From 086954b516628752b492507ce9484c47dcfa78b6 Mon Sep 17 00:00:00 2001
From: Ulrich Weigand <ulrich.weigand@de.ibm.com>
Date: Fri, 4 Nov 2022 12:53:32 +0100
Subject: [PATCH] Support big- and little-endian lane order with bitcast

Add a MemFlags operand to the bitcast instruction, where only the
`big` and `little` flags are accepted.  These define the lane order
to be used when casting between types of different lane counts.

Update all users to pass an appropriate MemFlags argument.

Implement lane swaps where necessary in the s390x back-end.

This is the final part necessary to fix
https://github.com/bytecodealliance/wasmtime/issues/4566.
---
 .../codegen/meta/src/shared/instructions.rs   | 11 ++-
 cranelift/codegen/src/isa/aarch64/lower.isle  | 10 +--
 cranelift/codegen/src/isa/riscv64/lower.isle  |  2 +-
 cranelift/codegen/src/isa/s390x/inst.isle     |  5 ++
 cranelift/codegen/src/isa/s390x/lower.isle    | 40 ++++++----
 cranelift/codegen/src/isa/x64/lower.isle      | 12 +--
 cranelift/codegen/src/verifier/mod.rs         | 16 +++-
 .../filetests/isa/s390x/bitcast.clif          | 79 +++++++++++++++++++
 .../filetests/isa/s390x/floating-point.clif   | 18 +++++
 .../filetests/isa/s390x/vec-bitcast.clif      | 76 ++++++++++++++++++
 .../filetests/filetests/verifier/bitcast.clif | 19 +++++
 cranelift/wasm/src/code_translator.rs         | 24 ++++--
 12 files changed, 269 insertions(+), 43 deletions(-)
 create mode 100644 cranelift/filetests/filetests/isa/s390x/bitcast.clif
 create mode 100644 cranelift/filetests/filetests/isa/s390x/vec-bitcast.clif

diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs
index cc0582e85441..cceffa495c6f 100644
--- a/cranelift/codegen/meta/src/shared/instructions.rs
+++ b/cranelift/codegen/meta/src/shared/instructions.rs
@@ -3104,6 +3104,7 @@ pub(crate) fn define(
 
     let x = &Operand::new("x", Mem);
     let a = &Operand::new("a", MemTo).with_doc("Bits of `x` reinterpreted");
+    let MemFlags = &Operand::new("MemFlags", &imm.memflags);
 
     ig.push(
         Inst::new(
@@ -3113,11 +3114,15 @@ pub(crate) fn define(
 
         The input and output types must be storable to memory and of the same
         size. A bitcast is equivalent to storing one type and loading the other
-        type from the same address.
+        type from the same address, using the specified MemFlags.
+
+        Note that this operation only supports the `big` or `little` MemFlags.
+        The specified byte order only affects the result in the case where
+        input and output types differ in lane count/size.
         "#,
-            &formats.unary,
+            &formats.load_no_offset,
         )
-        .operands_in(vec![x])
+        .operands_in(vec![MemFlags, x])
         .operands_out(vec![a]),
     );
 
diff --git a/cranelift/codegen/src/isa/aarch64/lower.isle b/cranelift/codegen/src/isa/aarch64/lower.isle
index b441b1027234..a9cc3846a90c 100644
--- a/cranelift/codegen/src/isa/aarch64/lower.isle
+++ b/cranelift/codegen/src/isa/aarch64/lower.isle
@@ -2197,25 +2197,25 @@
 ;;; Rules for `bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 ; SIMD&FP <=> SIMD&FP
-(rule 5 (lower (has_type (ty_float_or_vec _) (bitcast x @ (value_type (ty_float_or_vec _)))))
+(rule 5 (lower (has_type (ty_float_or_vec _) (bitcast _ x @ (value_type (ty_float_or_vec _)))))
       x)
 
 ; GPR => SIMD&FP
-(rule 4 (lower (has_type (ty_float_or_vec _) (bitcast x @ (value_type in_ty))))
+(rule 4 (lower (has_type (ty_float_or_vec _) (bitcast _ x @ (value_type in_ty))))
       (if (ty_int_ref_scalar_64 in_ty))
       (mov_to_fpu x (scalar_size in_ty)))
 
 ; SIMD&FP => GPR
-(rule 3 (lower (has_type out_ty (bitcast x @ (value_type (fits_in_64 (ty_float_or_vec _))))))
+(rule 3 (lower (has_type out_ty (bitcast _ x @ (value_type (fits_in_64 (ty_float_or_vec _))))))
       (if (ty_int_ref_scalar_64 out_ty))
       (mov_from_vec x 0 (scalar_size out_ty)))
 
 ; GPR <=> GPR
-(rule 2 (lower (has_type out_ty (bitcast x @ (value_type in_ty))))
+(rule 2 (lower (has_type out_ty (bitcast _ x @ (value_type in_ty))))
       (if (ty_int_ref_scalar_64 out_ty))
       (if (ty_int_ref_scalar_64 in_ty))
       x)
-(rule 1 (lower (has_type $I128 (bitcast x @ (value_type $I128)))) x)
+(rule 1 (lower (has_type $I128 (bitcast _ x @ (value_type $I128)))) x)
 
 ;;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle
index 750e2a5cf16a..c30087525fb6 100644
--- a/cranelift/codegen/src/isa/riscv64/lower.isle
+++ b/cranelift/codegen/src/isa/riscv64/lower.isle
@@ -811,7 +811,7 @@
 )
 ;;;;;  Rules for `bitcast`;;;;;;;;;
 (rule
-   (lower (has_type out (bitcast v @ (value_type in_ty))))
+   (lower (has_type out (bitcast _ v @ (value_type in_ty))))
    (gen_moves v in_ty out))
 
 ;;;;;  Rules for `ceil`;;;;;;;;;
diff --git a/cranelift/codegen/src/isa/s390x/inst.isle b/cranelift/codegen/src/isa/s390x/inst.isle
index 5d519abbaa0f..5261138ba28d 100644
--- a/cranelift/codegen/src/isa/s390x/inst.isle
+++ b/cranelift/codegen/src/isa/s390x/inst.isle
@@ -1486,6 +1486,11 @@
 (rule (lane_order_equal (LaneOrder.BigEndian) (LaneOrder.LittleEndian)) $false)
 (rule (lane_order_equal (LaneOrder.BigEndian) (LaneOrder.BigEndian)) $true)
 
+;; Return lane order matching memory byte order.
+(decl pure lane_order_from_memflags (MemFlags) LaneOrder)
+(rule 0 (lane_order_from_memflags (littleendian)) (LaneOrder.LittleEndian))
+(rule 1 (lane_order_from_memflags (bigendian)) (LaneOrder.BigEndian))
+
 ;; Convert a CLIF immediate lane index value to big-endian lane order.
 (decl be_lane_idx (Type u8) u8)
 (extern constructor be_lane_idx be_lane_idx)
diff --git a/cranelift/codegen/src/isa/s390x/lower.isle b/cranelift/codegen/src/isa/s390x/lower.isle
index 6c2629aab338..0e4ec27decf5 100644
--- a/cranelift/codegen/src/isa/s390x/lower.isle
+++ b/cranelift/codegen/src/isa/s390x/lower.isle
@@ -1738,40 +1738,46 @@
 ;;;; Rules for `bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 ;; Reinterpret a 64-bit integer value as floating-point.
-(rule (lower (has_type $F64 (bitcast x @ (value_type $I64))))
+(rule (lower (has_type $F64 (bitcast _ x @ (value_type $I64))))
       (vec_insert_lane_undef $F64X2 x 0 (zero_reg)))
 
 ;; Reinterpret a 64-bit floating-point value as integer.
-(rule (lower (has_type $I64 (bitcast x @ (value_type $F64))))
+(rule (lower (has_type $I64 (bitcast _ x @ (value_type $F64))))
       (vec_extract_lane $F64X2 x 0 (zero_reg)))
 
 ;; Reinterpret a 32-bit integer value as floating-point.
-(rule (lower (has_type $F32 (bitcast x @ (value_type $I32))))
+(rule (lower (has_type $F32 (bitcast _ x @ (value_type $I32))))
       (vec_insert_lane_undef $F32X4 x 0 (zero_reg)))
 
 ;; Reinterpret a 32-bit floating-point value as integer.
-(rule (lower (has_type $I32 (bitcast x @ (value_type $F32))))
+(rule (lower (has_type $I32 (bitcast _ x @ (value_type $F32))))
       (vec_extract_lane $F32X4 x 0 (zero_reg)))
 
 ;; Bitcast between types residing in GPRs is a no-op.
 (rule 1 (lower (has_type (gpr32_ty _)
-                         (bitcast x @ (value_type (gpr32_ty _))))) x)
+                         (bitcast _ x @ (value_type (gpr32_ty _))))) x)
 (rule 2 (lower (has_type (gpr64_ty _)
-                         (bitcast x @ (value_type (gpr64_ty _))))) x)
+                         (bitcast _ x @ (value_type (gpr64_ty _))))) x)
 
 ;; Bitcast between types residing in FPRs is a no-op.
 (rule 3 (lower (has_type (ty_scalar_float _)
-                         (bitcast x @ (value_type (ty_scalar_float _))))) x)
-
-;; Bitcast between types residing in VRs is a no-op.
-;; FIXME: There are two flavors of vector bitcast, which are currently not
-;; distinguished in CLIF IR.  Those generated by Wasmtime assume little-endian
-;; lane order, and those generated elsewhere assume big-endian lane order.
-;; Bitcast is a no-op if current lane order matches that assumed lane order.
-;; However, due to our choice of lane order depending on the current function
-;; ABI, every bitcast we currently see here is indeed a no-op.
-(rule 4 (lower (has_type (vr128_ty _)
-                         (bitcast x @ (value_type (vr128_ty _))))) x)
+                         (bitcast _ x @ (value_type (ty_scalar_float _))))) x)
+
+;; Bitcast between types residing in VRs is a no-op if lane count is unchanged.
+(rule 5 (lower (has_type (multi_lane bits count)
+                         (bitcast _ x @ (value_type (multi_lane bits count))))) x)
+
+;; Bitcast between types residing in VRs with different lane counts is a
+;; no-op if the operation's MemFlags indicate a byte order compatible with
+;; the current lane order.  Otherwise, lane elements need to be swapped,
+;; first in the input type, and then again in the output type.  This could
+;; be optimized further, but we don't bother at the moment since due to our
+;; choice of lane order depending on the current function ABI, this case will
+;; currently never arise in practice.
+(rule 4 (lower (has_type (vr128_ty out_ty)
+                         (bitcast flags x @ (value_type (vr128_ty in_ty)))))
+      (abi_vec_elt_rev (lane_order_from_memflags flags) out_ty
+        (abi_vec_elt_rev (lane_order_from_memflags flags) in_ty x)))
 
 
 ;;;; Rules for `insertlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
index f08d6ad5a7b1..326a83258b0d 100644
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -3298,25 +3298,25 @@
 
 ;; Rules for `bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-(rule (lower (has_type $I32 (bitcast src @ (value_type $F32))))
+(rule (lower (has_type $I32 (bitcast _ src @ (value_type $F32))))
       (bitcast_xmm_to_gpr $F32 src))
 
-(rule (lower (has_type $F32 (bitcast src @ (value_type $I32))))
+(rule (lower (has_type $F32 (bitcast _ src @ (value_type $I32))))
       (bitcast_gpr_to_xmm $I32 src))
 
-(rule (lower (has_type $I64 (bitcast src @ (value_type $F64))))
+(rule (lower (has_type $I64 (bitcast _ src @ (value_type $F64))))
       (bitcast_xmm_to_gpr $F64 src))
 
-(rule (lower (has_type $F64 (bitcast src @ (value_type $I64))))
+(rule (lower (has_type $F64 (bitcast _ src @ (value_type $I64))))
       (bitcast_gpr_to_xmm $I64 src))
 
 ;; Bitcast between types residing in GPR registers is a no-op.
 (rule 1 (lower (has_type (is_gpr_type _)
-                         (bitcast x @ (value_type (is_gpr_type _))))) x)
+                         (bitcast _ x @ (value_type (is_gpr_type _))))) x)
 
 ;; Bitcast between types residing in XMM registers is a no-op.
 (rule 2 (lower (has_type (is_xmm_type _)
-                         (bitcast x @ (value_type (is_xmm_type _))))) x)
+                         (bitcast _ x @ (value_type (is_xmm_type _))))) x)
 
 ;; Rules for `fcopysign` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
diff --git a/cranelift/codegen/src/verifier/mod.rs b/cranelift/codegen/src/verifier/mod.rs
index 80c79eb45642..0f858bf531f3 100644
--- a/cranelift/codegen/src/verifier/mod.rs
+++ b/cranelift/codegen/src/verifier/mod.rs
@@ -67,7 +67,7 @@ use crate::ir::entities::AnyEntity;
 use crate::ir::instructions::{BranchInfo, CallInfo, InstructionFormat, ResolvedConstraint};
 use crate::ir::{
     types, ArgumentPurpose, Block, Constant, DynamicStackSlot, FuncRef, Function, GlobalValue,
-    Inst, JumpTable, Opcode, SigRef, StackSlot, Type, Value, ValueDef, ValueList,
+    Inst, JumpTable, MemFlags, Opcode, SigRef, StackSlot, Type, Value, ValueDef, ValueList,
 };
 use crate::isa::TargetIsa;
 use crate::iterators::IteratorExtras;
@@ -729,11 +729,12 @@ impl<'a> Verifier<'a> {
                     ));
                 }
             }
-            Unary {
+            LoadNoOffset {
                 opcode: Opcode::Bitcast,
+                flags,
                 arg,
             } => {
-                self.verify_bitcast(inst, arg, errors)?;
+                self.verify_bitcast(inst, flags, arg, errors)?;
             }
             UnaryConst {
                 opcode: Opcode::Vconst,
@@ -1070,6 +1071,7 @@ impl<'a> Verifier<'a> {
     fn verify_bitcast(
         &self,
         inst: Inst,
+        flags: MemFlags,
         arg: Value,
         errors: &mut VerifierErrors,
     ) -> VerifierStepResult<()> {
@@ -1086,6 +1088,14 @@ impl<'a> Verifier<'a> {
                     typ.bits()
                 ),
             ))
+        } else if flags != MemFlags::new()
+            && flags != MemFlags::new().with_endianness(ir::Endianness::Little)
+            && flags != MemFlags::new().with_endianness(ir::Endianness::Big)
+        {
+            errors.fatal((
+                inst,
+                "The bitcast instruction only accepts the `big` or `little` memory flags",
+            ))
         } else {
             Ok(())
         }
diff --git a/cranelift/filetests/filetests/isa/s390x/bitcast.clif b/cranelift/filetests/filetests/isa/s390x/bitcast.clif
new file mode 100644
index 000000000000..6d946badbacb
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/s390x/bitcast.clif
@@ -0,0 +1,79 @@
+test compile precise-output
+target s390x
+
+;; Bitcast between integral types is a no-op.
+
+function %bitcast_i8_i8(i8) -> i8 {
+block0(v0: i8):
+  v1 = bitcast.i8 v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
+function %bitcast_i16_i16(i16) -> i16 {
+block0(v0: i16):
+  v1 = bitcast.i16 v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
+function %bitcast_i32_i32(i32) -> i32 {
+block0(v0: i32):
+  v1 = bitcast.i32 v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
+function %bitcast_i64_i64(i64) -> i64 {
+block0(v0: i64):
+  v1 = bitcast.i64 v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
+function %bitcast_i128_i128(i128) -> i128 {
+block0(v0: i128):
+  v1 = bitcast.i128 v0
+  return v1
+}
+
+; block0:
+;   vl %v0, 0(%r3)
+;   vst %v0, 0(%r2)
+;   br %r14
+
+function %bitcast_r64_i64(r64) -> i64 {
+block0(v0: r64):
+  v1 = bitcast.i64 v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
+function %bitcast_i64_r64(i64) -> r64 {
+block0(v0: i64):
+  v1 = bitcast.r64 v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
+function %bitcast_r64_r64(r64) -> r64 {
+block0(v0: r64):
+  v1 = bitcast.r64 v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
diff --git a/cranelift/filetests/filetests/isa/s390x/floating-point.clif b/cranelift/filetests/filetests/isa/s390x/floating-point.clif
index 1bfbf090bc8c..fe4a6cf41ff6 100644
--- a/cranelift/filetests/filetests/isa/s390x/floating-point.clif
+++ b/cranelift/filetests/filetests/isa/s390x/floating-point.clif
@@ -1200,3 +1200,21 @@ block0(v0: f32):
 ;   vlgvf %r2, %v0, 0
 ;   br %r14
 
+function %bitcast_f32_f32(f32) -> f32 {
+block0(v0: f32):
+  v1 = bitcast.f32 v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
+function %bitcast_f64_f64(f64) -> f64 {
+block0(v0: f64):
+  v1 = bitcast.f64 v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
diff --git a/cranelift/filetests/filetests/isa/s390x/vec-bitcast.clif b/cranelift/filetests/filetests/isa/s390x/vec-bitcast.clif
new file mode 100644
index 000000000000..c0d8ae45634e
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/s390x/vec-bitcast.clif
@@ -0,0 +1,76 @@
+test compile precise-output
+target s390x
+
+;; Vector bitcast is a no-op if the lane count remains unchanged,
+;; or if the ABI lane-order matches the specified byte order.
+;; Otherwise, lane-swaps need to happen.
+
+function %bitcast_i64x2_i32x4(i64x2) -> i32x4 {
+block0(v0: i64x2):
+  v1 = bitcast.i32x4 big v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
+function %bitcast_i64x2_i32x4(i64x2) -> i32x4 {
+block0(v0: i64x2):
+  v1 = bitcast.i32x4 little v0
+  return v1
+}
+
+; block0:
+;   vpdi %v3, %v24, %v24, 4
+;   vpdi %v5, %v3, %v3, 4
+;   verllg %v24, %v5, 32
+;   br %r14
+
+function %bitcast_i64x2_i32x4(i64x2) -> i32x4 wasmtime_system_v {
+block0(v0: i64x2):
+  v1 = bitcast.i32x4 big v0
+  return v1
+}
+
+; block0:
+;   vpdi %v3, %v24, %v24, 4
+;   vpdi %v5, %v3, %v3, 4
+;   verllg %v24, %v5, 32
+;   br %r14
+
+function %bitcast_i64x2_i32x4(i64x2) -> i32x4 wasmtime_system_v {
+block0(v0: i64x2):
+  v1 = bitcast.i32x4 little v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
+function %bitcast_i64x2_f64x2(i64x2) -> f64x2 {
+block0(v0: i64x2):
+  v1 = bitcast.f64x2 big v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
+function %bitcast_i64x2_f64x2(i64x2) -> f64x2 {
+block0(v0: i64x2):
+  v1 = bitcast.f64x2 little v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
+function %bitcast_i64x2_f64x2(i64x2) -> f64x2 wasmtime_system_v {
+block0(v0: i64x2):
+  v1 = bitcast.f64x2 big v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
diff --git a/cranelift/filetests/filetests/verifier/bitcast.clif b/cranelift/filetests/filetests/verifier/bitcast.clif
index 5ed7b8386d7f..78acd0e905e4 100644
--- a/cranelift/filetests/filetests/verifier/bitcast.clif
+++ b/cranelift/filetests/filetests/verifier/bitcast.clif
@@ -21,3 +21,22 @@ block0(v0: i64):
     return v1
 }
 
+; "little"/"big" flag modifier is ok
+function %bitcast_little(i32) -> f32 { ; Ok
+block0(v0: i32):
+    v1 = bitcast.f32 little v0
+    return v1
+}
+function %bitcast_big(i32) -> f32 { ; Ok
+block0(v0: i32):
+    v1 = bitcast.f32 big v0
+    return v1
+}
+
+; other flag modifiers are not ok
+function %bitcast_big(i32) -> f32 {
+block0(v0: i32):
+    v1 = bitcast.f32 big notrap v0 ;  error: The bitcast instruction only accepts the `big` or `little` memory flags
+    return v1
+}
+
diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs
index 7122ba5f91e6..86d54ffc0328 100644
--- a/cranelift/wasm/src/code_translator.rs
+++ b/cranelift/wasm/src/code_translator.rs
@@ -868,19 +868,19 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
         }
         Operator::F32ReinterpretI32 => {
             let val = state.pop1();
-            state.push1(builder.ins().bitcast(F32, val));
+            state.push1(builder.ins().bitcast(F32, MemFlags::new(), val));
         }
         Operator::F64ReinterpretI64 => {
             let val = state.pop1();
-            state.push1(builder.ins().bitcast(F64, val));
+            state.push1(builder.ins().bitcast(F64, MemFlags::new(), val));
         }
         Operator::I32ReinterpretF32 => {
             let val = state.pop1();
-            state.push1(builder.ins().bitcast(I32, val));
+            state.push1(builder.ins().bitcast(I32, MemFlags::new(), val));
         }
         Operator::I64ReinterpretF64 => {
             let val = state.pop1();
-            state.push1(builder.ins().bitcast(I64, val));
+            state.push1(builder.ins().bitcast(I64, MemFlags::new(), val));
         }
         Operator::I32Extend8S => {
             let val = state.pop1();
@@ -2898,7 +2898,9 @@ fn optionally_bitcast_vector(
     builder: &mut FunctionBuilder,
 ) -> Value {
     if builder.func.dfg.value_type(value) != needed_type {
-        builder.ins().bitcast(needed_type, value)
+        let mut flags = MemFlags::new();
+        flags.set_endianness(ir::Endianness::Little);
+        builder.ins().bitcast(needed_type, flags, value)
     } else {
         value
     }
@@ -2933,7 +2935,9 @@ fn canonicalise_v128_values<'a>(
     // Otherwise we'll have to cast, and push the resulting `Value`s into `canonicalised`.
     for v in values {
         tmp_canonicalised.push(if is_non_canonical_v128(builder.func.dfg.value_type(*v)) {
-            builder.ins().bitcast(I8X16, *v)
+            let mut flags = MemFlags::new();
+            flags.set_endianness(ir::Endianness::Little);
+            builder.ins().bitcast(I8X16, flags, *v)
         } else {
             *v
         });
@@ -3056,7 +3060,9 @@ pub fn bitcast_wasm_returns<FE: FuncEnvironment + ?Sized>(
         environ.is_wasm_return(&builder.func.signature, i)
     });
     for (t, arg) in changes {
-        *arg = builder.ins().bitcast(t, *arg);
+        let mut flags = MemFlags::new();
+        flags.set_endianness(ir::Endianness::Little);
+        *arg = builder.ins().bitcast(t, flags, *arg);
     }
 }
 
@@ -3072,6 +3078,8 @@ fn bitcast_wasm_params<FE: FuncEnvironment + ?Sized>(
         environ.is_wasm_parameter(&callee_signature, i)
     });
     for (t, arg) in changes {
-        *arg = builder.ins().bitcast(t, *arg);
+        let mut flags = MemFlags::new();
+        flags.set_endianness(ir::Endianness::Little);
+        *arg = builder.ins().bitcast(t, flags, *arg);
     }
 }