aarch64: Use VCodeConstant for f64/v128 constants (#5997)

* aarch64: Translate float and splat lowering to ISLE I was looking into `constant_f128` and its fallback lowering into memory and to get familiar with the code I figured it'd be good to port some Rust logic to ISLE. This commit ports the `constant_{f128,f64,f32}` helpers into ISLE from Rust as well as the `splat_const` helper which ended up being closely related. Tests reflect a number of regalloc changes that happened but also namely one major difference is that in the lowering of `f32` a 32-bit immediate is created now instead of a 64-bit immediate (in a GP register before it's moved into a FP register). This semantically has no change but the generated code is slightly different in a few minor cases. * aarch64: Load f64/v128 constants from a pool This commit removes the `LoadFpuConst64` and `LoadFpuConst128` pseudo-instructions from the AArch64 backend which internally loaded a nearby constant and then jumped over it. Constants now go through the `VCodeConstant` infrastructure which gets placed at the end of the function similar to how x64 works. Some minor support was added in as well to add a new addressing mode for a `MachLabel`-relative load.
bytecodealliance · Mar 13, 2023 · 03b5dbb · 03b5dbb
1 parent 6ecdc24
commit 03b5dbb
Show file tree

Hide file tree

Showing 25 changed files with 615 additions and 737 deletions.
diff --git a/cranelift/codegen/src/isa/aarch64/inst.isle b/cranelift/codegen/src/isa/aarch64/inst.isle
@@ -466,14 +466,6 @@
         (mem PairAMode)
         (flags MemFlags))
 
-       (LoadFpuConst64
-        (rd WritableReg)
-        (const_data u64))
-
-       (LoadFpuConst128
-        (rd WritableReg)
-        (const_data u128))
-
        ;; Conversion: FP -> integer.
        (FpuToInt
         (op FpuToIntOp)
@@ -1135,6 +1127,11 @@
          (off i64)
          (ty Type))
 
+        ;; A reference to a constant which is placed outside of the function's
+        ;; body, typically at the end.
+        (Const
+          (addr VCodeConstant))
+
         ;; Offset from the "nominal stack pointer", which is where the real SP is
         ;; just after stack and spill slots are allocated in the function prologue.
         ;; At emission time, this is converted to `SPOffset` with a fixup added to
@@ -1194,6 +1191,16 @@
 (rule (lane_size (dynamic_lane 32 _)) (ScalarSize.Size32))
 (rule (lane_size (dynamic_lane 64 _)) (ScalarSize.Size64))
 
+;; Helper for extracting the size of a lane from the input `VectorSize`
+(decl pure vector_lane_size (VectorSize) ScalarSize)
+(rule (vector_lane_size (VectorSize.Size8x16)) (ScalarSize.Size8))
+(rule (vector_lane_size (VectorSize.Size8x8))  (ScalarSize.Size8))
+(rule (vector_lane_size (VectorSize.Size16x8)) (ScalarSize.Size16))
+(rule (vector_lane_size (VectorSize.Size16x4)) (ScalarSize.Size16))
+(rule (vector_lane_size (VectorSize.Size32x4)) (ScalarSize.Size32))
+(rule (vector_lane_size (VectorSize.Size32x2)) (ScalarSize.Size32))
+(rule (vector_lane_size (VectorSize.Size64x2)) (ScalarSize.Size64))
+
 (type Cond extern
   (enum
     (Eq)
@@ -1908,6 +1915,13 @@
             (_ Unit (emit (MInst.VecDupFromFpu dst src size lane))))
         dst))
 
+;; Helper for emitting `MInst.VecDupImm` instructions.
+(decl vec_dup_imm (ASIMDMovModImm bool VectorSize) Reg)
+(rule (vec_dup_imm imm invert size)
+      (let ((dst WritableReg (temp_writable_reg $I8X16))
+            (_ Unit (emit (MInst.VecDupImm dst imm invert size))))
+        dst))
+
 ;; Helper for emitting `MInst.AluRRImm12` instructions.
 (decl alu_rr_imm12 (ALUOp Type Reg Imm12) Reg)
 (rule (alu_rr_imm12 op ty src imm)
@@ -2158,6 +2172,13 @@
             (_ Unit (emit (MInst.MovToFpu dst x size))))
         dst))
 
+;; Helper for emitting `MInst.FpuMoveFPImm` instructions.
+(decl fpu_move_fp_imm (ASIMDFPModImm ScalarSize) Reg)
+(rule (fpu_move_fp_imm imm size)
+      (let ((dst WritableReg (temp_writable_reg $I8X16))
+            (_ Unit (emit (MInst.FpuMoveFPImm dst imm size))))
+        dst))
+
 ;; Helper for emitting `MInst.MovToVec` instructions.
 (decl mov_to_vec (Reg Reg u8 VectorSize) Reg)
 (rule (mov_to_vec src1 src2 lane size)
@@ -2986,24 +3007,122 @@
            (amode ty addr offset)))
 
 ;; Lower a constant f32.
-(decl constant_f32 (u64) Reg)
-;; TODO: Port lower_constant_f32() to ISLE.
-(extern constructor constant_f32 constant_f32)
+;;
+;; Note that we must make sure that all bits outside the lowest 32 are set to 0
+;; because this function is also used to load wider constants (that have zeros
+;; in their most significant bits).
+(decl constant_f32 (u32) Reg)
+(rule 2 (constant_f32 0)
+        (vec_dup_imm (asimd_mov_mod_imm_zero (ScalarSize.Size32))
+                     $false
+                     (VectorSize.Size32x2)))
+(rule 1 (constant_f32 n)
+        (if-let imm (asimd_fp_mod_imm_from_u64 n (ScalarSize.Size32)))
+        (fpu_move_fp_imm imm (ScalarSize.Size32)))
+(rule (constant_f32 n)
+      (mov_to_fpu (imm $I32 (ImmExtend.Zero) n) (ScalarSize.Size32)))
 
 ;; Lower a constant f64.
+;;
+;; Note that we must make sure that all bits outside the lowest 64 are set to 0
+;; because this function is also used to load wider constants (that have zeros
+;; in their most significant bits).
+;; TODO: Treat as half of a 128 bit vector and consider replicated patterns.
+;; Scalar MOVI might also be an option.
 (decl constant_f64 (u64) Reg)
-;; TODO: Port lower_constant_f64() to ISLE.
-(extern constructor constant_f64 constant_f64)
+(rule 4 (constant_f64 0)
+        (vec_dup_imm (asimd_mov_mod_imm_zero (ScalarSize.Size32))
+                     $false
+                     (VectorSize.Size32x2)))
+(rule 3 (constant_f64 n)
+        (if-let imm (asimd_fp_mod_imm_from_u64 n (ScalarSize.Size64)))
+        (fpu_move_fp_imm imm (ScalarSize.Size64)))
+(rule 2 (constant_f64 (u64_as_u32 n))
+        (constant_f32 n))
+(rule 1 (constant_f64 (u64_low32_bits_unset n))
+        (mov_to_fpu (imm $I64 (ImmExtend.Zero) n) (ScalarSize.Size64)))
+(rule (constant_f64 n)
+      (fpu_load64 (AMode.Const (emit_u64_le_const n)) (mem_flags_trusted)))
+
+;; Tests whether the low 32 bits in the input are all zero.
+(decl u64_low32_bits_unset (u64) u64)
+(extern extractor u64_low32_bits_unset u64_low32_bits_unset)
 
 ;; Lower a constant f128.
 (decl constant_f128 (u128) Reg)
-;; TODO: Port lower_constant_f128() to ISLE.
-(extern constructor constant_f128 constant_f128)
+(rule 3 (constant_f128 0)
+        (vec_dup_imm (asimd_mov_mod_imm_zero (ScalarSize.Size8))
+                     $false
+                     (VectorSize.Size8x16)))
+
+;; If the upper 64-bits are all zero then defer to `constant_f64`.
+(rule 2 (constant_f128 (u128_as_u64 n)) (constant_f64 n))
+
+;; If the low half of the u128 equals the high half then delegate to the splat
+;; logic as a splat of a 64-bit value.
+(rule 1 (constant_f128 (u128_replicated_u64 n))
+        (splat_const n (VectorSize.Size64x2)))
+
+;; Base case is to load the constant from memory.
+(rule (constant_f128 n)
+      (fpu_load128 (AMode.Const (emit_u128_le_const n)) (mem_flags_trusted)))
 
 ;; Lower a vector splat with a constant parameter.
+;;
+;; The 64-bit input here only uses the low bits for the lane size in
+;; `VectorSize` and all other bits are ignored.
 (decl splat_const (u64 VectorSize) Reg)
-;; TODO: Port lower_splat_const() to ISLE.
-(extern constructor splat_const splat_const)
+
+;; If the splat'd constant can itself be reduced in size then attempt to do so
+;; as it will make it easier to create the immediates in the instructions below.
+(rule 5 (splat_const (u64_replicated_u32 n) (VectorSize.Size64x2))
+        (splat_const n (VectorSize.Size32x4)))
+(rule 5 (splat_const (u32_replicated_u16 n) (VectorSize.Size32x4))
+        (splat_const n (VectorSize.Size16x8)))
+(rule 5 (splat_const (u32_replicated_u16 n) (VectorSize.Size32x2))
+        (splat_const n (VectorSize.Size16x4)))
+(rule 5 (splat_const (u16_replicated_u8 n) (VectorSize.Size16x8))
+        (splat_const n (VectorSize.Size8x16)))
+(rule 5 (splat_const (u16_replicated_u8 n) (VectorSize.Size16x4))
+        (splat_const n (VectorSize.Size8x8)))
+
+;; Special cases for `vec_dup_imm` instructions where the input is either
+;; negated or not.
+(rule 4 (splat_const n size)
+        (if-let imm (asimd_mov_mod_imm_from_u64 n (vector_lane_size size)))
+        (vec_dup_imm imm $false size))
+(rule 3 (splat_const n size)
+        (if-let imm (asimd_mov_mod_imm_from_u64 (u64_not n) (vector_lane_size size)))
+        (vec_dup_imm imm $true size))
+
+;; Special case a 32-bit splat where an immediate can be created by
+;; concatenating the 32-bit constant into a 64-bit value
+(rule 2 (splat_const n (VectorSize.Size32x4))
+        (if-let imm (asimd_mov_mod_imm_from_u64 (u64_or n (u64_shl n 32)) (ScalarSize.Size64)))
+        (vec_dup_imm imm $false (VectorSize.Size64x2)))
+(rule 2 (splat_const n (VectorSize.Size32x2))
+        (if-let imm (asimd_mov_mod_imm_from_u64 (u64_or n (u64_shl n 32)) (ScalarSize.Size64)))
+        (fpu_extend (vec_dup_imm imm $false (VectorSize.Size64x2)) (ScalarSize.Size64)))
+
+(rule 1 (splat_const n size)
+        (if-let imm (asimd_fp_mod_imm_from_u64 n (vector_lane_size size)))
+        (vec_dup_fp_imm imm size))
+
+;; The base case for splat is to use `vec_dup` with the immediate loaded into a
+;; register.
+(rule (splat_const n size)
+      (vec_dup (imm $I64 (ImmExtend.Zero) n) size))
+
+;; Each of these extractors tests whether the upper half of the input equals the
+;; lower half of the input
+(decl u128_replicated_u64 (u64) u128)
+(extern extractor u128_replicated_u64 u128_replicated_u64)
+(decl u64_replicated_u32 (u64) u64)
+(extern extractor u64_replicated_u32 u64_replicated_u32)
+(decl u32_replicated_u16 (u64) u64)
+(extern extractor u32_replicated_u16 u32_replicated_u16)
+(decl u16_replicated_u8 (u64) u64)
+(extern extractor u16_replicated_u8 u16_replicated_u8)
 
 ;; Lower a FloatCC to a Cond.
 (decl fp_cond_code (FloatCC) Cond)
@@ -3814,3 +3933,36 @@
 ;; Helper for emitting the `trn2` instruction
 (decl vec_trn2 (Reg Reg VectorSize) Reg)
 (rule (vec_trn2 rn rm size) (vec_rrr (VecALUOp.Trn2) rn rm size))
+
+;; Helper for creating a zero value `ASIMDMovModImm` immediate.
+(decl asimd_mov_mod_imm_zero (ScalarSize) ASIMDMovModImm)
+(extern constructor asimd_mov_mod_imm_zero asimd_mov_mod_imm_zero)
+
+;; Helper for fallibly creating an `ASIMDMovModImm` immediate from its parts.
+(decl pure partial asimd_mov_mod_imm_from_u64 (u64 ScalarSize) ASIMDMovModImm)
+(extern constructor asimd_mov_mod_imm_from_u64 asimd_mov_mod_imm_from_u64)
+
+;; Helper for fallibly creating an `ASIMDFPModImm` immediate from its parts.
+(decl pure partial asimd_fp_mod_imm_from_u64 (u64 ScalarSize) ASIMDFPModImm)
+(extern constructor asimd_fp_mod_imm_from_u64 asimd_fp_mod_imm_from_u64)
+
+;; Helper for creating a `VecDupFPImm` instruction
+(decl vec_dup_fp_imm (ASIMDFPModImm VectorSize) Reg)
+(rule (vec_dup_fp_imm imm size)
+      (let ((dst WritableReg (temp_writable_reg $I8X16))
+            (_ Unit (emit (MInst.VecDupFPImm dst imm size))))
+       dst))
+
+;; Helper for creating a `FpuLoad64` instruction
+(decl fpu_load64 (AMode MemFlags) Reg)
+(rule (fpu_load64 amode flags)
+      (let ((dst WritableReg (temp_writable_reg $I8X16))
+            (_ Unit (emit (MInst.FpuLoad64 dst amode flags))))
+       dst))
+
+;; Helper for creating a `FpuLoad128` instruction
+(decl fpu_load128 (AMode MemFlags) Reg)
+(rule (fpu_load128 amode flags)
+      (let ((dst WritableReg (temp_writable_reg $I8X16))
+            (_ Unit (emit (MInst.FpuLoad128 dst amode flags))))
+       dst))
diff --git a/cranelift/codegen/src/isa/aarch64/inst/args.rs b/cranelift/codegen/src/isa/aarch64/inst/args.rs
@@ -124,6 +124,9 @@ pub enum MemLabel {
     /// offset from this instruction. This form must be used at emission time;
     /// see `memlabel_finalize()` for how other forms are lowered to this one.
     PCRel(i32),
+    /// An address that refers to a label within a `MachBuffer`, for example a
+    /// constant that lives in the pool at the end of the function.
+    Mach(MachLabel),
 }
 
 impl AMode {
@@ -194,6 +197,7 @@ impl AMode {
             | &AMode::FPOffset { .. }
             | &AMode::SPOffset { .. }
             | &AMode::NominalSPOffset { .. }
+            | &AMode::Const { .. }
             | AMode::Label { .. } => self.clone(),
         }
     }
@@ -382,7 +386,8 @@ impl PrettyPrint for ExtendOp {
 impl PrettyPrint for MemLabel {
     fn pretty_print(&self, _: u8, _: &mut AllocationConsumer<'_>) -> String {
         match self {
-            &MemLabel::PCRel(off) => format!("pc+{}", off),
+            MemLabel::PCRel(off) => format!("pc+{}", off),
+            MemLabel::Mach(off) => format!("label({})", off.get()),
         }
     }
 }
@@ -465,6 +470,8 @@ impl PrettyPrint for AMode {
                 let simm9 = simm9.pretty_print(8, allocs);
                 format!("[sp], {}", simm9)
             }
+            AMode::Const { addr } => format!("[const({})]", addr.as_u32()),
+
             // Eliminated by `mem_finalize()`.
             &AMode::SPOffset { .. }
             | &AMode::FPOffset { .. }