x64: Add lea-based lowering for iadd

This commit adds a rule for the lowering of `iadd` to use `lea` for 32 and 64-bit addition. The theoretical benefit of `lea` over the `add` instruction is that the `lea` variant can emulate a 3-operand instruction which doesn't destructively modify on of its operands. Additionally the `lea` operation can fold in other components such as constant additions and shifts. In practice, however, if `lea` is unconditionally used instead of `iadd` it ends up losing 10% performance on a local `meshoptimizer` benchmark. My best guess as to what's going on here is that my CPU's dedicated units for address computation are all overloaded while the ALUs are basically idle in a memory-intensive loop. Previously when the ALU was used for `add` and the address units for stores/loads it in theory pipelined things better (most of this is me shooting in the dark). To prevent the performance loss here I've updated the lowering of `iadd` to conditionally sometimes use `lea` and sometimes use `add` depending on how "complicated" the `Amode` is. Simple ones like `a + b` or `a + $imm` continue to use `add` (and its subsequent hypothetical extra `mov` necessary into the result). More complicated ones like `a + b + $imm` or `a + b << c + $imm` use `lea` as it can remove the need for extra instructions. Locally at least this fixes the performance loss relative to unconditionally using `lea`. One note is that this adds an `OperandSize` argument to the `MInst::LoadEffectiveAddress` variant to add an encoding for 32-bit `lea` in addition to the preexisting 64-bit encoding.
bytecodealliance · Mar 10, 2023 · f1067eb · f1067eb
1 parent 0e95ee4
commit f1067eb
Show file tree

Hide file tree

Showing 28 changed files with 631 additions and 331 deletions.
diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle
@@ -133,7 +133,8 @@
 
        ;; Loads the memory address of addr into dst.
        (LoadEffectiveAddress (addr SyntheticAmode)
-                             (dst WritableGpr))
+                             (dst WritableGpr)
+                             (size OperandSize))
 
        ;; Sign-extended loads and moves: movs (bl bq wl wq lq) addr reg.
        (MovsxRmR (ext_mode ExtMode)
@@ -3551,10 +3552,10 @@
             (inst MInst (MInst.Neg size src dst)))
         (ProducesFlags.ProducesFlagsReturnsResultWithConsumer inst dst)))
 
-(decl x64_lea (SyntheticAmode) Gpr)
-(rule (x64_lea addr)
+(decl x64_lea (Type SyntheticAmode) Gpr)
+(rule (x64_lea ty addr)
       (let ((dst WritableGpr (temp_writable_gpr))
-            (_ Unit (emit (MInst.LoadEffectiveAddress addr dst))))
+            (_ Unit (emit (MInst.LoadEffectiveAddress addr dst (operand_size_of_type_32_64 ty)))))
         dst))
 
 ;; Helper for creating `ud2` instructions.

diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -871,20 +871,16 @@ pub(crate) fn emit(
             )
         }
 
-        Inst::LoadEffectiveAddress { addr, dst } => {
+        Inst::LoadEffectiveAddress { addr, dst, size } => {
             let dst = allocs.next(dst.to_reg().to_reg());
             let amode = addr.finalize(state, sink).with_allocs(allocs);
+            let flags = match size {
+                OperandSize::Size32 => RexFlags::clear_w(),
+                OperandSize::Size64 => RexFlags::set_w(),
+                _ => unreachable!(),
+            };
 
-            emit_std_reg_mem(
-                sink,
-                LegacyPrefixes::None,
-                0x8D,
-                1,
-                dst,
-                &amode,
-                RexFlags::set_w(),
-                0,
-            );
+            emit_std_reg_mem(sink, LegacyPrefixes::None, 0x8D, 1, dst, &amode, flags, 0);
         }
 
         Inst::MovsxRmR { ext_mode, src, dst } => {

diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -437,6 +437,7 @@ impl Inst {
         Inst::LoadEffectiveAddress {
             addr: addr.into(),
             dst: WritableGpr::from_writable_reg(dst).unwrap(),
+            size: OperandSize::Size64,
         }
     }
 
@@ -1392,8 +1393,8 @@ impl PrettyPrint for Inst {
                 format!("{} {}, {}", ljustify("movq".to_string()), src, dst)
             }
 
-            Inst::LoadEffectiveAddress { addr, dst } => {
-                let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs);
+            Inst::LoadEffectiveAddress { addr, dst, size } => {
+                let dst = pretty_print_reg(dst.to_reg().to_reg(), size.to_bytes(), allocs);
                 let addr = addr.pretty_print(8, allocs);
                 format!("{} {}, {}", ljustify("lea".to_string()), addr, dst)
             }
@@ -2117,7 +2118,7 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
             collector.reg_def(dst.to_writable_reg());
             src.get_operands(collector);
         }
-        Inst::LoadEffectiveAddress { addr: src, dst } => {
+        Inst::LoadEffectiveAddress { addr: src, dst, .. } => {
             collector.reg_def(dst.to_writable_reg());
             src.get_operands(collector);
         }

diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
@@ -41,17 +41,57 @@
 
 ;; `i64` and smaller.
 
-;; Add two registers.
-(rule -5 (lower (has_type (fits_in_64 ty)
+;; Base case for 8 and 16-bit types
+(rule -6 (lower (has_type (fits_in_16 ty)
                        (iadd x y)))
       (x64_add ty x y))
 
-;; The above case handles when the rhs is an immediate or a sinkable load, but
-;; additionally add lhs meets these criteria.
-
+;; Base case for 32 and 64-bit types which might end up using the `lea`
+;; instruction to fold multiple operations into one. The actualy determination
+;; of whether to use `add` or `lea` is left up to the `add_or_lea` helper.
+(rule -5 (lower (has_type (ty_32_or_64 ty) (iadd x y)))
+      (add_or_lea ty (to_amode_add (mem_flags_trusted) x y (zero_offset))))
+
+;; Small helper used as part of the lowering of `iadd` just above which chooses
+;; either `lea` or `add` for the `Amode` given. The x64 `lea` instruction in
+;; theory is a superior `add` alternative offering the means to have a 3-operand
+;; instruction (aka better regalloc) along with the ability to fold multiple
+;; pieces of functionality into one. In practice though it seems that it's not
+;; so cut-and-dry. The `meshoptimizer` benchmark's `vtx` measurement, for
+;; example, gets 10% slower if `lea` is unconditionally used here. The apparent
+;; reason for this is that x64 cores have dedicated units for computing
+;; addresses, but a finite number of them. It seems that forcing everything
+;; through these units can cause a slowdown vs also using the ALUs which are
+;; otherwise idle if there's a lot of add instructions.
+;;
+;; Given all that a rough heuristic is applied here. If the `Amode` is "simple"
+;; and basically looks like one add instruction then the `add` instruction is
+;; itself used. This covers cases like `a + $constant` or `a + b`. In these
+;; cases the theoretical downside to using `add` is that the 3-operand mode
+;; can't be used and this may require an extra `mov` relative to an `lea`
+;; instruction.
+;;
+;; Otherwise if the `Amode` is "complicated", or can fold more than one
+;; arithmetic instruction into it, then an `lea` is used. This means that
+;; expressions of the form `a + b * c` or `a + b + $const` generate a single
+;; `lea` instruction.
+;;
+;; Locally on the `meshoptimizer` benchmark this at least preserves the
+;; performance relative to "always use `add`".
+(decl add_or_lea (Type Amode) Reg)
+(rule 1 (add_or_lea ty (Amode.ImmReg imm reg _flags))
+      (x64_add ty reg (RegMemImm.Imm imm)))
+(rule 1 (add_or_lea ty (Amode.ImmRegRegShift 0 base index 0 _flags))
+      (x64_add ty base index))
+(rule (add_or_lea ty mode)
+      (x64_lea ty mode))
+
+;; Higher-priority cases than the previous two where a load can be sunk into
+;; the add instruction itself. Note that both operands are tested for
+;; sink-ability since addition is commutative
 (rule -4 (lower (has_type (fits_in_64 ty)
-                       (iadd (simm32_from_value x) y)))
-      (x64_add ty y x))
+                       (iadd x (sinkable_load y))))
+      (x64_add ty x y))
 (rule -3 (lower (has_type (fits_in_64 ty)
                        (iadd (sinkable_load x) y)))
       (x64_add ty y x))
@@ -442,7 +482,7 @@
 (extern constructor ishl_i8x16_mask_table ishl_i8x16_mask_table)
 (rule (ishl_i8x16_mask (RegMemImm.Reg amt))
       (let ((mask_table SyntheticAmode (ishl_i8x16_mask_table))
-            (base_mask_addr Gpr (x64_lea mask_table))
+            (base_mask_addr Gpr (x64_lea $I64 mask_table))
             (mask_offset Gpr (x64_shl $I64 amt
                                   (imm8_to_imm8_gpr 4))))
         (Amode.ImmRegRegShift 0
@@ -547,7 +587,7 @@
 (extern constructor ushr_i8x16_mask_table ushr_i8x16_mask_table)
 (rule (ushr_i8x16_mask (RegMemImm.Reg amt))
       (let ((mask_table SyntheticAmode (ushr_i8x16_mask_table))
-            (base_mask_addr Gpr (x64_lea mask_table))
+            (base_mask_addr Gpr (x64_lea $I64 mask_table))
             (mask_offset Gpr (x64_shl $I64
                                   amt
                                   (imm8_to_imm8_gpr 4))))

diff --git a/cranelift/filetests/filetests/isa/x64/immediates.clif b/cranelift/filetests/filetests/isa/x64/immediates.clif
@@ -19,15 +19,16 @@ block0(v0: i64, v1: i64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movq    %rdi, %r9
-;   addq    %r9, const(0), %r9
-;   movq    %r9, 0(%rsi)
-;   movq    %rdi, %r10
-;   subq    %r10, const(0), %r10
-;   movq    %r10, 0(%rsi)
+;   movabsq $-18765284782900, %r9
 ;   movq    %rdi, %r11
-;   andq    %r11, const(0), %r11
+;   addq    %r11, %r9, %r11
 ;   movq    %r11, 0(%rsi)
+;   movq    %rdi, %r11
+;   subq    %r11, const(0), %r11
+;   movq    %r11, 0(%rsi)
+;   movq    %rdi, %rax
+;   andq    %rax, const(0), %rax
+;   movq    %rax, 0(%rsi)
 ;   orq     %rdi, const(0), %rdi
 ;   movq    %rdi, 0(%rsi)
 ;   movq    %rbp, %rsp
@@ -39,23 +40,21 @@ block0(v0: i64, v1: i64):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movq %rdi, %r9
-;   addq 0x32(%rip), %r9
-;   movq %r9, (%rsi) ; trap: heap_oob
-;   movq %rdi, %r10
-;   subq 0x25(%rip), %r10
-;   movq %r10, (%rsi) ; trap: heap_oob
+;   movabsq $18446725308424768716, %r9
+;   movq %rdi, %r11
+;   addq %r9, %r11
+;   movq %r11, (%rsi) ; trap: heap_oob
 ;   movq %rdi, %r11
-;   andq 0x18(%rip), %r11
+;   subq 0x1f(%rip), %r11
 ;   movq %r11, (%rsi) ; trap: heap_oob
-;   orq 0xe(%rip), %rdi
+;   movq %rdi, %rax
+;   andq 0x12(%rip), %rax
+;   movq %rax, (%rsi) ; trap: heap_oob
+;   orq 8(%rip), %rdi
 ;   movq %rdi, (%rsi) ; trap: heap_oob
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
 ;   int3
 ;   int3
 ;   fstp %st(5)