Conditionally use lea based on regalloc

bytecodealliance · Mar 15, 2023 · 47faeeb · 47faeeb
1 parent 28a77e6
commit 47faeeb
Show file tree

Hide file tree

Showing 19 changed files with 249 additions and 233 deletions.
diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -872,13 +872,81 @@ pub(crate) fn emit(
         Inst::LoadEffectiveAddress { addr, dst, size } => {
             let dst = allocs.next(dst.to_reg().to_reg());
             let amode = addr.finalize(state, sink).with_allocs(allocs);
-            let flags = match size {
-                OperandSize::Size32 => RexFlags::clear_w(),
-                OperandSize::Size64 => RexFlags::set_w(),
-                _ => unreachable!(),
-            };
 
-            emit_std_reg_mem(sink, LegacyPrefixes::None, 0x8D, 1, dst, &amode, flags, 0);
+            // If this `lea` can actually get encoded as an `add` then do that
+            // instead. Currently all candidate `iadd`s become an `lea`
+            // pseudo-instruction here but maximizing the sue of `lea` is not
+            // necessarily optimal. The `lea` instruction goes through dedicated
+            // address units on cores which are finite and disjoint from the
+            // general ALU, so if everything uses `lea` then those units can get
+            // saturated while leaving the ALU idle.
+            //
+            // To help make use of more parts of a cpu, this attempts to use
+            // `add` when it's semantically equivalent to `lea`, or otherwise
+            // when the `dst` register is the same as the `base` or `index`
+            // register.
+            //
+            // FIXME: ideally regalloc is informed of this constraint. Register
+            // allocation of `lea` should "attempt" to put the `base` in the
+            // same register as `dst` but not at the expense of generating a
+            // `mov` instruction. Currently that's not possible but perhaps one
+            // day it may be worth it.
+            match amode {
+                // If `base == dst` then this is `add $imm, %dst`, so encode
+                // that instead.
+                Amode::ImmReg {
+                    simm32,
+                    base,
+                    flags: _,
+                } if base == dst => {
+                    let inst = Inst::alu_rmi_r(
+                        *size,
+                        AluRmiROpcode::Add,
+                        RegMemImm::imm(simm32),
+                        Writable::from_reg(dst),
+                    );
+                    inst.emit(&[], sink, info, state);
+                }
+                // If the offset is 0 and the shift is 0 (meaning multiplication
+                // by 1) then:
+                //
+                // * If `base == dst`, then this is `add %index, %base`
+                // * If `index == dst`, then this is `add %base, %index`
+                //
+                // Encode the appropriate instruction here in that case.
+                Amode::ImmRegRegShift {
+                    simm32: 0,
+                    base,
+                    index,
+                    shift: 0,
+                    flags: _,
+                } if base == dst || index == dst => {
+                    let (dst, operand) = if base == dst {
+                        (base, index)
+                    } else {
+                        (index, base)
+                    };
+                    let inst = Inst::alu_rmi_r(
+                        *size,
+                        AluRmiROpcode::Add,
+                        RegMemImm::reg(operand.to_reg()),
+                        Writable::from_reg(dst.to_reg()),
+                    );
+                    inst.emit(&[], sink, info, state);
+                }
+
+                // If `lea`'s 3-operand mode is leveraged by regalloc, or if
+                // it's fancy like imm-plus-shift-plus-base, then `lea` is
+                // actually emitted.
+                _ => {
+                    let flags = match size {
+                        OperandSize::Size32 => RexFlags::clear_w(),
+                        OperandSize::Size64 => RexFlags::set_w(),
+                        _ => unreachable!(),
+                    };
+                    emit_std_reg_mem(sink, LegacyPrefixes::None, 0x8D, 1, dst, &amode, flags, 0);
+                }
+            };
         }
 
         Inst::MovsxRmR { ext_mode, src, dst } => {

diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
@@ -47,44 +47,14 @@
       (x64_add ty x y))
 
 ;; Base case for 32 and 64-bit types which might end up using the `lea`
-;; instruction to fold multiple operations into one. The actualy determination
-;; of whether to use `add` or `lea` is left up to the `add_or_lea` helper.
-(rule -5 (lower (has_type (ty_32_or_64 ty) (iadd x y)))
-      (add_or_lea ty (to_amode_add (mem_flags_trusted) x y (zero_offset))))
-
-;; Small helper used as part of the lowering of `iadd` just above which chooses
-;; either `lea` or `add` for the `Amode` given. The x64 `lea` instruction in
-;; theory is a superior `add` alternative offering the means to have a 3-operand
-;; instruction (aka better regalloc) along with the ability to fold multiple
-;; pieces of functionality into one. In practice though it seems that it's not
-;; so cut-and-dry. The `meshoptimizer` benchmark's `vtx` measurement, for
-;; example, gets 10% slower if `lea` is unconditionally used here. The apparent
-;; reason for this is that x64 cores have dedicated units for computing
-;; addresses, but a finite number of them. It seems that forcing everything
-;; through these units can cause a slowdown vs also using the ALUs which are
-;; otherwise idle if there's a lot of add instructions.
-;;
-;; Given all that a rough heuristic is applied here. If the `Amode` is "simple"
-;; and basically looks like one add instruction then the `add` instruction is
-;; itself used. This covers cases like `a + $constant` or `a + b`. In these
-;; cases the theoretical downside to using `add` is that the 3-operand mode
-;; can't be used and this may require an extra `mov` relative to an `lea`
-;; instruction.
+;; instruction to fold multiple operations into one.
 ;;
-;; Otherwise if the `Amode` is "complicated", or can fold more than one
-;; arithmetic instruction into it, then an `lea` is used. This means that
-;; expressions of the form `a + b * c` or `a + b + $const` generate a single
-;; `lea` instruction.
-;;
-;; Locally on the `meshoptimizer` benchmark this at least preserves the
-;; performance relative to "always use `add`".
-(decl add_or_lea (Type Amode) Reg)
-(rule 1 (add_or_lea ty (Amode.ImmReg imm reg _flags))
-      (x64_add ty reg (RegMemImm.Imm imm)))
-(rule 1 (add_or_lea ty (Amode.ImmRegRegShift 0 base index 0 _flags))
-      (x64_add ty base index))
-(rule (add_or_lea ty mode)
-      (x64_lea ty mode))
+;; Note that at this time this always generates a `lea` pseudo-instruction,
+;; but the actual instruction emitted might be an `add` if it's equivalent.
+;; For more details on this see the `emit.rs` logic to emit
+;; `LoadEffectiveAddress`.
+(rule -5 (lower (has_type (ty_32_or_64 ty) (iadd x y)))
+      (x64_lea ty (to_amode_add (mem_flags_trusted) x y (zero_offset))))
 
 ;; Higher-priority cases than the previous two where a load can be sunk into
 ;; the add instruction itself. Note that both operands are tested for

diff --git a/cranelift/filetests/filetests/isa/x64/amode-opt.clif b/cranelift/filetests/filetests/isa/x64/amode-opt.clif
@@ -246,8 +246,7 @@ block0(v0: i64, v1: i32, v2: i32):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movq    %rsi, %r8
-;   addl    %r8d, %edx, %r8d
+;   lea     0(%rsi,%rdx,1), %r8d
 ;   shll    $2, %r8d, %r8d
 ;   movq    -1(%rdi,%r8,1), %rax
 ;   movq    %rbp, %rsp
@@ -259,8 +258,7 @@ block0(v0: i64, v1: i32, v2: i32):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movq %rsi, %r8
-;   addl %edx, %r8d
+;   leal (%rsi, %rdx), %r8d
 ;   shll $2, %r8d
 ;   movq -1(%rdi, %r8), %rax ; trap: heap_oob
 ;   movq %rbp, %rsp

diff --git a/cranelift/filetests/filetests/isa/x64/basic.clif b/cranelift/filetests/filetests/isa/x64/basic.clif
@@ -11,8 +11,7 @@ block0(v0: i32, v1: i32):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movq    %rdi, %rax
-;   addl    %eax, %esi, %eax
+;   lea     0(%rdi,%rsi,1), %eax
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -22,8 +21,7 @@ block0(v0: i32, v1: i32):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movq %rdi, %rax
-;   addl %esi, %eax
+;   leal (%rdi, %rsi), %eax
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq

diff --git a/cranelift/filetests/filetests/isa/x64/branches.clif b/cranelift/filetests/filetests/isa/x64/branches.clif
@@ -784,8 +784,7 @@ block5(v5: i32):
 ;   movl    $4, %esi
 ;   jmp     label7
 ; block7:
-;   movq    %rdi, %rax
-;   addl    %eax, %esi, %eax
+;   lea     0(%rdi,%rsi,1), %eax
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -825,8 +824,7 @@ block5(v5: i32):
 ; block6: ; offset 0x59
 ;   movl $4, %esi
 ; block7: ; offset 0x5e
-;   movq %rdi, %rax
-;   addl %esi, %eax
+;   leal (%rdi, %rsi), %eax
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq

diff --git a/cranelift/filetests/filetests/isa/x64/immediates.clif b/cranelift/filetests/filetests/isa/x64/immediates.clif
@@ -20,8 +20,7 @@ block0(v0: i64, v1: i64):
 ;   movq    %rsp, %rbp
 ; block0:
 ;   movabsq $-18765284782900, %r9
-;   movq    %rdi, %r11
-;   addq    %r11, %r9, %r11
+;   lea     0(%rdi,%r9,1), %r11
 ;   movq    %r11, 0(%rsi)
 ;   movq    %rdi, %r11
 ;   subq    %r11, const(0), %r11
@@ -41,21 +40,20 @@ block0(v0: i64, v1: i64):
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   movabsq $18446725308424768716, %r9
-;   movq %rdi, %r11
-;   addq %r9, %r11
+;   leaq (%rdi, %r9), %r11
 ;   movq %r11, (%rsi) ; trap: heap_oob
 ;   movq %rdi, %r11
-;   subq 0x1f(%rip), %r11
+;   subq 0x20(%rip), %r11
 ;   movq %r11, (%rsi) ; trap: heap_oob
 ;   movq %rdi, %rax
-;   andq 0x12(%rip), %rax
+;   andq 0x13(%rip), %rax
 ;   movq %rax, (%rsi) ; trap: heap_oob
-;   orq 8(%rip), %rdi
+;   orq 9(%rip), %rdi
 ;   movq %rdi, (%rsi) ; trap: heap_oob
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
-;   int3
+;   addb %cl, %ah
 ;   int3
 ;   fstp %st(5)
 ;   outb %al, %dx

diff --git a/cranelift/filetests/filetests/isa/x64/lea.clif b/cranelift/filetests/filetests/isa/x64/lea.clif
@@ -11,8 +11,7 @@ block0(v0: i32, v1: i32):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movq    %rdi, %rax
-;   addl    %eax, %esi, %eax
+;   lea     0(%rdi,%rsi,1), %eax
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -22,8 +21,7 @@ block0(v0: i32, v1: i32):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movq %rdi, %rax
-;   addl %esi, %eax
+;   leal (%rdi, %rsi), %eax
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
@@ -38,8 +36,7 @@ block0(v0: i64, v1: i64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movq    %rdi, %rax
-;   addq    %rax, %rsi, %rax
+;   lea     0(%rdi,%rsi,1), %rax
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -49,8 +46,7 @@ block0(v0: i64, v1: i64):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movq %rdi, %rax
-;   addq %rsi, %rax
+;   leaq (%rdi, %rsi), %rax
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
@@ -66,8 +62,7 @@ block0(v0: i32):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movq    %rdi, %rax
-;   addl    %eax, $100, %eax
+;   lea     100(%rdi), %eax
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -77,8 +72,7 @@ block0(v0: i32):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movq %rdi, %rax
-;   addl $0x64, %eax
+;   leal 0x64(%rdi), %eax
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
@@ -94,8 +88,7 @@ block0(v0: i64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movq    %rdi, %rax
-;   addq    %rax, $100, %rax
+;   lea     100(%rdi), %rax
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -105,8 +98,7 @@ block0(v0: i64):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movq %rdi, %rax
-;   addq $0x64, %rax
+;   leaq 0x64(%rdi), %rax
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq

diff --git a/cranelift/filetests/filetests/isa/x64/load-op.clif b/cranelift/filetests/filetests/isa/x64/load-op.clif
@@ -155,8 +155,7 @@ block0(v0: i64, v1: i64):
 ;   movq    %rsp, %rbp
 ; block0:
 ;   movq    0(%rdi), %r8
-;   movq    %r8, %r9
-;   addq    %r9, %rdi, %r9
+;   lea     0(%r8,%rdi,1), %r9
 ;   movq    %r9, 0(%rsi)
 ;   movq    0(%r8,%rdi,1), %rax
 ;   movq    %rbp, %rsp
@@ -169,8 +168,7 @@ block0(v0: i64, v1: i64):
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   movq (%rdi), %r8 ; trap: heap_oob
-;   movq %r8, %r9
-;   addq %rdi, %r9
+;   leaq (%r8, %rdi), %r9
 ;   movq %r9, (%rsi) ; trap: heap_oob
 ;   movq (%r8, %rdi), %rax ; trap: heap_oob
 ;   movq %rbp, %rsp

diff --git a/cranelift/filetests/filetests/isa/x64/pinned-reg.clif b/cranelift/filetests/filetests/isa/x64/pinned-reg.clif
@@ -14,9 +14,9 @@ block0:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movq    %r15, %rsi
-;   addq    %rsi, $1, %rsi
-;   movq    %rsi, %r15
+;   movq    %r15, %rdi
+;   lea     1(%rdi), %rdi
+;   movq    %rdi, %r15
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -26,9 +26,9 @@ block0:
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movq %r15, %rsi
-;   addq $1, %rsi
-;   movq %rsi, %r15
+;   movq %r15, %rdi
+;   addq $1, %rdi
+;   movq %rdi, %r15
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
@@ -45,12 +45,12 @@ block0:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ;   subq    %rsp, $16, %rsp
-;   movq    %rsi, 0(%rsp)
+;   movq    %rdi, 0(%rsp)
 ; block0:
-;   movq    %r15, %rsi
-;   addq    %rsi, $1, %rsi
-;   movq    %rsi, %r15
-;   movq    0(%rsp), %rsi
+;   movq    %r15, %rdi
+;   lea     1(%rdi), %rdi
+;   movq    %rdi, %r15
+;   movq    0(%rsp), %rdi
 ;   addq    %rsp, $16, %rsp
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
@@ -61,12 +61,12 @@ block0:
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ;   subq $0x10, %rsp
-;   movq %rsi, (%rsp)
+;   movq %rdi, (%rsp)
 ; block1: ; offset 0xc
-;   movq %r15, %rsi
-;   addq $1, %rsi
-;   movq %rsi, %r15
-;   movq (%rsp), %rsi
+;   movq %r15, %rdi
+;   addq $1, %rdi
+;   movq %rdi, %r15
+;   movq (%rsp), %rdi
 ;   addq $0x10, %rsp
 ;   movq %rbp, %rsp
 ;   popq %rbp