Skip to content

Commit

Permalink
Conditionally use lea based on regalloc
Browse files Browse the repository at this point in the history
  • Loading branch information
alexcrichton committed Mar 15, 2023
1 parent 28a77e6 commit 47faeeb
Show file tree
Hide file tree
Showing 19 changed files with 249 additions and 233 deletions.
80 changes: 74 additions & 6 deletions cranelift/codegen/src/isa/x64/inst/emit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -872,13 +872,81 @@ pub(crate) fn emit(
Inst::LoadEffectiveAddress { addr, dst, size } => {
let dst = allocs.next(dst.to_reg().to_reg());
let amode = addr.finalize(state, sink).with_allocs(allocs);
let flags = match size {
OperandSize::Size32 => RexFlags::clear_w(),
OperandSize::Size64 => RexFlags::set_w(),
_ => unreachable!(),
};

emit_std_reg_mem(sink, LegacyPrefixes::None, 0x8D, 1, dst, &amode, flags, 0);
// If this `lea` can actually get encoded as an `add` then do that
// instead. Currently all candidate `iadd`s become an `lea`
// pseudo-instruction here but maximizing the sue of `lea` is not
// necessarily optimal. The `lea` instruction goes through dedicated
// address units on cores which are finite and disjoint from the
// general ALU, so if everything uses `lea` then those units can get
// saturated while leaving the ALU idle.
//
// To help make use of more parts of a cpu, this attempts to use
// `add` when it's semantically equivalent to `lea`, or otherwise
// when the `dst` register is the same as the `base` or `index`
// register.
//
// FIXME: ideally regalloc is informed of this constraint. Register
// allocation of `lea` should "attempt" to put the `base` in the
// same register as `dst` but not at the expense of generating a
// `mov` instruction. Currently that's not possible but perhaps one
// day it may be worth it.
match amode {
// If `base == dst` then this is `add $imm, %dst`, so encode
// that instead.
Amode::ImmReg {
simm32,
base,
flags: _,
} if base == dst => {
let inst = Inst::alu_rmi_r(
*size,
AluRmiROpcode::Add,
RegMemImm::imm(simm32),
Writable::from_reg(dst),
);
inst.emit(&[], sink, info, state);
}
// If the offset is 0 and the shift is 0 (meaning multiplication
// by 1) then:
//
// * If `base == dst`, then this is `add %index, %base`
// * If `index == dst`, then this is `add %base, %index`
//
// Encode the appropriate instruction here in that case.
Amode::ImmRegRegShift {
simm32: 0,
base,
index,
shift: 0,
flags: _,
} if base == dst || index == dst => {
let (dst, operand) = if base == dst {
(base, index)
} else {
(index, base)
};
let inst = Inst::alu_rmi_r(
*size,
AluRmiROpcode::Add,
RegMemImm::reg(operand.to_reg()),
Writable::from_reg(dst.to_reg()),
);
inst.emit(&[], sink, info, state);
}

// If `lea`'s 3-operand mode is leveraged by regalloc, or if
// it's fancy like imm-plus-shift-plus-base, then `lea` is
// actually emitted.
_ => {
let flags = match size {
OperandSize::Size32 => RexFlags::clear_w(),
OperandSize::Size64 => RexFlags::set_w(),
_ => unreachable!(),
};
emit_std_reg_mem(sink, LegacyPrefixes::None, 0x8D, 1, dst, &amode, flags, 0);
}
};
}

Inst::MovsxRmR { ext_mode, src, dst } => {
Expand Down
44 changes: 7 additions & 37 deletions cranelift/codegen/src/isa/x64/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -47,44 +47,14 @@
(x64_add ty x y))

;; Base case for 32 and 64-bit types which might end up using the `lea`
;; instruction to fold multiple operations into one. The actualy determination
;; of whether to use `add` or `lea` is left up to the `add_or_lea` helper.
(rule -5 (lower (has_type (ty_32_or_64 ty) (iadd x y)))
(add_or_lea ty (to_amode_add (mem_flags_trusted) x y (zero_offset))))

;; Small helper used as part of the lowering of `iadd` just above which chooses
;; either `lea` or `add` for the `Amode` given. The x64 `lea` instruction in
;; theory is a superior `add` alternative offering the means to have a 3-operand
;; instruction (aka better regalloc) along with the ability to fold multiple
;; pieces of functionality into one. In practice though it seems that it's not
;; so cut-and-dry. The `meshoptimizer` benchmark's `vtx` measurement, for
;; example, gets 10% slower if `lea` is unconditionally used here. The apparent
;; reason for this is that x64 cores have dedicated units for computing
;; addresses, but a finite number of them. It seems that forcing everything
;; through these units can cause a slowdown vs also using the ALUs which are
;; otherwise idle if there's a lot of add instructions.
;;
;; Given all that a rough heuristic is applied here. If the `Amode` is "simple"
;; and basically looks like one add instruction then the `add` instruction is
;; itself used. This covers cases like `a + $constant` or `a + b`. In these
;; cases the theoretical downside to using `add` is that the 3-operand mode
;; can't be used and this may require an extra `mov` relative to an `lea`
;; instruction.
;; instruction to fold multiple operations into one.
;;
;; Otherwise if the `Amode` is "complicated", or can fold more than one
;; arithmetic instruction into it, then an `lea` is used. This means that
;; expressions of the form `a + b * c` or `a + b + $const` generate a single
;; `lea` instruction.
;;
;; Locally on the `meshoptimizer` benchmark this at least preserves the
;; performance relative to "always use `add`".
(decl add_or_lea (Type Amode) Reg)
(rule 1 (add_or_lea ty (Amode.ImmReg imm reg _flags))
(x64_add ty reg (RegMemImm.Imm imm)))
(rule 1 (add_or_lea ty (Amode.ImmRegRegShift 0 base index 0 _flags))
(x64_add ty base index))
(rule (add_or_lea ty mode)
(x64_lea ty mode))
;; Note that at this time this always generates a `lea` pseudo-instruction,
;; but the actual instruction emitted might be an `add` if it's equivalent.
;; For more details on this see the `emit.rs` logic to emit
;; `LoadEffectiveAddress`.
(rule -5 (lower (has_type (ty_32_or_64 ty) (iadd x y)))
(x64_lea ty (to_amode_add (mem_flags_trusted) x y (zero_offset))))

;; Higher-priority cases than the previous two where a load can be sunk into
;; the add instruction itself. Note that both operands are tested for
Expand Down
6 changes: 2 additions & 4 deletions cranelift/filetests/filetests/isa/x64/amode-opt.clif
Original file line number Diff line number Diff line change
Expand Up @@ -246,8 +246,7 @@ block0(v0: i64, v1: i32, v2: i32):
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movq %rsi, %r8
; addl %r8d, %edx, %r8d
; lea 0(%rsi,%rdx,1), %r8d
; shll $2, %r8d, %r8d
; movq -1(%rdi,%r8,1), %rax
; movq %rbp, %rsp
Expand All @@ -259,8 +258,7 @@ block0(v0: i64, v1: i32, v2: i32):
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movq %rsi, %r8
; addl %edx, %r8d
; leal (%rsi, %rdx), %r8d
; shll $2, %r8d
; movq -1(%rdi, %r8), %rax ; trap: heap_oob
; movq %rbp, %rsp
Expand Down
6 changes: 2 additions & 4 deletions cranelift/filetests/filetests/isa/x64/basic.clif
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@ block0(v0: i32, v1: i32):
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movq %rdi, %rax
; addl %eax, %esi, %eax
; lea 0(%rdi,%rsi,1), %eax
; movq %rbp, %rsp
; popq %rbp
; ret
Expand All @@ -22,8 +21,7 @@ block0(v0: i32, v1: i32):
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movq %rdi, %rax
; addl %esi, %eax
; leal (%rdi, %rsi), %eax
; movq %rbp, %rsp
; popq %rbp
; retq
Expand Down
6 changes: 2 additions & 4 deletions cranelift/filetests/filetests/isa/x64/branches.clif
Original file line number Diff line number Diff line change
Expand Up @@ -784,8 +784,7 @@ block5(v5: i32):
; movl $4, %esi
; jmp label7
; block7:
; movq %rdi, %rax
; addl %eax, %esi, %eax
; lea 0(%rdi,%rsi,1), %eax
; movq %rbp, %rsp
; popq %rbp
; ret
Expand Down Expand Up @@ -825,8 +824,7 @@ block5(v5: i32):
; block6: ; offset 0x59
; movl $4, %esi
; block7: ; offset 0x5e
; movq %rdi, %rax
; addl %esi, %eax
; leal (%rdi, %rsi), %eax
; movq %rbp, %rsp
; popq %rbp
; retq
Expand Down
14 changes: 6 additions & 8 deletions cranelift/filetests/filetests/isa/x64/immediates.clif
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@ block0(v0: i64, v1: i64):
; movq %rsp, %rbp
; block0:
; movabsq $-18765284782900, %r9
; movq %rdi, %r11
; addq %r11, %r9, %r11
; lea 0(%rdi,%r9,1), %r11
; movq %r11, 0(%rsi)
; movq %rdi, %r11
; subq %r11, const(0), %r11
Expand All @@ -41,21 +40,20 @@ block0(v0: i64, v1: i64):
; movq %rsp, %rbp
; block1: ; offset 0x4
; movabsq $18446725308424768716, %r9
; movq %rdi, %r11
; addq %r9, %r11
; leaq (%rdi, %r9), %r11
; movq %r11, (%rsi) ; trap: heap_oob
; movq %rdi, %r11
; subq 0x1f(%rip), %r11
; subq 0x20(%rip), %r11
; movq %r11, (%rsi) ; trap: heap_oob
; movq %rdi, %rax
; andq 0x12(%rip), %rax
; andq 0x13(%rip), %rax
; movq %rax, (%rsi) ; trap: heap_oob
; orq 8(%rip), %rdi
; orq 9(%rip), %rdi
; movq %rdi, (%rsi) ; trap: heap_oob
; movq %rbp, %rsp
; popq %rbp
; retq
; int3
; addb %cl, %ah
; int3
; fstp %st(5)
; outb %al, %dx
Expand Down
24 changes: 8 additions & 16 deletions cranelift/filetests/filetests/isa/x64/lea.clif
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@ block0(v0: i32, v1: i32):
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movq %rdi, %rax
; addl %eax, %esi, %eax
; lea 0(%rdi,%rsi,1), %eax
; movq %rbp, %rsp
; popq %rbp
; ret
Expand All @@ -22,8 +21,7 @@ block0(v0: i32, v1: i32):
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movq %rdi, %rax
; addl %esi, %eax
; leal (%rdi, %rsi), %eax
; movq %rbp, %rsp
; popq %rbp
; retq
Expand All @@ -38,8 +36,7 @@ block0(v0: i64, v1: i64):
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movq %rdi, %rax
; addq %rax, %rsi, %rax
; lea 0(%rdi,%rsi,1), %rax
; movq %rbp, %rsp
; popq %rbp
; ret
Expand All @@ -49,8 +46,7 @@ block0(v0: i64, v1: i64):
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movq %rdi, %rax
; addq %rsi, %rax
; leaq (%rdi, %rsi), %rax
; movq %rbp, %rsp
; popq %rbp
; retq
Expand All @@ -66,8 +62,7 @@ block0(v0: i32):
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movq %rdi, %rax
; addl %eax, $100, %eax
; lea 100(%rdi), %eax
; movq %rbp, %rsp
; popq %rbp
; ret
Expand All @@ -77,8 +72,7 @@ block0(v0: i32):
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movq %rdi, %rax
; addl $0x64, %eax
; leal 0x64(%rdi), %eax
; movq %rbp, %rsp
; popq %rbp
; retq
Expand All @@ -94,8 +88,7 @@ block0(v0: i64):
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movq %rdi, %rax
; addq %rax, $100, %rax
; lea 100(%rdi), %rax
; movq %rbp, %rsp
; popq %rbp
; ret
Expand All @@ -105,8 +98,7 @@ block0(v0: i64):
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movq %rdi, %rax
; addq $0x64, %rax
; leaq 0x64(%rdi), %rax
; movq %rbp, %rsp
; popq %rbp
; retq
Expand Down
6 changes: 2 additions & 4 deletions cranelift/filetests/filetests/isa/x64/load-op.clif
Original file line number Diff line number Diff line change
Expand Up @@ -155,8 +155,7 @@ block0(v0: i64, v1: i64):
; movq %rsp, %rbp
; block0:
; movq 0(%rdi), %r8
; movq %r8, %r9
; addq %r9, %rdi, %r9
; lea 0(%r8,%rdi,1), %r9
; movq %r9, 0(%rsi)
; movq 0(%r8,%rdi,1), %rax
; movq %rbp, %rsp
Expand All @@ -169,8 +168,7 @@ block0(v0: i64, v1: i64):
; movq %rsp, %rbp
; block1: ; offset 0x4
; movq (%rdi), %r8 ; trap: heap_oob
; movq %r8, %r9
; addq %rdi, %r9
; leaq (%r8, %rdi), %r9
; movq %r9, (%rsi) ; trap: heap_oob
; movq (%r8, %rdi), %rax ; trap: heap_oob
; movq %rbp, %rsp
Expand Down
32 changes: 16 additions & 16 deletions cranelift/filetests/filetests/isa/x64/pinned-reg.clif
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ block0:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movq %r15, %rsi
; addq %rsi, $1, %rsi
; movq %rsi, %r15
; movq %r15, %rdi
; lea 1(%rdi), %rdi
; movq %rdi, %r15
; movq %rbp, %rsp
; popq %rbp
; ret
Expand All @@ -26,9 +26,9 @@ block0:
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movq %r15, %rsi
; addq $1, %rsi
; movq %rsi, %r15
; movq %r15, %rdi
; addq $1, %rdi
; movq %rdi, %r15
; movq %rbp, %rsp
; popq %rbp
; retq
Expand All @@ -45,12 +45,12 @@ block0:
; pushq %rbp
; movq %rsp, %rbp
; subq %rsp, $16, %rsp
; movq %rsi, 0(%rsp)
; movq %rdi, 0(%rsp)
; block0:
; movq %r15, %rsi
; addq %rsi, $1, %rsi
; movq %rsi, %r15
; movq 0(%rsp), %rsi
; movq %r15, %rdi
; lea 1(%rdi), %rdi
; movq %rdi, %r15
; movq 0(%rsp), %rdi
; addq %rsp, $16, %rsp
; movq %rbp, %rsp
; popq %rbp
Expand All @@ -61,12 +61,12 @@ block0:
; pushq %rbp
; movq %rsp, %rbp
; subq $0x10, %rsp
; movq %rsi, (%rsp)
; movq %rdi, (%rsp)
; block1: ; offset 0xc
; movq %r15, %rsi
; addq $1, %rsi
; movq %rsi, %r15
; movq (%rsp), %rsi
; movq %r15, %rdi
; addq $1, %rdi
; movq %rdi, %r15
; movq (%rsp), %rdi
; addq $0x10, %rsp
; movq %rbp, %rsp
; popq %rbp
Expand Down
Loading

0 comments on commit 47faeeb

Please sign in to comment.