Skip to content

Commit

Permalink
x64: Add lea-based lowering for iadd
Browse files Browse the repository at this point in the history
This commit adds a rule for the lowering of `iadd` to use `lea` for 32
and 64-bit addition. The theoretical benefit of `lea` over the `add`
instruction is that the `lea` variant can emulate a 3-operand
instruction which doesn't destructively modify on of its operands.
Additionally the `lea` operation can fold in other components such as
constant additions and shifts.

In practice, however, if `lea` is unconditionally used instead of `iadd`
it ends up losing 10% performance on a local `meshoptimizer` benchmark.
My best guess as to what's going on here is that my CPU's dedicated
units for address computation are all overloaded while the ALUs are
basically idle in a memory-intensive loop. Previously when the ALU was
used for `add` and the address units for stores/loads it in theory
pipelined things better (most of this is me shooting in the dark). To
prevent the performance loss here I've updated the lowering of `iadd` to
conditionally sometimes use `lea` and sometimes use `add` depending on
how "complicated" the `Amode` is. Simple ones like `a + b` or `a + $imm`
continue to use `add` (and its subsequent hypothetical extra `mov`
necessary into the result). More complicated ones like `a + b + $imm` or
`a + b << c + $imm` use `lea` as it can remove the need for extra
instructions. Locally at least this fixes the performance loss relative
to unconditionally using `lea`.

One note is that this adds an `OperandSize` argument to the
`MInst::LoadEffectiveAddress` variant to add an encoding for 32-bit
`lea` in addition to the preexisting 64-bit encoding.
  • Loading branch information
alexcrichton committed Mar 10, 2023
1 parent 0e95ee4 commit f1067eb
Show file tree
Hide file tree
Showing 28 changed files with 631 additions and 331 deletions.
9 changes: 5 additions & 4 deletions cranelift/codegen/src/isa/x64/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,8 @@

;; Loads the memory address of addr into dst.
(LoadEffectiveAddress (addr SyntheticAmode)
(dst WritableGpr))
(dst WritableGpr)
(size OperandSize))

;; Sign-extended loads and moves: movs (bl bq wl wq lq) addr reg.
(MovsxRmR (ext_mode ExtMode)
Expand Down Expand Up @@ -3551,10 +3552,10 @@
(inst MInst (MInst.Neg size src dst)))
(ProducesFlags.ProducesFlagsReturnsResultWithConsumer inst dst)))

(decl x64_lea (SyntheticAmode) Gpr)
(rule (x64_lea addr)
(decl x64_lea (Type SyntheticAmode) Gpr)
(rule (x64_lea ty addr)
(let ((dst WritableGpr (temp_writable_gpr))
(_ Unit (emit (MInst.LoadEffectiveAddress addr dst))))
(_ Unit (emit (MInst.LoadEffectiveAddress addr dst (operand_size_of_type_32_64 ty)))))
dst))

;; Helper for creating `ud2` instructions.
Expand Down
18 changes: 7 additions & 11 deletions cranelift/codegen/src/isa/x64/inst/emit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -871,20 +871,16 @@ pub(crate) fn emit(
)
}

Inst::LoadEffectiveAddress { addr, dst } => {
Inst::LoadEffectiveAddress { addr, dst, size } => {
let dst = allocs.next(dst.to_reg().to_reg());
let amode = addr.finalize(state, sink).with_allocs(allocs);
let flags = match size {
OperandSize::Size32 => RexFlags::clear_w(),
OperandSize::Size64 => RexFlags::set_w(),
_ => unreachable!(),
};

emit_std_reg_mem(
sink,
LegacyPrefixes::None,
0x8D,
1,
dst,
&amode,
RexFlags::set_w(),
0,
);
emit_std_reg_mem(sink, LegacyPrefixes::None, 0x8D, 1, dst, &amode, flags, 0);
}

Inst::MovsxRmR { ext_mode, src, dst } => {
Expand Down
7 changes: 4 additions & 3 deletions cranelift/codegen/src/isa/x64/inst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,7 @@ impl Inst {
Inst::LoadEffectiveAddress {
addr: addr.into(),
dst: WritableGpr::from_writable_reg(dst).unwrap(),
size: OperandSize::Size64,
}
}

Expand Down Expand Up @@ -1392,8 +1393,8 @@ impl PrettyPrint for Inst {
format!("{} {}, {}", ljustify("movq".to_string()), src, dst)
}

Inst::LoadEffectiveAddress { addr, dst } => {
let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs);
Inst::LoadEffectiveAddress { addr, dst, size } => {
let dst = pretty_print_reg(dst.to_reg().to_reg(), size.to_bytes(), allocs);
let addr = addr.pretty_print(8, allocs);
format!("{} {}, {}", ljustify("lea".to_string()), addr, dst)
}
Expand Down Expand Up @@ -2117,7 +2118,7 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
collector.reg_def(dst.to_writable_reg());
src.get_operands(collector);
}
Inst::LoadEffectiveAddress { addr: src, dst } => {
Inst::LoadEffectiveAddress { addr: src, dst, .. } => {
collector.reg_def(dst.to_writable_reg());
src.get_operands(collector);
}
Expand Down
58 changes: 49 additions & 9 deletions cranelift/codegen/src/isa/x64/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -41,17 +41,57 @@

;; `i64` and smaller.

;; Add two registers.
(rule -5 (lower (has_type (fits_in_64 ty)
;; Base case for 8 and 16-bit types
(rule -6 (lower (has_type (fits_in_16 ty)
(iadd x y)))
(x64_add ty x y))

;; The above case handles when the rhs is an immediate or a sinkable load, but
;; additionally add lhs meets these criteria.

;; Base case for 32 and 64-bit types which might end up using the `lea`
;; instruction to fold multiple operations into one. The actualy determination
;; of whether to use `add` or `lea` is left up to the `add_or_lea` helper.
(rule -5 (lower (has_type (ty_32_or_64 ty) (iadd x y)))
(add_or_lea ty (to_amode_add (mem_flags_trusted) x y (zero_offset))))

;; Small helper used as part of the lowering of `iadd` just above which chooses
;; either `lea` or `add` for the `Amode` given. The x64 `lea` instruction in
;; theory is a superior `add` alternative offering the means to have a 3-operand
;; instruction (aka better regalloc) along with the ability to fold multiple
;; pieces of functionality into one. In practice though it seems that it's not
;; so cut-and-dry. The `meshoptimizer` benchmark's `vtx` measurement, for
;; example, gets 10% slower if `lea` is unconditionally used here. The apparent
;; reason for this is that x64 cores have dedicated units for computing
;; addresses, but a finite number of them. It seems that forcing everything
;; through these units can cause a slowdown vs also using the ALUs which are
;; otherwise idle if there's a lot of add instructions.
;;
;; Given all that a rough heuristic is applied here. If the `Amode` is "simple"
;; and basically looks like one add instruction then the `add` instruction is
;; itself used. This covers cases like `a + $constant` or `a + b`. In these
;; cases the theoretical downside to using `add` is that the 3-operand mode
;; can't be used and this may require an extra `mov` relative to an `lea`
;; instruction.
;;
;; Otherwise if the `Amode` is "complicated", or can fold more than one
;; arithmetic instruction into it, then an `lea` is used. This means that
;; expressions of the form `a + b * c` or `a + b + $const` generate a single
;; `lea` instruction.
;;
;; Locally on the `meshoptimizer` benchmark this at least preserves the
;; performance relative to "always use `add`".
(decl add_or_lea (Type Amode) Reg)
(rule 1 (add_or_lea ty (Amode.ImmReg imm reg _flags))
(x64_add ty reg (RegMemImm.Imm imm)))
(rule 1 (add_or_lea ty (Amode.ImmRegRegShift 0 base index 0 _flags))
(x64_add ty base index))
(rule (add_or_lea ty mode)
(x64_lea ty mode))

;; Higher-priority cases than the previous two where a load can be sunk into
;; the add instruction itself. Note that both operands are tested for
;; sink-ability since addition is commutative
(rule -4 (lower (has_type (fits_in_64 ty)
(iadd (simm32_from_value x) y)))
(x64_add ty y x))
(iadd x (sinkable_load y))))
(x64_add ty x y))
(rule -3 (lower (has_type (fits_in_64 ty)
(iadd (sinkable_load x) y)))
(x64_add ty y x))
Expand Down Expand Up @@ -442,7 +482,7 @@
(extern constructor ishl_i8x16_mask_table ishl_i8x16_mask_table)
(rule (ishl_i8x16_mask (RegMemImm.Reg amt))
(let ((mask_table SyntheticAmode (ishl_i8x16_mask_table))
(base_mask_addr Gpr (x64_lea mask_table))
(base_mask_addr Gpr (x64_lea $I64 mask_table))
(mask_offset Gpr (x64_shl $I64 amt
(imm8_to_imm8_gpr 4))))
(Amode.ImmRegRegShift 0
Expand Down Expand Up @@ -547,7 +587,7 @@
(extern constructor ushr_i8x16_mask_table ushr_i8x16_mask_table)
(rule (ushr_i8x16_mask (RegMemImm.Reg amt))
(let ((mask_table SyntheticAmode (ushr_i8x16_mask_table))
(base_mask_addr Gpr (x64_lea mask_table))
(base_mask_addr Gpr (x64_lea $I64 mask_table))
(mask_offset Gpr (x64_shl $I64
amt
(imm8_to_imm8_gpr 4))))
Expand Down
35 changes: 17 additions & 18 deletions cranelift/filetests/filetests/isa/x64/immediates.clif
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,16 @@ block0(v0: i64, v1: i64):
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movq %rdi, %r9
; addq %r9, const(0), %r9
; movq %r9, 0(%rsi)
; movq %rdi, %r10
; subq %r10, const(0), %r10
; movq %r10, 0(%rsi)
; movabsq $-18765284782900, %r9
; movq %rdi, %r11
; andq %r11, const(0), %r11
; addq %r11, %r9, %r11
; movq %r11, 0(%rsi)
; movq %rdi, %r11
; subq %r11, const(0), %r11
; movq %r11, 0(%rsi)
; movq %rdi, %rax
; andq %rax, const(0), %rax
; movq %rax, 0(%rsi)
; orq %rdi, const(0), %rdi
; movq %rdi, 0(%rsi)
; movq %rbp, %rsp
Expand All @@ -39,23 +40,21 @@ block0(v0: i64, v1: i64):
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movq %rdi, %r9
; addq 0x32(%rip), %r9
; movq %r9, (%rsi) ; trap: heap_oob
; movq %rdi, %r10
; subq 0x25(%rip), %r10
; movq %r10, (%rsi) ; trap: heap_oob
; movabsq $18446725308424768716, %r9
; movq %rdi, %r11
; addq %r9, %r11
; movq %r11, (%rsi) ; trap: heap_oob
; movq %rdi, %r11
; andq 0x18(%rip), %r11
; subq 0x1f(%rip), %r11
; movq %r11, (%rsi) ; trap: heap_oob
; orq 0xe(%rip), %rdi
; movq %rdi, %rax
; andq 0x12(%rip), %rax
; movq %rax, (%rsi) ; trap: heap_oob
; orq 8(%rip), %rdi
; movq %rdi, (%rsi) ; trap: heap_oob
; movq %rbp, %rsp
; popq %rbp
; retq
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; int3
; int3
; fstp %st(5)
Expand Down
Loading

0 comments on commit f1067eb

Please sign in to comment.