Skip to content

Commit

Permalink
cranelift: Add inline stack probe for x64
Browse files Browse the repository at this point in the history
  • Loading branch information
afonso360 committed Sep 1, 2022
1 parent c54d838 commit bcfb253
Show file tree
Hide file tree
Showing 16 changed files with 398 additions and 17 deletions.
12 changes: 12 additions & 0 deletions cranelift/codegen/meta/src/shared/settings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,18 @@ pub(crate) fn define() -> SettingGroup {
12,
);

settings.add_enum(
"probestack_strategy",
"Controls what kinds of stack probes are emitted.",
r#"
Supported strategies:
- `outline`: Always emits stack probes as calls to a probe stack function.
- `inline`: Always emits inline stack probes.
"#,
vec!["outline", "inline"],
);

// Jump table options.

settings.add_bool(
Expand Down
4 changes: 4 additions & 0 deletions cranelift/codegen/src/isa/aarch64/abi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,10 @@ impl ABIMachineSpec for AArch64MachineDeps {
smallvec![]
}

fn gen_inline_probestack(_frame_size: u32, _guard_size: u32) -> SmallInstVec<Self::I> {
unimplemented!("Inline stack probing is unimplemented on AArch64");
}

// Returns stack bytes used as well as instructions. Does not adjust
// nominal SP offset; abi generic code will do that.
fn gen_clobber_save(
Expand Down
4 changes: 4 additions & 0 deletions cranelift/codegen/src/isa/s390x/abi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -574,6 +574,10 @@ impl ABIMachineSpec for S390xMachineDeps {
smallvec![]
}

fn gen_inline_probestack(_frame_size: u32, _guard_size: u32) -> SmallInstVec<Self::I> {
unimplemented!("Inline stack probing is unimplemented on S390x");
}

// Returns stack bytes used as well as instructions. Does not adjust
// nominal SP offset; abi generic code will do that.
fn gen_clobber_save(
Expand Down
53 changes: 53 additions & 0 deletions cranelift/codegen/src/isa/x64/abi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,42 @@ pub(crate) type X64Caller = Caller<X64ABIMachineSpec>;
/// Implementation of ABI primitives for x64.
pub struct X64ABIMachineSpec;

impl X64ABIMachineSpec {
fn gen_probestack_unroll(guard_size: u32, probe_count: u32) -> SmallInstVec<Inst> {
let mut insts = SmallVec::with_capacity(probe_count as usize);
for i in 0..probe_count {
let offset = (guard_size * (i + 1)) as i64;

// TODO: It would be nice if we could store the imm 0, but we don't have insts for those
// so store the stack pointer. Any register will do, since the stack is undefined at this point
insts.push(Self::gen_store_stack(
StackAMode::SPOffset(-offset, I8),
regs::rsp(),
I32,
));
}
insts
}
fn gen_probestack_loop(frame_size: u32, guard_size: u32) -> SmallInstVec<Inst> {
// We have to use a caller saved register since clobbering only happens
// after stack probing.
//
// R11 is caller saved on both Fastcall and SystemV, and not used for argument
// passing, so it's pretty much free. It is also not used by the stacklimit mechanism.
let tmp = regs::r11();
debug_assert!({
let real_reg = tmp.to_real_reg().unwrap();
!is_callee_save_systemv(real_reg, false) && !is_callee_save_fastcall(real_reg, false)
});

smallvec![Inst::StackProbeLoop {
tmp: Writable::from_reg(tmp),
frame_size,
guard_size,
}]
}
}

impl IsaFlags for x64_settings::Flags {}

impl ABIMachineSpec for X64ABIMachineSpec {
Expand Down Expand Up @@ -398,6 +434,23 @@ impl ABIMachineSpec for X64ABIMachineSpec {
insts
}

fn gen_inline_probestack(frame_size: u32, guard_size: u32) -> SmallInstVec<Self::I> {
// Unroll at most n consecutive probes, before falling back to using a loop
//
// This was number was picked because the loop version is 38 bytes long. We can fit
// 5 inline probes in that space, so unroll if its beneficial in terms of code size.
const PROBE_MAX_UNROLL: u32 = 5;

// Number of probes that we need to perform
let probe_count = align_to(frame_size, guard_size) / guard_size;

if probe_count <= PROBE_MAX_UNROLL {
Self::gen_probestack_unroll(guard_size, probe_count)
} else {
Self::gen_probestack_loop(frame_size, guard_size)
}
}

fn gen_clobber_save(
_call_conv: isa::CallConv,
setup_frame: bool,
Expand Down
5 changes: 5 additions & 0 deletions cranelift/codegen/src/isa/x64/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,11 @@
;; popq reg
(Pop64 (dst WritableGpr))

;; Emits a inline stack probe loop.
(StackProbeLoop (tmp WritableReg)
(frame_size u32)
(guard_size u32))

;; =========================================
;; Floating-point operations.

Expand Down
107 changes: 107 additions & 0 deletions cranelift/codegen/src/isa/x64/inst/emit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1234,6 +1234,113 @@ pub(crate) fn emit(
sink.put1(0x58 + (enc_dst & 7));
}

Inst::StackProbeLoop {
tmp,
frame_size,
guard_size,
} => {
assert!(info.flags.enable_probestack());
assert!(guard_size.is_power_of_two());

let tmp = allocs.next_writable(*tmp);

// Number of probes that we need to perform
let probe_count = align_to(*frame_size, *guard_size) / guard_size;

// The inline stack probe loop has 3 phases
//
// We generate the "guard area" register which is essentially the framze_size aligned to
// guard_size. We copy the stack pointer and and subtract the guard area from it. This
// gets us a register that we can use to compare when looping.
//
// After that we emit the loop, Essentially we just adjust the stack pointer one guard_size'd
// distance at a time and then touch the stack by writing anything to it. We use the previously
// created "guard area" register to know when to stop looping.
//
// When we have touched all the pages that we need, we have to restore the stack pointer
// to where it was before.
//
// If you are editing this code, make sure to manually update the jump offset below
// We don't have relocations/labels on this part of the pipeline, so we need
// to manually do the offsets.
//
// Generate the following code:
// mov tmp_reg, rsp
// sub tmp_reg, guard_size * probe_count
// .loop_start:
// sub rsp, guard_size
// mov [rsp], rsp
// cmp rsp, tmp_reg
// jne .loop_start
// add rsp, guard_size * probe_count

// Create the guard bound register
// mov tmp_reg, rsp
let inst = Inst::gen_move(tmp, regs::rsp(), types::I64);
inst.emit(&[], sink, info, state);

// sub tmp_reg, GUARD_SIZE * probe_count
let inst = Inst::alu_rmi_r(
OperandSize::Size64,
AluRmiROpcode::Sub,
RegMemImm::imm(guard_size * probe_count),
tmp,
);
inst.emit(&[], sink, info, state);

// Emit the main loop!
let loop_start = sink.get_label();
sink.bind_label(loop_start);

// sub rsp, GUARD_SIZE
let inst = Inst::alu_rmi_r(
OperandSize::Size64,
AluRmiROpcode::Sub,
RegMemImm::imm(*guard_size),
Writable::from_reg(regs::rsp()),
);
inst.emit(&[], sink, info, state);

// TODO: `mov [rsp], 0` would be better, but we don't have that instruction
// Probe the stack! We don't use Inst::gen_store_stack here because we need a predictable
// instruction size.
// mov [rsp], rsp
let inst = Inst::mov_r_m(
OperandSize::Size32, // Use Size32 since it saves us one byte
regs::rsp(),
SyntheticAmode::Real(Amode::imm_reg(0, regs::rsp())),
);
inst.emit(&[], sink, info, state);

// Compare and jump if we are not done yet
// cmp rsp, tmp_reg
let inst = Inst::cmp_rmi_r(
OperandSize::Size64,
RegMemImm::reg(regs::rsp()),
tmp.to_reg(),
);
inst.emit(&[], sink, info, state);

// jne .loop_start
// TODO: Encoding the JmpIf as a short jump saves us 4 bytes here.
one_way_jmp(sink, CC::NZ, loop_start);

// The regular prologue code is going to emit a `sub` after this, so we need to
// reset the stack pointer
//
// TODO: It would be better if we could avoid the `add` + `sub` that is generated here
// and in the stack adj portion of the prologue
//
// add rsp, GUARD_SIZE * probe_count
let inst = Inst::alu_rmi_r(
OperandSize::Size64,
AluRmiROpcode::Add,
RegMemImm::imm(guard_size * probe_count),
Writable::from_reg(regs::rsp()),
);
inst.emit(&[], sink, info, state);
}

Inst::CallKnown {
dest,
info: call_info,
Expand Down
19 changes: 19 additions & 0 deletions cranelift/codegen/src/isa/x64/inst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ impl Inst {
| Inst::Nop { .. }
| Inst::Pop64 { .. }
| Inst::Push64 { .. }
| Inst::StackProbeLoop { .. }
| Inst::Ret { .. }
| Inst::Setcc { .. }
| Inst::ShiftR { .. }
Expand Down Expand Up @@ -1427,6 +1428,21 @@ impl PrettyPrint for Inst {
format!("{} {}", ljustify("pushq".to_string()), src)
}

Inst::StackProbeLoop {
tmp,
frame_size,
guard_size,
} => {
let tmp = pretty_print_reg(tmp.to_reg(), 8, allocs);
format!(
"{} {}, frame_size={}, guard_size={}",
ljustify("stack_probe_loop".to_string()),
tmp,
frame_size,
guard_size
)
}

Inst::Pop64 { dst } => {
let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs);
format!("{} {}", ljustify("popq".to_string()), dst)
Expand Down Expand Up @@ -1946,6 +1962,9 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
Inst::Pop64 { dst } => {
collector.reg_def(dst.to_writable_reg());
}
Inst::StackProbeLoop { tmp, .. } => {
collector.reg_early_def(*tmp);
}

Inst::CallKnown { ref info, .. } => {
for &u in &info.uses {
Expand Down
22 changes: 18 additions & 4 deletions cranelift/codegen/src/machinst/abi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ use crate::ir::types::*;
use crate::ir::{ArgumentExtension, ArgumentPurpose, DynamicStackSlot, Signature, StackSlot};
use crate::isa::TargetIsa;
use crate::settings;
use crate::settings::ProbestackStrategy;
use crate::CodegenResult;
use crate::{ir, isa};
use crate::{machinst::*, trace};
Expand Down Expand Up @@ -430,6 +431,9 @@ pub trait ABIMachineSpec {
/// Generate a probestack call.
fn gen_probestack(_frame_size: u32) -> SmallInstVec<Self::I>;

/// Generate a inline stack probe.
fn gen_inline_probestack(_frame_size: u32, _guard_size: u32) -> SmallInstVec<Self::I>;

/// Get all clobbered registers that are callee-saved according to the ABI; the result
/// contains the registers in a sorted order.
fn get_clobbered_callee_saves(
Expand Down Expand Up @@ -1660,10 +1664,20 @@ impl<M: ABIMachineSpec> Callee<M> {
insts.extend(stack_limit_load.clone());
self.insert_stack_check(*reg, total_stacksize, &mut insts);
}
if let Some(min_frame) = &self.probestack_min_frame {
if total_stacksize >= *min_frame {
insts.extend(M::gen_probestack(total_stacksize));
}

let needs_probestack = self
.probestack_min_frame
.map_or(false, |min_frame| total_stacksize >= min_frame);

if needs_probestack {
insts.extend(
if self.flags.probestack_strategy() == ProbestackStrategy::Inline {
let guard_size = 1 << self.flags.probestack_size_log2();
M::gen_inline_probestack(total_stacksize, guard_size)
} else {
M::gen_probestack(total_stacksize)
},
);
}
}

Expand Down
1 change: 1 addition & 0 deletions cranelift/codegen/src/settings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,7 @@ opt_level = "none"
tls_model = "none"
libcall_call_conv = "isa_default"
probestack_size_log2 = 12
probestack_strategy = "outline"
regalloc_checker = false
regalloc_verbose_logs = false
enable_alias_analysis = true
Expand Down
67 changes: 67 additions & 0 deletions cranelift/filetests/filetests/isa/x64/inline-probestack-large.clif
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
test compile precise-output
set enable_probestack=true
; Test with the larger size of 64k
set probestack_size_log2=16
set probestack_strategy=inline
target x86_64



; If the stack size is just one page, we can avoid the stack probe entirely
function %single_page() -> i64 system_v {
ss0 = explicit_slot 8192

block0:
v1 = stack_addr.i64 ss0
return v1
}

; pushq %rbp
; movq %rsp, %rbp
; subq %rsp, $8192, %rsp
; block0:
; lea rsp(0 + virtual offset), %rax
; addq %rsp, $8192, %rsp
; movq %rbp, %rsp
; popq %rbp
; ret

function %unrolled() -> i64 system_v {
ss0 = explicit_slot 196608

block0:
v1 = stack_addr.i64 ss0
return v1
}

; pushq %rbp
; movq %rsp, %rbp
; movl %esp, -65536(%rsp)
; movl %esp, -131072(%rsp)
; movl %esp, -196608(%rsp)
; subq %rsp, $196608, %rsp
; block0:
; lea rsp(0 + virtual offset), %rax
; addq %rsp, $196608, %rsp
; movq %rbp, %rsp
; popq %rbp
; ret

function %large() -> i64 system_v {
ss0 = explicit_slot 2097152

block0:
v1 = stack_addr.i64 ss0
return v1
}

; pushq %rbp
; movq %rsp, %rbp
; stack_probe_loop %r11, frame_size=2097152, guard_size=65536
; subq %rsp, $2097152, %rsp
; block0:
; lea rsp(0 + virtual offset), %rax
; addq %rsp, $2097152, %rsp
; movq %rbp, %rsp
; popq %rbp
; ret
Loading

0 comments on commit bcfb253

Please sign in to comment.