Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cranelift: Port ishl SIMD lowerings to ISLE #3686

Merged
merged 1 commit into from
Jan 13, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
src/clif.isle f176ef3bba99365
src/prelude.isle d95510fad2e2473c
src/prelude.isle 7b911d3b894ae17
src/isa/aarch64/inst.isle 5fa80451697b084f
src/isa/aarch64/lower.isle 2d2e1e076a0c8a23
11 changes: 6 additions & 5 deletions cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.rs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

70 changes: 70 additions & 0 deletions cranelift/codegen/src/isa/x64/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@
(MovsxRmR (ext_mode ExtMode)
(src RegMem)
(dst WritableReg))
(Mov64MR (src SyntheticAmode)
(dst WritableReg))
(Cmove (size OperandSize)
(cc CC)
(consequent RegMem)
Expand All @@ -70,6 +72,8 @@
(Not (size OperandSize)
(src Reg)
(dst WritableReg))
(LoadEffectiveAddress (addr SyntheticAmode)
(dst WritableReg))
))

(type OperandSize extern
Expand Down Expand Up @@ -318,6 +322,17 @@

(type SyntheticAmode extern (enum))

(decl synthetic_amode_to_reg_mem (SyntheticAmode) RegMem)
(extern constructor synthetic_amode_to_reg_mem synthetic_amode_to_reg_mem)

(type Amode extern (enum))

(decl amode_imm_reg_reg_shift (u32 Reg Reg u8) Amode)
(extern constructor amode_imm_reg_reg_shift amode_imm_reg_reg_shift)

(decl amode_to_synthetic_amode (Amode) SyntheticAmode)
(extern constructor amode_to_synthetic_amode amode_to_synthetic_amode)

(type ShiftKind extern
(enum ShiftLeft
ShiftRightLogical
Expand Down Expand Up @@ -438,6 +453,11 @@

;;;; Helpers for Sign/Zero Extending ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(type ExtKind extern
(enum None
SignExtend
ZeroExtend))

(type ExtendKind (enum Sign Zero))

(type ExtMode extern (enum BL BQ WL WQ LQ))
Expand Down Expand Up @@ -549,6 +569,40 @@
(RegMem.Reg r)
(OperandSize.Size32))))

;;;; Helpers for Emitting Loads ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Load a value into a register.
(decl x64_load (Type SyntheticAmode ExtKind) Reg)

(rule (x64_load (fits_in_32 ty) addr (ExtKind.SignExtend))
(movsx ty
(ext_mode (ty_bytes ty) 8)
(synthetic_amode_to_reg_mem addr)))

(rule (x64_load $I64 addr _ext_kind)
(let ((dst WritableReg (temp_writable_reg $I64))
(_ Unit (emit (MInst.Mov64MR addr dst))))
(writable_reg_to_reg dst)))

(rule (x64_load $F32 addr _ext_kind)
(xmm_unary_rm_r (SseOpcode.Movss)
(synthetic_amode_to_reg_mem addr)))

(rule (x64_load $F64 addr _ext_kind)
(xmm_unary_rm_r (SseOpcode.Movsd)
(synthetic_amode_to_reg_mem addr)))

(rule (x64_load $F32X4 addr _ext_kind)
(xmm_unary_rm_r (SseOpcode.Movups)
(synthetic_amode_to_reg_mem addr)))

(rule (x64_load $F64X2 addr _ext_kind)
(xmm_unary_rm_r (SseOpcode.Movupd)
(synthetic_amode_to_reg_mem addr)))

(rule (x64_load (multi_lane _bits _lanes) addr _ext_kind)
(xmm_unary_rm_r (SseOpcode.Movdqu)
(synthetic_amode_to_reg_mem addr)))

;;;; Instruction Constructors ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
Expand Down Expand Up @@ -1236,6 +1290,16 @@
dst))))
(writable_reg_to_reg dst)))

;; Helper for creating `psllw` instructions.
(decl psllw (Reg RegMemImm) Reg)
(rule (psllw src1 src2)
(xmm_rmi_reg (SseOpcode.Psllw) src1 src2))

;; Helper for creating `pslld` instructions.
(decl pslld (Reg RegMemImm) Reg)
(rule (pslld src1 src2)
(xmm_rmi_reg (SseOpcode.Pslld) src1 src2))

;; Helper for creating `psllq` instructions.
(decl psllq (Reg RegMemImm) Reg)
(rule (psllq src1 src2)
Expand Down Expand Up @@ -1353,3 +1417,9 @@
(size OperandSize (operand_size_of_type_32_64 ty))
(_ Unit (emit (MInst.Not size src dst))))
(writable_reg_to_reg dst)))

(decl lea (SyntheticAmode) Reg)
(rule (lea addr)
(let ((dst WritableReg (temp_writable_reg $I64))
(_ Unit (emit (MInst.LoadEffectiveAddress addr dst))))
(writable_reg_to_reg dst)))
6 changes: 6 additions & 0 deletions cranelift/codegen/src/isa/x64/inst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3053,6 +3053,12 @@ impl MachInst for Inst {
}

fn gen_move(dst_reg: Writable<Reg>, src_reg: Reg, ty: Type) -> Inst {
log::trace!(
"Inst::gen_move {:?} -> {:?} (type: {:?})",
src_reg,
dst_reg.to_reg(),
ty
);
let rc_dst = dst_reg.to_reg().get_class();
let rc_src = src_reg.get_class();
// If this isn't true, we have gone way off the rails.
Expand Down
61 changes: 61 additions & 0 deletions cranelift/codegen/src/isa/x64/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,67 @@
(let ((amt_ Reg (lo_reg amt)))
(shl_i128 (put_in_regs src) amt_)))

;; SSE.

;; Since the x86 instruction set does not have any 8x16 shift instructions (even
;; in higher feature sets like AVX), we lower the `ishl.i8x16` to a sequence of
;; instructions. The basic idea, whether the amount to shift by is an immediate
;; or not, is to use a 16x8 shift and then mask off the incorrect bits to 0s.
(rule (lower (has_type $I8X16 (ishl src amt)))
(let ((src_ Reg (put_in_reg src))
(amt_gpr RegMemImm (put_in_reg_mem_imm amt))
(amt_xmm RegMemImm (reg_mem_imm_to_xmm amt_gpr))
;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be
;; correct for half of the lanes; the others must be fixed up with
;; the mask below.
(unmasked Reg (psllw src_ amt_xmm))
(mask_addr SyntheticAmode (ishl_i8x16_mask amt_gpr))
(mask Reg (x64_load $I8X16 mask_addr (ExtKind.None))))
(value_reg (sse_and $I8X16 unmasked (RegMem.Reg mask)))))

;; Get the address of the mask to use when fixing up the lanes that weren't
;; correctly generated by the 16x8 shift.
(decl ishl_i8x16_mask (RegMemImm) SyntheticAmode)

;; When the shift amount is known, we can statically (i.e. at compile time)
;; determine the mask to use and only emit that.
(rule (ishl_i8x16_mask (RegMemImm.Imm amt))
(ishl_i8x16_mask_for_const amt))

;; Otherwise, we must emit the entire mask table and dynamically (i.e. at run
;; time) find the correct mask offset in the table. We do this use `lea` to find
;; the base address of the mask table and then complex addressing to offset to
;; the right mask: `base_address + amt << 4`
(rule (ishl_i8x16_mask (RegMemImm.Reg amt))
(let ((mask_table SyntheticAmode (ishl_i8x16_mask_table))
(base_mask_addr Reg (lea mask_table))
(mask_offset Reg (shl $I64 amt (Imm8Reg.Imm8 4))))
(amode_to_synthetic_amode (amode_imm_reg_reg_shift 0
base_mask_addr
mask_offset
0))))
(rule (ishl_i8x16_mask (RegMemImm.Mem amt))
(ishl_i8x16_mask (RegMemImm.Reg (x64_load $I64 amt (ExtKind.None)))))

;; Get the address of the mask for a constant 8x16 shift amount.
(decl ishl_i8x16_mask_for_const (u32) SyntheticAmode)
(extern constructor ishl_i8x16_mask_for_const ishl_i8x16_mask_for_const)

;; Get the address of the mask table for a dynamic 8x16 shift amount.
(decl ishl_i8x16_mask_table () SyntheticAmode)
(extern constructor ishl_i8x16_mask_table ishl_i8x16_mask_table)

;; 16x8, 32x4, and 64x2 shifts can each use a single instruction.
(rule (lower (has_type $I16X8 (ishl src amt)))
(value_reg (psllw (put_in_reg src)
(reg_mem_imm_to_xmm (put_in_reg_mem_imm amt)))))
(rule (lower (has_type $I32X4 (ishl src amt)))
(value_reg (pslld (put_in_reg src)
(reg_mem_imm_to_xmm (put_in_reg_mem_imm amt)))))
(rule (lower (has_type $I64X2 (ishl src amt)))
(value_reg (psllq (put_in_reg src)
(reg_mem_imm_to_xmm (put_in_reg_mem_imm amt)))))

;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; `i64` and smaller.
Expand Down
29 changes: 6 additions & 23 deletions cranelift/codegen/src/isa/x64/lower.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1539,9 +1539,10 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
| Opcode::Bnot
| Opcode::Bitselect
| Opcode::Vselect
| Opcode::Sshr => implemented_in_isle(ctx),
| Opcode::Sshr
| Opcode::Ishl => implemented_in_isle(ctx),

Opcode::Ishl | Opcode::Ushr | Opcode::Rotl | Opcode::Rotr => {
Opcode::Ushr | Opcode::Rotl | Opcode::Rotr => {
let dst_ty = ctx.output_ty(insn, 0);
debug_assert_eq!(ctx.input_ty(insn, 0), dst_ty);

Expand All @@ -1557,7 +1558,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
// This implementation uses the last two encoding methods.
let (size, lhs) = match dst_ty {
types::I8 | types::I16 => match op {
Opcode::Ishl => (OperandSize::Size32, put_input_in_reg(ctx, inputs[0])),
Opcode::Ushr => (
OperandSize::Size32,
extend_input_to_reg(ctx, inputs[0], ExtSpec::ZeroExtendTo32),
Expand Down Expand Up @@ -1589,7 +1589,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();

let shift_kind = match op {
Opcode::Ishl => ShiftKind::ShiftLeft,
Opcode::Ushr => ShiftKind::ShiftRightLogical,
Opcode::Rotl => ShiftKind::RotateLeft,
Opcode::Rotr => ShiftKind::RotateRight,
Expand All @@ -1608,7 +1607,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let dst = get_output_reg(ctx, outputs[0]);

match op {
Opcode::Ishl | Opcode::Ushr | Opcode::Rotl => {
Opcode::Ushr | Opcode::Rotl => {
implemented_in_isle(ctx);
}
Opcode::Rotr => {
Expand Down Expand Up @@ -1643,7 +1642,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
}
_ => unreachable!(),
}
} else if dst_ty == types::I8X16 && (op == Opcode::Ishl || op == Opcode::Ushr) {
} else if dst_ty == types::I8X16 && op == Opcode::Ushr {
// Since the x86 instruction set does not have any 8x16 shift instructions (even in higher feature sets
// like AVX), we lower the `ishl.i8x16` and `ushr.i8x16` to a sequence of instructions. The basic idea,
// whether the `shift_by` amount is an immediate or not, is to use a 16x8 shift and then mask off the
Expand Down Expand Up @@ -1671,7 +1670,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
// Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be correct for half of the lanes;
// the others must be fixed up with the mask below.
let shift_opcode = match op {
Opcode::Ishl => SseOpcode::Psllw,
Opcode::Ushr => SseOpcode::Psrlw,
_ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
};
Expand All @@ -1695,20 +1693,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
];
const SHL_MASKS: [u8; 128] = [
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,
0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xf8, 0xf8, 0xf8, 0xf8,
0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf0,
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
0xf0, 0xf0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0,
0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
];

let mask = match op {
Opcode::Ishl => &SHL_MASKS,
Opcode::Ushr => &USHR_MASKS,
_ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
};
Expand Down Expand Up @@ -1775,17 +1761,14 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let sse_op = match dst_ty {
types::I16X8 => match op {
Opcode::Ishl => SseOpcode::Psllw,
Opcode::Ushr => SseOpcode::Psrlw,
_ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
},
types::I32X4 => match op {
Opcode::Ishl => SseOpcode::Pslld,
Opcode::Ushr => SseOpcode::Psrld,
_ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
},
types::I64X2 => match op {
Opcode::Ishl => SseOpcode::Psllq,
Opcode::Ushr => SseOpcode::Psrlq,
_ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
},
Expand Down
Loading