diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 4542f3386de7..817e7f830c8d 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -550,6 +550,8 @@ pub enum SseOpcode { Punpcklbw, Pxor, Rcpss, + Roundps, + Roundpd, Roundss, Roundsd, Rsqrtss, @@ -729,6 +731,8 @@ impl SseOpcode { | SseOpcode::Pmovzxdq | SseOpcode::Pmulld | SseOpcode::Ptest + | SseOpcode::Roundps + | SseOpcode::Roundpd | SseOpcode::Roundss | SseOpcode::Roundsd => SSE41, @@ -890,6 +894,8 @@ impl fmt::Debug for SseOpcode { SseOpcode::Punpcklbw => "punpcklbw", SseOpcode::Pxor => "pxor", SseOpcode::Rcpss => "rcpss", + SseOpcode::Roundps => "roundps", + SseOpcode::Roundpd => "roundpd", SseOpcode::Roundss => "roundss", SseOpcode::Roundsd => "roundsd", SseOpcode::Rsqrtss => "rsqrtss", @@ -1238,6 +1244,20 @@ impl From for FcmpImm { } } +/// Encode the rounding modes used as part of the Rounding Control field. +pub(crate) enum RoundImm { + RoundNearest = 0x00, + RoundDown = 0x01, + RoundUp = 0x02, + RoundZero = 0x03, +} + +impl RoundImm { + pub(crate) fn encode(self) -> u8 { + self as u8 + } +} + /// An operand's size in bits. #[derive(Clone, Copy, PartialEq)] pub enum OperandSize { diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 56ecc0e8432a..c0d94d2ab63b 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -1981,6 +1981,8 @@ pub(crate) fn emit( SseOpcode::Pextrw => (LegacyPrefixes::_66, 0x0FC5, 2), SseOpcode::Pextrd => (LegacyPrefixes::_66, 0x0F3A16, 3), SseOpcode::Pshufd => (LegacyPrefixes::_66, 0x0F70, 2), + SseOpcode::Roundps => (LegacyPrefixes::_66, 0x0F3A08, 3), + SseOpcode::Roundpd => (LegacyPrefixes::_66, 0x0F3A09, 3), _ => unimplemented!("Opcode {:?} not implemented", op), }; let rex = if *is64 { diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index fb9f0c1c07db..bda26e3f27dc 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -3505,6 +3505,27 @@ fn test_x64_emit() { "palignr $3, %xmm1, %xmm9", )); + insns.push(( + Inst::xmm_rm_r_imm(SseOpcode::Roundps, RegMem::reg(xmm7), w_xmm8, 3, false), + "66440F3A08C703", + "roundps $3, %xmm7, %xmm8", + )); + insns.push(( + Inst::xmm_rm_r_imm(SseOpcode::Roundpd, RegMem::reg(xmm10), w_xmm7, 2, false), + "66410F3A09FA02", + "roundpd $2, %xmm10, %xmm7", + )); + insns.push(( + Inst::xmm_rm_r_imm(SseOpcode::Roundps, RegMem::reg(xmm4), w_xmm8, 1, false), + "66440F3A08C401", + "roundps $1, %xmm4, %xmm8", + )); + insns.push(( + Inst::xmm_rm_r_imm(SseOpcode::Roundpd, RegMem::reg(xmm15), w_xmm15, 0, false), + "66450F3A09FF00", + "roundpd $0, %xmm15, %xmm15", + )); + // ======================================================== // Pertaining to atomics. let am1: SyntheticAmode = Amode::imm_reg_reg_shift(321, r10, rdx, 2).into(); diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 09ded3c94882..30cd1b4d2da9 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -3207,22 +3207,45 @@ fn lower_insn_to_regs>( // Lower to VM calls when there's no access to SSE4.1. let ty = ty.unwrap(); - let libcall = match (ty, op) { - (types::F32, Opcode::Ceil) => LibCall::CeilF32, - (types::F64, Opcode::Ceil) => LibCall::CeilF64, - (types::F32, Opcode::Floor) => LibCall::FloorF32, - (types::F64, Opcode::Floor) => LibCall::FloorF64, - (types::F32, Opcode::Nearest) => LibCall::NearestF32, - (types::F64, Opcode::Nearest) => LibCall::NearestF64, - (types::F32, Opcode::Trunc) => LibCall::TruncF32, - (types::F64, Opcode::Trunc) => LibCall::TruncF64, - _ => panic!( - "unexpected type/opcode {:?}/{:?} in Ceil/Floor/Nearest/Trunc", - ty, op - ), - }; - - emit_vm_call(ctx, flags, triple, libcall, insn, inputs, outputs)?; + if !ty.is_vector() { + let libcall = match (op, ty) { + (Opcode::Ceil, types::F32) => LibCall::CeilF32, + (Opcode::Ceil, types::F64) => LibCall::CeilF64, + (Opcode::Floor, types::F32) => LibCall::FloorF32, + (Opcode::Floor, types::F64) => LibCall::FloorF64, + (Opcode::Nearest, types::F32) => LibCall::NearestF32, + (Opcode::Nearest, types::F64) => LibCall::NearestF64, + (Opcode::Trunc, types::F32) => LibCall::TruncF32, + (Opcode::Trunc, types::F64) => LibCall::TruncF64, + _ => panic!( + "unexpected type/opcode {:?}/{:?} in Ceil/Floor/Nearest/Trunc", + ty, op + ), + }; + emit_vm_call(ctx, flags, triple, libcall, insn, inputs, outputs)?; + } else { + let (op, mode) = match (op, ty) { + (Opcode::Ceil, types::F32X4) => (SseOpcode::Roundps, RoundImm::RoundUp), + (Opcode::Ceil, types::F64X2) => (SseOpcode::Roundpd, RoundImm::RoundUp), + (Opcode::Floor, types::F32X4) => (SseOpcode::Roundps, RoundImm::RoundDown), + (Opcode::Floor, types::F64X2) => (SseOpcode::Roundpd, RoundImm::RoundDown), + (Opcode::Trunc, types::F32X4) => (SseOpcode::Roundps, RoundImm::RoundZero), + (Opcode::Trunc, types::F64X2) => (SseOpcode::Roundpd, RoundImm::RoundZero), + (Opcode::Nearest, types::F32X4) => (SseOpcode::Roundps, RoundImm::RoundNearest), + (Opcode::Nearest, types::F64X2) => (SseOpcode::Roundpd, RoundImm::RoundNearest), + _ => panic!("Unknown op/ty combination (vector){:?}", ty), + }; + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::gen_move(dst, src, ty)); + ctx.emit(Inst::xmm_rm_r_imm( + op, + RegMem::reg(dst.to_reg()), + dst, + mode.encode(), + false, + )); + } } Opcode::Load