From 058f58a5f10674e0545b09b6bf18d58cb8696476 Mon Sep 17 00:00:00 2001 From: Johnnie Birch Date: Tue, 16 Mar 2021 22:08:33 -0700 Subject: [PATCH] Vpopcnt for x64 --- build.rs | 1 - .../codegen/meta/src/shared/instructions.rs | 3 + cranelift/codegen/src/isa/x64/lower.rs | 695 ++++++++++-------- cranelift/wasm/src/code_translator.rs | 10 +- 4 files changed, 397 insertions(+), 312 deletions(-) diff --git a/build.rs b/build.rs index 9ee3b893c784..a2289d24da6f 100644 --- a/build.rs +++ b/build.rs @@ -191,7 +191,6 @@ fn x64_should_panic(testsuite: &str, testname: &str, strategy: &str) -> bool { } match (testsuite, testname) { - ("simd", "simd_i8x16_arith2") => return true, // Unsupported feature: proposed simd operator I8x16Popcnt ("simd", "simd_conversions") => return true, // unknown operator or unexpected token: tests/spec_testsuite/proposals/simd/simd_conversions.wast:724:6 ("simd", "simd_i16x8_extadd_pairwise_i8x16") => return true, ("simd", "simd_i16x8_extmul_i8x16") => return true, diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs index abf7e0e32a81..9a9637f1bfc3 100644 --- a/cranelift/codegen/meta/src/shared/instructions.rs +++ b/cranelift/codegen/meta/src/shared/instructions.rs @@ -3314,6 +3314,9 @@ pub(crate) fn define( .operands_out(vec![a]), ); + let x = &Operand::new("x", Int); + let a = &Operand::new("a", Int); + ig.push( Inst::new( "popcnt", diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 2aaa8f69b586..48395dc4ea46 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -2708,372 +2708,451 @@ fn lower_insn_to_regs>( } Opcode::Popcnt => { - let (ext_spec, ty) = match ctx.input_ty(insn, 0) { - types::I8 | types::I16 => (Some(ExtSpec::ZeroExtendTo32), types::I32), - a if a == types::I32 || a == types::I64 || a == types::I128 => (None, a), - _ => unreachable!(), - }; + let ty_tmp = ty.unwrap(); + if !ty_tmp.is_vector() { + let (ext_spec, ty) = match ctx.input_ty(insn, 0) { + types::I8 | types::I16 => (Some(ExtSpec::ZeroExtendTo32), types::I32), + a if a == types::I32 || a == types::I64 || a == types::I128 => (None, a), + _ => unreachable!(), + }; - if isa_flags.use_popcnt() { - match ty { - types::I32 | types::I64 => { - let src = input_to_reg_mem(ctx, inputs[0]); - let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - ctx.emit(Inst::unary_rm_r( - OperandSize::from_ty(ty), - UnaryRmROpcode::Popcnt, - src, - dst, - )); - return Ok(()); + if isa_flags.use_popcnt() { + match ty { + types::I32 | types::I64 => { + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + ctx.emit(Inst::unary_rm_r( + OperandSize::from_ty(ty), + UnaryRmROpcode::Popcnt, + src, + dst, + )); + return Ok(()); + } + + types::I128 => { + // The number of ones in a 128-bits value is the plain sum of the number of + // ones in its low and high parts. No risk of overflow here. + let dsts = get_output_reg(ctx, outputs[0]); + let dst = dsts.regs()[0]; + let tmp = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + let srcs = put_input_in_regs(ctx, inputs[0]); + let src_lo = srcs.regs()[0]; + let src_hi = srcs.regs()[1]; + + ctx.emit(Inst::unary_rm_r( + OperandSize::Size64, + UnaryRmROpcode::Popcnt, + RegMem::reg(src_lo), + dst, + )); + ctx.emit(Inst::unary_rm_r( + OperandSize::Size64, + UnaryRmROpcode::Popcnt, + RegMem::reg(src_hi), + tmp, + )); + ctx.emit(Inst::alu_rmi_r( + OperandSize::Size64, + AluRmiROpcode::Add, + RegMemImm::reg(tmp.to_reg()), + dst, + )); + + // Zero the result's high component. + ctx.emit(Inst::alu_rmi_r( + OperandSize::Size64, + AluRmiROpcode::Xor, + RegMemImm::reg(dsts.regs()[1].to_reg()), + dsts.regs()[1], + )); + + return Ok(()); + } + _ => {} } + } - types::I128 => { - // The number of ones in a 128-bits value is the plain sum of the number of - // ones in its low and high parts. No risk of overflow here. - let dsts = get_output_reg(ctx, outputs[0]); - let dst = dsts.regs()[0]; - let tmp = ctx.alloc_tmp(types::I64).only_reg().unwrap(); - let srcs = put_input_in_regs(ctx, inputs[0]); - let src_lo = srcs.regs()[0]; - let src_hi = srcs.regs()[1]; - - ctx.emit(Inst::unary_rm_r( + let (srcs, ty): (SmallVec<[RegMem; 2]>, Type) = if let Some(ext_spec) = ext_spec { + ( + smallvec![RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec))], + ty, + ) + } else if ty == types::I128 { + let regs = put_input_in_regs(ctx, inputs[0]); + ( + smallvec![RegMem::reg(regs.regs()[0]), RegMem::reg(regs.regs()[1])], + types::I64, + ) + } else { + // N.B.: explicitly put input in a reg here because the width of the instruction + // into which this RM op goes may not match the width of the input type (in fact, + // it won't for i32.popcnt), and we don't want a larger than necessary load. + (smallvec![RegMem::reg(put_input_in_reg(ctx, inputs[0]))], ty) + }; + + let mut dsts: SmallVec<[Reg; 2]> = smallvec![]; + for src in srcs { + let dst = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + dsts.push(dst.to_reg()); + if ty == types::I64 { + let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + let cst = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + + // mov src, tmp1 + ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1)); + + // shr $1, tmp1 + ctx.emit(Inst::shift_r( OperandSize::Size64, - UnaryRmROpcode::Popcnt, - RegMem::reg(src_lo), - dst, + ShiftKind::ShiftRightLogical, + Some(1), + tmp1, )); - ctx.emit(Inst::unary_rm_r( + + // mov 0x7777_7777_7777_7777, cst + ctx.emit(Inst::imm(OperandSize::Size64, 0x7777777777777777, cst)); + + // andq cst, tmp1 + ctx.emit(Inst::alu_rmi_r( OperandSize::Size64, - UnaryRmROpcode::Popcnt, - RegMem::reg(src_hi), - tmp, + AluRmiROpcode::And, + RegMemImm::reg(cst.to_reg()), + tmp1, )); + + // mov src, tmp2 + ctx.emit(Inst::mov64_rm_r(src, tmp2)); + + // sub tmp1, tmp2 ctx.emit(Inst::alu_rmi_r( OperandSize::Size64, - AluRmiROpcode::Add, - RegMemImm::reg(tmp.to_reg()), - dst, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp1.to_reg()), + tmp2, )); - // Zero the result's high component. - ctx.emit(Inst::alu_rmi_r( + // shr $1, tmp1 + ctx.emit(Inst::shift_r( OperandSize::Size64, - AluRmiROpcode::Xor, - RegMemImm::reg(dsts.regs()[1].to_reg()), - dsts.regs()[1], + ShiftKind::ShiftRightLogical, + Some(1), + tmp1, )); - return Ok(()); - } - _ => {} - } - } + // and cst, tmp1 + ctx.emit(Inst::alu_rmi_r( + OperandSize::Size64, + AluRmiROpcode::And, + RegMemImm::reg(cst.to_reg()), + tmp1, + )); - let (srcs, ty): (SmallVec<[RegMem; 2]>, Type) = if let Some(ext_spec) = ext_spec { - ( - smallvec![RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec))], - ty, - ) - } else if ty == types::I128 { - let regs = put_input_in_regs(ctx, inputs[0]); - ( - smallvec![RegMem::reg(regs.regs()[0]), RegMem::reg(regs.regs()[1])], - types::I64, - ) - } else { - // N.B.: explicitly put input in a reg here because the width of the instruction - // into which this RM op goes may not match the width of the input type (in fact, - // it won't for i32.popcnt), and we don't want a larger than necessary load. - (smallvec![RegMem::reg(put_input_in_reg(ctx, inputs[0]))], ty) - }; + // sub tmp1, tmp2 + ctx.emit(Inst::alu_rmi_r( + OperandSize::Size64, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp1.to_reg()), + tmp2, + )); - let mut dsts: SmallVec<[Reg; 2]> = smallvec![]; - for src in srcs { - let dst = ctx.alloc_tmp(types::I64).only_reg().unwrap(); - dsts.push(dst.to_reg()); - if ty == types::I64 { - let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); - let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); - let cst = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + // shr $1, tmp1 + ctx.emit(Inst::shift_r( + OperandSize::Size64, + ShiftKind::ShiftRightLogical, + Some(1), + tmp1, + )); - // mov src, tmp1 - ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1)); + // and cst, tmp1 + ctx.emit(Inst::alu_rmi_r( + OperandSize::Size64, + AluRmiROpcode::And, + RegMemImm::reg(cst.to_reg()), + tmp1, + )); - // shr $1, tmp1 - ctx.emit(Inst::shift_r( - OperandSize::Size64, - ShiftKind::ShiftRightLogical, - Some(1), - tmp1, - )); + // sub tmp1, tmp2 + ctx.emit(Inst::alu_rmi_r( + OperandSize::Size64, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp1.to_reg()), + tmp2, + )); - // mov 0x7777_7777_7777_7777, cst - ctx.emit(Inst::imm(OperandSize::Size64, 0x7777777777777777, cst)); + // mov tmp2, dst + ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst)); - // andq cst, tmp1 - ctx.emit(Inst::alu_rmi_r( - OperandSize::Size64, - AluRmiROpcode::And, - RegMemImm::reg(cst.to_reg()), - tmp1, - )); + // shr $4, dst + ctx.emit(Inst::shift_r( + OperandSize::Size64, + ShiftKind::ShiftRightLogical, + Some(4), + dst, + )); - // mov src, tmp2 - ctx.emit(Inst::mov64_rm_r(src, tmp2)); + // add tmp2, dst + ctx.emit(Inst::alu_rmi_r( + OperandSize::Size64, + AluRmiROpcode::Add, + RegMemImm::reg(tmp2.to_reg()), + dst, + )); - // sub tmp1, tmp2 - ctx.emit(Inst::alu_rmi_r( - OperandSize::Size64, - AluRmiROpcode::Sub, - RegMemImm::reg(tmp1.to_reg()), - tmp2, - )); + // mov $0x0F0F_0F0F_0F0F_0F0F, cst + ctx.emit(Inst::imm(OperandSize::Size64, 0x0F0F0F0F0F0F0F0F, cst)); - // shr $1, tmp1 - ctx.emit(Inst::shift_r( - OperandSize::Size64, - ShiftKind::ShiftRightLogical, - Some(1), - tmp1, - )); + // and cst, dst + ctx.emit(Inst::alu_rmi_r( + OperandSize::Size64, + AluRmiROpcode::And, + RegMemImm::reg(cst.to_reg()), + dst, + )); - // and cst, tmp1 - ctx.emit(Inst::alu_rmi_r( - OperandSize::Size64, - AluRmiROpcode::And, - RegMemImm::reg(cst.to_reg()), - tmp1, - )); + // mov $0x0101_0101_0101_0101, cst + ctx.emit(Inst::imm(OperandSize::Size64, 0x0101010101010101, cst)); - // sub tmp1, tmp2 - ctx.emit(Inst::alu_rmi_r( - OperandSize::Size64, - AluRmiROpcode::Sub, - RegMemImm::reg(tmp1.to_reg()), - tmp2, - )); + // mul cst, dst + ctx.emit(Inst::alu_rmi_r( + OperandSize::Size64, + AluRmiROpcode::Mul, + RegMemImm::reg(cst.to_reg()), + dst, + )); - // shr $1, tmp1 - ctx.emit(Inst::shift_r( - OperandSize::Size64, - ShiftKind::ShiftRightLogical, - Some(1), - tmp1, - )); + // shr $56, dst + ctx.emit(Inst::shift_r( + OperandSize::Size64, + ShiftKind::ShiftRightLogical, + Some(56), + dst, + )); + } else { + assert_eq!(ty, types::I32); - // and cst, tmp1 - ctx.emit(Inst::alu_rmi_r( - OperandSize::Size64, - AluRmiROpcode::And, - RegMemImm::reg(cst.to_reg()), - tmp1, - )); + let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); - // sub tmp1, tmp2 - ctx.emit(Inst::alu_rmi_r( - OperandSize::Size64, - AluRmiROpcode::Sub, - RegMemImm::reg(tmp1.to_reg()), - tmp2, - )); + // mov src, tmp1 + ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1)); - // mov tmp2, dst - ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst)); + // shr $1, tmp1 + ctx.emit(Inst::shift_r( + OperandSize::Size32, + ShiftKind::ShiftRightLogical, + Some(1), + tmp1, + )); - // shr $4, dst - ctx.emit(Inst::shift_r( - OperandSize::Size64, - ShiftKind::ShiftRightLogical, - Some(4), - dst, - )); + // andq $0x7777_7777, tmp1 + ctx.emit(Inst::alu_rmi_r( + OperandSize::Size32, + AluRmiROpcode::And, + RegMemImm::imm(0x77777777), + tmp1, + )); - // add tmp2, dst - ctx.emit(Inst::alu_rmi_r( - OperandSize::Size64, - AluRmiROpcode::Add, - RegMemImm::reg(tmp2.to_reg()), - dst, - )); + // mov src, tmp2 + ctx.emit(Inst::mov64_rm_r(src, tmp2)); - // mov $0x0F0F_0F0F_0F0F_0F0F, cst - ctx.emit(Inst::imm(OperandSize::Size64, 0x0F0F0F0F0F0F0F0F, cst)); + // sub tmp1, tmp2 + ctx.emit(Inst::alu_rmi_r( + OperandSize::Size32, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp1.to_reg()), + tmp2, + )); - // and cst, dst - ctx.emit(Inst::alu_rmi_r( - OperandSize::Size64, - AluRmiROpcode::And, - RegMemImm::reg(cst.to_reg()), - dst, - )); + // shr $1, tmp1 + ctx.emit(Inst::shift_r( + OperandSize::Size32, + ShiftKind::ShiftRightLogical, + Some(1), + tmp1, + )); - // mov $0x0101_0101_0101_0101, cst - ctx.emit(Inst::imm(OperandSize::Size64, 0x0101010101010101, cst)); + // and 0x7777_7777, tmp1 + ctx.emit(Inst::alu_rmi_r( + OperandSize::Size32, + AluRmiROpcode::And, + RegMemImm::imm(0x77777777), + tmp1, + )); - // mul cst, dst - ctx.emit(Inst::alu_rmi_r( - OperandSize::Size64, - AluRmiROpcode::Mul, - RegMemImm::reg(cst.to_reg()), - dst, - )); + // sub tmp1, tmp2 + ctx.emit(Inst::alu_rmi_r( + OperandSize::Size32, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp1.to_reg()), + tmp2, + )); - // shr $56, dst - ctx.emit(Inst::shift_r( - OperandSize::Size64, - ShiftKind::ShiftRightLogical, - Some(56), - dst, - )); - } else { - assert_eq!(ty, types::I32); + // shr $1, tmp1 + ctx.emit(Inst::shift_r( + OperandSize::Size32, + ShiftKind::ShiftRightLogical, + Some(1), + tmp1, + )); - let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); - let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + // and $0x7777_7777, tmp1 + ctx.emit(Inst::alu_rmi_r( + OperandSize::Size32, + AluRmiROpcode::And, + RegMemImm::imm(0x77777777), + tmp1, + )); - // mov src, tmp1 - ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1)); + // sub tmp1, tmp2 + ctx.emit(Inst::alu_rmi_r( + OperandSize::Size32, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp1.to_reg()), + tmp2, + )); - // shr $1, tmp1 - ctx.emit(Inst::shift_r( - OperandSize::Size32, - ShiftKind::ShiftRightLogical, - Some(1), - tmp1, - )); + // mov tmp2, dst + ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst)); - // andq $0x7777_7777, tmp1 - ctx.emit(Inst::alu_rmi_r( - OperandSize::Size32, - AluRmiROpcode::And, - RegMemImm::imm(0x77777777), - tmp1, - )); + // shr $4, dst + ctx.emit(Inst::shift_r( + OperandSize::Size32, + ShiftKind::ShiftRightLogical, + Some(4), + dst, + )); - // mov src, tmp2 - ctx.emit(Inst::mov64_rm_r(src, tmp2)); + // add tmp2, dst + ctx.emit(Inst::alu_rmi_r( + OperandSize::Size32, + AluRmiROpcode::Add, + RegMemImm::reg(tmp2.to_reg()), + dst, + )); - // sub tmp1, tmp2 - ctx.emit(Inst::alu_rmi_r( - OperandSize::Size32, - AluRmiROpcode::Sub, - RegMemImm::reg(tmp1.to_reg()), - tmp2, - )); + // and $0x0F0F_0F0F, dst + ctx.emit(Inst::alu_rmi_r( + OperandSize::Size32, + AluRmiROpcode::And, + RegMemImm::imm(0x0F0F0F0F), + dst, + )); - // shr $1, tmp1 - ctx.emit(Inst::shift_r( - OperandSize::Size32, - ShiftKind::ShiftRightLogical, - Some(1), - tmp1, - )); + // mul $0x0101_0101, dst + ctx.emit(Inst::alu_rmi_r( + OperandSize::Size32, + AluRmiROpcode::Mul, + RegMemImm::imm(0x01010101), + dst, + )); - // and 0x7777_7777, tmp1 - ctx.emit(Inst::alu_rmi_r( - OperandSize::Size32, - AluRmiROpcode::And, - RegMemImm::imm(0x77777777), - tmp1, - )); + // shr $24, dst + ctx.emit(Inst::shift_r( + OperandSize::Size32, + ShiftKind::ShiftRightLogical, + Some(24), + dst, + )); + } + } - // sub tmp1, tmp2 + if dsts.len() == 1 { + let final_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + ctx.emit(Inst::gen_move(final_dst, dsts[0], types::I64)); + } else { + assert!(dsts.len() == 2); + let final_dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::gen_move(final_dst.regs()[0], dsts[0], types::I64)); ctx.emit(Inst::alu_rmi_r( - OperandSize::Size32, - AluRmiROpcode::Sub, - RegMemImm::reg(tmp1.to_reg()), - tmp2, - )); - - // shr $1, tmp1 - ctx.emit(Inst::shift_r( - OperandSize::Size32, - ShiftKind::ShiftRightLogical, - Some(1), - tmp1, + OperandSize::Size64, + AluRmiROpcode::Add, + RegMemImm::reg(dsts[1]), + final_dst.regs()[0], )); - - // and $0x7777_7777, tmp1 ctx.emit(Inst::alu_rmi_r( - OperandSize::Size32, - AluRmiROpcode::And, - RegMemImm::imm(0x77777777), - tmp1, + OperandSize::Size64, + AluRmiROpcode::Xor, + RegMemImm::reg(final_dst.regs()[1].to_reg()), + final_dst.regs()[1], )); + } + } else { + // For SIMD 4.4 we use Mula's algroithm (https://arxiv.org/pdf/1611.07612.pdf) + // + //__m128i count_bytes ( __m128i v) { + // __m128i lookup = _mm_setr_epi8(0 ,1 ,1 ,2 ,1 ,2 ,2 ,3 ,1 ,2 ,2 ,3 ,2 ,3 ,3 ,4) ; + // __m128i low_mask = _mm_set1_epi8 (0 x0f ) ; + // __m128i lo = _mm_and_si128 (v, low_mask ) ; + // __m128i hi = _mm_and_si128 (_mm_srli_epi16 (v, 4) , low_mask ) ; + // __m128i cnt1 = _mm_shuffle_epi8 (lookup , lo) ; + // __m128i cnt2 = _mm_shuffle_epi8 (lookup , hi) ; + // return _mm_add_epi8 (cnt1 , cnt2 ) ; + //} + // + // Details of the above algorithm can be found in the reference noted above, but the basics + // are to create a lookup table that pre populates the popcnt values for each number [0,15]. + // The algorithm uses shifts to isolate 4 bit sections of the vector, pshufb as part of the + // lookup process, and adds together the results. - // sub tmp1, tmp2 - ctx.emit(Inst::alu_rmi_r( - OperandSize::Size32, - AluRmiROpcode::Sub, - RegMemImm::reg(tmp1.to_reg()), - tmp2, - )); + // Get input vector and destination + let ty = ty.unwrap(); + let lhs = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - // mov tmp2, dst - ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst)); + // __m128i lookup = _mm_setr_epi8(0 ,1 ,1 ,2 ,1 ,2 ,2 ,3 ,1 ,2 ,2 ,3 ,2 ,3 ,3 ,4); + static POPCOUNT_4BIT: [u8; 16] = [ + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, + 0x03, 0x03, 0x04, + ]; + let lookup = ctx.use_constant(VCodeConstantData::WellKnown(&POPCOUNT_4BIT)); - // shr $4, dst - ctx.emit(Inst::shift_r( - OperandSize::Size32, - ShiftKind::ShiftRightLogical, - Some(4), - dst, - )); + // Create a mask for lower 4bits of each subword. + static LOW_MASK: [u8; 16] = [0x0F; 16]; + let low_mask_const = ctx.use_constant(VCodeConstantData::WellKnown(&LOW_MASK)); + let low_mask = ctx.alloc_tmp(types::I8X16).only_reg().unwrap(); + ctx.emit(Inst::xmm_load_const(low_mask_const, low_mask, ty)); - // add tmp2, dst - ctx.emit(Inst::alu_rmi_r( - OperandSize::Size32, - AluRmiROpcode::Add, - RegMemImm::reg(tmp2.to_reg()), - dst, - )); + // __m128i lo = _mm_and_si128 (v, low_mask ); + let lo = ctx.alloc_tmp(types::I8X16).only_reg().unwrap(); + ctx.emit(Inst::gen_move(lo, low_mask.to_reg(), types::I8X16)); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pand, RegMem::reg(lhs), lo)); - // and $0x0F0F_0F0F, dst - ctx.emit(Inst::alu_rmi_r( - OperandSize::Size32, - AluRmiROpcode::And, - RegMemImm::imm(0x0F0F0F0F), - dst, - )); + // __m128i hi = _mm_and_si128 (_mm_srli_epi16 (v, 4) , low_mask ) ; + ctx.emit(Inst::gen_move(dst, lhs, ty)); + ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrlw, RegMemImm::imm(4), dst)); + let tmp = ctx.alloc_tmp(types::I8X16).only_reg().unwrap(); + ctx.emit(Inst::gen_move(tmp, low_mask.to_reg(), types::I8X16)); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pand, + RegMem::reg(dst.to_reg()), + tmp, + )); - // mul $0x0101_0101, dst - ctx.emit(Inst::alu_rmi_r( - OperandSize::Size32, - AluRmiROpcode::Mul, - RegMemImm::imm(0x01010101), - dst, - )); + // __m128i cnt1 = _mm_shuffle_epi8 (lookup , lo) ; + let tmp2 = ctx.alloc_tmp(types::I8X16).only_reg().unwrap(); + ctx.emit(Inst::xmm_load_const(lookup, tmp2, ty)); + ctx.emit(Inst::gen_move(dst, tmp2.to_reg(), types::I8X16)); - // shr $24, dst - ctx.emit(Inst::shift_r( - OperandSize::Size32, - ShiftKind::ShiftRightLogical, - Some(24), - dst, - )); - } - } + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pshufb, + RegMem::reg(lo.to_reg()), + dst, + )); - if dsts.len() == 1 { - let final_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - ctx.emit(Inst::gen_move(final_dst, dsts[0], types::I64)); - } else { - assert!(dsts.len() == 2); - let final_dst = get_output_reg(ctx, outputs[0]); - ctx.emit(Inst::gen_move(final_dst.regs()[0], dsts[0], types::I64)); - ctx.emit(Inst::alu_rmi_r( - OperandSize::Size64, - AluRmiROpcode::Add, - RegMemImm::reg(dsts[1]), - final_dst.regs()[0], + // __m128i cnt2 = _mm_shuffle_epi8 (lookup , hi) ; + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pshufb, + RegMem::reg(tmp.to_reg()), + tmp2, )); - ctx.emit(Inst::alu_rmi_r( - OperandSize::Size64, - AluRmiROpcode::Xor, - RegMemImm::reg(final_dst.regs()[1].to_reg()), - final_dst.regs()[1], + + // return _mm_add_epi8 (cnt1 , cnt2 ) ; + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Paddb, + RegMem::reg(tmp2.to_reg()), + dst, )); } } diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs index d6403ad45623..d810306892b8 100644 --- a/cranelift/wasm/src/code_translator.rs +++ b/cranelift/wasm/src/code_translator.rs @@ -1859,6 +1859,10 @@ pub fn translate_operator( let (a, b) = pop2_with_bitcast(state, I16X8, builder); state.push1(builder.ins().widening_pairwise_dot_product_s(a, b)); } + Operator::I8x16Popcnt => { + let arg = pop1_with_bitcast(state, type_of(op), builder); + state.push1(builder.ins().popcnt(arg)); + } Operator::I64x2ExtendLowI32x4S | Operator::I64x2ExtendHighI32x4S | Operator::I64x2ExtendLowI32x4U @@ -1884,8 +1888,7 @@ pub fn translate_operator( | Operator::F64x2PromoteLowF32x4 | Operator::F64x2ConvertLowI32x4U | Operator::I32x4TruncSatF64x2SZero - | Operator::I32x4TruncSatF64x2UZero - | Operator::I8x16Popcnt => { + | Operator::I32x4TruncSatF64x2UZero => { return Err(wasm_unsupported!("proposed simd operator {:?}", op)); } Operator::ReturnCall { .. } | Operator::ReturnCallIndirect { .. } => { @@ -2590,7 +2593,8 @@ fn type_of(operator: &Operator) -> Type { | Operator::I8x16MaxS | Operator::I8x16MaxU | Operator::I8x16RoundingAverageU - | Operator::I8x16Bitmask => I8X16, + | Operator::I8x16Bitmask + | Operator::I8x16Popcnt => I8X16, Operator::I16x8Splat | Operator::V128Load16Splat { .. }