From 98c359c7bedfa748e43b99a5a6eeb0babd3c1bea Mon Sep 17 00:00:00 2001 From: Trevor Elliott Date: Thu, 11 Aug 2022 22:39:46 -0700 Subject: [PATCH] Lower `fcvt_to_uint_sat` in ISLE --- cranelift/codegen/src/isa/x64/lower.isle | 92 ++++++++++++ cranelift/codegen/src/isa/x64/lower.rs | 140 +----------------- .../filetests/filetests/isa/x64/fcvt.clif | 25 ++-- 3 files changed, 107 insertions(+), 150 deletions(-) diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index f3196ab921df..21160e83a2ae 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -3104,3 +3104,95 @@ ;; Keeps negative overflow lanes as is. (x64_pxor tmp dst))) +;; The algorithm for converting floats to unsigned ints is a little tricky. The +;; complication arises because we are converting from a signed 64-bit int with a positive +;; integer range from 1..INT_MAX (0x1..0x7FFFFFFF) to an unsigned integer with an extended +;; range from (INT_MAX+1)..UINT_MAX. It's this range from (INT_MAX+1)..UINT_MAX +;; (0x80000000..0xFFFFFFFF) that needs to be accounted for as a special case since our +;; conversion instruction (cvttps2dq) only converts as high as INT_MAX (0x7FFFFFFF), but +;; which conveniently setting underflows and overflows (smaller than MIN_INT or larger than +;; MAX_INT) to be INT_MAX+1 (0x80000000). Nothing that the range (INT_MAX+1)..UINT_MAX includes +;; precisely INT_MAX values we can correctly account for and convert every value in this range +;; if we simply subtract INT_MAX+1 before doing the cvttps2dq conversion. After the subtraction +;; every value originally (INT_MAX+1)..UINT_MAX is now the range (0..INT_MAX). +;; After the conversion we add INT_MAX+1 back to this converted value, noting again that +;; values we are trying to account for were already set to INT_MAX+1 during the original conversion. +;; We simply have to create a mask and make sure we are adding together only the lanes that need +;; to be accounted for. Digesting it all the steps then are: +;; +;; Step 1 - Account for NaN and negative floats by setting these src values to zero. +;; Step 2 - Make a copy (tmp1) of the src value since we need to convert twice for +;; reasons described above. +;; Step 3 - Convert the original src values. This will convert properly all floats up to INT_MAX +;; Step 4 - Subtract INT_MAX from the copy set (tmp1). Note, all zero and negative values are those +;; values that were originally in the range (0..INT_MAX). This will come in handy during +;; step 7 when we zero negative lanes. +;; Step 5 - Create a bit mask for tmp1 that will correspond to all lanes originally less than +;; UINT_MAX that are now less than INT_MAX thanks to the subtraction. +;; Step 6 - Convert the second set of values (tmp1) +;; Step 7 - Prep the converted second set by zeroing out negative lanes (these have already been +;; converted correctly with the first set) and by setting overflow lanes to 0x7FFFFFFF +;; as this will allow us to properly saturate overflow lanes when adding to 0x80000000 +;; Step 8 - Add the orginal converted src and the converted tmp1 where float values originally less +;; than and equal to INT_MAX will be unchanged, float values originally between INT_MAX+1 and +;; UINT_MAX will add together (INT_MAX) + (SRC - INT_MAX), and float values originally +;; greater than UINT_MAX will be saturated to UINT_MAX (0xFFFFFFFF) after adding (0x8000000 + 0x7FFFFFFF). +;; +;; +;; The table below illustrates the result after each step where it matters for the converted set. +;; Note the original value range (original src set) is the final dst in Step 8: +;; +;; Original src set: +;; | Original Value Range | Step 1 | Step 3 | Step 8 | +;; | -FLT_MIN..FLT_MAX | 0.0..FLT_MAX | 0..INT_MAX(w/overflow) | 0..UINT_MAX(w/saturation) | +;; +;; Copied src set (tmp1): +;; | Step 2 | Step 4 | +;; | 0.0..FLT_MAX | (0.0-(INT_MAX+1))..(FLT_MAX-(INT_MAX+1)) | +;; +;; | Step 6 | Step 7 | +;; | (0-(INT_MAX+1))..(UINT_MAX-(INT_MAX+1))(w/overflow) | ((INT_MAX+1)-(INT_MAX+1))..(INT_MAX+1) | +(rule (lower (has_type $I32X4 (fcvt_to_uint_sat val @ (value_type $F32X4)))) + (let ((src Xmm val) + + ;; Converting to unsigned int so if float src is negative or NaN + ;; will first set to zero. + (tmp2 Xmm (x64_pxor src src)) ;; make a zero + (dst Xmm (x64_maxps src tmp2)) + + ;; Set tmp2 to INT_MAX+1. It is important to note here that after it looks + ;; like we are only converting INT_MAX (0x7FFFFFFF) but in fact because + ;; single precision IEEE-754 floats can only accurately represent contingous + ;; integers up to 2^23 and outside of this range it rounds to the closest + ;; integer that it can represent. In the case of INT_MAX, this value gets + ;; represented as 0x4f000000 which is the integer value (INT_MAX+1). + (tmp2 Xmm (x64_pcmpeqd tmp2 tmp2)) + (tmp2 Xmm (x64_psrld tmp2 (RegMemImm.Imm 1))) + (tmp2 Xmm (x64_cvtdq2ps tmp2)) + + ;; Make a copy of these lanes and then do the first conversion. + ;; Overflow lanes greater than the maximum allowed signed value will + ;; set to 0x80000000. Negative and NaN lanes will be 0x0 + (tmp1 Xmm dst) + (dst Xmm (x64_cvttps2dq $F32X4 dst)) + + ;; Set lanes to src - max_signed_int + (tmp1 Xmm (x64_subps tmp1 tmp2)) + + ;; Create mask for all positive lanes to saturate (i.e. greater than + ;; or equal to the maxmimum allowable unsigned int). + (tmp2 Xmm (x64_cmpps tmp2 tmp1 (FcmpImm.LessThanOrEqual))) + + ;; Convert those set of lanes that have the max_signed_int factored out. + (tmp1 Xmm (x64_cvttps2dq $F32X4 tmp1)) + + ;; Prepare converted lanes by zeroing negative lanes and prepping lanes + ;; that have positive overflow (based on the mask) by setting these lanes + ;; to 0x7FFFFFFF + (tmp1 Xmm (x64_pxor tmp1 tmp2)) + (tmp2 Xmm (x64_pxor tmp2 tmp2)) ;; make another zero + (tmp1 Xmm (x64_pmaxsd tmp1 tmp2))) + + ;; Add this second set of converted lanes to the original to properly handle + ;; values greater than max signed int. + (x64_paddd tmp1 dst))) diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 941aa6634148..91b3f0fcd107 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -559,146 +559,12 @@ fn lower_insn_to_regs( | Opcode::FcvtLowFromSint | Opcode::FcvtFromUint | Opcode::FcvtToUint - | Opcode::FcvtToSint => { + | Opcode::FcvtToSint + | Opcode::FcvtToUintSat + | Opcode::FcvtToSintSat => { implemented_in_isle(ctx); } - Opcode::FcvtToUintSat | Opcode::FcvtToSintSat => { - let src = put_input_in_reg(ctx, inputs[0]); - let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - - let input_ty = ctx.input_ty(insn, 0); - if !input_ty.is_vector() { - implemented_in_isle(ctx); - } else { - if op == Opcode::FcvtToSintSat { - implemented_in_isle(ctx); - } else if op == Opcode::FcvtToUintSat { - // The algorithm for converting floats to unsigned ints is a little tricky. The - // complication arises because we are converting from a signed 64-bit int with a positive - // integer range from 1..INT_MAX (0x1..0x7FFFFFFF) to an unsigned integer with an extended - // range from (INT_MAX+1)..UINT_MAX. It's this range from (INT_MAX+1)..UINT_MAX - // (0x80000000..0xFFFFFFFF) that needs to be accounted for as a special case since our - // conversion instruction (cvttps2dq) only converts as high as INT_MAX (0x7FFFFFFF), but - // which conveniently setting underflows and overflows (smaller than MIN_INT or larger than - // MAX_INT) to be INT_MAX+1 (0x80000000). Nothing that the range (INT_MAX+1)..UINT_MAX includes - // precisely INT_MAX values we can correctly account for and convert every value in this range - // if we simply subtract INT_MAX+1 before doing the cvttps2dq conversion. After the subtraction - // every value originally (INT_MAX+1)..UINT_MAX is now the range (0..INT_MAX). - // After the conversion we add INT_MAX+1 back to this converted value, noting again that - // values we are trying to account for were already set to INT_MAX+1 during the original conversion. - // We simply have to create a mask and make sure we are adding together only the lanes that need - // to be accounted for. Digesting it all the steps then are: - // - // Step 1 - Account for NaN and negative floats by setting these src values to zero. - // Step 2 - Make a copy (tmp1) of the src value since we need to convert twice for - // reasons described above. - // Step 3 - Convert the original src values. This will convert properly all floats up to INT_MAX - // Step 4 - Subtract INT_MAX from the copy set (tmp1). Note, all zero and negative values are those - // values that were originally in the range (0..INT_MAX). This will come in handy during - // step 7 when we zero negative lanes. - // Step 5 - Create a bit mask for tmp1 that will correspond to all lanes originally less than - // UINT_MAX that are now less than INT_MAX thanks to the subtraction. - // Step 6 - Convert the second set of values (tmp1) - // Step 7 - Prep the converted second set by zeroing out negative lanes (these have already been - // converted correctly with the first set) and by setting overflow lanes to 0x7FFFFFFF - // as this will allow us to properly saturate overflow lanes when adding to 0x80000000 - // Step 8 - Add the orginal converted src and the converted tmp1 where float values originally less - // than and equal to INT_MAX will be unchanged, float values originally between INT_MAX+1 and - // UINT_MAX will add together (INT_MAX) + (SRC - INT_MAX), and float values originally - // greater than UINT_MAX will be saturated to UINT_MAX (0xFFFFFFFF) after adding (0x8000000 + 0x7FFFFFFF). - // - // - // The table below illustrates the result after each step where it matters for the converted set. - // Note the original value range (original src set) is the final dst in Step 8: - // - // Original src set: - // | Original Value Range | Step 1 | Step 3 | Step 8 | - // | -FLT_MIN..FLT_MAX | 0.0..FLT_MAX | 0..INT_MAX(w/overflow) | 0..UINT_MAX(w/saturation) | - // - // Copied src set (tmp1): - // | Step 2 | Step 4 | - // | 0.0..FLT_MAX | (0.0-(INT_MAX+1))..(FLT_MAX-(INT_MAX+1)) | - // - // | Step 6 | Step 7 | - // | (0-(INT_MAX+1))..(UINT_MAX-(INT_MAX+1))(w/overflow) | ((INT_MAX+1)-(INT_MAX+1))..(INT_MAX+1) | - - // Create temporaries - assert_eq!(types::F32X4, ctx.input_ty(insn, 0)); - let tmp1 = ctx.alloc_tmp(types::I32X4).only_reg().unwrap(); - let tmp2 = ctx.alloc_tmp(types::I32X4).only_reg().unwrap(); - - // Converting to unsigned int so if float src is negative or NaN - // will first set to zero. - ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp2), tmp2)); - ctx.emit(Inst::gen_move(dst, src, input_ty)); - ctx.emit(Inst::xmm_rm_r(SseOpcode::Maxps, RegMem::from(tmp2), dst)); - - // Set tmp2 to INT_MAX+1. It is important to note here that after it looks - // like we are only converting INT_MAX (0x7FFFFFFF) but in fact because - // single precision IEEE-754 floats can only accurately represent contingous - // integers up to 2^23 and outside of this range it rounds to the closest - // integer that it can represent. In the case of INT_MAX, this value gets - // represented as 0x4f000000 which is the integer value (INT_MAX+1). - - ctx.emit(Inst::xmm_rm_r(SseOpcode::Pcmpeqd, RegMem::from(tmp2), tmp2)); - ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(1), tmp2)); - ctx.emit(Inst::xmm_unary_rm_r( - SseOpcode::Cvtdq2ps, - RegMem::from(tmp2), - tmp2, - )); - - // Make a copy of these lanes and then do the first conversion. - // Overflow lanes greater than the maximum allowed signed value will - // set to 0x80000000. Negative and NaN lanes will be 0x0 - ctx.emit(Inst::xmm_mov(SseOpcode::Movaps, RegMem::from(dst), tmp1)); - ctx.emit(Inst::xmm_unary_rm_r( - SseOpcode::Cvttps2dq, - RegMem::from(dst), - dst, - )); - - // Set lanes to src - max_signed_int - ctx.emit(Inst::xmm_rm_r(SseOpcode::Subps, RegMem::from(tmp2), tmp1)); - - // Create mask for all positive lanes to saturate (i.e. greater than - // or equal to the maxmimum allowable unsigned int). - let cond = FcmpImm::from(FloatCC::LessThanOrEqual); - ctx.emit(Inst::xmm_rm_r_imm( - SseOpcode::Cmpps, - RegMem::from(tmp1), - tmp2, - cond.encode(), - OperandSize::Size32, - )); - - // Convert those set of lanes that have the max_signed_int factored out. - ctx.emit(Inst::xmm_unary_rm_r( - SseOpcode::Cvttps2dq, - RegMem::from(tmp1), - tmp1, - )); - - // Prepare converted lanes by zeroing negative lanes and prepping lanes - // that have positive overflow (based on the mask) by setting these lanes - // to 0x7FFFFFFF - ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp2), tmp1)); - ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp2), tmp2)); - ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmaxsd, RegMem::from(tmp2), tmp1)); - - // Add this second set of converted lanes to the original to properly handle - // values greater than max signed int. - ctx.emit(Inst::xmm_rm_r(SseOpcode::Paddd, RegMem::from(tmp1), dst)); - } else { - // Since this branch is also guarded by a check for vector types - // neither Opcode::FcvtToUint nor Opcode::FcvtToSint can reach here - // due to vector varients not existing. The first two branches will - // cover all reachable cases. - unreachable!(); - } - } - } Opcode::IaddPairwise => { if let (Some(swiden_low), Some(swiden_high)) = ( matches_input(ctx, inputs[0], Opcode::SwidenLow), diff --git a/cranelift/filetests/filetests/isa/x64/fcvt.clif b/cranelift/filetests/filetests/isa/x64/fcvt.clif index 629c95f0b32d..176edd781da0 100644 --- a/cranelift/filetests/filetests/isa/x64/fcvt.clif +++ b/cranelift/filetests/filetests/isa/x64/fcvt.clif @@ -433,20 +433,19 @@ block0(v0: f32x4): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; pxor %xmm5, %xmm5, %xmm5 -; maxps %xmm0, %xmm5, %xmm0 -; pcmpeqd %xmm5, %xmm5, %xmm5 -; psrld %xmm5, $1, %xmm5 -; cvtdq2ps %xmm5, %xmm5 -; movdqa %xmm0, %xmm11 +; pxor %xmm3, %xmm3, %xmm3 +; maxps %xmm0, %xmm3, %xmm0 +; pcmpeqd %xmm8, %xmm8, %xmm8 +; psrld %xmm8, $1, %xmm8 +; cvtdq2ps %xmm8, %xmm14 +; cvttps2dq %xmm0, %xmm13 +; subps %xmm0, %xmm14, %xmm0 +; cmpps $2, %xmm14, %xmm0, %xmm14 ; cvttps2dq %xmm0, %xmm0 -; subps %xmm11, %xmm5, %xmm11 -; cmpps $2, %xmm5, %xmm11, %xmm5 -; cvttps2dq %xmm11, %xmm11 -; pxor %xmm11, %xmm5, %xmm11 -; pxor %xmm5, %xmm5, %xmm5 -; pmaxsd %xmm11, %xmm5, %xmm11 -; paddd %xmm0, %xmm11, %xmm0 +; pxor %xmm0, %xmm14, %xmm0 +; pxor %xmm7, %xmm7, %xmm7 +; pmaxsd %xmm0, %xmm7, %xmm0 +; paddd %xmm0, %xmm13, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret