diff --git a/Cargo.toml b/Cargo.toml index 39c4553fe..0de09019d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,6 +30,7 @@ test = false # For more information on this dependency see rust-lang/rust's # `src/tools/rustc-std-workspace` folder core = { version = "1.0.0", optional = true, package = 'rustc-std-workspace-core' } +specialized-div-rem = { version = "0.2.0" , features = ["no_std", "asm"]} [build-dependencies] cc = { optional = true, version = "1.0" } diff --git a/src/int/mod.rs b/src/int/mod.rs index 7587bc69e..fa86e3bf3 100644 --- a/src/int/mod.rs +++ b/src/int/mod.rs @@ -1,17 +1,5 @@ use core::ops; -macro_rules! hty { - ($ty:ty) => { - <$ty as LargeInt>::HighHalf - }; -} - -macro_rules! os_ty { - ($ty:ty) => { - <$ty as Int>::OtherSign - }; -} - pub mod addsub; pub mod mul; pub mod sdiv; diff --git a/src/int/sdiv.rs b/src/int/sdiv.rs index c9e252cc3..d399bdafd 100644 --- a/src/int/sdiv.rs +++ b/src/int/sdiv.rs @@ -1,101 +1,97 @@ -use int::Int; - -trait Div: Int { - /// Returns `a / b` - fn div(self, other: Self) -> Self { - let s_a = self >> (Self::BITS - 1); - let s_b = other >> (Self::BITS - 1); - // NOTE it's OK to overflow here because of the `.unsigned()` below. - // This whole operation is computing the absolute value of the inputs - // So some overflow will happen when dealing with e.g. `i64::MIN` - // where the absolute value is `(-i64::MIN) as u64` - let a = (self ^ s_a).wrapping_sub(s_a); - let b = (other ^ s_b).wrapping_sub(s_b); - let s = s_a ^ s_b; - - let r = a.unsigned().aborting_div(b.unsigned()); - (Self::from_unsigned(r) ^ s) - s +// see udiv.rs for more documentation + +#[cfg(not(target = "x86_64"))] +intrinsics! { + #[maybe_use_optimized_c_shim] + #[arm_aeabi_alias = __aeabi_idiv] + pub extern "C" fn __divsi3(a: i32, b: i32) -> i32 { + specialized_div_rem::i32_div_rem_binary_long(a, b).0 } -} -impl Div for i32 {} -impl Div for i64 {} -impl Div for i128 {} + #[maybe_use_optimized_c_shim] + pub extern "C" fn __modsi3(a: i32, b: i32) -> i32 { + specialized_div_rem::i32_div_rem_binary_long(a, b).1 + } + + #[maybe_use_optimized_c_shim] + pub extern "C" fn __divmodsi4(a: i32, b: i32, rem: &mut i32) -> i32 { + let quo_rem = specialized_div_rem::i32_div_rem_binary_long(a, b); + *rem = quo_rem.1; + quo_rem.0 + } -trait Mod: Int { - /// Returns `a % b` - fn mod_(self, other: Self) -> Self { - let s = other >> (Self::BITS - 1); - // NOTE(wrapping_sub) see comment in the `div` - let b = (other ^ s).wrapping_sub(s); - let s = self >> (Self::BITS - 1); - let a = (self ^ s).wrapping_sub(s); + #[maybe_use_optimized_c_shim] + pub extern "C" fn __divdi3(a: i64, b: i64) -> i64 { + specialized_div_rem::i64_div_rem_delegate(a, b).0 + } - let r = a.unsigned().aborting_rem(b.unsigned()); - (Self::from_unsigned(r) ^ s) - s + #[maybe_use_optimized_c_shim] + pub extern "C" fn __moddi3(a: i64, b: i64) -> i64 { + specialized_div_rem::i64_div_rem_delegate(a, b).1 } -} -impl Mod for i32 {} -impl Mod for i64 {} -impl Mod for i128 {} - -trait Divmod: Int { - /// Returns `a / b` and sets `*rem = n % d` - fn divmod(self, other: Self, rem: &mut Self, div: F) -> Self - where - F: Fn(Self, Self) -> Self, - { - let r = div(self, other); - // NOTE won't overflow because it's using the result from the - // previous division - *rem = self - r.wrapping_mul(other); - r + #[aapcs_on_arm] + pub extern "C" fn __divmoddi4(a: i64, b: i64, rem: &mut i64) -> i64 { + let quo_rem = specialized_div_rem::i64_div_rem_delegate(a, b); + *rem = quo_rem.1; + quo_rem.0 } -} -impl Divmod for i32 {} -impl Divmod for i64 {} + #[win64_128bit_abi_hack] + pub extern "C" fn __divti3(a: i128, b: i128) -> i128 { + specialized_div_rem::i128_div_rem_trifecta(a, b).0 + } + #[win64_128bit_abi_hack] + pub extern "C" fn __modti3(a: i128, b: i128) -> i128 { + specialized_div_rem::i128_div_rem_trifecta(a, b).1 + } +} + +#[cfg(target = "x86_64")] intrinsics! { #[maybe_use_optimized_c_shim] #[arm_aeabi_alias = __aeabi_idiv] pub extern "C" fn __divsi3(a: i32, b: i32) -> i32 { - a.div(b) + specialized_div_rem::i32_div_rem_binary_long(a, b).0 } #[maybe_use_optimized_c_shim] - pub extern "C" fn __divdi3(a: i64, b: i64) -> i64 { - a.div(b) + pub extern "C" fn __modsi3(a: i32, b: i32) -> i32 { + specialized_div_rem::i32_div_rem_binary_long(a, b).1 } - #[win64_128bit_abi_hack] - pub extern "C" fn __divti3(a: i128, b: i128) -> i128 { - a.div(b) + #[maybe_use_optimized_c_shim] + pub extern "C" fn __divmodsi4(a: i32, b: i32, rem: &mut i32) -> i32 { + let quo_rem = specialized_div_rem::i32_div_rem_binary_long(a, b); + *rem = quo_rem.1; + quo_rem.0 } #[maybe_use_optimized_c_shim] - pub extern "C" fn __modsi3(a: i32, b: i32) -> i32 { - a.mod_(b) + pub extern "C" fn __divdi3(a: i64, b: i64) -> i64 { + specialized_div_rem::i64_div_rem_delegate(a, b).0 } #[maybe_use_optimized_c_shim] pub extern "C" fn __moddi3(a: i64, b: i64) -> i64 { - a.mod_(b) + specialized_div_rem::i64_div_rem_delegate(a, b).1 } - #[win64_128bit_abi_hack] - pub extern "C" fn __modti3(a: i128, b: i128) -> i128 { - a.mod_(b) + #[aapcs_on_arm] + pub extern "C" fn __divmoddi4(a: i64, b: i64, rem: &mut i64) -> i64 { + let quo_rem = specialized_div_rem::i64_div_rem_delegate(a, b); + *rem = quo_rem.1; + quo_rem.0 } - #[maybe_use_optimized_c_shim] - pub extern "C" fn __divmodsi4(a: i32, b: i32, rem: &mut i32) -> i32 { - a.divmod(b, rem, |a, b| __divsi3(a, b)) + #[win64_128bit_abi_hack] + pub extern "C" fn __divti3(a: i128, b: i128) -> i128 { + specialized_div_rem::i128_div_rem_asymmetric(a, b).0 } - #[aapcs_on_arm] - pub extern "C" fn __divmoddi4(a: i64, b: i64, rem: &mut i64) -> i64 { - a.divmod(b, rem, |a, b| __divdi3(a, b)) + #[win64_128bit_abi_hack] + pub extern "C" fn __modti3(a: i128, b: i128) -> i128 { + specialized_div_rem::i128_div_rem_asymmetric(a, b).1 } -} +} \ No newline at end of file diff --git a/src/int/udiv.rs b/src/int/udiv.rs index b393ac6db..2e58671ef 100644 --- a/src/int/udiv.rs +++ b/src/int/udiv.rs @@ -1,270 +1,153 @@ -use int::{Int, LargeInt}; +// NOTE there are panics inside the specialized_div_rem functions if division by 0 +// is encountered, however these should be unreachable and optimized away unless +// uses of `std/core::intrinsics::unchecked_div/rem` do not have a 0 check in front +// of them. -macro_rules! udivmod_inner { - ($n:expr, $d:expr, $rem:expr, $ty:ty) => {{ - let (n, d, rem) = ($n, $d, $rem); - // NOTE X is unknown, K != 0 - if n.high() == 0 { - if d.high() == 0 { - // 0 X - // --- - // 0 X - - if let Some(rem) = rem { - *rem = <$ty>::from(n.low().aborting_rem(d.low())); - } - return <$ty>::from(n.low().aborting_div(d.low())) - } else { - // 0 X - // --- - // K X - if let Some(rem) = rem { - *rem = n; - } - return 0; - }; +#[cfg(not(target = "x86_64"))] +intrinsics! { + #[maybe_use_optimized_c_shim] + #[arm_aeabi_alias = __aeabi_uidiv] + /// Returns `n / d` + pub extern "C" fn __udivsi3(n: u32, d: u32) -> u32 { + specialized_div_rem::u32_div_rem_binary_long(n, d).0 + } + + #[maybe_use_optimized_c_shim] + /// Returns `n % d` + pub extern "C" fn __umodsi3(n: u32, d: u32) -> u32 { + specialized_div_rem::u32_div_rem_binary_long(n, d).1 + } + + #[maybe_use_optimized_c_shim] + /// Returns `n / d` and sets `*rem = n % d` + pub extern "C" fn __udivmodsi4(n: u32, d: u32, rem: Option<&mut u32>) -> u32 { + let quo_rem = specialized_div_rem::u32_div_rem_binary_long(n, d); + if let Some(rem) = rem { + *rem = quo_rem.1; } - - let mut sr; - let mut q; - let mut r; - - if d.low() == 0 { - if d.high() == 0 { - // K X - // --- - // 0 0 - // NOTE This should be unreachable in safe Rust because the program will panic before - // this intrinsic is called - ::abort(); - } - - if n.low() == 0 { - // K 0 - // --- - // K 0 - if let Some(rem) = rem { - *rem = <$ty>::from_parts(0, n.high().aborting_rem(d.high())); - } - return <$ty>::from(n.high().aborting_div(d.high())) - } - - // K K - // --- - // K 0 - - if d.high().is_power_of_two() { - if let Some(rem) = rem { - *rem = <$ty>::from_parts(n.low(), n.high() & (d.high() - 1)); - } - return <$ty>::from(n.high() >> d.high().trailing_zeros()); - } - - sr = d.high().leading_zeros().wrapping_sub(n.high().leading_zeros()); - - // D > N - if sr > ::BITS - 2 { - if let Some(rem) = rem { - *rem = n; - } - return 0; - } - - sr += 1; - - // 1 <= sr <= ::BITS - 1 - q = n << (<$ty>::BITS - sr); - r = n >> sr; - } else if d.high() == 0 { - // K X - // --- - // 0 K - if d.low().is_power_of_two() { - if let Some(rem) = rem { - *rem = <$ty>::from(n.low() & (d.low() - 1)); - } - - if d.low() == 1 { - return n; - } else { - let sr = d.low().trailing_zeros(); - return n >> sr; - }; - } - - sr = 1 + ::BITS + d.low().leading_zeros() - n.high().leading_zeros(); - - // 2 <= sr <= u64::BITS - 1 - q = n << (<$ty>::BITS - sr); - r = n >> sr; - } else { - // K X - // --- - // K K - sr = d.high().leading_zeros().wrapping_sub(n.high().leading_zeros()); - - // D > N - if sr > ::BITS - 1 { - if let Some(rem) = rem { - *rem = n; - } - return 0; - } - - sr += 1; - - // 1 <= sr <= ::BITS - q = n << (<$ty>::BITS - sr); - r = n >> sr; + quo_rem.0 + } + + // `_delegate` is most efficient in the 64 bit range + + #[maybe_use_optimized_c_shim] + /// Returns `n / d` + pub extern "C" fn __udivdi3(n: u64, d: u64) -> u64 { + specialized_div_rem::u64_div_rem_delegate(n, d).0 + } + + #[maybe_use_optimized_c_shim] + /// Returns `n % d` + pub extern "C" fn __umoddi3(n: u64, d: u64) -> u64 { + specialized_div_rem::u64_div_rem_delegate(n, d).1 + } + + /// Returns `n / d` and sets `*rem = n % d` + pub extern "C" fn __udivmoddi4(n: u64, d: u64, rem: Option<&mut u64>) -> u64 { + let quo_rem = specialized_div_rem::u64_div_rem_delegate(n, d); + if let Some(rem) = rem { + *rem = quo_rem.1; } + quo_rem.0 + } + + // `_trifecta` is efficient for large divisions, even when division + // hardware is not availiable at all. - // Not a special case - // q and r are initialized with - // q = n << (u64::BITS - sr) - // r = n >> sr - // 1 <= sr <= u64::BITS - 1 - let mut carry = 0; - - // Don't use a range because they may generate references to memcpy in unoptimized code - let mut i = 0; - while i < sr { - i += 1; - - // r:q = ((r:q) << 1) | carry - r = (r << 1) | (q >> (<$ty>::BITS - 1)); - q = (q << 1) | carry as $ty; + #[win64_128bit_abi_hack] + /// Returns `n / d` + pub extern "C" fn __udivti3(n: u128, d: u128) -> u128 { + specialized_div_rem::u128_div_rem_trifecta(n, d).0 + } - // carry = 0 - // if r >= d { - // r -= d; - // carry = 1; - // } - let s = (d.wrapping_sub(r).wrapping_sub(1)) as os_ty!($ty) >> (<$ty>::BITS - 1); - carry = (s & 1) as hty!($ty); - r -= d & s as $ty; - } + #[win64_128bit_abi_hack] + /// Returns `n % d` + pub extern "C" fn __umodti3(n: u128, d: u128) -> u128 { + specialized_div_rem::u128_div_rem_trifecta(n, d).1 + } + #[win64_128bit_abi_hack] + /// Returns `n / d` and sets `*rem = n % d` + pub extern "C" fn __udivmodti4(n: u128, d: u128, rem: Option<&mut u128>) -> u128 { + let quo_rem = specialized_div_rem::u128_div_rem_trifecta(n, d); if let Some(rem) = rem { - *rem = r; + *rem = quo_rem.1; } - (q << 1) | carry as $ty - }} + quo_rem.0 + } } +// uses `_asymmetric` instead of `_trifecta`, because x86_64 supplies the `divq` instruction +#[cfg(target = "x86_64")] intrinsics! { #[maybe_use_optimized_c_shim] #[arm_aeabi_alias = __aeabi_uidiv] /// Returns `n / d` pub extern "C" fn __udivsi3(n: u32, d: u32) -> u32 { - // Special cases - if d == 0 { - // NOTE This should be unreachable in safe Rust because the program will panic before - // this intrinsic is called - ::abort(); - } - - if n == 0 { - return 0; - } - - let mut sr = d.leading_zeros().wrapping_sub(n.leading_zeros()); - - // d > n - if sr > u32::BITS - 1 { - return 0; - } - - // d == 1 - if sr == u32::BITS - 1 { - return n; - } - - sr += 1; - - // 1 <= sr <= u32::BITS - 1 - let mut q = n << (u32::BITS - sr); - let mut r = n >> sr; - - let mut carry = 0; - - // Don't use a range because they may generate references to memcpy in unoptimized code - let mut i = 0; - while i < sr { - i += 1; - - // r:q = ((r:q) << 1) | carry - r = (r << 1) | (q >> (u32::BITS - 1)); - q = (q << 1) | carry; - - // carry = 0; - // if r > d { - // r -= d; - // carry = 1; - // } - - let s = (d.wrapping_sub(r).wrapping_sub(1)) as i32 >> (u32::BITS - 1); - carry = (s & 1) as u32; - r -= d & s as u32; - } - - (q << 1) | carry + specialized_div_rem::u32_div_rem_binary_long(n, d).0 } - + #[maybe_use_optimized_c_shim] /// Returns `n % d` pub extern "C" fn __umodsi3(n: u32, d: u32) -> u32 { - let q = __udivsi3(n, d); - n - q * d + specialized_div_rem::u32_div_rem_binary_long(n, d).1 } - + #[maybe_use_optimized_c_shim] /// Returns `n / d` and sets `*rem = n % d` pub extern "C" fn __udivmodsi4(n: u32, d: u32, rem: Option<&mut u32>) -> u32 { - let q = __udivsi3(n, d); + let quo_rem = specialized_div_rem::u32_div_rem_binary_long(n, d); if let Some(rem) = rem { - *rem = n - (q * d); + *rem = quo_rem.1; } - q + quo_rem.0 } - + + // `_delegate` is most efficient in the 64 bit range + #[maybe_use_optimized_c_shim] /// Returns `n / d` pub extern "C" fn __udivdi3(n: u64, d: u64) -> u64 { - __udivmoddi4(n, d, None) + specialized_div_rem::u64_div_rem_delegate(n, d).0 } - + #[maybe_use_optimized_c_shim] /// Returns `n % d` pub extern "C" fn __umoddi3(n: u64, d: u64) -> u64 { - let mut rem = 0; - __udivmoddi4(n, d, Some(&mut rem)); - rem + specialized_div_rem::u64_div_rem_delegate(n, d).1 + } + + /// Returns `n / d` and sets `*rem = n % d` + pub extern "C" fn __udivmoddi4(n: u64, d: u64, rem: Option<&mut u64>) -> u64 { + let quo_rem = specialized_div_rem::u64_div_rem_delegate(n, d); + if let Some(rem) = rem { + *rem = quo_rem.1; + } + quo_rem.0 } + + // `_trifecta` is efficient for large divisions, even when division + // hardware is not availiable at all. #[win64_128bit_abi_hack] /// Returns `n / d` pub extern "C" fn __udivti3(n: u128, d: u128) -> u128 { - __udivmodti4(n, d, None) + specialized_div_rem::u128_div_rem_trifecta(n, d).0 } #[win64_128bit_abi_hack] /// Returns `n % d` pub extern "C" fn __umodti3(n: u128, d: u128) -> u128 { - let mut rem = 0; - __udivmodti4(n, d, Some(&mut rem)); - rem - } - - /// Returns `n / d` and sets `*rem = n % d` - pub extern "C" fn __udivmoddi4(n: u64, d: u64, rem: Option<&mut u64>) -> u64 { - udivmod_inner!(n, d, rem, u64) + specialized_div_rem::u128_div_rem_trifecta(n, d).1 } #[win64_128bit_abi_hack] /// Returns `n / d` and sets `*rem = n % d` - pub extern "C" fn __udivmodti4(n: u128, - d: u128, - rem: Option<&mut u128>) -> u128 { - udivmod_inner!(n, d, rem, u128) + pub extern "C" fn __udivmodti4(n: u128, d: u128, rem: Option<&mut u128>) -> u128 { + let quo_rem = specialized_div_rem::u128_div_rem_trifecta(n, d); + if let Some(rem) = rem { + *rem = quo_rem.1; + } + quo_rem.0 } -} +} \ No newline at end of file