diff --git a/Cargo.toml b/Cargo.toml
index 39c4553fe..0de09019d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -30,6 +30,7 @@ test = false
 # For more information on this dependency see rust-lang/rust's
 # `src/tools/rustc-std-workspace` folder
 core = { version = "1.0.0", optional = true, package = 'rustc-std-workspace-core' }
+specialized-div-rem = { version = "0.2.0" , features = ["no_std", "asm"]}
 
 [build-dependencies]
 cc = { optional = true, version = "1.0" }
diff --git a/src/int/mod.rs b/src/int/mod.rs
index 7587bc69e..fa86e3bf3 100644
--- a/src/int/mod.rs
+++ b/src/int/mod.rs
@@ -1,17 +1,5 @@
 use core::ops;
 
-macro_rules! hty {
-    ($ty:ty) => {
-        <$ty as LargeInt>::HighHalf
-    };
-}
-
-macro_rules! os_ty {
-    ($ty:ty) => {
-        <$ty as Int>::OtherSign
-    };
-}
-
 pub mod addsub;
 pub mod mul;
 pub mod sdiv;
diff --git a/src/int/sdiv.rs b/src/int/sdiv.rs
index c9e252cc3..d399bdafd 100644
--- a/src/int/sdiv.rs
+++ b/src/int/sdiv.rs
@@ -1,101 +1,97 @@
-use int::Int;
-
-trait Div: Int {
-    /// Returns `a / b`
-    fn div(self, other: Self) -> Self {
-        let s_a = self >> (Self::BITS - 1);
-        let s_b = other >> (Self::BITS - 1);
-        // NOTE it's OK to overflow here because of the `.unsigned()` below.
-        // This whole operation is computing the absolute value of the inputs
-        // So some overflow will happen when dealing with e.g. `i64::MIN`
-        // where the absolute value is `(-i64::MIN) as u64`
-        let a = (self ^ s_a).wrapping_sub(s_a);
-        let b = (other ^ s_b).wrapping_sub(s_b);
-        let s = s_a ^ s_b;
-
-        let r = a.unsigned().aborting_div(b.unsigned());
-        (Self::from_unsigned(r) ^ s) - s
+// see udiv.rs for more documentation
+
+#[cfg(not(target = "x86_64"))]
+intrinsics! {
+    #[maybe_use_optimized_c_shim]
+    #[arm_aeabi_alias = __aeabi_idiv]
+    pub extern "C" fn __divsi3(a: i32, b: i32) -> i32 {
+        specialized_div_rem::i32_div_rem_binary_long(a, b).0
     }
-}
 
-impl Div for i32 {}
-impl Div for i64 {}
-impl Div for i128 {}
+    #[maybe_use_optimized_c_shim]
+    pub extern "C" fn __modsi3(a: i32, b: i32) -> i32 {
+        specialized_div_rem::i32_div_rem_binary_long(a, b).1
+    }
+
+    #[maybe_use_optimized_c_shim]
+    pub extern "C" fn __divmodsi4(a: i32, b: i32, rem: &mut i32) -> i32 {
+        let quo_rem = specialized_div_rem::i32_div_rem_binary_long(a, b);
+        *rem = quo_rem.1;
+        quo_rem.0
+    }
 
-trait Mod: Int {
-    /// Returns `a % b`
-    fn mod_(self, other: Self) -> Self {
-        let s = other >> (Self::BITS - 1);
-        // NOTE(wrapping_sub) see comment in the `div`
-        let b = (other ^ s).wrapping_sub(s);
-        let s = self >> (Self::BITS - 1);
-        let a = (self ^ s).wrapping_sub(s);
+    #[maybe_use_optimized_c_shim]
+    pub extern "C" fn __divdi3(a: i64, b: i64) -> i64 {
+        specialized_div_rem::i64_div_rem_delegate(a, b).0
+    }
 
-        let r = a.unsigned().aborting_rem(b.unsigned());
-        (Self::from_unsigned(r) ^ s) - s
+    #[maybe_use_optimized_c_shim]
+    pub extern "C" fn __moddi3(a: i64, b: i64) -> i64 {
+        specialized_div_rem::i64_div_rem_delegate(a, b).1
     }
-}
 
-impl Mod for i32 {}
-impl Mod for i64 {}
-impl Mod for i128 {}
-
-trait Divmod: Int {
-    /// Returns `a / b` and sets `*rem = n % d`
-    fn divmod<F>(self, other: Self, rem: &mut Self, div: F) -> Self
-    where
-        F: Fn(Self, Self) -> Self,
-    {
-        let r = div(self, other);
-        // NOTE won't overflow because it's using the result from the
-        // previous division
-        *rem = self - r.wrapping_mul(other);
-        r
+    #[aapcs_on_arm]
+    pub extern "C" fn __divmoddi4(a: i64, b: i64, rem: &mut i64) -> i64 {
+        let quo_rem = specialized_div_rem::i64_div_rem_delegate(a, b);
+        *rem = quo_rem.1;
+        quo_rem.0
     }
-}
 
-impl Divmod for i32 {}
-impl Divmod for i64 {}
+    #[win64_128bit_abi_hack]
+    pub extern "C" fn __divti3(a: i128, b: i128) -> i128 {
+        specialized_div_rem::i128_div_rem_trifecta(a, b).0
+    }
 
+    #[win64_128bit_abi_hack]
+    pub extern "C" fn __modti3(a: i128, b: i128) -> i128 {
+        specialized_div_rem::i128_div_rem_trifecta(a, b).1
+    }
+}
+
+#[cfg(target = "x86_64")]
 intrinsics! {
     #[maybe_use_optimized_c_shim]
     #[arm_aeabi_alias = __aeabi_idiv]
     pub extern "C" fn __divsi3(a: i32, b: i32) -> i32 {
-        a.div(b)
+        specialized_div_rem::i32_div_rem_binary_long(a, b).0
     }
 
     #[maybe_use_optimized_c_shim]
-    pub extern "C" fn __divdi3(a: i64, b: i64) -> i64 {
-        a.div(b)
+    pub extern "C" fn __modsi3(a: i32, b: i32) -> i32 {
+        specialized_div_rem::i32_div_rem_binary_long(a, b).1
     }
 
-    #[win64_128bit_abi_hack]
-    pub extern "C" fn __divti3(a: i128, b: i128) -> i128 {
-        a.div(b)
+    #[maybe_use_optimized_c_shim]
+    pub extern "C" fn __divmodsi4(a: i32, b: i32, rem: &mut i32) -> i32 {
+        let quo_rem = specialized_div_rem::i32_div_rem_binary_long(a, b);
+        *rem = quo_rem.1;
+        quo_rem.0
     }
 
     #[maybe_use_optimized_c_shim]
-    pub extern "C" fn __modsi3(a: i32, b: i32) -> i32 {
-        a.mod_(b)
+    pub extern "C" fn __divdi3(a: i64, b: i64) -> i64 {
+        specialized_div_rem::i64_div_rem_delegate(a, b).0
     }
 
     #[maybe_use_optimized_c_shim]
     pub extern "C" fn __moddi3(a: i64, b: i64) -> i64 {
-        a.mod_(b)
+        specialized_div_rem::i64_div_rem_delegate(a, b).1
     }
 
-    #[win64_128bit_abi_hack]
-    pub extern "C" fn __modti3(a: i128, b: i128) -> i128 {
-        a.mod_(b)
+    #[aapcs_on_arm]
+    pub extern "C" fn __divmoddi4(a: i64, b: i64, rem: &mut i64) -> i64 {
+        let quo_rem = specialized_div_rem::i64_div_rem_delegate(a, b);
+        *rem = quo_rem.1;
+        quo_rem.0
     }
 
-    #[maybe_use_optimized_c_shim]
-    pub extern "C" fn __divmodsi4(a: i32, b: i32, rem: &mut i32) -> i32 {
-        a.divmod(b, rem, |a, b| __divsi3(a, b))
+    #[win64_128bit_abi_hack]
+    pub extern "C" fn __divti3(a: i128, b: i128) -> i128 {
+        specialized_div_rem::i128_div_rem_asymmetric(a, b).0
     }
 
-    #[aapcs_on_arm]
-    pub extern "C" fn __divmoddi4(a: i64, b: i64, rem: &mut i64) -> i64 {
-        a.divmod(b, rem, |a, b| __divdi3(a, b))
+    #[win64_128bit_abi_hack]
+    pub extern "C" fn __modti3(a: i128, b: i128) -> i128 {
+        specialized_div_rem::i128_div_rem_asymmetric(a, b).1
     }
-}
+}
\ No newline at end of file
diff --git a/src/int/udiv.rs b/src/int/udiv.rs
index b393ac6db..2e58671ef 100644
--- a/src/int/udiv.rs
+++ b/src/int/udiv.rs
@@ -1,270 +1,153 @@
-use int::{Int, LargeInt};
+// NOTE there are panics inside the specialized_div_rem functions if division by 0
+// is encountered, however these should be unreachable and optimized away unless
+// uses of `std/core::intrinsics::unchecked_div/rem` do not have a 0 check in front
+// of them.
 
-macro_rules! udivmod_inner {
-    ($n:expr, $d:expr, $rem:expr, $ty:ty) => {{
-        let (n, d, rem) = ($n, $d, $rem);
-        // NOTE X is unknown, K != 0
-        if n.high() == 0 {
-            if d.high() == 0 {
-                // 0 X
-                // ---
-                // 0 X
-
-                if let Some(rem) = rem {
-                    *rem = <$ty>::from(n.low().aborting_rem(d.low()));
-                }
-                return <$ty>::from(n.low().aborting_div(d.low()))
-            } else {
-                // 0 X
-                // ---
-                // K X
-                if let Some(rem) = rem {
-                    *rem = n;
-                }
-                return 0;
-            };
+#[cfg(not(target = "x86_64"))]
+intrinsics! {
+    #[maybe_use_optimized_c_shim]
+    #[arm_aeabi_alias = __aeabi_uidiv]
+    /// Returns `n / d`
+    pub extern "C" fn __udivsi3(n: u32, d: u32) -> u32 {
+        specialized_div_rem::u32_div_rem_binary_long(n, d).0
+    }
+    
+    #[maybe_use_optimized_c_shim]
+    /// Returns `n % d`
+    pub extern "C" fn __umodsi3(n: u32, d: u32) -> u32 {
+        specialized_div_rem::u32_div_rem_binary_long(n, d).1
+    }
+    
+    #[maybe_use_optimized_c_shim]
+    /// Returns `n / d` and sets `*rem = n % d`
+    pub extern "C" fn __udivmodsi4(n: u32, d: u32, rem: Option<&mut u32>) -> u32 {
+        let quo_rem = specialized_div_rem::u32_div_rem_binary_long(n, d);
+        if let Some(rem) = rem {
+            *rem = quo_rem.1;
         }
-
-        let mut sr;
-        let mut q;
-        let mut r;
-
-        if d.low() == 0 {
-            if d.high() == 0 {
-                // K X
-                // ---
-                // 0 0
-                // NOTE This should be unreachable in safe Rust because the program will panic before
-                // this intrinsic is called
-                ::abort();
-            }
-
-            if n.low() == 0 {
-                // K 0
-                // ---
-                // K 0
-                if let Some(rem) = rem {
-                    *rem = <$ty>::from_parts(0, n.high().aborting_rem(d.high()));
-                }
-                return <$ty>::from(n.high().aborting_div(d.high()))
-            }
-
-            // K K
-            // ---
-            // K 0
-
-            if d.high().is_power_of_two() {
-                if let Some(rem) = rem {
-                    *rem = <$ty>::from_parts(n.low(), n.high() & (d.high() - 1));
-                }
-                return <$ty>::from(n.high() >> d.high().trailing_zeros());
-            }
-
-            sr = d.high().leading_zeros().wrapping_sub(n.high().leading_zeros());
-
-            // D > N
-            if sr > <hty!($ty)>::BITS - 2 {
-                if let Some(rem) = rem {
-                    *rem = n;
-                }
-                return 0;
-            }
-
-            sr += 1;
-
-            // 1 <= sr <= <hty!($ty)>::BITS - 1
-            q = n << (<$ty>::BITS - sr);
-            r = n >> sr;
-        } else if d.high() == 0 {
-            // K X
-            // ---
-            // 0 K
-            if d.low().is_power_of_two() {
-                if let Some(rem) = rem {
-                    *rem = <$ty>::from(n.low() & (d.low() - 1));
-                }
-
-                if d.low() == 1 {
-                    return n;
-                } else {
-                    let sr = d.low().trailing_zeros();
-                    return n >> sr;
-                };
-            }
-
-            sr = 1 + <hty!($ty)>::BITS + d.low().leading_zeros() - n.high().leading_zeros();
-
-            // 2 <= sr <= u64::BITS - 1
-            q = n << (<$ty>::BITS - sr);
-            r = n >> sr;
-        } else {
-            // K X
-            // ---
-            // K K
-            sr = d.high().leading_zeros().wrapping_sub(n.high().leading_zeros());
-
-            // D > N
-            if sr > <hty!($ty)>::BITS - 1 {
-                if let Some(rem) = rem {
-                    *rem = n;
-                }
-                return 0;
-            }
-
-            sr += 1;
-
-            // 1 <= sr <= <hty!($ty)>::BITS
-            q = n << (<$ty>::BITS - sr);
-            r = n >> sr;
+        quo_rem.0
+    }
+    
+    // `_delegate` is most efficient in the 64 bit range
+    
+    #[maybe_use_optimized_c_shim]
+    /// Returns `n / d`
+    pub extern "C" fn __udivdi3(n: u64, d: u64) -> u64 {
+        specialized_div_rem::u64_div_rem_delegate(n, d).0
+    }
+    
+    #[maybe_use_optimized_c_shim]
+    /// Returns `n % d`
+    pub extern "C" fn __umoddi3(n: u64, d: u64) -> u64 {
+        specialized_div_rem::u64_div_rem_delegate(n, d).1
+    }
+    
+    /// Returns `n / d` and sets `*rem = n % d`
+    pub extern "C" fn __udivmoddi4(n: u64, d: u64, rem: Option<&mut u64>) -> u64 {
+        let quo_rem = specialized_div_rem::u64_div_rem_delegate(n, d);
+        if let Some(rem) = rem {
+            *rem = quo_rem.1;
         }
+        quo_rem.0
+    }
+    
+    // `_trifecta` is efficient for large divisions, even when division
+    // hardware is not availiable at all.
 
-        // Not a special case
-        // q and r are initialized with
-        // q = n << (u64::BITS - sr)
-        // r = n >> sr
-        // 1 <= sr <= u64::BITS - 1
-        let mut carry = 0;
-
-        // Don't use a range because they may generate references to memcpy in unoptimized code
-        let mut i = 0;
-        while i < sr {
-            i += 1;
-
-            // r:q = ((r:q) << 1) | carry
-            r = (r << 1) | (q >> (<$ty>::BITS - 1));
-            q = (q << 1) | carry as $ty;
+    #[win64_128bit_abi_hack]
+    /// Returns `n / d`
+    pub extern "C" fn __udivti3(n: u128, d: u128) -> u128 {
+        specialized_div_rem::u128_div_rem_trifecta(n, d).0
+    }
 
-            // carry = 0
-            // if r >= d {
-            //     r -= d;
-            //     carry = 1;
-            // }
-            let s = (d.wrapping_sub(r).wrapping_sub(1)) as os_ty!($ty) >> (<$ty>::BITS - 1);
-            carry = (s & 1) as hty!($ty);
-            r -= d & s as $ty;
-        }
+    #[win64_128bit_abi_hack]
+    /// Returns `n % d`
+    pub extern "C" fn __umodti3(n: u128, d: u128) -> u128 {
+        specialized_div_rem::u128_div_rem_trifecta(n, d).1
+    }
 
+    #[win64_128bit_abi_hack]
+    /// Returns `n / d` and sets `*rem = n % d`
+    pub extern "C" fn __udivmodti4(n: u128, d: u128, rem: Option<&mut u128>) -> u128 {
+        let quo_rem = specialized_div_rem::u128_div_rem_trifecta(n, d);
         if let Some(rem) = rem {
-            *rem = r;
+            *rem = quo_rem.1;
         }
-        (q << 1) | carry as $ty
-    }}
+        quo_rem.0
+    }
 }
 
+// uses `_asymmetric` instead of `_trifecta`, because x86_64 supplies the `divq` instruction
+#[cfg(target = "x86_64")]
 intrinsics! {
     #[maybe_use_optimized_c_shim]
     #[arm_aeabi_alias = __aeabi_uidiv]
     /// Returns `n / d`
     pub extern "C" fn __udivsi3(n: u32, d: u32) -> u32 {
-        // Special cases
-        if d == 0 {
-            // NOTE This should be unreachable in safe Rust because the program will panic before
-            // this intrinsic is called
-            ::abort();
-        }
-
-        if n == 0 {
-            return 0;
-        }
-
-        let mut sr = d.leading_zeros().wrapping_sub(n.leading_zeros());
-
-        // d > n
-        if sr > u32::BITS - 1 {
-            return 0;
-        }
-
-        // d == 1
-        if sr == u32::BITS - 1 {
-            return n;
-        }
-
-        sr += 1;
-
-        // 1 <= sr <= u32::BITS - 1
-        let mut q = n << (u32::BITS - sr);
-        let mut r = n >> sr;
-
-        let mut carry = 0;
-
-        // Don't use a range because they may generate references to memcpy in unoptimized code
-        let mut i = 0;
-        while i < sr {
-            i += 1;
-
-            // r:q = ((r:q) << 1) | carry
-            r = (r << 1) | (q >> (u32::BITS - 1));
-            q = (q << 1) | carry;
-
-            // carry = 0;
-            // if r > d {
-            //     r -= d;
-            //     carry = 1;
-            // }
-
-            let s = (d.wrapping_sub(r).wrapping_sub(1)) as i32 >> (u32::BITS - 1);
-            carry = (s & 1) as u32;
-            r -= d & s as u32;
-        }
-
-        (q << 1) | carry
+        specialized_div_rem::u32_div_rem_binary_long(n, d).0
     }
-
+    
     #[maybe_use_optimized_c_shim]
     /// Returns `n % d`
     pub extern "C" fn __umodsi3(n: u32, d: u32) -> u32 {
-        let q = __udivsi3(n, d);
-        n - q * d
+        specialized_div_rem::u32_div_rem_binary_long(n, d).1
     }
-
+    
     #[maybe_use_optimized_c_shim]
     /// Returns `n / d` and sets `*rem = n % d`
     pub extern "C" fn __udivmodsi4(n: u32, d: u32, rem: Option<&mut u32>) -> u32 {
-        let q = __udivsi3(n, d);
+        let quo_rem = specialized_div_rem::u32_div_rem_binary_long(n, d);
         if let Some(rem) = rem {
-            *rem = n - (q * d);
+            *rem = quo_rem.1;
         }
-        q
+        quo_rem.0
     }
-
+    
+    // `_delegate` is most efficient in the 64 bit range
+    
     #[maybe_use_optimized_c_shim]
     /// Returns `n / d`
     pub extern "C" fn __udivdi3(n: u64, d: u64) -> u64 {
-        __udivmoddi4(n, d, None)
+        specialized_div_rem::u64_div_rem_delegate(n, d).0
     }
-
+    
     #[maybe_use_optimized_c_shim]
     /// Returns `n % d`
     pub extern "C" fn __umoddi3(n: u64, d: u64) -> u64 {
-        let mut rem = 0;
-        __udivmoddi4(n, d, Some(&mut rem));
-        rem
+        specialized_div_rem::u64_div_rem_delegate(n, d).1
+    }
+    
+    /// Returns `n / d` and sets `*rem = n % d`
+    pub extern "C" fn __udivmoddi4(n: u64, d: u64, rem: Option<&mut u64>) -> u64 {
+        let quo_rem = specialized_div_rem::u64_div_rem_delegate(n, d);
+        if let Some(rem) = rem {
+            *rem = quo_rem.1;
+        }
+        quo_rem.0
     }
+    
+    // `_trifecta` is efficient for large divisions, even when division
+    // hardware is not availiable at all.
 
     #[win64_128bit_abi_hack]
     /// Returns `n / d`
     pub extern "C" fn __udivti3(n: u128, d: u128) -> u128 {
-        __udivmodti4(n, d, None)
+        specialized_div_rem::u128_div_rem_trifecta(n, d).0
     }
 
     #[win64_128bit_abi_hack]
     /// Returns `n % d`
     pub extern "C" fn __umodti3(n: u128, d: u128) -> u128 {
-        let mut rem = 0;
-        __udivmodti4(n, d, Some(&mut rem));
-        rem
-    }
-
-    /// Returns `n / d` and sets `*rem = n % d`
-    pub extern "C" fn __udivmoddi4(n: u64, d: u64, rem: Option<&mut u64>) -> u64 {
-        udivmod_inner!(n, d, rem, u64)
+        specialized_div_rem::u128_div_rem_trifecta(n, d).1
     }
 
     #[win64_128bit_abi_hack]
     /// Returns `n / d` and sets `*rem = n % d`
-    pub extern "C" fn __udivmodti4(n: u128,
-                                   d: u128,
-                                   rem: Option<&mut u128>) -> u128 {
-        udivmod_inner!(n, d, rem, u128)
+    pub extern "C" fn __udivmodti4(n: u128, d: u128, rem: Option<&mut u128>) -> u128 {
+        let quo_rem = specialized_div_rem::u128_div_rem_trifecta(n, d);
+        if let Some(rem) = rem {
+            *rem = quo_rem.1;
+        }
+        quo_rem.0
     }
-}
+}
\ No newline at end of file