summaryrefslogtreecommitdiffstats
path: root/vendor/compiler_builtins/src/int
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 12:02:58 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 12:02:58 +0000
commit698f8c2f01ea549d77d7dc3338a12e04c11057b9 (patch)
tree173a775858bd501c378080a10dca74132f05bc50 /vendor/compiler_builtins/src/int
parentInitial commit. (diff)
downloadrustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.tar.xz
rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.zip
Adding upstream version 1.64.0+dfsg1.upstream/1.64.0+dfsg1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/compiler_builtins/src/int')
-rw-r--r--vendor/compiler_builtins/src/int/addsub.rs96
-rw-r--r--vendor/compiler_builtins/src/int/leading_zeros.rs149
-rw-r--r--vendor/compiler_builtins/src/int/mod.rs390
-rw-r--r--vendor/compiler_builtins/src/int/mul.rs138
-rw-r--r--vendor/compiler_builtins/src/int/sdiv.rs169
-rw-r--r--vendor/compiler_builtins/src/int/shift.rs116
-rw-r--r--vendor/compiler_builtins/src/int/specialized_div_rem/asymmetric.rs69
-rw-r--r--vendor/compiler_builtins/src/int/specialized_div_rem/binary_long.rs548
-rw-r--r--vendor/compiler_builtins/src/int/specialized_div_rem/delegate.rs319
-rw-r--r--vendor/compiler_builtins/src/int/specialized_div_rem/mod.rs306
-rw-r--r--vendor/compiler_builtins/src/int/specialized_div_rem/norm_shift.rs106
-rw-r--r--vendor/compiler_builtins/src/int/specialized_div_rem/trifecta.rs386
-rw-r--r--vendor/compiler_builtins/src/int/udiv.rs106
13 files changed, 2898 insertions, 0 deletions
diff --git a/vendor/compiler_builtins/src/int/addsub.rs b/vendor/compiler_builtins/src/int/addsub.rs
new file mode 100644
index 000000000..f4841e90f
--- /dev/null
+++ b/vendor/compiler_builtins/src/int/addsub.rs
@@ -0,0 +1,96 @@
+use int::{DInt, Int};
+
+trait UAddSub: DInt {
+ fn uadd(self, other: Self) -> Self {
+ let (lo, carry) = self.lo().overflowing_add(other.lo());
+ let hi = self.hi().wrapping_add(other.hi());
+ let carry = if carry { Self::H::ONE } else { Self::H::ZERO };
+ Self::from_lo_hi(lo, hi.wrapping_add(carry))
+ }
+ fn uadd_one(self) -> Self {
+ let (lo, carry) = self.lo().overflowing_add(Self::H::ONE);
+ let carry = if carry { Self::H::ONE } else { Self::H::ZERO };
+ Self::from_lo_hi(lo, self.hi().wrapping_add(carry))
+ }
+ fn usub(self, other: Self) -> Self {
+ let uneg = (!other).uadd_one();
+ self.uadd(uneg)
+ }
+}
+
+impl UAddSub for u128 {}
+
+trait AddSub: Int
+where
+ <Self as Int>::UnsignedInt: UAddSub,
+{
+ fn add(self, other: Self) -> Self {
+ Self::from_unsigned(self.unsigned().uadd(other.unsigned()))
+ }
+ fn sub(self, other: Self) -> Self {
+ Self::from_unsigned(self.unsigned().usub(other.unsigned()))
+ }
+}
+
+impl AddSub for u128 {}
+impl AddSub for i128 {}
+
+trait Addo: AddSub
+where
+ <Self as Int>::UnsignedInt: UAddSub,
+{
+ fn addo(self, other: Self) -> (Self, bool) {
+ let sum = AddSub::add(self, other);
+ (sum, (other < Self::ZERO) != (sum < self))
+ }
+}
+
+impl Addo for i128 {}
+impl Addo for u128 {}
+
+trait Subo: AddSub
+where
+ <Self as Int>::UnsignedInt: UAddSub,
+{
+ fn subo(self, other: Self) -> (Self, bool) {
+ let sum = AddSub::sub(self, other);
+ (sum, (other < Self::ZERO) != (self < sum))
+ }
+}
+
+impl Subo for i128 {}
+impl Subo for u128 {}
+
+intrinsics! {
+ pub extern "C" fn __rust_i128_add(a: i128, b: i128) -> i128 {
+ AddSub::add(a,b)
+ }
+
+ pub extern "C" fn __rust_i128_addo(a: i128, b: i128) -> (i128, bool) {
+ a.addo(b)
+ }
+
+ pub extern "C" fn __rust_u128_add(a: u128, b: u128) -> u128 {
+ AddSub::add(a,b)
+ }
+
+ pub extern "C" fn __rust_u128_addo(a: u128, b: u128) -> (u128, bool) {
+ a.addo(b)
+ }
+
+ pub extern "C" fn __rust_i128_sub(a: i128, b: i128) -> i128 {
+ AddSub::sub(a,b)
+ }
+
+ pub extern "C" fn __rust_i128_subo(a: i128, b: i128) -> (i128, bool) {
+ a.subo(b)
+ }
+
+ pub extern "C" fn __rust_u128_sub(a: u128, b: u128) -> u128 {
+ AddSub::sub(a,b)
+ }
+
+ pub extern "C" fn __rust_u128_subo(a: u128, b: u128) -> (u128, bool) {
+ a.subo(b)
+ }
+}
diff --git a/vendor/compiler_builtins/src/int/leading_zeros.rs b/vendor/compiler_builtins/src/int/leading_zeros.rs
new file mode 100644
index 000000000..9e60ab0d7
--- /dev/null
+++ b/vendor/compiler_builtins/src/int/leading_zeros.rs
@@ -0,0 +1,149 @@
+// Note: these functions happen to produce the correct `usize::leading_zeros(0)` value
+// without a explicit zero check. Zero is probably common enough that it could warrant
+// adding a zero check at the beginning, but `__clzsi2` has a precondition that `x != 0`.
+// Compilers will insert the check for zero in cases where it is needed.
+
+public_test_dep! {
+/// Returns the number of leading binary zeros in `x`.
+#[allow(dead_code)]
+pub(crate) fn usize_leading_zeros_default(x: usize) -> usize {
+ // The basic idea is to test if the higher bits of `x` are zero and bisect the number
+ // of leading zeros. It is possible for all branches of the bisection to use the same
+ // code path by conditionally shifting the higher parts down to let the next bisection
+ // step work on the higher or lower parts of `x`. Instead of starting with `z == 0`
+ // and adding to the number of zeros, it is slightly faster to start with
+ // `z == usize::MAX.count_ones()` and subtract from the potential number of zeros,
+ // because it simplifies the final bisection step.
+ let mut x = x;
+ // the number of potential leading zeros
+ let mut z = usize::MAX.count_ones() as usize;
+ // a temporary
+ let mut t: usize;
+ #[cfg(target_pointer_width = "64")]
+ {
+ t = x >> 32;
+ if t != 0 {
+ z -= 32;
+ x = t;
+ }
+ }
+ #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
+ {
+ t = x >> 16;
+ if t != 0 {
+ z -= 16;
+ x = t;
+ }
+ }
+ t = x >> 8;
+ if t != 0 {
+ z -= 8;
+ x = t;
+ }
+ t = x >> 4;
+ if t != 0 {
+ z -= 4;
+ x = t;
+ }
+ t = x >> 2;
+ if t != 0 {
+ z -= 2;
+ x = t;
+ }
+ // the last two bisections are combined into one conditional
+ t = x >> 1;
+ if t != 0 {
+ z - 2
+ } else {
+ z - x
+ }
+
+ // We could potentially save a few cycles by using the LUT trick from
+ // "https://embeddedgurus.com/state-space/2014/09/
+ // fast-deterministic-and-portable-counting-leading-zeros/".
+ // However, 256 bytes for a LUT is too large for embedded use cases. We could remove
+ // the last 3 bisections and use this 16 byte LUT for the rest of the work:
+ //const LUT: [u8; 16] = [0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4];
+ //z -= LUT[x] as usize;
+ //z
+ // However, it ends up generating about the same number of instructions. When benchmarked
+ // on x86_64, it is slightly faster to use the LUT, but this is probably because of OOO
+ // execution effects. Changing to using a LUT and branching is risky for smaller cores.
+}
+}
+
+// The above method does not compile well on RISC-V (because of the lack of predicated
+// instructions), producing code with many branches or using an excessively long
+// branchless solution. This method takes advantage of the set-if-less-than instruction on
+// RISC-V that allows `(x >= power-of-two) as usize` to be branchless.
+
+public_test_dep! {
+/// Returns the number of leading binary zeros in `x`.
+#[allow(dead_code)]
+pub(crate) fn usize_leading_zeros_riscv(x: usize) -> usize {
+ let mut x = x;
+ // the number of potential leading zeros
+ let mut z = usize::MAX.count_ones() as usize;
+ // a temporary
+ let mut t: usize;
+
+ // RISC-V does not have a set-if-greater-than-or-equal instruction and
+ // `(x >= power-of-two) as usize` will get compiled into two instructions, but this is
+ // still the most optimal method. A conditional set can only be turned into a single
+ // immediate instruction if `x` is compared with an immediate `imm` (that can fit into
+ // 12 bits) like `x < imm` but not `imm < x` (because the immediate is always on the
+ // right). If we try to save an instruction by using `x < imm` for each bisection, we
+ // have to shift `x` left and compare with powers of two approaching `usize::MAX + 1`,
+ // but the immediate will never fit into 12 bits and never save an instruction.
+ #[cfg(target_pointer_width = "64")]
+ {
+ // If the upper 32 bits of `x` are not all 0, `t` is set to `1 << 5`, otherwise
+ // `t` is set to 0.
+ t = ((x >= (1 << 32)) as usize) << 5;
+ // If `t` was set to `1 << 5`, then the upper 32 bits are shifted down for the
+ // next step to process.
+ x >>= t;
+ // If `t` was set to `1 << 5`, then we subtract 32 from the number of potential
+ // leading zeros
+ z -= t;
+ }
+ #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
+ {
+ t = ((x >= (1 << 16)) as usize) << 4;
+ x >>= t;
+ z -= t;
+ }
+ t = ((x >= (1 << 8)) as usize) << 3;
+ x >>= t;
+ z -= t;
+ t = ((x >= (1 << 4)) as usize) << 2;
+ x >>= t;
+ z -= t;
+ t = ((x >= (1 << 2)) as usize) << 1;
+ x >>= t;
+ z -= t;
+ t = (x >= (1 << 1)) as usize;
+ x >>= t;
+ z -= t;
+ // All bits except the LSB are guaranteed to be zero for this final bisection step.
+ // If `x != 0` then `x == 1` and subtracts one potential zero from `z`.
+ z - x
+}
+}
+
+intrinsics! {
+ #[maybe_use_optimized_c_shim]
+ #[cfg(any(
+ target_pointer_width = "16",
+ target_pointer_width = "32",
+ target_pointer_width = "64"
+ ))]
+ /// Returns the number of leading binary zeros in `x`.
+ pub extern "C" fn __clzsi2(x: usize) -> usize {
+ if cfg!(any(target_arch = "riscv32", target_arch = "riscv64")) {
+ usize_leading_zeros_riscv(x)
+ } else {
+ usize_leading_zeros_default(x)
+ }
+ }
+}
diff --git a/vendor/compiler_builtins/src/int/mod.rs b/vendor/compiler_builtins/src/int/mod.rs
new file mode 100644
index 000000000..509f9fdae
--- /dev/null
+++ b/vendor/compiler_builtins/src/int/mod.rs
@@ -0,0 +1,390 @@
+use core::ops;
+
+mod specialized_div_rem;
+
+pub mod addsub;
+pub mod leading_zeros;
+pub mod mul;
+pub mod sdiv;
+pub mod shift;
+pub mod udiv;
+
+pub use self::leading_zeros::__clzsi2;
+
+public_test_dep! {
+/// Trait for some basic operations on integers
+pub(crate) trait Int:
+ Copy
+ + core::fmt::Debug
+ + PartialEq
+ + PartialOrd
+ + ops::AddAssign
+ + ops::SubAssign
+ + ops::BitAndAssign
+ + ops::BitOrAssign
+ + ops::BitXorAssign
+ + ops::ShlAssign<i32>
+ + ops::ShrAssign<u32>
+ + ops::Add<Output = Self>
+ + ops::Sub<Output = Self>
+ + ops::Div<Output = Self>
+ + ops::Shl<u32, Output = Self>
+ + ops::Shr<u32, Output = Self>
+ + ops::BitOr<Output = Self>
+ + ops::BitXor<Output = Self>
+ + ops::BitAnd<Output = Self>
+ + ops::Not<Output = Self>
+{
+ /// Type with the same width but other signedness
+ type OtherSign: Int;
+ /// Unsigned version of Self
+ type UnsignedInt: Int;
+
+ /// If `Self` is a signed integer
+ const SIGNED: bool;
+
+ /// The bitwidth of the int type
+ const BITS: u32;
+
+ const ZERO: Self;
+ const ONE: Self;
+ const MIN: Self;
+ const MAX: Self;
+
+ /// LUT used for maximizing the space covered and minimizing the computational cost of fuzzing
+ /// in `testcrate`. For example, Self = u128 produces [0,1,2,7,8,15,16,31,32,63,64,95,96,111,
+ /// 112,119,120,125,126,127].
+ const FUZZ_LENGTHS: [u8; 20];
+ /// The number of entries of `FUZZ_LENGTHS` actually used. The maximum is 20 for u128.
+ const FUZZ_NUM: usize;
+
+ fn unsigned(self) -> Self::UnsignedInt;
+ fn from_unsigned(unsigned: Self::UnsignedInt) -> Self;
+
+ fn from_bool(b: bool) -> Self;
+
+ /// Prevents the need for excessive conversions between signed and unsigned
+ fn logical_shr(self, other: u32) -> Self;
+
+ /// Absolute difference between two integers.
+ fn abs_diff(self, other: Self) -> Self::UnsignedInt;
+
+ // copied from primitive integers, but put in a trait
+ fn is_zero(self) -> bool;
+ fn wrapping_neg(self) -> Self;
+ fn wrapping_add(self, other: Self) -> Self;
+ fn wrapping_mul(self, other: Self) -> Self;
+ fn wrapping_sub(self, other: Self) -> Self;
+ fn wrapping_shl(self, other: u32) -> Self;
+ fn wrapping_shr(self, other: u32) -> Self;
+ fn rotate_left(self, other: u32) -> Self;
+ fn overflowing_add(self, other: Self) -> (Self, bool);
+ fn leading_zeros(self) -> u32;
+}
+}
+
+macro_rules! int_impl_common {
+ ($ty:ty) => {
+ const BITS: u32 = <Self as Int>::ZERO.count_zeros();
+ const SIGNED: bool = Self::MIN != Self::ZERO;
+
+ const ZERO: Self = 0;
+ const ONE: Self = 1;
+ const MIN: Self = <Self>::MIN;
+ const MAX: Self = <Self>::MAX;
+
+ const FUZZ_LENGTHS: [u8; 20] = {
+ let bits = <Self as Int>::BITS;
+ let mut v = [0u8; 20];
+ v[0] = 0;
+ v[1] = 1;
+ v[2] = 2; // important for parity and the iX::MIN case when reversed
+ let mut i = 3;
+ // No need for any more until the byte boundary, because there should be no algorithms
+ // that are sensitive to anything not next to byte boundaries after 2. We also scale
+ // in powers of two, which is important to prevent u128 corner tests from getting too
+ // big.
+ let mut l = 8;
+ loop {
+ if l >= ((bits / 2) as u8) {
+ break;
+ }
+ // get both sides of the byte boundary
+ v[i] = l - 1;
+ i += 1;
+ v[i] = l;
+ i += 1;
+ l *= 2;
+ }
+
+ if bits != 8 {
+ // add the lower side of the middle boundary
+ v[i] = ((bits / 2) - 1) as u8;
+ i += 1;
+ }
+
+ // We do not want to jump directly from the Self::BITS/2 boundary to the Self::BITS
+ // boundary because of algorithms that split the high part up. We reverse the scaling
+ // as we go to Self::BITS.
+ let mid = i;
+ let mut j = 1;
+ loop {
+ v[i] = (bits as u8) - (v[mid - j]) - 1;
+ if j == mid {
+ break;
+ }
+ i += 1;
+ j += 1;
+ }
+ v
+ };
+
+ const FUZZ_NUM: usize = {
+ let log2 = (<Self as Int>::BITS - 1).count_ones() as usize;
+ if log2 == 3 {
+ // case for u8
+ 6
+ } else {
+ // 3 entries on each extreme, 2 in the middle, and 4 for each scale of intermediate
+ // boundaries.
+ 8 + (4 * (log2 - 4))
+ }
+ };
+
+ fn from_bool(b: bool) -> Self {
+ b as $ty
+ }
+
+ fn logical_shr(self, other: u32) -> Self {
+ Self::from_unsigned(self.unsigned().wrapping_shr(other))
+ }
+
+ fn is_zero(self) -> bool {
+ self == Self::ZERO
+ }
+
+ fn wrapping_neg(self) -> Self {
+ <Self>::wrapping_neg(self)
+ }
+
+ fn wrapping_add(self, other: Self) -> Self {
+ <Self>::wrapping_add(self, other)
+ }
+
+ fn wrapping_mul(self, other: Self) -> Self {
+ <Self>::wrapping_mul(self, other)
+ }
+
+ fn wrapping_sub(self, other: Self) -> Self {
+ <Self>::wrapping_sub(self, other)
+ }
+
+ fn wrapping_shl(self, other: u32) -> Self {
+ <Self>::wrapping_shl(self, other)
+ }
+
+ fn wrapping_shr(self, other: u32) -> Self {
+ <Self>::wrapping_shr(self, other)
+ }
+
+ fn rotate_left(self, other: u32) -> Self {
+ <Self>::rotate_left(self, other)
+ }
+
+ fn overflowing_add(self, other: Self) -> (Self, bool) {
+ <Self>::overflowing_add(self, other)
+ }
+
+ fn leading_zeros(self) -> u32 {
+ <Self>::leading_zeros(self)
+ }
+ };
+}
+
+macro_rules! int_impl {
+ ($ity:ty, $uty:ty) => {
+ impl Int for $uty {
+ type OtherSign = $ity;
+ type UnsignedInt = $uty;
+
+ fn unsigned(self) -> $uty {
+ self
+ }
+
+ // It makes writing macros easier if this is implemented for both signed and unsigned
+ #[allow(clippy::wrong_self_convention)]
+ fn from_unsigned(me: $uty) -> Self {
+ me
+ }
+
+ fn abs_diff(self, other: Self) -> Self {
+ if self < other {
+ other.wrapping_sub(self)
+ } else {
+ self.wrapping_sub(other)
+ }
+ }
+
+ int_impl_common!($uty);
+ }
+
+ impl Int for $ity {
+ type OtherSign = $uty;
+ type UnsignedInt = $uty;
+
+ fn unsigned(self) -> $uty {
+ self as $uty
+ }
+
+ fn from_unsigned(me: $uty) -> Self {
+ me as $ity
+ }
+
+ fn abs_diff(self, other: Self) -> $uty {
+ self.wrapping_sub(other).wrapping_abs() as $uty
+ }
+
+ int_impl_common!($ity);
+ }
+ };
+}
+
+int_impl!(isize, usize);
+int_impl!(i8, u8);
+int_impl!(i16, u16);
+int_impl!(i32, u32);
+int_impl!(i64, u64);
+int_impl!(i128, u128);
+
+public_test_dep! {
+/// Trait for integers twice the bit width of another integer. This is implemented for all
+/// primitives except for `u8`, because there is not a smaller primitive.
+pub(crate) trait DInt: Int {
+ /// Integer that is half the bit width of the integer this trait is implemented for
+ type H: HInt<D = Self> + Int;
+
+ /// Returns the low half of `self`
+ fn lo(self) -> Self::H;
+ /// Returns the high half of `self`
+ fn hi(self) -> Self::H;
+ /// Returns the low and high halves of `self` as a tuple
+ fn lo_hi(self) -> (Self::H, Self::H);
+ /// Constructs an integer using lower and higher half parts
+ fn from_lo_hi(lo: Self::H, hi: Self::H) -> Self;
+}
+}
+
+public_test_dep! {
+/// Trait for integers half the bit width of another integer. This is implemented for all
+/// primitives except for `u128`, because it there is not a larger primitive.
+pub(crate) trait HInt: Int {
+ /// Integer that is double the bit width of the integer this trait is implemented for
+ type D: DInt<H = Self> + Int;
+
+ /// Widens (using default extension) the integer to have double bit width
+ fn widen(self) -> Self::D;
+ /// Widens (zero extension only) the integer to have double bit width. This is needed to get
+ /// around problems with associated type bounds (such as `Int<Othersign: DInt>`) being unstable
+ fn zero_widen(self) -> Self::D;
+ /// Widens the integer to have double bit width and shifts the integer into the higher bits
+ fn widen_hi(self) -> Self::D;
+ /// Widening multiplication with zero widening. This cannot overflow.
+ fn zero_widen_mul(self, rhs: Self) -> Self::D;
+ /// Widening multiplication. This cannot overflow.
+ fn widen_mul(self, rhs: Self) -> Self::D;
+}
+}
+
+macro_rules! impl_d_int {
+ ($($X:ident $D:ident),*) => {
+ $(
+ impl DInt for $D {
+ type H = $X;
+
+ fn lo(self) -> Self::H {
+ self as $X
+ }
+ fn hi(self) -> Self::H {
+ (self >> <$X as Int>::BITS) as $X
+ }
+ fn lo_hi(self) -> (Self::H, Self::H) {
+ (self.lo(), self.hi())
+ }
+ fn from_lo_hi(lo: Self::H, hi: Self::H) -> Self {
+ lo.zero_widen() | hi.widen_hi()
+ }
+ }
+ )*
+ };
+}
+
+macro_rules! impl_h_int {
+ ($($H:ident $uH:ident $X:ident),*) => {
+ $(
+ impl HInt for $H {
+ type D = $X;
+
+ fn widen(self) -> Self::D {
+ self as $X
+ }
+ fn zero_widen(self) -> Self::D {
+ (self as $uH) as $X
+ }
+ fn widen_hi(self) -> Self::D {
+ (self as $X) << <$H as Int>::BITS
+ }
+ fn zero_widen_mul(self, rhs: Self) -> Self::D {
+ self.zero_widen().wrapping_mul(rhs.zero_widen())
+ }
+ fn widen_mul(self, rhs: Self) -> Self::D {
+ self.widen().wrapping_mul(rhs.widen())
+ }
+ }
+ )*
+ };
+}
+
+impl_d_int!(u8 u16, u16 u32, u32 u64, u64 u128, i8 i16, i16 i32, i32 i64, i64 i128);
+impl_h_int!(
+ u8 u8 u16,
+ u16 u16 u32,
+ u32 u32 u64,
+ u64 u64 u128,
+ i8 u8 i16,
+ i16 u16 i32,
+ i32 u32 i64,
+ i64 u64 i128
+);
+
+public_test_dep! {
+/// Trait to express (possibly lossy) casting of integers
+pub(crate) trait CastInto<T: Copy>: Copy {
+ fn cast(self) -> T;
+}
+}
+
+macro_rules! cast_into {
+ ($ty:ty) => {
+ cast_into!($ty; usize, isize, u8, i8, u16, i16, u32, i32, u64, i64, u128, i128);
+ };
+ ($ty:ty; $($into:ty),*) => {$(
+ impl CastInto<$into> for $ty {
+ fn cast(self) -> $into {
+ self as $into
+ }
+ }
+ )*};
+}
+
+cast_into!(usize);
+cast_into!(isize);
+cast_into!(u8);
+cast_into!(i8);
+cast_into!(u16);
+cast_into!(i16);
+cast_into!(u32);
+cast_into!(i32);
+cast_into!(u64);
+cast_into!(i64);
+cast_into!(u128);
+cast_into!(i128);
diff --git a/vendor/compiler_builtins/src/int/mul.rs b/vendor/compiler_builtins/src/int/mul.rs
new file mode 100644
index 000000000..07ce061c9
--- /dev/null
+++ b/vendor/compiler_builtins/src/int/mul.rs
@@ -0,0 +1,138 @@
+use int::{DInt, HInt, Int};
+
+trait Mul: DInt
+where
+ Self::H: DInt,
+{
+ fn mul(self, rhs: Self) -> Self {
+ // In order to prevent infinite recursion, we cannot use the `widen_mul` in this:
+ //self.lo().widen_mul(rhs.lo())
+ // .wrapping_add(self.lo().wrapping_mul(rhs.hi()).widen_hi())
+ // .wrapping_add(self.hi().wrapping_mul(rhs.lo()).widen_hi())
+
+ let lhs_lo = self.lo();
+ let rhs_lo = rhs.lo();
+ // construct the widening multiplication using only `Self::H` sized multiplications
+ let tmp_0 = lhs_lo.lo().zero_widen_mul(rhs_lo.lo());
+ let tmp_1 = lhs_lo.lo().zero_widen_mul(rhs_lo.hi());
+ let tmp_2 = lhs_lo.hi().zero_widen_mul(rhs_lo.lo());
+ let tmp_3 = lhs_lo.hi().zero_widen_mul(rhs_lo.hi());
+ // sum up all widening partials
+ let mul = Self::from_lo_hi(tmp_0, tmp_3)
+ .wrapping_add(tmp_1.zero_widen() << (Self::BITS / 4))
+ .wrapping_add(tmp_2.zero_widen() << (Self::BITS / 4));
+ // add the higher partials
+ mul.wrapping_add(lhs_lo.wrapping_mul(rhs.hi()).widen_hi())
+ .wrapping_add(self.hi().wrapping_mul(rhs_lo).widen_hi())
+ }
+}
+
+impl Mul for u64 {}
+impl Mul for i128 {}
+
+pub(crate) trait UMulo: Int + DInt {
+ fn mulo(self, rhs: Self) -> (Self, bool) {
+ match (self.hi().is_zero(), rhs.hi().is_zero()) {
+ // overflow is guaranteed
+ (false, false) => (self.wrapping_mul(rhs), true),
+ (true, false) => {
+ let mul_lo = self.lo().widen_mul(rhs.lo());
+ let mul_hi = self.lo().widen_mul(rhs.hi());
+ let (mul, o) = mul_lo.overflowing_add(mul_hi.lo().widen_hi());
+ (mul, o || !mul_hi.hi().is_zero())
+ }
+ (false, true) => {
+ let mul_lo = rhs.lo().widen_mul(self.lo());
+ let mul_hi = rhs.lo().widen_mul(self.hi());
+ let (mul, o) = mul_lo.overflowing_add(mul_hi.lo().widen_hi());
+ (mul, o || !mul_hi.hi().is_zero())
+ }
+ // overflow is guaranteed to not happen, and use a smaller widening multiplication
+ (true, true) => (self.lo().widen_mul(rhs.lo()), false),
+ }
+ }
+}
+
+impl UMulo for u32 {}
+impl UMulo for u64 {}
+impl UMulo for u128 {}
+
+macro_rules! impl_signed_mulo {
+ ($fn:ident, $iD:ident, $uD:ident) => {
+ fn $fn(lhs: $iD, rhs: $iD) -> ($iD, bool) {
+ let mut lhs = lhs;
+ let mut rhs = rhs;
+ // the test against `mul_neg` below fails without this early return
+ if lhs == 0 || rhs == 0 {
+ return (0, false);
+ }
+
+ let lhs_neg = lhs < 0;
+ let rhs_neg = rhs < 0;
+ if lhs_neg {
+ lhs = lhs.wrapping_neg();
+ }
+ if rhs_neg {
+ rhs = rhs.wrapping_neg();
+ }
+ let mul_neg = lhs_neg != rhs_neg;
+
+ let (mul, o) = (lhs as $uD).mulo(rhs as $uD);
+ let mut mul = mul as $iD;
+
+ if mul_neg {
+ mul = mul.wrapping_neg();
+ }
+ if (mul < 0) != mul_neg {
+ // this one check happens to catch all edge cases related to `$iD::MIN`
+ (mul, true)
+ } else {
+ (mul, o)
+ }
+ }
+ };
+}
+
+impl_signed_mulo!(i32_overflowing_mul, i32, u32);
+impl_signed_mulo!(i64_overflowing_mul, i64, u64);
+impl_signed_mulo!(i128_overflowing_mul, i128, u128);
+
+intrinsics! {
+ #[maybe_use_optimized_c_shim]
+ #[arm_aeabi_alias = __aeabi_lmul]
+ #[cfg(any(not(any(target_arch = "riscv32", target_arch = "riscv64")), target_feature = "m"))]
+ pub extern "C" fn __muldi3(a: u64, b: u64) -> u64 {
+ a.mul(b)
+ }
+
+ pub extern "C" fn __multi3(a: i128, b: i128) -> i128 {
+ a.mul(b)
+ }
+
+ pub extern "C" fn __mulosi4(a: i32, b: i32, oflow: &mut i32) -> i32 {
+ let (mul, o) = i32_overflowing_mul(a, b);
+ *oflow = o as i32;
+ mul
+ }
+
+ pub extern "C" fn __mulodi4(a: i64, b: i64, oflow: &mut i32) -> i64 {
+ let (mul, o) = i64_overflowing_mul(a, b);
+ *oflow = o as i32;
+ mul
+ }
+
+ #[unadjusted_on_win64]
+ pub extern "C" fn __muloti4(a: i128, b: i128, oflow: &mut i32) -> i128 {
+ let (mul, o) = i128_overflowing_mul(a, b);
+ *oflow = o as i32;
+ mul
+ }
+
+ pub extern "C" fn __rust_i128_mulo(a: i128, b: i128) -> (i128, bool) {
+ i128_overflowing_mul(a, b)
+ }
+
+ pub extern "C" fn __rust_u128_mulo(a: u128, b: u128) -> (u128, bool) {
+ a.mulo(b)
+ }
+}
diff --git a/vendor/compiler_builtins/src/int/sdiv.rs b/vendor/compiler_builtins/src/int/sdiv.rs
new file mode 100644
index 000000000..f1822f0f8
--- /dev/null
+++ b/vendor/compiler_builtins/src/int/sdiv.rs
@@ -0,0 +1,169 @@
+use int::udiv::*;
+
+macro_rules! sdivmod {
+ (
+ $unsigned_fn:ident, // name of the unsigned division function
+ $signed_fn:ident, // name of the signed division function
+ $uX:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name`
+ $iX:ident, // signed integer type for the inputs and outputs of `$signed_name`
+ $($attr:tt),* // attributes
+ ) => {
+ intrinsics! {
+ #[avr_skip]
+ $(
+ #[$attr]
+ )*
+ /// Returns `n / d` and sets `*rem = n % d`
+ pub extern "C" fn $signed_fn(a: $iX, b: $iX, rem: &mut $iX) -> $iX {
+ let a_neg = a < 0;
+ let b_neg = b < 0;
+ let mut a = a;
+ let mut b = b;
+ if a_neg {
+ a = a.wrapping_neg();
+ }
+ if b_neg {
+ b = b.wrapping_neg();
+ }
+ let mut r = *rem as $uX;
+ let t = $unsigned_fn(a as $uX, b as $uX, Some(&mut r)) as $iX;
+ let mut r = r as $iX;
+ if a_neg {
+ r = r.wrapping_neg();
+ }
+ *rem = r;
+ if a_neg != b_neg {
+ t.wrapping_neg()
+ } else {
+ t
+ }
+ }
+ }
+ }
+}
+
+macro_rules! sdiv {
+ (
+ $unsigned_fn:ident, // name of the unsigned division function
+ $signed_fn:ident, // name of the signed division function
+ $uX:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name`
+ $iX:ident, // signed integer type for the inputs and outputs of `$signed_name`
+ $($attr:tt),* // attributes
+ ) => {
+ intrinsics! {
+ #[avr_skip]
+ $(
+ #[$attr]
+ )*
+ /// Returns `n / d`
+ pub extern "C" fn $signed_fn(a: $iX, b: $iX) -> $iX {
+ let a_neg = a < 0;
+ let b_neg = b < 0;
+ let mut a = a;
+ let mut b = b;
+ if a_neg {
+ a = a.wrapping_neg();
+ }
+ if b_neg {
+ b = b.wrapping_neg();
+ }
+ let t = $unsigned_fn(a as $uX, b as $uX) as $iX;
+ if a_neg != b_neg {
+ t.wrapping_neg()
+ } else {
+ t
+ }
+ }
+ }
+ }
+}
+
+macro_rules! smod {
+ (
+ $unsigned_fn:ident, // name of the unsigned division function
+ $signed_fn:ident, // name of the signed division function
+ $uX:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name`
+ $iX:ident, // signed integer type for the inputs and outputs of `$signed_name`
+ $($attr:tt),* // attributes
+ ) => {
+ intrinsics! {
+ #[avr_skip]
+ $(
+ #[$attr]
+ )*
+ /// Returns `n % d`
+ pub extern "C" fn $signed_fn(a: $iX, b: $iX) -> $iX {
+ let a_neg = a < 0;
+ let b_neg = b < 0;
+ let mut a = a;
+ let mut b = b;
+ if a_neg {
+ a = a.wrapping_neg();
+ }
+ if b_neg {
+ b = b.wrapping_neg();
+ }
+ let r = $unsigned_fn(a as $uX, b as $uX) as $iX;
+ if a_neg {
+ r.wrapping_neg()
+ } else {
+ r
+ }
+ }
+ }
+ }
+}
+
+sdivmod!(
+ __udivmodsi4,
+ __divmodsi4,
+ u32,
+ i32,
+ maybe_use_optimized_c_shim
+);
+// The `#[arm_aeabi_alias = __aeabi_idiv]` attribute cannot be made to work with `intrinsics!` in macros
+intrinsics! {
+ #[maybe_use_optimized_c_shim]
+ #[arm_aeabi_alias = __aeabi_idiv]
+ /// Returns `n / d`
+ pub extern "C" fn __divsi3(a: i32, b: i32) -> i32 {
+ let a_neg = a < 0;
+ let b_neg = b < 0;
+ let mut a = a;
+ let mut b = b;
+ if a_neg {
+ a = a.wrapping_neg();
+ }
+ if b_neg {
+ b = b.wrapping_neg();
+ }
+ let t = __udivsi3(a as u32, b as u32) as i32;
+ if a_neg != b_neg {
+ t.wrapping_neg()
+ } else {
+ t
+ }
+ }
+}
+smod!(__umodsi3, __modsi3, u32, i32, maybe_use_optimized_c_shim);
+
+sdivmod!(
+ __udivmoddi4,
+ __divmoddi4,
+ u64,
+ i64,
+ maybe_use_optimized_c_shim
+);
+sdiv!(__udivdi3, __divdi3, u64, i64, maybe_use_optimized_c_shim);
+smod!(__umoddi3, __moddi3, u64, i64, maybe_use_optimized_c_shim);
+
+// LLVM does not currently have a `__divmodti4` function, but GCC does
+sdivmod!(
+ __udivmodti4,
+ __divmodti4,
+ u128,
+ i128,
+ maybe_use_optimized_c_shim
+);
+sdiv!(__udivti3, __divti3, u128, i128, win64_128bit_abi_hack);
+smod!(__umodti3, __modti3, u128, i128, win64_128bit_abi_hack);
diff --git a/vendor/compiler_builtins/src/int/shift.rs b/vendor/compiler_builtins/src/int/shift.rs
new file mode 100644
index 000000000..908e619e1
--- /dev/null
+++ b/vendor/compiler_builtins/src/int/shift.rs
@@ -0,0 +1,116 @@
+use int::{DInt, HInt, Int};
+
+trait Ashl: DInt {
+ /// Returns `a << b`, requires `b < Self::BITS`
+ fn ashl(self, shl: u32) -> Self {
+ let n_h = Self::H::BITS;
+ if shl & n_h != 0 {
+ // we only need `self.lo()` because `self.hi()` will be shifted out entirely
+ self.lo().wrapping_shl(shl - n_h).widen_hi()
+ } else if shl == 0 {
+ self
+ } else {
+ Self::from_lo_hi(
+ self.lo().wrapping_shl(shl),
+ self.lo().logical_shr(n_h - shl) | self.hi().wrapping_shl(shl),
+ )
+ }
+ }
+}
+
+impl Ashl for u32 {}
+impl Ashl for u64 {}
+impl Ashl for u128 {}
+
+trait Ashr: DInt {
+ /// Returns arithmetic `a >> b`, requires `b < Self::BITS`
+ fn ashr(self, shr: u32) -> Self {
+ let n_h = Self::H::BITS;
+ if shr & n_h != 0 {
+ Self::from_lo_hi(
+ self.hi().wrapping_shr(shr - n_h),
+ // smear the sign bit
+ self.hi().wrapping_shr(n_h - 1),
+ )
+ } else if shr == 0 {
+ self
+ } else {
+ Self::from_lo_hi(
+ self.lo().logical_shr(shr) | self.hi().wrapping_shl(n_h - shr),
+ self.hi().wrapping_shr(shr),
+ )
+ }
+ }
+}
+
+impl Ashr for i32 {}
+impl Ashr for i64 {}
+impl Ashr for i128 {}
+
+trait Lshr: DInt {
+ /// Returns logical `a >> b`, requires `b < Self::BITS`
+ fn lshr(self, shr: u32) -> Self {
+ let n_h = Self::H::BITS;
+ if shr & n_h != 0 {
+ self.hi().logical_shr(shr - n_h).zero_widen()
+ } else if shr == 0 {
+ self
+ } else {
+ Self::from_lo_hi(
+ self.lo().logical_shr(shr) | self.hi().wrapping_shl(n_h - shr),
+ self.hi().logical_shr(shr),
+ )
+ }
+ }
+}
+
+impl Lshr for u32 {}
+impl Lshr for u64 {}
+impl Lshr for u128 {}
+
+intrinsics! {
+ #[maybe_use_optimized_c_shim]
+ pub extern "C" fn __ashlsi3(a: u32, b: u32) -> u32 {
+ a.ashl(b)
+ }
+
+ #[maybe_use_optimized_c_shim]
+ #[arm_aeabi_alias = __aeabi_llsl]
+ pub extern "C" fn __ashldi3(a: u64, b: u32) -> u64 {
+ a.ashl(b)
+ }
+
+ pub extern "C" fn __ashlti3(a: u128, b: u32) -> u128 {
+ a.ashl(b)
+ }
+
+ #[maybe_use_optimized_c_shim]
+ pub extern "C" fn __ashrsi3(a: i32, b: u32) -> i32 {
+ a.ashr(b)
+ }
+
+ #[maybe_use_optimized_c_shim]
+ #[arm_aeabi_alias = __aeabi_lasr]
+ pub extern "C" fn __ashrdi3(a: i64, b: u32) -> i64 {
+ a.ashr(b)
+ }
+
+ pub extern "C" fn __ashrti3(a: i128, b: u32) -> i128 {
+ a.ashr(b)
+ }
+
+ #[maybe_use_optimized_c_shim]
+ pub extern "C" fn __lshrsi3(a: u32, b: u32) -> u32 {
+ a.lshr(b)
+ }
+
+ #[maybe_use_optimized_c_shim]
+ #[arm_aeabi_alias = __aeabi_llsr]
+ pub extern "C" fn __lshrdi3(a: u64, b: u32) -> u64 {
+ a.lshr(b)
+ }
+
+ pub extern "C" fn __lshrti3(a: u128, b: u32) -> u128 {
+ a.lshr(b)
+ }
+}
diff --git a/vendor/compiler_builtins/src/int/specialized_div_rem/asymmetric.rs b/vendor/compiler_builtins/src/int/specialized_div_rem/asymmetric.rs
new file mode 100644
index 000000000..56ce188a3
--- /dev/null
+++ b/vendor/compiler_builtins/src/int/specialized_div_rem/asymmetric.rs
@@ -0,0 +1,69 @@
+/// Creates an unsigned division function optimized for dividing integers with the same
+/// bitwidth as the largest operand in an asymmetrically sized division. For example, x86-64 has an
+/// assembly instruction that can divide a 128 bit integer by a 64 bit integer if the quotient fits
+/// in 64 bits. The 128 bit version of this algorithm would use that fast hardware division to
+/// construct a full 128 bit by 128 bit division.
+#[allow(unused_macros)]
+macro_rules! impl_asymmetric {
+ (
+ $fn:ident, // name of the unsigned division function
+ $zero_div_fn:ident, // function called when division by zero is attempted
+ $half_division:ident, // function for division of a $uX by a $uX
+ $asymmetric_division:ident, // function for division of a $uD by a $uX
+ $n_h:expr, // the number of bits in a $iH or $uH
+ $uH:ident, // unsigned integer with half the bit width of $uX
+ $uX:ident, // unsigned integer with half the bit width of $uD
+ $uD:ident // unsigned integer type for the inputs and outputs of `$fn`
+ ) => {
+ /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
+ /// tuple.
+ pub fn $fn(duo: $uD, div: $uD) -> ($uD, $uD) {
+ let n: u32 = $n_h * 2;
+
+ let duo_lo = duo as $uX;
+ let duo_hi = (duo >> n) as $uX;
+ let div_lo = div as $uX;
+ let div_hi = (div >> n) as $uX;
+ if div_hi == 0 {
+ if div_lo == 0 {
+ $zero_div_fn()
+ }
+ if duo_hi < div_lo {
+ // `$uD` by `$uX` division with a quotient that will fit into a `$uX`
+ let (quo, rem) = unsafe { $asymmetric_division(duo, div_lo) };
+ return (quo as $uD, rem as $uD);
+ } else {
+ // Short division using the $uD by $uX division
+ let (quo_hi, rem_hi) = $half_division(duo_hi, div_lo);
+ let tmp = unsafe {
+ $asymmetric_division((duo_lo as $uD) | ((rem_hi as $uD) << n), div_lo)
+ };
+ return ((tmp.0 as $uD) | ((quo_hi as $uD) << n), tmp.1 as $uD);
+ }
+ }
+
+ // This has been adapted from
+ // https://www.codeproject.com/tips/785014/uint-division-modulus which was in turn
+ // adapted from Hacker's Delight. This is similar to the two possibility algorithm
+ // in that it uses only more significant parts of `duo` and `div` to divide a large
+ // integer with a smaller division instruction.
+ let div_lz = div_hi.leading_zeros();
+ let div_extra = n - div_lz;
+ let div_sig_n = (div >> div_extra) as $uX;
+ let tmp = unsafe { $asymmetric_division(duo >> 1, div_sig_n) };
+
+ let mut quo = tmp.0 >> ((n - 1) - div_lz);
+ if quo != 0 {
+ quo -= 1;
+ }
+
+ // Note that this is a full `$uD` multiplication being used here
+ let mut rem = duo - (quo as $uD).wrapping_mul(div);
+ if div <= rem {
+ quo += 1;
+ rem -= div;
+ }
+ return (quo as $uD, rem);
+ }
+ };
+}
diff --git a/vendor/compiler_builtins/src/int/specialized_div_rem/binary_long.rs b/vendor/compiler_builtins/src/int/specialized_div_rem/binary_long.rs
new file mode 100644
index 000000000..0d7822882
--- /dev/null
+++ b/vendor/compiler_builtins/src/int/specialized_div_rem/binary_long.rs
@@ -0,0 +1,548 @@
+/// Creates an unsigned division function that uses binary long division, designed for
+/// computer architectures without division instructions. These functions have good performance for
+/// microarchitectures with large branch miss penalties and architectures without the ability to
+/// predicate instructions. For architectures with predicated instructions, one of the algorithms
+/// described in the documentation of these functions probably has higher performance, and a custom
+/// assembly routine should be used instead.
+#[allow(unused_macros)]
+macro_rules! impl_binary_long {
+ (
+ $fn:ident, // name of the unsigned division function
+ $zero_div_fn:ident, // function called when division by zero is attempted
+ $normalization_shift:ident, // function for finding the normalization shift
+ $n:tt, // the number of bits in a $iX or $uX
+ $uX:ident, // unsigned integer type for the inputs and outputs of `$fn`
+ $iX:ident // signed integer type with same bitwidth as `$uX`
+ ) => {
+ /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
+ /// tuple.
+ pub fn $fn(duo: $uX, div: $uX) -> ($uX, $uX) {
+ let mut duo = duo;
+ // handle edge cases before calling `$normalization_shift`
+ if div == 0 {
+ $zero_div_fn()
+ }
+ if duo < div {
+ return (0, duo);
+ }
+
+ // There are many variations of binary division algorithm that could be used. This
+ // documentation gives a tour of different methods so that future readers wanting to
+ // optimize further do not have to painstakingly derive them. The SWAR variation is
+ // especially hard to understand without reading the less convoluted methods first.
+
+ // You may notice that a `duo < div_original` check is included in many these
+ // algorithms. A critical optimization that many algorithms miss is handling of
+ // quotients that will turn out to have many trailing zeros or many leading zeros. This
+ // happens in cases of exact or close-to-exact divisions, divisions by power of two, and
+ // in cases where the quotient is small. The `duo < div_original` check handles these
+ // cases of early returns and ends up replacing other kinds of mundane checks that
+ // normally terminate a binary division algorithm.
+ //
+ // Something you may see in other algorithms that is not special-cased here is checks
+ // for division by powers of two. The `duo < div_original` check handles this case and
+ // more, however it can be checked up front before the bisection using the
+ // `((div > 0) && ((div & (div - 1)) == 0))` trick. This is not special-cased because
+ // compilers should handle most cases where divisions by power of two occur, and we do
+ // not want to add on a few cycles for every division operation just to save a few
+ // cycles rarely.
+
+ // The following example is the most straightforward translation from the way binary
+ // long division is typically visualized:
+ // Dividing 178u8 (0b10110010) by 6u8 (0b110). `div` is shifted left by 5, according to
+ // the result from `$normalization_shift(duo, div, false)`.
+ //
+ // Step 0: `sub` is negative, so there is not full normalization, so no `quo` bit is set
+ // and `duo` is kept unchanged.
+ // duo:10110010, div_shifted:11000000, sub:11110010, quo:00000000, shl:5
+ //
+ // Step 1: `sub` is positive, set a `quo` bit and update `duo` for next step.
+ // duo:10110010, div_shifted:01100000, sub:01010010, quo:00010000, shl:4
+ //
+ // Step 2: Continue based on `sub`. The `quo` bits start accumulating.
+ // duo:01010010, div_shifted:00110000, sub:00100010, quo:00011000, shl:3
+ // duo:00100010, div_shifted:00011000, sub:00001010, quo:00011100, shl:2
+ // duo:00001010, div_shifted:00001100, sub:11111110, quo:00011100, shl:1
+ // duo:00001010, div_shifted:00000110, sub:00000100, quo:00011100, shl:0
+ // The `duo < div_original` check terminates the algorithm with the correct quotient of
+ // 29u8 and remainder of 4u8
+ /*
+ let div_original = div;
+ let mut shl = $normalization_shift(duo, div, false);
+ let mut quo = 0;
+ loop {
+ let div_shifted = div << shl;
+ let sub = duo.wrapping_sub(div_shifted);
+ // it is recommended to use `println!`s like this if functionality is unclear
+ /*
+ println!("duo:{:08b}, div_shifted:{:08b}, sub:{:08b}, quo:{:08b}, shl:{}",
+ duo,
+ div_shifted,
+ sub,
+ quo,
+ shl
+ );
+ */
+ if 0 <= (sub as $iX) {
+ duo = sub;
+ quo += 1 << shl;
+ if duo < div_original {
+ // this branch is optional
+ return (quo, duo)
+ }
+ }
+ if shl == 0 {
+ return (quo, duo)
+ }
+ shl -= 1;
+ }
+ */
+
+ // This restoring binary long division algorithm reduces the number of operations
+ // overall via:
+ // - `pow` can be shifted right instead of recalculating from `shl`
+ // - starting `div` shifted left and shifting it right for each step instead of
+ // recalculating from `shl`
+ // - The `duo < div_original` branch is used to terminate the algorithm instead of the
+ // `shl == 0` branch. This check is strong enough to prevent set bits of `pow` and
+ // `div` from being shifted off the end. This check also only occurs on half of steps
+ // on average, since it is behind the `(sub as $iX) >= 0` branch.
+ // - `shl` is now not needed by any aspect of of the loop and thus only 3 variables are
+ // being updated between steps
+ //
+ // There are many variations of this algorithm, but this encompases the largest number
+ // of architectures and does not rely on carry flags, add-with-carry, or SWAR
+ // complications to be decently fast.
+ /*
+ let div_original = div;
+ let shl = $normalization_shift(duo, div, false);
+ let mut div: $uX = div << shl;
+ let mut pow: $uX = 1 << shl;
+ let mut quo: $uX = 0;
+ loop {
+ let sub = duo.wrapping_sub(div);
+ if 0 <= (sub as $iX) {
+ duo = sub;
+ quo |= pow;
+ if duo < div_original {
+ return (quo, duo)
+ }
+ }
+ div >>= 1;
+ pow >>= 1;
+ }
+ */
+
+ // If the architecture has flags and predicated arithmetic instructions, it is possible
+ // to do binary long division without branching and in only 3 or 4 instructions. This is
+ // a variation of a 3 instruction central loop from
+ // http://www.chiark.greenend.org.uk/~theom/riscos/docs/ultimate/a252div.txt.
+ //
+ // What allows doing division in only 3 instructions is realizing that instead of
+ // keeping `duo` in place and shifting `div` right to align bits, `div` can be kept in
+ // place and `duo` can be shifted left. This means `div` does not have to be updated,
+ // but causes edge case problems and makes `duo < div_original` tests harder. Some
+ // architectures have an option to shift an argument in an arithmetic operation, which
+ // means `duo` can be shifted left and subtracted from in one instruction. The other two
+ // instructions are updating `quo` and undoing the subtraction if it turns out things
+ // were not normalized.
+
+ /*
+ // Perform one binary long division step on the already normalized arguments, because
+ // the main. Note that this does a full normalization since the central loop needs
+ // `duo.leading_zeros()` to be at least 1 more than `div.leading_zeros()`. The original
+ // variation only did normalization to the nearest 4 steps, but this makes handling edge
+ // cases much harder. We do a full normalization and perform a binary long division
+ // step. In the edge case where the msbs of `duo` and `div` are set, it clears the msb
+ // of `duo`, then the edge case handler shifts `div` right and does another long
+ // division step to always insure `duo.leading_zeros() + 1 >= div.leading_zeros()`.
+ let div_original = div;
+ let mut shl = $normalization_shift(duo, div, true);
+ let mut div: $uX = (div << shl);
+ let mut quo: $uX = 1;
+ duo = duo.wrapping_sub(div);
+ if duo < div_original {
+ return (1 << shl, duo);
+ }
+ let div_neg: $uX;
+ if (div as $iX) < 0 {
+ // A very ugly edge case where the most significant bit of `div` is set (after
+ // shifting to match `duo` when its most significant bit is at the sign bit), which
+ // leads to the sign bit of `div_neg` being cut off and carries not happening when
+ // they should. This branch performs a long division step that keeps `duo` in place
+ // and shifts `div` down.
+ div >>= 1;
+ div_neg = div.wrapping_neg();
+ let (sub, carry) = duo.overflowing_add(div_neg);
+ duo = sub;
+ quo = quo.wrapping_add(quo).wrapping_add(carry as $uX);
+ if !carry {
+ duo = duo.wrapping_add(div);
+ }
+ shl -= 1;
+ } else {
+ div_neg = div.wrapping_neg();
+ }
+ // The add-with-carry that updates `quo` needs to have the carry set when a normalized
+ // subtract happens. Using `duo.wrapping_shl(1).overflowing_sub(div)` to do the
+ // subtraction generates a carry when an unnormalized subtract happens, which is the
+ // opposite of what we want. Instead, we use
+ // `duo.wrapping_shl(1).overflowing_add(div_neg)`, where `div_neg` is negative `div`.
+ let mut i = shl;
+ loop {
+ if i == 0 {
+ break;
+ }
+ i -= 1;
+ // `ADDS duo, div, duo, LSL #1`
+ // (add `div` to `duo << 1` and set flags)
+ let (sub, carry) = duo.wrapping_shl(1).overflowing_add(div_neg);
+ duo = sub;
+ // `ADC quo, quo, quo`
+ // (add with carry). Effectively shifts `quo` left by 1 and sets the least
+ // significant bit to the carry.
+ quo = quo.wrapping_add(quo).wrapping_add(carry as $uX);
+ // `ADDCC duo, duo, div`
+ // (add if carry clear). Undoes the subtraction if no carry was generated.
+ if !carry {
+ duo = duo.wrapping_add(div);
+ }
+ }
+ return (quo, duo >> shl);
+ */
+
+ // This is the SWAR (SIMD within in a register) restoring division algorithm.
+ // This combines several ideas of the above algorithms:
+ // - If `duo` is shifted left instead of shifting `div` right like in the 3 instruction
+ // restoring division algorithm, some architectures can do the shifting and
+ // subtraction step in one instruction.
+ // - `quo` can be constructed by adding powers-of-two to it or shifting it left by one
+ // and adding one.
+ // - Every time `duo` is shifted left, there is another unused 0 bit shifted into the
+ // LSB, so what if we use those bits to store `quo`?
+ // Through a complex setup, it is possible to manage `duo` and `quo` in the same
+ // register, and perform one step with 2 or 3 instructions. The only major downsides are
+ // that there is significant setup (it is only saves instructions if `shl` is
+ // approximately more than 4), `duo < div_original` checks are impractical once SWAR is
+ // initiated, and the number of division steps taken has to be exact (we cannot do more
+ // division steps than `shl`, because it introduces edge cases where quotient bits in
+ // `duo` start to collide with the real part of `div`.
+ /*
+ // first step. The quotient bit is stored in `quo` for now
+ let div_original = div;
+ let mut shl = $normalization_shift(duo, div, true);
+ let mut div: $uX = (div << shl);
+ duo = duo.wrapping_sub(div);
+ let mut quo: $uX = 1 << shl;
+ if duo < div_original {
+ return (quo, duo);
+ }
+
+ let mask: $uX;
+ if (div as $iX) < 0 {
+ // deal with same edge case as the 3 instruction restoring division algorithm, but
+ // the quotient bit from this step also has to be stored in `quo`
+ div >>= 1;
+ shl -= 1;
+ let tmp = 1 << shl;
+ mask = tmp - 1;
+ let sub = duo.wrapping_sub(div);
+ if (sub as $iX) >= 0 {
+ // restore
+ duo = sub;
+ quo |= tmp;
+ }
+ if duo < div_original {
+ return (quo, duo);
+ }
+ } else {
+ mask = quo - 1;
+ }
+ // There is now room for quotient bits in `duo`.
+
+ // Note that `div` is already shifted left and has `shl` unset bits. We subtract 1 from
+ // `div` and end up with the subset of `shl` bits being all being set. This subset acts
+ // just like a two's complement negative one. The subset of `div` containing the divisor
+ // had 1 subtracted from it, but a carry will always be generated from the `shl` subset
+ // as long as the quotient stays positive.
+ //
+ // When the modified `div` is subtracted from `duo.wrapping_shl(1)`, the `shl` subset
+ // adds a quotient bit to the least significant bit.
+ // For example, 89 (0b01011001) divided by 3 (0b11):
+ //
+ // shl:4, div:0b00110000
+ // first step:
+ // duo:0b01011001
+ // + div_neg:0b11010000
+ // ____________________
+ // 0b00101001
+ // quo is set to 0b00010000 and mask is set to 0b00001111 for later
+ //
+ // 1 is subtracted from `div`. I will differentiate the `shl` part of `div` and the
+ // quotient part of `duo` with `^`s.
+ // chars.
+ // div:0b00110000
+ // ^^^^
+ // + 0b11111111
+ // ________________
+ // 0b00101111
+ // ^^^^
+ // div_neg:0b11010001
+ //
+ // first SWAR step:
+ // duo_shl1:0b01010010
+ // ^
+ // + div_neg:0b11010001
+ // ____________________
+ // 0b00100011
+ // ^
+ // second:
+ // duo_shl1:0b01000110
+ // ^^
+ // + div_neg:0b11010001
+ // ____________________
+ // 0b00010111
+ // ^^
+ // third:
+ // duo_shl1:0b00101110
+ // ^^^
+ // + div_neg:0b11010001
+ // ____________________
+ // 0b11111111
+ // ^^^
+ // 3 steps resulted in the quotient with 3 set bits as expected, but currently the real
+ // part of `duo` is negative and the third step was an unnormalized step. The restore
+ // branch then restores `duo`. Note that the restore branch does not shift `duo` left.
+ //
+ // duo:0b11111111
+ // ^^^
+ // + div:0b00101111
+ // ^^^^
+ // ________________
+ // 0b00101110
+ // ^^^
+ // `duo` is now back in the `duo_shl1` state it was at in the the third step, with an
+ // unset quotient bit.
+ //
+ // final step (`shl` was 4, so exactly 4 steps must be taken)
+ // duo_shl1:0b01011100
+ // ^^^^
+ // + div_neg:0b11010001
+ // ____________________
+ // 0b00101101
+ // ^^^^
+ // The quotient includes the `^` bits added with the `quo` bits from the beginning that
+ // contained the first step and potential edge case step,
+ // `quo:0b00010000 + (duo:0b00101101 & mask:0b00001111) == 0b00011101 == 29u8`.
+ // The remainder is the bits remaining in `duo` that are not part of the quotient bits,
+ // `duo:0b00101101 >> shl == 0b0010 == 2u8`.
+ let div: $uX = div.wrapping_sub(1);
+ let mut i = shl;
+ loop {
+ if i == 0 {
+ break;
+ }
+ i -= 1;
+ duo = duo.wrapping_shl(1).wrapping_sub(div);
+ if (duo as $iX) < 0 {
+ // restore
+ duo = duo.wrapping_add(div);
+ }
+ }
+ // unpack the results of SWAR
+ return ((duo & mask) | quo, duo >> shl);
+ */
+
+ // The problem with the conditional restoring SWAR algorithm above is that, in practice,
+ // it requires assembly code to bring out its full unrolled potential (It seems that
+ // LLVM can't use unrolled conditionals optimally and ends up erasing all the benefit
+ // that my algorithm intends. On architectures without predicated instructions, the code
+ // gen is especially bad. We need a default software division algorithm that is
+ // guaranteed to get decent code gen for the central loop.
+
+ // For non-SWAR algorithms, there is a way to do binary long division without
+ // predication or even branching. This involves creating a mask from the sign bit and
+ // performing different kinds of steps using that.
+ /*
+ let shl = $normalization_shift(duo, div, true);
+ let mut div: $uX = div << shl;
+ let mut pow: $uX = 1 << shl;
+ let mut quo: $uX = 0;
+ loop {
+ let sub = duo.wrapping_sub(div);
+ let sign_mask = !((sub as $iX).wrapping_shr($n - 1) as $uX);
+ duo -= div & sign_mask;
+ quo |= pow & sign_mask;
+ div >>= 1;
+ pow >>= 1;
+ if pow == 0 {
+ break;
+ }
+ }
+ return (quo, duo);
+ */
+ // However, it requires about 4 extra operations (smearing the sign bit, negating the
+ // mask, and applying the mask twice) on top of the operations done by the actual
+ // algorithm. With SWAR however, just 2 extra operations are needed, making it
+ // practical and even the most optimal algorithm for some architectures.
+
+ // What we do is use custom assembly for predicated architectures that need software
+ // division, and for the default algorithm use a mask based restoring SWAR algorithm
+ // without conditionals or branches. On almost all architectures, this Rust code is
+ // guaranteed to compile down to 5 assembly instructions or less for each step, and LLVM
+ // will unroll it in a decent way.
+
+ // standard opening for SWAR algorithm with first step and edge case handling
+ let div_original = div;
+ let mut shl = $normalization_shift(duo, div, true);
+ let mut div: $uX = (div << shl);
+ duo = duo.wrapping_sub(div);
+ let mut quo: $uX = 1 << shl;
+ if duo < div_original {
+ return (quo, duo);
+ }
+ let mask: $uX;
+ if (div as $iX) < 0 {
+ div >>= 1;
+ shl -= 1;
+ let tmp = 1 << shl;
+ mask = tmp - 1;
+ let sub = duo.wrapping_sub(div);
+ if (sub as $iX) >= 0 {
+ duo = sub;
+ quo |= tmp;
+ }
+ if duo < div_original {
+ return (quo, duo);
+ }
+ } else {
+ mask = quo - 1;
+ }
+
+ // central loop
+ div = div.wrapping_sub(1);
+ let mut i = shl;
+ loop {
+ if i == 0 {
+ break;
+ }
+ i -= 1;
+ // shift left 1 and subtract
+ duo = duo.wrapping_shl(1).wrapping_sub(div);
+ // create mask
+ let mask = (duo as $iX).wrapping_shr($n - 1) as $uX;
+ // restore
+ duo = duo.wrapping_add(div & mask);
+ }
+ // unpack
+ return ((duo & mask) | quo, duo >> shl);
+
+ // miscellanious binary long division algorithms that might be better for specific
+ // architectures
+
+ // Another kind of long division uses an interesting fact that `div` and `pow` can be
+ // negated when `duo` is negative to perform a "negated" division step that works in
+ // place of any normalization mechanism. This is a non-restoring division algorithm that
+ // is very similar to the non-restoring division algorithms that can be found on the
+ // internet, except there is only one test for `duo < 0`. The subtraction from `quo` can
+ // be viewed as shifting the least significant set bit right (e.x. if we enter a series
+ // of negated binary long division steps starting with `quo == 0b1011_0000` and
+ // `pow == 0b0000_1000`, `quo` will progress like this: 0b1010_1000, 0b1010_0100,
+ // 0b1010_0010, 0b1010_0001).
+ /*
+ let div_original = div;
+ let shl = $normalization_shift(duo, div, true);
+ let mut div: $uX = (div << shl);
+ let mut pow: $uX = 1 << shl;
+ let mut quo: $uX = pow;
+ duo = duo.wrapping_sub(div);
+ if duo < div_original {
+ return (quo, duo);
+ }
+ div >>= 1;
+ pow >>= 1;
+ loop {
+ if (duo as $iX) < 0 {
+ // Negated binary long division step.
+ duo = duo.wrapping_add(div);
+ quo = quo.wrapping_sub(pow);
+ } else {
+ // Normal long division step.
+ if duo < div_original {
+ return (quo, duo)
+ }
+ duo = duo.wrapping_sub(div);
+ quo = quo.wrapping_add(pow);
+ }
+ pow >>= 1;
+ div >>= 1;
+ }
+ */
+
+ // This is the Nonrestoring SWAR algorithm, combining the nonrestoring algorithm with
+ // SWAR techniques that makes the only difference between steps be negation of `div`.
+ // If there was an architecture with an instruction that negated inputs to an adder
+ // based on conditionals, and in place shifting (or a three input addition operation
+ // that can have `duo` as two of the inputs to effectively shift it left by 1), then a
+ // single instruction central loop is possible. Microarchitectures often have inputs to
+ // their ALU that can invert the arguments and carry in of adders, but the architectures
+ // unfortunately do not have an instruction to dynamically invert this input based on
+ // conditionals.
+ /*
+ // SWAR opening
+ let div_original = div;
+ let mut shl = $normalization_shift(duo, div, true);
+ let mut div: $uX = (div << shl);
+ duo = duo.wrapping_sub(div);
+ let mut quo: $uX = 1 << shl;
+ if duo < div_original {
+ return (quo, duo);
+ }
+ let mask: $uX;
+ if (div as $iX) < 0 {
+ div >>= 1;
+ shl -= 1;
+ let tmp = 1 << shl;
+ let sub = duo.wrapping_sub(div);
+ if (sub as $iX) >= 0 {
+ // restore
+ duo = sub;
+ quo |= tmp;
+ }
+ if duo < div_original {
+ return (quo, duo);
+ }
+ mask = tmp - 1;
+ } else {
+ mask = quo - 1;
+ }
+
+ // central loop
+ let div: $uX = div.wrapping_sub(1);
+ let mut i = shl;
+ loop {
+ if i == 0 {
+ break;
+ }
+ i -= 1;
+ // note: the `wrapping_shl(1)` can be factored out, but would require another
+ // restoring division step to prevent `(duo as $iX)` from overflowing
+ if (duo as $iX) < 0 {
+ // Negated binary long division step.
+ duo = duo.wrapping_shl(1).wrapping_add(div);
+ } else {
+ // Normal long division step.
+ duo = duo.wrapping_shl(1).wrapping_sub(div);
+ }
+ }
+ if (duo as $iX) < 0 {
+ // Restore. This was not needed in the original nonrestoring algorithm because of
+ // the `duo < div_original` checks.
+ duo = duo.wrapping_add(div);
+ }
+ // unpack
+ return ((duo & mask) | quo, duo >> shl);
+ */
+ }
+ };
+}
diff --git a/vendor/compiler_builtins/src/int/specialized_div_rem/delegate.rs b/vendor/compiler_builtins/src/int/specialized_div_rem/delegate.rs
new file mode 100644
index 000000000..330c6e4f8
--- /dev/null
+++ b/vendor/compiler_builtins/src/int/specialized_div_rem/delegate.rs
@@ -0,0 +1,319 @@
+/// Creates an unsigned division function that uses a combination of hardware division and
+/// binary long division to divide integers larger than what hardware division by itself can do. This
+/// function is intended for microarchitectures that have division hardware, but not fast enough
+/// multiplication hardware for `impl_trifecta` to be faster.
+#[allow(unused_macros)]
+macro_rules! impl_delegate {
+ (
+ $fn:ident, // name of the unsigned division function
+ $zero_div_fn:ident, // function called when division by zero is attempted
+ $half_normalization_shift:ident, // function for finding the normalization shift of $uX
+ $half_division:ident, // function for division of a $uX by a $uX
+ $n_h:expr, // the number of bits in $iH or $uH
+ $uH:ident, // unsigned integer with half the bit width of $uX
+ $uX:ident, // unsigned integer with half the bit width of $uD.
+ $uD:ident, // unsigned integer type for the inputs and outputs of `$fn`
+ $iD:ident // signed integer type with the same bitwidth as `$uD`
+ ) => {
+ /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
+ /// tuple.
+ pub fn $fn(duo: $uD, div: $uD) -> ($uD, $uD) {
+ // The two possibility algorithm, undersubtracting long division algorithm, or any kind
+ // of reciprocal based algorithm will not be fastest, because they involve large
+ // multiplications that we assume to not be fast enough relative to the divisions to
+ // outweigh setup times.
+
+ // the number of bits in a $uX
+ let n = $n_h * 2;
+
+ let duo_lo = duo as $uX;
+ let duo_hi = (duo >> n) as $uX;
+ let div_lo = div as $uX;
+ let div_hi = (div >> n) as $uX;
+
+ match (div_lo == 0, div_hi == 0, duo_hi == 0) {
+ (true, true, _) => $zero_div_fn(),
+ (_, false, true) => {
+ // `duo` < `div`
+ return (0, duo);
+ }
+ (false, true, true) => {
+ // delegate to smaller division
+ let tmp = $half_division(duo_lo, div_lo);
+ return (tmp.0 as $uD, tmp.1 as $uD);
+ }
+ (false, true, false) => {
+ if duo_hi < div_lo {
+ // `quo_hi` will always be 0. This performs a binary long division algorithm
+ // to zero `duo_hi` followed by a half division.
+
+ // We can calculate the normalization shift using only `$uX` size functions.
+ // If we calculated the normalization shift using
+ // `$half_normalization_shift(duo_hi, div_lo false)`, it would break the
+ // assumption the function has that the first argument is more than the
+ // second argument. If the arguments are switched, the assumption holds true
+ // since `duo_hi < div_lo`.
+ let norm_shift = $half_normalization_shift(div_lo, duo_hi, false);
+ let shl = if norm_shift == 0 {
+ // Consider what happens if the msbs of `duo_hi` and `div_lo` align with
+ // no shifting. The normalization shift will always return
+ // `norm_shift == 0` regardless of whether it is fully normalized,
+ // because `duo_hi < div_lo`. In that edge case, `n - norm_shift` would
+ // result in shift overflow down the line. For the edge case, because
+ // both `duo_hi < div_lo` and we are comparing all the significant bits
+ // of `duo_hi` and `div`, we can make `shl = n - 1`.
+ n - 1
+ } else {
+ // We also cannot just use `shl = n - norm_shift - 1` in the general
+ // case, because when we are not in the edge case comparing all the
+ // significant bits, then the full `duo < div` may not be true and thus
+ // breaks the division algorithm.
+ n - norm_shift
+ };
+
+ // The 3 variable restoring division algorithm (see binary_long.rs) is ideal
+ // for this task, since `pow` and `quo` can be `$uX` and the delegation
+ // check is simple.
+ let mut div: $uD = div << shl;
+ let mut pow_lo: $uX = 1 << shl;
+ let mut quo_lo: $uX = 0;
+ let mut duo = duo;
+ loop {
+ let sub = duo.wrapping_sub(div);
+ if 0 <= (sub as $iD) {
+ duo = sub;
+ quo_lo |= pow_lo;
+ let duo_hi = (duo >> n) as $uX;
+ if duo_hi == 0 {
+ // Delegate to get the rest of the quotient. Note that the
+ // `div_lo` here is the original unshifted `div`.
+ let tmp = $half_division(duo as $uX, div_lo);
+ return ((quo_lo | tmp.0) as $uD, tmp.1 as $uD);
+ }
+ }
+ div >>= 1;
+ pow_lo >>= 1;
+ }
+ } else if duo_hi == div_lo {
+ // `quo_hi == 1`. This branch is cheap and helps with edge cases.
+ let tmp = $half_division(duo as $uX, div as $uX);
+ return ((1 << n) | (tmp.0 as $uD), tmp.1 as $uD);
+ } else {
+ // `div_lo < duo_hi`
+ // `rem_hi == 0`
+ if (div_lo >> $n_h) == 0 {
+ // Short division of $uD by a $uH, using $uX by $uX division
+ let div_0 = div_lo as $uH as $uX;
+ let (quo_hi, rem_3) = $half_division(duo_hi, div_0);
+
+ let duo_mid = ((duo >> $n_h) as $uH as $uX) | (rem_3 << $n_h);
+ let (quo_1, rem_2) = $half_division(duo_mid, div_0);
+
+ let duo_lo = (duo as $uH as $uX) | (rem_2 << $n_h);
+ let (quo_0, rem_1) = $half_division(duo_lo, div_0);
+
+ return (
+ (quo_0 as $uD) | ((quo_1 as $uD) << $n_h) | ((quo_hi as $uD) << n),
+ rem_1 as $uD,
+ );
+ }
+
+ // This is basically a short division composed of a half division for the hi
+ // part, specialized 3 variable binary long division in the middle, and
+ // another half division for the lo part.
+ let duo_lo = duo as $uX;
+ let tmp = $half_division(duo_hi, div_lo);
+ let quo_hi = tmp.0;
+ let mut duo = (duo_lo as $uD) | ((tmp.1 as $uD) << n);
+ // This check is required to avoid breaking the long division below.
+ if duo < div {
+ return ((quo_hi as $uD) << n, duo);
+ }
+
+ // The half division handled all shift alignments down to `n`, so this
+ // division can continue with a shift of `n - 1`.
+ let mut div: $uD = div << (n - 1);
+ let mut pow_lo: $uX = 1 << (n - 1);
+ let mut quo_lo: $uX = 0;
+ loop {
+ let sub = duo.wrapping_sub(div);
+ if 0 <= (sub as $iD) {
+ duo = sub;
+ quo_lo |= pow_lo;
+ let duo_hi = (duo >> n) as $uX;
+ if duo_hi == 0 {
+ // Delegate to get the rest of the quotient. Note that the
+ // `div_lo` here is the original unshifted `div`.
+ let tmp = $half_division(duo as $uX, div_lo);
+ return (
+ (tmp.0) as $uD | (quo_lo as $uD) | ((quo_hi as $uD) << n),
+ tmp.1 as $uD,
+ );
+ }
+ }
+ div >>= 1;
+ pow_lo >>= 1;
+ }
+ }
+ }
+ (_, false, false) => {
+ // Full $uD by $uD binary long division. `quo_hi` will always be 0.
+ if duo < div {
+ return (0, duo);
+ }
+ let div_original = div;
+ let shl = $half_normalization_shift(duo_hi, div_hi, false);
+ let mut duo = duo;
+ let mut div: $uD = div << shl;
+ let mut pow_lo: $uX = 1 << shl;
+ let mut quo_lo: $uX = 0;
+ loop {
+ let sub = duo.wrapping_sub(div);
+ if 0 <= (sub as $iD) {
+ duo = sub;
+ quo_lo |= pow_lo;
+ if duo < div_original {
+ return (quo_lo as $uD, duo);
+ }
+ }
+ div >>= 1;
+ pow_lo >>= 1;
+ }
+ }
+ }
+ }
+ };
+}
+
+public_test_dep! {
+/// Returns `n / d` and sets `*rem = n % d`.
+///
+/// This specialization exists because:
+/// - The LLVM backend for 32-bit SPARC cannot compile functions that return `(u128, u128)`,
+/// so we have to use an old fashioned `&mut u128` argument to return the remainder.
+/// - 64-bit SPARC does not have u64 * u64 => u128 widening multiplication, which makes the
+/// delegate algorithm strategy the only reasonably fast way to perform `u128` division.
+// used on SPARC
+#[allow(dead_code)]
+pub(crate) fn u128_divide_sparc(duo: u128, div: u128, rem: &mut u128) -> u128 {
+ use super::*;
+ let duo_lo = duo as u64;
+ let duo_hi = (duo >> 64) as u64;
+ let div_lo = div as u64;
+ let div_hi = (div >> 64) as u64;
+
+ match (div_lo == 0, div_hi == 0, duo_hi == 0) {
+ (true, true, _) => zero_div_fn(),
+ (_, false, true) => {
+ *rem = duo;
+ return 0;
+ }
+ (false, true, true) => {
+ let tmp = u64_by_u64_div_rem(duo_lo, div_lo);
+ *rem = tmp.1 as u128;
+ return tmp.0 as u128;
+ }
+ (false, true, false) => {
+ if duo_hi < div_lo {
+ let norm_shift = u64_normalization_shift(div_lo, duo_hi, false);
+ let shl = if norm_shift == 0 {
+ 64 - 1
+ } else {
+ 64 - norm_shift
+ };
+
+ let mut div: u128 = div << shl;
+ let mut pow_lo: u64 = 1 << shl;
+ let mut quo_lo: u64 = 0;
+ let mut duo = duo;
+ loop {
+ let sub = duo.wrapping_sub(div);
+ if 0 <= (sub as i128) {
+ duo = sub;
+ quo_lo |= pow_lo;
+ let duo_hi = (duo >> 64) as u64;
+ if duo_hi == 0 {
+ let tmp = u64_by_u64_div_rem(duo as u64, div_lo);
+ *rem = tmp.1 as u128;
+ return (quo_lo | tmp.0) as u128;
+ }
+ }
+ div >>= 1;
+ pow_lo >>= 1;
+ }
+ } else if duo_hi == div_lo {
+ let tmp = u64_by_u64_div_rem(duo as u64, div as u64);
+ *rem = tmp.1 as u128;
+ return (1 << 64) | (tmp.0 as u128);
+ } else {
+ if (div_lo >> 32) == 0 {
+ let div_0 = div_lo as u32 as u64;
+ let (quo_hi, rem_3) = u64_by_u64_div_rem(duo_hi, div_0);
+
+ let duo_mid = ((duo >> 32) as u32 as u64) | (rem_3 << 32);
+ let (quo_1, rem_2) = u64_by_u64_div_rem(duo_mid, div_0);
+
+ let duo_lo = (duo as u32 as u64) | (rem_2 << 32);
+ let (quo_0, rem_1) = u64_by_u64_div_rem(duo_lo, div_0);
+
+ *rem = rem_1 as u128;
+ return (quo_0 as u128) | ((quo_1 as u128) << 32) | ((quo_hi as u128) << 64);
+ }
+
+ let duo_lo = duo as u64;
+ let tmp = u64_by_u64_div_rem(duo_hi, div_lo);
+ let quo_hi = tmp.0;
+ let mut duo = (duo_lo as u128) | ((tmp.1 as u128) << 64);
+ if duo < div {
+ *rem = duo;
+ return (quo_hi as u128) << 64;
+ }
+
+ let mut div: u128 = div << (64 - 1);
+ let mut pow_lo: u64 = 1 << (64 - 1);
+ let mut quo_lo: u64 = 0;
+ loop {
+ let sub = duo.wrapping_sub(div);
+ if 0 <= (sub as i128) {
+ duo = sub;
+ quo_lo |= pow_lo;
+ let duo_hi = (duo >> 64) as u64;
+ if duo_hi == 0 {
+ let tmp = u64_by_u64_div_rem(duo as u64, div_lo);
+ *rem = tmp.1 as u128;
+ return (tmp.0) as u128 | (quo_lo as u128) | ((quo_hi as u128) << 64);
+ }
+ }
+ div >>= 1;
+ pow_lo >>= 1;
+ }
+ }
+ }
+ (_, false, false) => {
+ if duo < div {
+ *rem = duo;
+ return 0;
+ }
+ let div_original = div;
+ let shl = u64_normalization_shift(duo_hi, div_hi, false);
+ let mut duo = duo;
+ let mut div: u128 = div << shl;
+ let mut pow_lo: u64 = 1 << shl;
+ let mut quo_lo: u64 = 0;
+ loop {
+ let sub = duo.wrapping_sub(div);
+ if 0 <= (sub as i128) {
+ duo = sub;
+ quo_lo |= pow_lo;
+ if duo < div_original {
+ *rem = duo;
+ return quo_lo as u128;
+ }
+ }
+ div >>= 1;
+ pow_lo >>= 1;
+ }
+ }
+ }
+}
+}
diff --git a/vendor/compiler_builtins/src/int/specialized_div_rem/mod.rs b/vendor/compiler_builtins/src/int/specialized_div_rem/mod.rs
new file mode 100644
index 000000000..6ec4675df
--- /dev/null
+++ b/vendor/compiler_builtins/src/int/specialized_div_rem/mod.rs
@@ -0,0 +1,306 @@
+// TODO: when `unsafe_block_in_unsafe_fn` is stabilized, remove this
+#![allow(unused_unsafe)]
+// The functions are complex with many branches, and explicit
+// `return`s makes it clear where function exit points are
+#![allow(clippy::needless_return)]
+#![allow(clippy::comparison_chain)]
+// Clippy is confused by the complex configuration
+#![allow(clippy::if_same_then_else)]
+#![allow(clippy::needless_bool)]
+
+//! This `specialized_div_rem` module is originally from version 1.0.0 of the
+//! `specialized-div-rem` crate. Note that `for` loops with ranges are not used in this
+//! module, since unoptimized compilation may generate references to `memcpy`.
+//!
+//! The purpose of these macros is to easily change the both the division algorithm used
+//! for a given integer size and the half division used by that algorithm. The way
+//! functions call each other is also constructed such that linkers will find the chain of
+//! software and hardware divisions needed for every size of signed and unsigned division.
+//! For example, most target compilations do the following:
+//!
+//! - Many 128 bit division functions like `u128::wrapping_div` use
+//! `std::intrinsics::unchecked_div`, which gets replaced by `__udivti3` because there
+//! is not a 128 bit by 128 bit hardware division function in most architectures.
+//! `__udivti3` uses `u128_div_rem` (this extra level of function calls exists because
+//! `__umodti3` and `__udivmodti4` also exist, and `specialized_div_rem` supplies just
+//! one function to calculate both the quotient and remainder. If configuration flags
+//! enable it, `impl_trifecta!` defines `u128_div_rem` to use the trifecta algorithm,
+//! which requires the half sized division `u64_by_u64_div_rem`. If the architecture
+//! supplies a 64 bit hardware division instruction, `u64_by_u64_div_rem` will be
+//! reduced to those instructions. Note that we do not specify the half size division
+//! directly to be `__udivdi3`, because hardware division would never be introduced.
+//! - If the architecture does not supply a 64 bit hardware division instruction, u64
+//! divisions will use functions such as `__udivdi3`. This will call `u64_div_rem`
+//! which is defined by `impl_delegate!`. The half division for this algorithm is
+//! `u32_by_u32_div_rem` which in turn becomes hardware division instructions or more
+//! software division algorithms.
+//! - If the architecture does not supply a 32 bit hardware instruction, linkers will
+//! look for `__udivsi3`. `impl_binary_long!` is used, but this algorithm uses no half
+//! division, so the chain of calls ends here.
+//!
+//! On some architectures like x86_64, an asymmetrically sized division is supplied, in
+//! which 128 bit numbers can be divided by 64 bit numbers. `impl_asymmetric!` is used to
+//! extend the 128 by 64 bit division to a full 128 by 128 bit division.
+
+// `allow(dead_code)` is used in various places, because the configuration code would otherwise be
+// ridiculously complex
+
+#[macro_use]
+mod norm_shift;
+
+#[macro_use]
+mod binary_long;
+
+#[macro_use]
+mod delegate;
+
+// used on SPARC
+#[allow(unused_imports)]
+#[cfg(not(feature = "public-test-deps"))]
+pub(crate) use self::delegate::u128_divide_sparc;
+
+#[cfg(feature = "public-test-deps")]
+pub use self::delegate::u128_divide_sparc;
+
+#[macro_use]
+mod trifecta;
+
+#[macro_use]
+mod asymmetric;
+
+/// The behavior of all divisions by zero is controlled by this function. This function should be
+/// impossible to reach by Rust users, unless `compiler-builtins` public division functions or
+/// `core/std::unchecked_div/rem` are directly used without a zero check in front.
+fn zero_div_fn() -> ! {
+ unsafe { core::hint::unreachable_unchecked() }
+}
+
+const USE_LZ: bool = {
+ if cfg!(target_arch = "arm") {
+ if cfg!(target_feature = "thumb-mode") {
+ // ARM thumb targets have CLZ instructions if the instruction set of ARMv6T2 is
+ // supported. This is needed to successfully differentiate between targets like
+ // `thumbv8.base` and `thumbv8.main`.
+ cfg!(target_feature = "v6t2")
+ } else {
+ // Regular ARM targets have CLZ instructions if the ARMv5TE instruction set is
+ // supported. Technically, ARMv5T was the first to have CLZ, but the "v5t" target
+ // feature does not seem to work.
+ cfg!(target_feature = "v5te")
+ }
+ } else if cfg!(any(target_arch = "sparc", target_arch = "sparc64")) {
+ // LZD or LZCNT on SPARC only exists for the VIS 3 extension and later.
+ cfg!(target_feature = "vis3")
+ } else if cfg!(any(target_arch = "riscv32", target_arch = "riscv64")) {
+ // The `B` extension on RISC-V determines if a CLZ assembly instruction exists
+ cfg!(target_feature = "b")
+ } else {
+ // All other common targets Rust supports should have CLZ instructions
+ true
+ }
+};
+
+impl_normalization_shift!(
+ u32_normalization_shift,
+ USE_LZ,
+ 32,
+ u32,
+ i32,
+ allow(dead_code)
+);
+impl_normalization_shift!(
+ u64_normalization_shift,
+ USE_LZ,
+ 64,
+ u64,
+ i64,
+ allow(dead_code)
+);
+
+/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
+/// `checked_div` and `checked_rem` are used to avoid bringing in panic function
+/// dependencies.
+#[inline]
+fn u64_by_u64_div_rem(duo: u64, div: u64) -> (u64, u64) {
+ if let Some(quo) = duo.checked_div(div) {
+ if let Some(rem) = duo.checked_rem(div) {
+ return (quo, rem);
+ }
+ }
+ zero_div_fn()
+}
+
+// Whether `trifecta` or `delegate` is faster for 128 bit division depends on the speed at which a
+// microarchitecture can multiply and divide. We decide to be optimistic and assume `trifecta` is
+// faster if the target pointer width is at least 64.
+#[cfg(all(
+ not(any(target_pointer_width = "16", target_pointer_width = "32")),
+ not(all(not(feature = "no-asm"), target_arch = "x86_64")),
+ not(any(target_arch = "sparc", target_arch = "sparc64"))
+))]
+impl_trifecta!(
+ u128_div_rem,
+ zero_div_fn,
+ u64_by_u64_div_rem,
+ 32,
+ u32,
+ u64,
+ u128
+);
+
+// If the pointer width less than 64, then the target architecture almost certainly does not have
+// the fast 64 to 128 bit widening multiplication needed for `trifecta` to be faster.
+#[cfg(all(
+ any(target_pointer_width = "16", target_pointer_width = "32"),
+ not(all(not(feature = "no-asm"), target_arch = "x86_64")),
+ not(any(target_arch = "sparc", target_arch = "sparc64"))
+))]
+impl_delegate!(
+ u128_div_rem,
+ zero_div_fn,
+ u64_normalization_shift,
+ u64_by_u64_div_rem,
+ 32,
+ u32,
+ u64,
+ u128,
+ i128
+);
+
+/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
+///
+/// # Safety
+///
+/// If the quotient does not fit in a `u64`, a floating point exception occurs.
+/// If `div == 0`, then a division by zero exception occurs.
+#[cfg(all(not(feature = "no-asm"), target_arch = "x86_64"))]
+#[inline]
+unsafe fn u128_by_u64_div_rem(duo: u128, div: u64) -> (u64, u64) {
+ let duo_lo = duo as u64;
+ let duo_hi = (duo >> 64) as u64;
+ let quo: u64;
+ let rem: u64;
+ unsafe {
+ // divides the combined registers rdx:rax (`duo` is split into two 64 bit parts to do this)
+ // by `div`. The quotient is stored in rax and the remainder in rdx.
+ // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
+ core::arch::asm!(
+ "div {0}",
+ in(reg) div,
+ inlateout("rax") duo_lo => quo,
+ inlateout("rdx") duo_hi => rem,
+ options(att_syntax, pure, nomem, nostack)
+ );
+ }
+ (quo, rem)
+}
+
+// use `asymmetric` instead of `trifecta` on x86_64
+#[cfg(all(not(feature = "no-asm"), target_arch = "x86_64"))]
+impl_asymmetric!(
+ u128_div_rem,
+ zero_div_fn,
+ u64_by_u64_div_rem,
+ u128_by_u64_div_rem,
+ 32,
+ u32,
+ u64,
+ u128
+);
+
+/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
+/// `checked_div` and `checked_rem` are used to avoid bringing in panic function
+/// dependencies.
+#[inline]
+#[allow(dead_code)]
+fn u32_by_u32_div_rem(duo: u32, div: u32) -> (u32, u32) {
+ if let Some(quo) = duo.checked_div(div) {
+ if let Some(rem) = duo.checked_rem(div) {
+ return (quo, rem);
+ }
+ }
+ zero_div_fn()
+}
+
+// When not on x86 and the pointer width is not 64, use `delegate` since the division size is larger
+// than register size.
+#[cfg(all(
+ not(all(not(feature = "no-asm"), target_arch = "x86")),
+ not(target_pointer_width = "64")
+))]
+impl_delegate!(
+ u64_div_rem,
+ zero_div_fn,
+ u32_normalization_shift,
+ u32_by_u32_div_rem,
+ 16,
+ u16,
+ u32,
+ u64,
+ i64
+);
+
+// When not on x86 and the pointer width is 64, use `binary_long`.
+#[cfg(all(
+ not(all(not(feature = "no-asm"), target_arch = "x86")),
+ target_pointer_width = "64"
+))]
+impl_binary_long!(
+ u64_div_rem,
+ zero_div_fn,
+ u64_normalization_shift,
+ 64,
+ u64,
+ i64
+);
+
+/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
+///
+/// # Safety
+///
+/// If the quotient does not fit in a `u32`, a floating point exception occurs.
+/// If `div == 0`, then a division by zero exception occurs.
+#[cfg(all(not(feature = "no-asm"), target_arch = "x86"))]
+#[inline]
+unsafe fn u64_by_u32_div_rem(duo: u64, div: u32) -> (u32, u32) {
+ let duo_lo = duo as u32;
+ let duo_hi = (duo >> 32) as u32;
+ let quo: u32;
+ let rem: u32;
+ unsafe {
+ // divides the combined registers rdx:rax (`duo` is split into two 32 bit parts to do this)
+ // by `div`. The quotient is stored in rax and the remainder in rdx.
+ // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
+ core::arch::asm!(
+ "div {0}",
+ in(reg) div,
+ inlateout("rax") duo_lo => quo,
+ inlateout("rdx") duo_hi => rem,
+ options(att_syntax, pure, nomem, nostack)
+ );
+ }
+ (quo, rem)
+}
+
+// use `asymmetric` instead of `delegate` on x86
+#[cfg(all(not(feature = "no-asm"), target_arch = "x86"))]
+impl_asymmetric!(
+ u64_div_rem,
+ zero_div_fn,
+ u32_by_u32_div_rem,
+ u64_by_u32_div_rem,
+ 16,
+ u16,
+ u32,
+ u64
+);
+
+// 32 bits is the smallest division used by `compiler-builtins`, so we end with binary long division
+impl_binary_long!(
+ u32_div_rem,
+ zero_div_fn,
+ u32_normalization_shift,
+ 32,
+ u32,
+ i32
+);
diff --git a/vendor/compiler_builtins/src/int/specialized_div_rem/norm_shift.rs b/vendor/compiler_builtins/src/int/specialized_div_rem/norm_shift.rs
new file mode 100644
index 000000000..61b67b6bc
--- /dev/null
+++ b/vendor/compiler_builtins/src/int/specialized_div_rem/norm_shift.rs
@@ -0,0 +1,106 @@
+/// Creates a function used by some division algorithms to compute the "normalization shift".
+#[allow(unused_macros)]
+macro_rules! impl_normalization_shift {
+ (
+ $name:ident, // name of the normalization shift function
+ // boolean for if `$uX::leading_zeros` should be used (if an architecture does not have a
+ // hardware instruction for `usize::leading_zeros`, then this should be `true`)
+ $use_lz:ident,
+ $n:tt, // the number of bits in a $iX or $uX
+ $uX:ident, // unsigned integer type for the inputs of `$name`
+ $iX:ident, // signed integer type for the inputs of `$name`
+ $($unsigned_attr:meta),* // attributes for the function
+ ) => {
+ /// Finds the shift left that the divisor `div` would need to be normalized for a binary
+ /// long division step with the dividend `duo`. NOTE: This function assumes that these edge
+ /// cases have been handled before reaching it:
+ /// `
+ /// if div == 0 {
+ /// panic!("attempt to divide by zero")
+ /// }
+ /// if duo < div {
+ /// return (0, duo)
+ /// }
+ /// `
+ ///
+ /// Normalization is defined as (where `shl` is the output of this function):
+ /// `
+ /// if duo.leading_zeros() != (div << shl).leading_zeros() {
+ /// // If the most significant bits of `duo` and `div << shl` are not in the same place,
+ /// // then `div << shl` has one more leading zero than `duo`.
+ /// assert_eq!(duo.leading_zeros() + 1, (div << shl).leading_zeros());
+ /// // Also, `2*(div << shl)` is not more than `duo` (otherwise the first division step
+ /// // would not be able to clear the msb of `duo`)
+ /// assert!(duo < (div << (shl + 1)));
+ /// }
+ /// if full_normalization {
+ /// // Some algorithms do not need "full" normalization, which means that `duo` is
+ /// // larger than `div << shl` when the most significant bits are aligned.
+ /// assert!((div << shl) <= duo);
+ /// }
+ /// `
+ ///
+ /// Note: If the software bisection algorithm is being used in this function, it happens
+ /// that full normalization always occurs, so be careful that new algorithms are not
+ /// invisibly depending on this invariant when `full_normalization` is set to `false`.
+ $(
+ #[$unsigned_attr]
+ )*
+ fn $name(duo: $uX, div: $uX, full_normalization: bool) -> usize {
+ // We have to find the leading zeros of `div` to know where its msb (most significant
+ // set bit) is to even begin binary long division. It is also good to know where the msb
+ // of `duo` is so that useful work can be started instead of shifting `div` for all
+ // possible quotients (many division steps are wasted if `duo.leading_zeros()` is large
+ // and `div` starts out being shifted all the way to the msb). Aligning the msbs of
+ // `div` and `duo` could be done by shifting `div` left by
+ // `div.leading_zeros() - duo.leading_zeros()`, but some CPUs without division hardware
+ // also do not have single instructions for calculating `leading_zeros`. Instead of
+ // software doing two bisections to find the two `leading_zeros`, we do one bisection to
+ // find `div.leading_zeros() - duo.leading_zeros()` without actually knowing either of
+ // the leading zeros values.
+
+ let mut shl: usize;
+ if $use_lz {
+ shl = (div.leading_zeros() - duo.leading_zeros()) as usize;
+ if full_normalization {
+ if duo < (div << shl) {
+ // when the msb of `duo` and `div` are aligned, the resulting `div` may be
+ // larger than `duo`, so we decrease the shift by 1.
+ shl -= 1;
+ }
+ }
+ } else {
+ let mut test = duo;
+ shl = 0usize;
+ let mut lvl = $n >> 1;
+ loop {
+ let tmp = test >> lvl;
+ // It happens that a final `duo < (div << shl)` check is not needed, because the
+ // `div <= tmp` check insures that the msb of `test` never passes the msb of
+ // `div`, and any set bits shifted off the end of `test` would still keep
+ // `div <= tmp` true.
+ if div <= tmp {
+ test = tmp;
+ shl += lvl;
+ }
+ // narrow down bisection
+ lvl >>= 1;
+ if lvl == 0 {
+ break
+ }
+ }
+ }
+ // tests the invariants that should hold before beginning binary long division
+ /*
+ if full_normalization {
+ assert!((div << shl) <= duo);
+ }
+ if duo.leading_zeros() != (div << shl).leading_zeros() {
+ assert_eq!(duo.leading_zeros() + 1, (div << shl).leading_zeros());
+ assert!(duo < (div << (shl + 1)));
+ }
+ */
+ shl
+ }
+ }
+}
diff --git a/vendor/compiler_builtins/src/int/specialized_div_rem/trifecta.rs b/vendor/compiler_builtins/src/int/specialized_div_rem/trifecta.rs
new file mode 100644
index 000000000..7e104053b
--- /dev/null
+++ b/vendor/compiler_builtins/src/int/specialized_div_rem/trifecta.rs
@@ -0,0 +1,386 @@
+/// Creates an unsigned division function optimized for division of integers with bitwidths
+/// larger than the largest hardware integer division supported. These functions use large radix
+/// division algorithms that require both fast division and very fast widening multiplication on the
+/// target microarchitecture. Otherwise, `impl_delegate` should be used instead.
+#[allow(unused_macros)]
+macro_rules! impl_trifecta {
+ (
+ $fn:ident, // name of the unsigned division function
+ $zero_div_fn:ident, // function called when division by zero is attempted
+ $half_division:ident, // function for division of a $uX by a $uX
+ $n_h:expr, // the number of bits in $iH or $uH
+ $uH:ident, // unsigned integer with half the bit width of $uX
+ $uX:ident, // unsigned integer with half the bit width of $uD
+ $uD:ident // unsigned integer type for the inputs and outputs of `$unsigned_name`
+ ) => {
+ /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
+ /// tuple.
+ pub fn $fn(duo: $uD, div: $uD) -> ($uD, $uD) {
+ // This is called the trifecta algorithm because it uses three main algorithms: short
+ // division for small divisors, the two possibility algorithm for large divisors, and an
+ // undersubtracting long division algorithm for intermediate cases.
+
+ // This replicates `carrying_mul` (rust-lang rfc #2417). LLVM correctly optimizes this
+ // to use a widening multiply to 128 bits on the relevant architectures.
+ fn carrying_mul(lhs: $uX, rhs: $uX) -> ($uX, $uX) {
+ let tmp = (lhs as $uD).wrapping_mul(rhs as $uD);
+ (tmp as $uX, (tmp >> ($n_h * 2)) as $uX)
+ }
+ fn carrying_mul_add(lhs: $uX, mul: $uX, add: $uX) -> ($uX, $uX) {
+ let tmp = (lhs as $uD)
+ .wrapping_mul(mul as $uD)
+ .wrapping_add(add as $uD);
+ (tmp as $uX, (tmp >> ($n_h * 2)) as $uX)
+ }
+
+ // the number of bits in a $uX
+ let n = $n_h * 2;
+
+ if div == 0 {
+ $zero_div_fn()
+ }
+
+ // Trying to use a normalization shift function will cause inelegancies in the code and
+ // inefficiencies for architectures with a native count leading zeros instruction. The
+ // undersubtracting algorithm needs both values (keeping the original `div_lz` but
+ // updating `duo_lz` multiple times), so we assume hardware support for fast
+ // `leading_zeros` calculation.
+ let div_lz = div.leading_zeros();
+ let mut duo_lz = duo.leading_zeros();
+
+ // the possible ranges of `duo` and `div` at this point:
+ // `0 <= duo < 2^n_d`
+ // `1 <= div < 2^n_d`
+
+ // quotient is 0 or 1 branch
+ if div_lz <= duo_lz {
+ // The quotient cannot be more than 1. The highest set bit of `duo` needs to be at
+ // least one place higher than `div` for the quotient to be more than 1.
+ if duo >= div {
+ return (1, duo - div);
+ } else {
+ return (0, duo);
+ }
+ }
+
+ // `_sb` is the number of significant bits (from the ones place to the highest set bit)
+ // `{2, 2^div_sb} <= duo < 2^n_d`
+ // `1 <= div < {2^duo_sb, 2^(n_d - 1)}`
+ // smaller division branch
+ if duo_lz >= n {
+ // `duo < 2^n` so it will fit in a $uX. `div` will also fit in a $uX (because of the
+ // `div_lz <= duo_lz` branch) so no numerical error.
+ let (quo, rem) = $half_division(duo as $uX, div as $uX);
+ return (quo as $uD, rem as $uD);
+ }
+
+ // `{2^n, 2^div_sb} <= duo < 2^n_d`
+ // `1 <= div < {2^duo_sb, 2^(n_d - 1)}`
+ // short division branch
+ if div_lz >= (n + $n_h) {
+ // `1 <= div < {2^duo_sb, 2^n_h}`
+
+ // It is barely possible to improve the performance of this by calculating the
+ // reciprocal and removing one `$half_division`, but only if the CPU can do fast
+ // multiplications in parallel. Other reciprocal based methods can remove two
+ // `$half_division`s, but have multiplications that cannot be done in parallel and
+ // reduce performance. I have decided to use this trivial short division method and
+ // rely on the CPU having quick divisions.
+
+ let duo_hi = (duo >> n) as $uX;
+ let div_0 = div as $uH as $uX;
+ let (quo_hi, rem_3) = $half_division(duo_hi, div_0);
+
+ let duo_mid = ((duo >> $n_h) as $uH as $uX) | (rem_3 << $n_h);
+ let (quo_1, rem_2) = $half_division(duo_mid, div_0);
+
+ let duo_lo = (duo as $uH as $uX) | (rem_2 << $n_h);
+ let (quo_0, rem_1) = $half_division(duo_lo, div_0);
+
+ return (
+ (quo_0 as $uD) | ((quo_1 as $uD) << $n_h) | ((quo_hi as $uD) << n),
+ rem_1 as $uD,
+ );
+ }
+
+ // relative leading significant bits, cannot overflow because of above branches
+ let lz_diff = div_lz - duo_lz;
+
+ // `{2^n, 2^div_sb} <= duo < 2^n_d`
+ // `2^n_h <= div < {2^duo_sb, 2^(n_d - 1)}`
+ // `mul` or `mul - 1` branch
+ if lz_diff < $n_h {
+ // Two possibility division algorithm
+
+ // The most significant bits of `duo` and `div` are within `$n_h` bits of each
+ // other. If we take the `n` most significant bits of `duo` and divide them by the
+ // corresponding bits in `div`, it produces a quotient value `quo`. It happens that
+ // `quo` or `quo - 1` will always be the correct quotient for the whole number. In
+ // other words, the bits less significant than the `n` most significant bits of
+ // `duo` and `div` can only influence the quotient to be one of two values.
+ // Because there are only two possibilities, there only needs to be one `$uH` sized
+ // division, a `$uH` by `$uD` multiplication, and only one branch with a few simple
+ // operations.
+ //
+ // Proof that the true quotient can only be `quo` or `quo - 1`.
+ // All `/` operators here are floored divisions.
+ //
+ // `shift` is the number of bits not in the higher `n` significant bits of `duo`.
+ // (definitions)
+ // 0. shift = n - duo_lz
+ // 1. duo_sig_n == duo / 2^shift
+ // 2. div_sig_n == div / 2^shift
+ // 3. quo == duo_sig_n / div_sig_n
+ //
+ //
+ // We are trying to find the true quotient, `true_quo`.
+ // 4. true_quo = duo / div. (definition)
+ //
+ // This is true because of the bits that are cut off during the bit shift.
+ // 5. duo_sig_n * 2^shift <= duo < (duo_sig_n + 1) * 2^shift.
+ // 6. div_sig_n * 2^shift <= div < (div_sig_n + 1) * 2^shift.
+ //
+ // Dividing each bound of (5) by each bound of (6) gives 4 possibilities for what
+ // `true_quo == duo / div` is bounded by:
+ // (duo_sig_n * 2^shift) / (div_sig_n * 2^shift)
+ // (duo_sig_n * 2^shift) / ((div_sig_n + 1) * 2^shift)
+ // ((duo_sig_n + 1) * 2^shift) / (div_sig_n * 2^shift)
+ // ((duo_sig_n + 1) * 2^shift) / ((div_sig_n + 1) * 2^shift)
+ //
+ // Simplifying each of these four:
+ // duo_sig_n / div_sig_n
+ // duo_sig_n / (div_sig_n + 1)
+ // (duo_sig_n + 1) / div_sig_n
+ // (duo_sig_n + 1) / (div_sig_n + 1)
+ //
+ // Taking the smallest and the largest of these as the low and high bounds
+ // and replacing `duo / div` with `true_quo`:
+ // 7. duo_sig_n / (div_sig_n + 1) <= true_quo < (duo_sig_n + 1) / div_sig_n
+ //
+ // The `lz_diff < n_h` conditional on this branch makes sure that `div_sig_n` is at
+ // least `2^n_h`, and the `div_lz <= duo_lz` branch makes sure that the highest bit
+ // of `div_sig_n` is not the `2^(n - 1)` bit.
+ // 8. `2^(n - 1) <= duo_sig_n < 2^n`
+ // 9. `2^n_h <= div_sig_n < 2^(n - 1)`
+ //
+ // We want to prove that either
+ // `(duo_sig_n + 1) / div_sig_n == duo_sig_n / (div_sig_n + 1)` or that
+ // `(duo_sig_n + 1) / div_sig_n == duo_sig_n / (div_sig_n + 1) + 1`.
+ //
+ // We also want to prove that `quo` is one of these:
+ // `duo_sig_n / div_sig_n == duo_sig_n / (div_sig_n + 1)` or
+ // `duo_sig_n / div_sig_n == (duo_sig_n + 1) / div_sig_n`.
+ //
+ // When 1 is added to the numerator of `duo_sig_n / div_sig_n` to produce
+ // `(duo_sig_n + 1) / div_sig_n`, it is not possible that the value increases by
+ // more than 1 with floored integer arithmetic and `div_sig_n != 0`. Consider
+ // `x/y + 1 < (x + 1)/y` <=> `x/y + 1 < x/y + 1/y` <=> `1 < 1/y` <=> `y < 1`.
+ // `div_sig_n` is a nonzero integer. Thus,
+ // 10. `duo_sig_n / div_sig_n == (duo_sig_n + 1) / div_sig_n` or
+ // `(duo_sig_n / div_sig_n) + 1 == (duo_sig_n + 1) / div_sig_n.
+ //
+ // When 1 is added to the denominator of `duo_sig_n / div_sig_n` to produce
+ // `duo_sig_n / (div_sig_n + 1)`, it is not possible that the value decreases by
+ // more than 1 with the bounds (8) and (9). Consider `x/y - 1 <= x/(y + 1)` <=>
+ // `(x - y)/y < x/(y + 1)` <=> `(y + 1)*(x - y) < x*y` <=> `x*y - y*y + x - y < x*y`
+ // <=> `x < y*y + y`. The smallest value of `div_sig_n` is `2^n_h` and the largest
+ // value of `duo_sig_n` is `2^n - 1`. Substituting reveals `2^n - 1 < 2^n + 2^n_h`.
+ // Thus,
+ // 11. `duo_sig_n / div_sig_n == duo_sig_n / (div_sig_n + 1)` or
+ // `(duo_sig_n / div_sig_n) - 1` == duo_sig_n / (div_sig_n + 1)`
+ //
+ // Combining both (10) and (11), we know that
+ // `quo - 1 <= duo_sig_n / (div_sig_n + 1) <= true_quo
+ // < (duo_sig_n + 1) / div_sig_n <= quo + 1` and therefore:
+ // 12. quo - 1 <= true_quo < quo + 1
+ //
+ // In a lot of division algorithms using smaller divisions to construct a larger
+ // division, we often encounter a situation where the approximate `quo` value
+ // calculated from a smaller division is multiple increments away from the true
+ // `quo` value. In those algorithms, multiple correction steps have to be applied.
+ // Those correction steps may need more multiplications to test `duo - (quo*div)`
+ // again. Because of the fact that our `quo` can only be one of two values, we can
+ // see if `duo - (quo*div)` overflows. If it did overflow, then we know that we have
+ // the larger of the two values (since the true quotient is unique, and any larger
+ // quotient will cause `duo - (quo*div)` to be negative). Also because there is only
+ // one correction needed, we can calculate the remainder `duo - (true_quo*div) ==
+ // duo - ((quo - 1)*div) == duo - (quo*div - div) == duo + div - quo*div`.
+ // If `duo - (quo*div)` did not overflow, then we have the correct answer.
+ let shift = n - duo_lz;
+ let duo_sig_n = (duo >> shift) as $uX;
+ let div_sig_n = (div >> shift) as $uX;
+ let quo = $half_division(duo_sig_n, div_sig_n).0;
+
+ // The larger `quo` value can overflow `$uD` in the right circumstances. This is a
+ // manual `carrying_mul_add` with overflow checking.
+ let div_lo = div as $uX;
+ let div_hi = (div >> n) as $uX;
+ let (tmp_lo, carry) = carrying_mul(quo, div_lo);
+ let (tmp_hi, overflow) = carrying_mul_add(quo, div_hi, carry);
+ let tmp = (tmp_lo as $uD) | ((tmp_hi as $uD) << n);
+ if (overflow != 0) || (duo < tmp) {
+ return (
+ (quo - 1) as $uD,
+ // Both the addition and subtraction can overflow, but when combined end up
+ // as a correct positive number.
+ duo.wrapping_add(div).wrapping_sub(tmp),
+ );
+ } else {
+ return (quo as $uD, duo - tmp);
+ }
+ }
+
+ // Undersubtracting long division algorithm.
+ // Instead of clearing a minimum of 1 bit from `duo` per iteration via binary long
+ // division, `n_h - 1` bits are cleared per iteration with this algorithm. It is a more
+ // complicated version of regular long division. Most integer division algorithms tend
+ // to guess a part of the quotient, and may have a larger quotient than the true
+ // quotient (which when multiplied by `div` will "oversubtract" the original dividend).
+ // They then check if the quotient was in fact too large and then have to correct it.
+ // This long division algorithm has been carefully constructed to always underguess the
+ // quotient by slim margins. This allows different subalgorithms to be blindly jumped to
+ // without needing an extra correction step.
+ //
+ // The only problem is that this subalgorithm will not work for many ranges of `duo` and
+ // `div`. Fortunately, the short division, two possibility algorithm, and other simple
+ // cases happen to exactly fill these gaps.
+ //
+ // For an example, consider the division of 76543210 by 213 and assume that `n_h` is
+ // equal to two decimal digits (note: we are working with base 10 here for readability).
+ // The first `sig_n_h` part of the divisor (21) is taken and is incremented by 1 to
+ // prevent oversubtraction. We also record the number of extra places not a part of
+ // the `sig_n` or `sig_n_h` parts.
+ //
+ // sig_n_h == 2 digits, sig_n == 4 digits
+ //
+ // vvvv <- `duo_sig_n`
+ // 76543210
+ // ^^^^ <- extra places in duo, `duo_extra == 4`
+ //
+ // vv <- `div_sig_n_h`
+ // 213
+ // ^ <- extra places in div, `div_extra == 1`
+ //
+ // The difference in extra places, `duo_extra - div_extra == extra_shl == 3`, is used
+ // for shifting partial sums in the long division.
+ //
+ // In the first step, the first `sig_n` part of duo (7654) is divided by
+ // `div_sig_n_h_add_1` (22), which results in a partial quotient of 347. This is
+ // multiplied by the whole divisor to make 73911, which is shifted left by `extra_shl`
+ // and subtracted from duo. The partial quotient is also shifted left by `extra_shl` to
+ // be added to `quo`.
+ //
+ // 347
+ // ________
+ // |76543210
+ // -73911
+ // 2632210
+ //
+ // Variables dependent on duo have to be updated:
+ //
+ // vvvv <- `duo_sig_n == 2632`
+ // 2632210
+ // ^^^ <- `duo_extra == 3`
+ //
+ // `extra_shl == 2`
+ //
+ // Two more steps are taken after this and then duo fits into `n` bits, and then a final
+ // normal long division step is made. The partial quotients are all progressively added
+ // to each other in the actual algorithm, but here I have left them all in a tower that
+ // can be added together to produce the quotient, 359357.
+ //
+ // 14
+ // 443
+ // 119
+ // 347
+ // ________
+ // |76543210
+ // -73911
+ // 2632210
+ // -25347
+ // 97510
+ // -94359
+ // 3151
+ // -2982
+ // 169 <- the remainder
+
+ let mut duo = duo;
+ let mut quo: $uD = 0;
+
+ // The number of lesser significant bits not a part of `div_sig_n_h`
+ let div_extra = (n + $n_h) - div_lz;
+
+ // The most significant `n_h` bits of div
+ let div_sig_n_h = (div >> div_extra) as $uH;
+
+ // This needs to be a `$uX` in case of overflow from the increment
+ let div_sig_n_h_add1 = (div_sig_n_h as $uX) + 1;
+
+ // `{2^n, 2^(div_sb + n_h)} <= duo < 2^n_d`
+ // `2^n_h <= div < {2^(duo_sb - n_h), 2^n}`
+ loop {
+ // The number of lesser significant bits not a part of `duo_sig_n`
+ let duo_extra = n - duo_lz;
+
+ // The most significant `n` bits of `duo`
+ let duo_sig_n = (duo >> duo_extra) as $uX;
+
+ // the two possibility algorithm requires that the difference between msbs is less
+ // than `n_h`, so the comparison is `<=` here.
+ if div_extra <= duo_extra {
+ // Undersubtracting long division step
+ let quo_part = $half_division(duo_sig_n, div_sig_n_h_add1).0 as $uD;
+ let extra_shl = duo_extra - div_extra;
+
+ // Addition to the quotient.
+ quo += (quo_part << extra_shl);
+
+ // Subtraction from `duo`. At least `n_h - 1` bits are cleared from `duo` here.
+ duo -= (div.wrapping_mul(quo_part) << extra_shl);
+ } else {
+ // Two possibility algorithm
+ let shift = n - duo_lz;
+ let duo_sig_n = (duo >> shift) as $uX;
+ let div_sig_n = (div >> shift) as $uX;
+ let quo_part = $half_division(duo_sig_n, div_sig_n).0;
+ let div_lo = div as $uX;
+ let div_hi = (div >> n) as $uX;
+
+ let (tmp_lo, carry) = carrying_mul(quo_part, div_lo);
+ // The undersubtracting long division algorithm has already run once, so
+ // overflow beyond `$uD` bits is not possible here
+ let (tmp_hi, _) = carrying_mul_add(quo_part, div_hi, carry);
+ let tmp = (tmp_lo as $uD) | ((tmp_hi as $uD) << n);
+
+ if duo < tmp {
+ return (
+ quo + ((quo_part - 1) as $uD),
+ duo.wrapping_add(div).wrapping_sub(tmp),
+ );
+ } else {
+ return (quo + (quo_part as $uD), duo - tmp);
+ }
+ }
+
+ duo_lz = duo.leading_zeros();
+
+ if div_lz <= duo_lz {
+ // quotient can have 0 or 1 added to it
+ if div <= duo {
+ return (quo + 1, duo - div);
+ } else {
+ return (quo, duo);
+ }
+ }
+
+ // This can only happen if `div_sd < n` (because of previous "quo = 0 or 1"
+ // branches), but it is not worth it to unroll further.
+ if n <= duo_lz {
+ // simple division and addition
+ let tmp = $half_division(duo as $uX, div as $uX);
+ return (quo + (tmp.0 as $uD), tmp.1 as $uD);
+ }
+ }
+ }
+ };
+}
diff --git a/vendor/compiler_builtins/src/int/udiv.rs b/vendor/compiler_builtins/src/int/udiv.rs
new file mode 100644
index 000000000..fb09f87d8
--- /dev/null
+++ b/vendor/compiler_builtins/src/int/udiv.rs
@@ -0,0 +1,106 @@
+#[cfg(not(feature = "public-test-deps"))]
+pub(crate) use int::specialized_div_rem::*;
+
+#[cfg(feature = "public-test-deps")]
+pub use int::specialized_div_rem::*;
+
+intrinsics! {
+ #[maybe_use_optimized_c_shim]
+ #[arm_aeabi_alias = __aeabi_uidiv]
+ /// Returns `n / d`
+ pub extern "C" fn __udivsi3(n: u32, d: u32) -> u32 {
+ u32_div_rem(n, d).0
+ }
+
+ #[maybe_use_optimized_c_shim]
+ /// Returns `n % d`
+ pub extern "C" fn __umodsi3(n: u32, d: u32) -> u32 {
+ u32_div_rem(n, d).1
+ }
+
+ #[avr_skip]
+ #[maybe_use_optimized_c_shim]
+ /// Returns `n / d` and sets `*rem = n % d`
+ pub extern "C" fn __udivmodsi4(n: u32, d: u32, rem: Option<&mut u32>) -> u32 {
+ let quo_rem = u32_div_rem(n, d);
+ if let Some(rem) = rem {
+ *rem = quo_rem.1;
+ }
+ quo_rem.0
+ }
+
+ #[avr_skip]
+ #[maybe_use_optimized_c_shim]
+ /// Returns `n / d`
+ pub extern "C" fn __udivdi3(n: u64, d: u64) -> u64 {
+ u64_div_rem(n, d).0
+ }
+
+ #[avr_skip]
+ #[maybe_use_optimized_c_shim]
+ /// Returns `n % d`
+ pub extern "C" fn __umoddi3(n: u64, d: u64) -> u64 {
+ u64_div_rem(n, d).1
+ }
+
+ #[avr_skip]
+ #[maybe_use_optimized_c_shim]
+ /// Returns `n / d` and sets `*rem = n % d`
+ pub extern "C" fn __udivmoddi4(n: u64, d: u64, rem: Option<&mut u64>) -> u64 {
+ let quo_rem = u64_div_rem(n, d);
+ if let Some(rem) = rem {
+ *rem = quo_rem.1;
+ }
+ quo_rem.0
+ }
+
+ // Note: we use block configuration and not `if cfg!(...)`, because we need to entirely disable
+ // the existence of `u128_div_rem` to get 32-bit SPARC to compile, see `u128_divide_sparc` docs.
+
+ #[avr_skip]
+ #[win64_128bit_abi_hack]
+ /// Returns `n / d`
+ pub extern "C" fn __udivti3(n: u128, d: u128) -> u128 {
+ #[cfg(not(any(target_arch = "sparc", target_arch = "sparc64")))] {
+ u128_div_rem(n, d).0
+ }
+ #[cfg(any(target_arch = "sparc", target_arch = "sparc64"))] {
+ u128_divide_sparc(n, d, &mut 0)
+ }
+ }
+
+ #[avr_skip]
+ #[win64_128bit_abi_hack]
+ /// Returns `n % d`
+ pub extern "C" fn __umodti3(n: u128, d: u128) -> u128 {
+ #[cfg(not(any(target_arch = "sparc", target_arch = "sparc64")))] {
+ u128_div_rem(n, d).1
+ }
+ #[cfg(any(target_arch = "sparc", target_arch = "sparc64"))] {
+ let mut rem = 0;
+ u128_divide_sparc(n, d, &mut rem);
+ rem
+ }
+ }
+
+ #[avr_skip]
+ #[win64_128bit_abi_hack]
+ /// Returns `n / d` and sets `*rem = n % d`
+ pub extern "C" fn __udivmodti4(n: u128, d: u128, rem: Option<&mut u128>) -> u128 {
+ #[cfg(not(any(target_arch = "sparc", target_arch = "sparc64")))] {
+ let quo_rem = u128_div_rem(n, d);
+ if let Some(rem) = rem {
+ *rem = quo_rem.1;
+ }
+ quo_rem.0
+ }
+ #[cfg(any(target_arch = "sparc", target_arch = "sparc64"))] {
+ let mut tmp = 0;
+ let quo = u128_divide_sparc(n, d, &mut tmp);
+ if let Some(rem) = rem {
+ *rem = tmp;
+ }
+ quo
+ }
+ }
+}