//! A simple big-integer type for slow path algorithms. //! //! This includes minimal stackvector for use in big-integer arithmetic. #![doc(hidden)] #[cfg(feature = "alloc")] use crate::heapvec::HeapVec; use crate::num::Float; #[cfg(not(feature = "alloc"))] use crate::stackvec::StackVec; #[cfg(not(feature = "compact"))] use crate::table::{LARGE_POW5, LARGE_POW5_STEP}; use core::{cmp, ops, ptr}; /// Number of bits in a Bigint. /// /// This needs to be at least the number of bits required to store /// a Bigint, which is `log2(radix**digits)`. /// ≅ 3600 for base-10, rounded-up. pub const BIGINT_BITS: usize = 4000; /// The number of limbs for the bigint. pub const BIGINT_LIMBS: usize = BIGINT_BITS / LIMB_BITS; #[cfg(feature = "alloc")] pub type VecType = HeapVec; #[cfg(not(feature = "alloc"))] pub type VecType = StackVec; /// Storage for a big integer type. /// /// This is used for algorithms when we have a finite number of digits. /// Specifically, it stores all the significant digits scaled to the /// proper exponent, as an integral type, and then directly compares /// these digits. /// /// This requires us to store the number of significant bits, plus the /// number of exponent bits (required) since we scale everything /// to the same exponent. #[derive(Clone, PartialEq, Eq)] pub struct Bigint { /// Significant digits for the float, stored in a big integer in LE order. /// /// This is pretty much the same number of digits for any radix, since the /// significant digits balances out the zeros from the exponent: /// 1. Decimal is 1091 digits, 767 mantissa digits + 324 exponent zeros. /// 2. Base 6 is 1097 digits, or 680 mantissa digits + 417 exponent zeros. /// 3. Base 36 is 1086 digits, or 877 mantissa digits + 209 exponent zeros. /// /// However, the number of bytes required is larger for large radixes: /// for decimal, we need `log2(10**1091) ≅ 3600`, while for base 36 /// we need `log2(36**1086) ≅ 5600`. Since we use uninitialized data, /// we avoid a major performance hit from the large buffer size. pub data: VecType, } #[allow(clippy::new_without_default)] impl Bigint { /// Construct a bigint representing 0. #[inline(always)] pub fn new() -> Self { Self { data: VecType::new(), } } /// Construct a bigint from an integer. #[inline(always)] pub fn from_u64(value: u64) -> Self { Self { data: VecType::from_u64(value), } } #[inline(always)] pub fn hi64(&self) -> (u64, bool) { self.data.hi64() } /// Multiply and assign as if by exponentiation by a power. #[inline] pub fn pow(&mut self, base: u32, exp: u32) -> Option<()> { debug_assert!(base == 2 || base == 5 || base == 10); if base % 5 == 0 { pow(&mut self.data, exp)?; } if base % 2 == 0 { shl(&mut self.data, exp as usize)?; } Some(()) } /// Calculate the bit-length of the big-integer. #[inline] pub fn bit_length(&self) -> u32 { bit_length(&self.data) } } impl ops::MulAssign<&Bigint> for Bigint { fn mul_assign(&mut self, rhs: &Bigint) { self.data *= &rhs.data; } } /// REVERSE VIEW /// Reverse, immutable view of a sequence. pub struct ReverseView<'a, T: 'a> { inner: &'a [T], } impl<'a, T> ops::Index for ReverseView<'a, T> { type Output = T; #[inline] fn index(&self, index: usize) -> &T { let len = self.inner.len(); &(*self.inner)[len - index - 1] } } /// Create a reverse view of the vector for indexing. #[inline] pub fn rview(x: &[Limb]) -> ReverseView { ReverseView { inner: x, } } // COMPARE // ------- /// Compare `x` to `y`, in little-endian order. #[inline] pub fn compare(x: &[Limb], y: &[Limb]) -> cmp::Ordering { match x.len().cmp(&y.len()) { cmp::Ordering::Equal => { let iter = x.iter().rev().zip(y.iter().rev()); for (&xi, yi) in iter { match xi.cmp(yi) { cmp::Ordering::Equal => (), ord => return ord, } } // Equal case. cmp::Ordering::Equal }, ord => ord, } } // NORMALIZE // --------- /// Normalize the integer, so any leading zero values are removed. #[inline] pub fn normalize(x: &mut VecType) { // We don't care if this wraps: the index is bounds-checked. while let Some(&value) = x.get(x.len().wrapping_sub(1)) { if value == 0 { unsafe { x.set_len(x.len() - 1) }; } else { break; } } } /// Get if the big integer is normalized. #[inline] #[allow(clippy::match_like_matches_macro)] pub fn is_normalized(x: &[Limb]) -> bool { // We don't care if this wraps: the index is bounds-checked. match x.get(x.len().wrapping_sub(1)) { Some(&0) => false, _ => true, } } // FROM // ---- /// Create StackVec from u64 value. #[inline(always)] #[allow(clippy::branches_sharing_code)] pub fn from_u64(x: u64) -> VecType { let mut vec = VecType::new(); debug_assert!(vec.capacity() >= 2); if LIMB_BITS == 32 { vec.try_push(x as Limb).unwrap(); vec.try_push((x >> 32) as Limb).unwrap(); } else { vec.try_push(x as Limb).unwrap(); } vec.normalize(); vec } // HI // -- /// Check if any of the remaining bits are non-zero. /// /// # Safety /// /// Safe as long as `rindex <= x.len()`. #[inline] pub fn nonzero(x: &[Limb], rindex: usize) -> bool { debug_assert!(rindex <= x.len()); let len = x.len(); let slc = &x[..len - rindex]; slc.iter().rev().any(|&x| x != 0) } // These return the high X bits and if the bits were truncated. /// Shift 32-bit integer to high 64-bits. #[inline] pub fn u32_to_hi64_1(r0: u32) -> (u64, bool) { u64_to_hi64_1(r0 as u64) } /// Shift 2 32-bit integers to high 64-bits. #[inline] pub fn u32_to_hi64_2(r0: u32, r1: u32) -> (u64, bool) { let r0 = (r0 as u64) << 32; let r1 = r1 as u64; u64_to_hi64_1(r0 | r1) } /// Shift 3 32-bit integers to high 64-bits. #[inline] pub fn u32_to_hi64_3(r0: u32, r1: u32, r2: u32) -> (u64, bool) { let r0 = r0 as u64; let r1 = (r1 as u64) << 32; let r2 = r2 as u64; u64_to_hi64_2(r0, r1 | r2) } /// Shift 64-bit integer to high 64-bits. #[inline] pub fn u64_to_hi64_1(r0: u64) -> (u64, bool) { let ls = r0.leading_zeros(); (r0 << ls, false) } /// Shift 2 64-bit integers to high 64-bits. #[inline] pub fn u64_to_hi64_2(r0: u64, r1: u64) -> (u64, bool) { let ls = r0.leading_zeros(); let rs = 64 - ls; let v = match ls { 0 => r0, _ => (r0 << ls) | (r1 >> rs), }; let n = r1 << ls != 0; (v, n) } /// Extract the hi bits from the buffer. macro_rules! hi { // # Safety // // Safe as long as the `stackvec.len() >= 1`. (@1 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{ $fn($rview[0] as $t) }}; // # Safety // // Safe as long as the `stackvec.len() >= 2`. (@2 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{ let r0 = $rview[0] as $t; let r1 = $rview[1] as $t; $fn(r0, r1) }}; // # Safety // // Safe as long as the `stackvec.len() >= 2`. (@nonzero2 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{ let (v, n) = hi!(@2 $self, $rview, $t, $fn); (v, n || nonzero($self, 2 )) }}; // # Safety // // Safe as long as the `stackvec.len() >= 3`. (@3 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{ let r0 = $rview[0] as $t; let r1 = $rview[1] as $t; let r2 = $rview[2] as $t; $fn(r0, r1, r2) }}; // # Safety // // Safe as long as the `stackvec.len() >= 3`. (@nonzero3 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{ let (v, n) = hi!(@3 $self, $rview, $t, $fn); (v, n || nonzero($self, 3)) }}; } /// Get the high 64 bits from the vector. #[inline(always)] pub fn hi64(x: &[Limb]) -> (u64, bool) { let rslc = rview(x); // SAFETY: the buffer must be at least length bytes long. match x.len() { 0 => (0, false), 1 if LIMB_BITS == 32 => hi!(@1 x, rslc, u32, u32_to_hi64_1), 1 => hi!(@1 x, rslc, u64, u64_to_hi64_1), 2 if LIMB_BITS == 32 => hi!(@2 x, rslc, u32, u32_to_hi64_2), 2 => hi!(@2 x, rslc, u64, u64_to_hi64_2), _ if LIMB_BITS == 32 => hi!(@nonzero3 x, rslc, u32, u32_to_hi64_3), _ => hi!(@nonzero2 x, rslc, u64, u64_to_hi64_2), } } // POWERS // ------ /// MulAssign by a power of 5. /// /// Theoretically... /// /// Use an exponentiation by squaring method, since it reduces the time /// complexity of the multiplication to ~`O(log(n))` for the squaring, /// and `O(n*m)` for the result. Since `m` is typically a lower-order /// factor, this significantly reduces the number of multiplications /// we need to do. Iteratively multiplying by small powers follows /// the nth triangular number series, which scales as `O(p^2)`, but /// where `p` is `n+m`. In short, it scales very poorly. /// /// Practically.... /// /// Exponentiation by Squaring: /// running 2 tests /// test bigcomp_f32_lexical ... bench: 1,018 ns/iter (+/- 78) /// test bigcomp_f64_lexical ... bench: 3,639 ns/iter (+/- 1,007) /// /// Exponentiation by Iterative Small Powers: /// running 2 tests /// test bigcomp_f32_lexical ... bench: 518 ns/iter (+/- 31) /// test bigcomp_f64_lexical ... bench: 583 ns/iter (+/- 47) /// /// Exponentiation by Iterative Large Powers (of 2): /// running 2 tests /// test bigcomp_f32_lexical ... bench: 671 ns/iter (+/- 31) /// test bigcomp_f64_lexical ... bench: 1,394 ns/iter (+/- 47) /// /// The following benchmarks were run on `1 * 5^300`, using native `pow`, /// a version with only small powers, and one with pre-computed powers /// of `5^(3 * max_exp)`, rather than `5^(5 * max_exp)`. /// /// However, using large powers is crucial for good performance for higher /// powers. /// pow/default time: [426.20 ns 427.96 ns 429.89 ns] /// pow/small time: [2.9270 us 2.9411 us 2.9565 us] /// pow/large:3 time: [838.51 ns 842.21 ns 846.27 ns] /// /// Even using worst-case scenarios, exponentiation by squaring is /// significantly slower for our workloads. Just multiply by small powers, /// in simple cases, and use precalculated large powers in other cases. /// /// Furthermore, using sufficiently big large powers is also crucial for /// performance. This is a tradeoff of binary size and performance, and /// using a single value at ~`5^(5 * max_exp)` seems optimal. pub fn pow(x: &mut VecType, mut exp: u32) -> Option<()> { // Minimize the number of iterations for large exponents: just // do a few steps with a large powers. #[cfg(not(feature = "compact"))] { while exp >= LARGE_POW5_STEP { large_mul(x, &LARGE_POW5)?; exp -= LARGE_POW5_STEP; } } // Now use our pre-computed small powers iteratively. // This is calculated as `⌊log(2^BITS - 1, 5)⌋`. let small_step = if LIMB_BITS == 32 { 13 } else { 27 }; let max_native = (5 as Limb).pow(small_step); while exp >= small_step { small_mul(x, max_native)?; exp -= small_step; } if exp != 0 { // SAFETY: safe, since `exp < small_step`. let small_power = unsafe { f64::int_pow_fast_path(exp as usize, 5) }; small_mul(x, small_power as Limb)?; } Some(()) } // SCALAR // ------ /// Add two small integers and return the resulting value and if overflow happens. #[inline(always)] pub fn scalar_add(x: Limb, y: Limb) -> (Limb, bool) { x.overflowing_add(y) } /// Multiply two small integers (with carry) (and return the overflow contribution). /// /// Returns the (low, high) components. #[inline(always)] pub fn scalar_mul(x: Limb, y: Limb, carry: Limb) -> (Limb, Limb) { // Cannot overflow, as long as wide is 2x as wide. This is because // the following is always true: // `Wide::MAX - (Narrow::MAX * Narrow::MAX) >= Narrow::MAX` let z: Wide = (x as Wide) * (y as Wide) + (carry as Wide); (z as Limb, (z >> LIMB_BITS) as Limb) } // SMALL // ----- /// Add small integer to bigint starting from offset. #[inline] pub fn small_add_from(x: &mut VecType, y: Limb, start: usize) -> Option<()> { let mut index = start; let mut carry = y; while carry != 0 && index < x.len() { let result = scalar_add(x[index], carry); x[index] = result.0; carry = result.1 as Limb; index += 1; } // If we carried past all the elements, add to the end of the buffer. if carry != 0 { x.try_push(carry)?; } Some(()) } /// Add small integer to bigint. #[inline(always)] pub fn small_add(x: &mut VecType, y: Limb) -> Option<()> { small_add_from(x, y, 0) } /// Multiply bigint by small integer. #[inline] pub fn small_mul(x: &mut VecType, y: Limb) -> Option<()> { let mut carry = 0; for xi in x.iter_mut() { let result = scalar_mul(*xi, y, carry); *xi = result.0; carry = result.1; } // If we carried past all the elements, add to the end of the buffer. if carry != 0 { x.try_push(carry)?; } Some(()) } // LARGE // ----- /// Add bigint to bigint starting from offset. pub fn large_add_from(x: &mut VecType, y: &[Limb], start: usize) -> Option<()> { // The effective x buffer is from `xstart..x.len()`, so we need to treat // that as the current range. If the effective y buffer is longer, need // to resize to that, + the start index. if y.len() > x.len().saturating_sub(start) { // Ensure we panic if we can't extend the buffer. // This avoids any unsafe behavior afterwards. x.try_resize(y.len() + start, 0)?; } // Iteratively add elements from y to x. let mut carry = false; for (index, &yi) in y.iter().enumerate() { // We panicked in `try_resize` if this wasn't true. let xi = x.get_mut(start + index).unwrap(); // Only one op of the two ops can overflow, since we added at max // Limb::max_value() + Limb::max_value(). Add the previous carry, // and store the current carry for the next. let result = scalar_add(*xi, yi); *xi = result.0; let mut tmp = result.1; if carry { let result = scalar_add(*xi, 1); *xi = result.0; tmp |= result.1; } carry = tmp; } // Handle overflow. if carry { small_add_from(x, 1, y.len() + start)?; } Some(()) } /// Add bigint to bigint. #[inline(always)] pub fn large_add(x: &mut VecType, y: &[Limb]) -> Option<()> { large_add_from(x, y, 0) } /// Grade-school multiplication algorithm. /// /// Slow, naive algorithm, using limb-bit bases and just shifting left for /// each iteration. This could be optimized with numerous other algorithms, /// but it's extremely simple, and works in O(n*m) time, which is fine /// by me. Each iteration, of which there are `m` iterations, requires /// `n` multiplications, and `n` additions, or grade-school multiplication. /// /// Don't use Karatsuba multiplication, since out implementation seems to /// be slower asymptotically, which is likely just due to the small sizes /// we deal with here. For example, running on the following data: /// /// ```text /// const SMALL_X: &[u32] = &[ /// 766857581, 3588187092, 1583923090, 2204542082, 1564708913, 2695310100, 3676050286, /// 1022770393, 468044626, 446028186 /// ]; /// const SMALL_Y: &[u32] = &[ /// 3945492125, 3250752032, 1282554898, 1708742809, 1131807209, 3171663979, 1353276095, /// 1678845844, 2373924447, 3640713171 /// ]; /// const LARGE_X: &[u32] = &[ /// 3647536243, 2836434412, 2154401029, 1297917894, 137240595, 790694805, 2260404854, /// 3872698172, 690585094, 99641546, 3510774932, 1672049983, 2313458559, 2017623719, /// 638180197, 1140936565, 1787190494, 1797420655, 14113450, 2350476485, 3052941684, /// 1993594787, 2901001571, 4156930025, 1248016552, 848099908, 2660577483, 4030871206, /// 692169593, 2835966319, 1781364505, 4266390061, 1813581655, 4210899844, 2137005290, /// 2346701569, 3715571980, 3386325356, 1251725092, 2267270902, 474686922, 2712200426, /// 197581715, 3087636290, 1379224439, 1258285015, 3230794403, 2759309199, 1494932094, /// 326310242 /// ]; /// const LARGE_Y: &[u32] = &[ /// 1574249566, 868970575, 76716509, 3198027972, 1541766986, 1095120699, 3891610505, /// 2322545818, 1677345138, 865101357, 2650232883, 2831881215, 3985005565, 2294283760, /// 3468161605, 393539559, 3665153349, 1494067812, 106699483, 2596454134, 797235106, /// 705031740, 1209732933, 2732145769, 4122429072, 141002534, 790195010, 4014829800, /// 1303930792, 3649568494, 308065964, 1233648836, 2807326116, 79326486, 1262500691, /// 621809229, 2258109428, 3819258501, 171115668, 1139491184, 2979680603, 1333372297, /// 1657496603, 2790845317, 4090236532, 4220374789, 601876604, 1828177209, 2372228171, /// 2247372529 /// ]; /// ``` /// /// We get the following results: /// ```text /// mul/small:long time: [220.23 ns 221.47 ns 222.81 ns] /// Found 4 outliers among 100 measurements (4.00%) /// 2 (2.00%) high mild /// 2 (2.00%) high severe /// mul/small:karatsuba time: [233.88 ns 234.63 ns 235.44 ns] /// Found 11 outliers among 100 measurements (11.00%) /// 8 (8.00%) high mild /// 3 (3.00%) high severe /// mul/large:long time: [1.9365 us 1.9455 us 1.9558 us] /// Found 12 outliers among 100 measurements (12.00%) /// 7 (7.00%) high mild /// 5 (5.00%) high severe /// mul/large:karatsuba time: [4.4250 us 4.4515 us 4.4812 us] /// ``` /// /// In short, Karatsuba multiplication is never worthwhile for out use-case. pub fn long_mul(x: &[Limb], y: &[Limb]) -> Option { // Using the immutable value, multiply by all the scalars in y, using // the algorithm defined above. Use a single buffer to avoid // frequent reallocations. Handle the first case to avoid a redundant // addition, since we know y.len() >= 1. let mut z = VecType::try_from(x)?; if !y.is_empty() { let y0 = y[0]; small_mul(&mut z, y0)?; for (index, &yi) in y.iter().enumerate().skip(1) { if yi != 0 { let mut zi = VecType::try_from(x)?; small_mul(&mut zi, yi)?; large_add_from(&mut z, &zi, index)?; } } } z.normalize(); Some(z) } /// Multiply bigint by bigint using grade-school multiplication algorithm. #[inline(always)] pub fn large_mul(x: &mut VecType, y: &[Limb]) -> Option<()> { // Karatsuba multiplication never makes sense, so just use grade school // multiplication. if y.len() == 1 { // SAFETY: safe since `y.len() == 1`. small_mul(x, y[0])?; } else { *x = long_mul(y, x)?; } Some(()) } // SHIFT // ----- /// Shift-left `n` bits inside a buffer. #[inline] pub fn shl_bits(x: &mut VecType, n: usize) -> Option<()> { debug_assert!(n != 0); // Internally, for each item, we shift left by n, and add the previous // right shifted limb-bits. // For example, we transform (for u8) shifted left 2, to: // b10100100 b01000010 // b10 b10010001 b00001000 debug_assert!(n < LIMB_BITS); let rshift = LIMB_BITS - n; let lshift = n; let mut prev: Limb = 0; for xi in x.iter_mut() { let tmp = *xi; *xi <<= lshift; *xi |= prev >> rshift; prev = tmp; } // Always push the carry, even if it creates a non-normal result. let carry = prev >> rshift; if carry != 0 { x.try_push(carry)?; } Some(()) } /// Shift-left `n` limbs inside a buffer. #[inline] pub fn shl_limbs(x: &mut VecType, n: usize) -> Option<()> { debug_assert!(n != 0); if n + x.len() > x.capacity() { None } else if !x.is_empty() { let len = n + x.len(); // SAFE: since x is not empty, and `x.len() + n <= x.capacity()`. unsafe { // Move the elements. let src = x.as_ptr(); let dst = x.as_mut_ptr().add(n); ptr::copy(src, dst, x.len()); // Write our 0s. ptr::write_bytes(x.as_mut_ptr(), 0, n); x.set_len(len); } Some(()) } else { Some(()) } } /// Shift-left buffer by n bits. #[inline] pub fn shl(x: &mut VecType, n: usize) -> Option<()> { let rem = n % LIMB_BITS; let div = n / LIMB_BITS; if rem != 0 { shl_bits(x, rem)?; } if div != 0 { shl_limbs(x, div)?; } Some(()) } /// Get number of leading zero bits in the storage. #[inline] pub fn leading_zeros(x: &[Limb]) -> u32 { let length = x.len(); // wrapping_sub is fine, since it'll just return None. if let Some(&value) = x.get(length.wrapping_sub(1)) { value.leading_zeros() } else { 0 } } /// Calculate the bit-length of the big-integer. #[inline] pub fn bit_length(x: &[Limb]) -> u32 { let nlz = leading_zeros(x); LIMB_BITS as u32 * x.len() as u32 - nlz } // LIMB // ---- // Type for a single limb of the big integer. // // A limb is analogous to a digit in base10, except, it stores 32-bit // or 64-bit numbers instead. We want types where 64-bit multiplication // is well-supported by the architecture, rather than emulated in 3 // instructions. The quickest way to check this support is using a // cross-compiler for numerous architectures, along with the following // source file and command: // // Compile with `gcc main.c -c -S -O3 -masm=intel` // // And the source code is: // ```text // #include // // struct i128 { // uint64_t hi; // uint64_t lo; // }; // // // Type your code here, or load an example. // struct i128 square(uint64_t x, uint64_t y) { // __int128 prod = (__int128)x * (__int128)y; // struct i128 z; // z.hi = (uint64_t)(prod >> 64); // z.lo = (uint64_t)prod; // return z; // } // ``` // // If the result contains `call __multi3`, then the multiplication // is emulated by the compiler. Otherwise, it's natively supported. // // This should be all-known 64-bit platforms supported by Rust. // https://forge.rust-lang.org/platform-support.html // // # Supported // // Platforms where native 128-bit multiplication is explicitly supported: // - x86_64 (Supported via `MUL`). // - mips64 (Supported via `DMULTU`, which `HI` and `LO` can be read-from). // - s390x (Supported via `MLGR`). // // # Efficient // // Platforms where native 64-bit multiplication is supported and // you can extract hi-lo for 64-bit multiplications. // - aarch64 (Requires `UMULH` and `MUL` to capture high and low bits). // - powerpc64 (Requires `MULHDU` and `MULLD` to capture high and low bits). // - riscv64 (Requires `MUL` and `MULH` to capture high and low bits). // // # Unsupported // // Platforms where native 128-bit multiplication is not supported, // requiring software emulation. // sparc64 (`UMUL` only supports double-word arguments). // sparcv9 (Same as sparc64). // // These tests are run via `xcross`, my own library for C cross-compiling, // which supports numerous targets (far in excess of Rust's tier 1 support, // or rust-embedded/cross's list). xcross may be found here: // https://github.com/Alexhuszagh/xcross // // To compile for the given target, run: // `xcross gcc main.c -c -S -O3 --target $target` // // All 32-bit architectures inherently do not have support. That means // we can essentially look for 64-bit architectures that are not SPARC. #[cfg(all(target_pointer_width = "64", not(target_arch = "sparc")))] pub type Limb = u64; #[cfg(all(target_pointer_width = "64", not(target_arch = "sparc")))] pub type Wide = u128; #[cfg(all(target_pointer_width = "64", not(target_arch = "sparc")))] pub const LIMB_BITS: usize = 64; #[cfg(not(all(target_pointer_width = "64", not(target_arch = "sparc"))))] pub type Limb = u32; #[cfg(not(all(target_pointer_width = "64", not(target_arch = "sparc"))))] pub type Wide = u64; #[cfg(not(all(target_pointer_width = "64", not(target_arch = "sparc"))))] pub const LIMB_BITS: usize = 32;