From 698f8c2f01ea549d77d7dc3338a12e04c11057b9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 17 Apr 2024 14:02:58 +0200 Subject: Adding upstream version 1.64.0+dfsg1. Signed-off-by: Daniel Baumann --- vendor/compiler_builtins/src/arm.rs | 183 +++++++ vendor/compiler_builtins/src/arm_linux.rs | 214 ++++++++ vendor/compiler_builtins/src/float/add.rs | 213 ++++++++ vendor/compiler_builtins/src/float/cmp.rs | 253 ++++++++++ vendor/compiler_builtins/src/float/conv.rs | 351 +++++++++++++ vendor/compiler_builtins/src/float/div.rs | 467 ++++++++++++++++++ vendor/compiler_builtins/src/float/extend.rs | 83 ++++ vendor/compiler_builtins/src/float/mod.rs | 175 +++++++ vendor/compiler_builtins/src/float/mul.rs | 209 ++++++++ vendor/compiler_builtins/src/float/pow.rs | 36 ++ vendor/compiler_builtins/src/float/sub.rs | 25 + vendor/compiler_builtins/src/float/trunc.rs | 125 +++++ vendor/compiler_builtins/src/int/addsub.rs | 96 ++++ vendor/compiler_builtins/src/int/leading_zeros.rs | 149 ++++++ vendor/compiler_builtins/src/int/mod.rs | 390 +++++++++++++++ vendor/compiler_builtins/src/int/mul.rs | 138 ++++++ vendor/compiler_builtins/src/int/sdiv.rs | 169 +++++++ vendor/compiler_builtins/src/int/shift.rs | 116 +++++ .../src/int/specialized_div_rem/asymmetric.rs | 69 +++ .../src/int/specialized_div_rem/binary_long.rs | 548 +++++++++++++++++++++ .../src/int/specialized_div_rem/delegate.rs | 319 ++++++++++++ .../src/int/specialized_div_rem/mod.rs | 306 ++++++++++++ .../src/int/specialized_div_rem/norm_shift.rs | 106 ++++ .../src/int/specialized_div_rem/trifecta.rs | 386 +++++++++++++++ vendor/compiler_builtins/src/int/udiv.rs | 106 ++++ vendor/compiler_builtins/src/lib.rs | 72 +++ vendor/compiler_builtins/src/macros.rs | 448 +++++++++++++++++ vendor/compiler_builtins/src/math.rs | 117 +++++ vendor/compiler_builtins/src/mem/impls.rs | 267 ++++++++++ vendor/compiler_builtins/src/mem/mod.rs | 211 ++++++++ vendor/compiler_builtins/src/mem/x86_64.rs | 100 ++++ vendor/compiler_builtins/src/probestack.rs | 350 +++++++++++++ vendor/compiler_builtins/src/riscv.rs | 34 ++ vendor/compiler_builtins/src/x86.rs | 85 ++++ vendor/compiler_builtins/src/x86_64.rs | 94 ++++ 35 files changed, 7010 insertions(+) create mode 100644 vendor/compiler_builtins/src/arm.rs create mode 100644 vendor/compiler_builtins/src/arm_linux.rs create mode 100644 vendor/compiler_builtins/src/float/add.rs create mode 100644 vendor/compiler_builtins/src/float/cmp.rs create mode 100644 vendor/compiler_builtins/src/float/conv.rs create mode 100644 vendor/compiler_builtins/src/float/div.rs create mode 100644 vendor/compiler_builtins/src/float/extend.rs create mode 100644 vendor/compiler_builtins/src/float/mod.rs create mode 100644 vendor/compiler_builtins/src/float/mul.rs create mode 100644 vendor/compiler_builtins/src/float/pow.rs create mode 100644 vendor/compiler_builtins/src/float/sub.rs create mode 100644 vendor/compiler_builtins/src/float/trunc.rs create mode 100644 vendor/compiler_builtins/src/int/addsub.rs create mode 100644 vendor/compiler_builtins/src/int/leading_zeros.rs create mode 100644 vendor/compiler_builtins/src/int/mod.rs create mode 100644 vendor/compiler_builtins/src/int/mul.rs create mode 100644 vendor/compiler_builtins/src/int/sdiv.rs create mode 100644 vendor/compiler_builtins/src/int/shift.rs create mode 100644 vendor/compiler_builtins/src/int/specialized_div_rem/asymmetric.rs create mode 100644 vendor/compiler_builtins/src/int/specialized_div_rem/binary_long.rs create mode 100644 vendor/compiler_builtins/src/int/specialized_div_rem/delegate.rs create mode 100644 vendor/compiler_builtins/src/int/specialized_div_rem/mod.rs create mode 100644 vendor/compiler_builtins/src/int/specialized_div_rem/norm_shift.rs create mode 100644 vendor/compiler_builtins/src/int/specialized_div_rem/trifecta.rs create mode 100644 vendor/compiler_builtins/src/int/udiv.rs create mode 100644 vendor/compiler_builtins/src/lib.rs create mode 100644 vendor/compiler_builtins/src/macros.rs create mode 100644 vendor/compiler_builtins/src/math.rs create mode 100644 vendor/compiler_builtins/src/mem/impls.rs create mode 100644 vendor/compiler_builtins/src/mem/mod.rs create mode 100644 vendor/compiler_builtins/src/mem/x86_64.rs create mode 100644 vendor/compiler_builtins/src/probestack.rs create mode 100644 vendor/compiler_builtins/src/riscv.rs create mode 100644 vendor/compiler_builtins/src/x86.rs create mode 100644 vendor/compiler_builtins/src/x86_64.rs (limited to 'vendor/compiler_builtins/src') diff --git a/vendor/compiler_builtins/src/arm.rs b/vendor/compiler_builtins/src/arm.rs new file mode 100644 index 000000000..9c1b6ad12 --- /dev/null +++ b/vendor/compiler_builtins/src/arm.rs @@ -0,0 +1,183 @@ +#![cfg(not(feature = "no-asm"))] +#![allow(unused_imports)] + +use core::intrinsics; + +// iOS symbols have a leading underscore. +#[cfg(target_os = "ios")] +macro_rules! bl { + ($func:literal) => { + concat!("bl _", $func) + }; +} +#[cfg(not(target_os = "ios"))] +macro_rules! bl { + ($func:literal) => { + concat!("bl ", $func) + }; +} + +intrinsics! { + // NOTE This function and the ones below are implemented using assembly because they are using a + // custom calling convention which can't be implemented using a normal Rust function. + #[naked] + #[cfg(not(target_env = "msvc"))] + pub unsafe extern "C" fn __aeabi_uidivmod() { + core::arch::asm!( + "push {{lr}}", + "sub sp, sp, #4", + "mov r2, sp", + bl!("__udivmodsi4"), + "ldr r1, [sp]", + "add sp, sp, #4", + "pop {{pc}}", + options(noreturn) + ); + } + + #[naked] + pub unsafe extern "C" fn __aeabi_uldivmod() { + core::arch::asm!( + "push {{r4, lr}}", + "sub sp, sp, #16", + "add r4, sp, #8", + "str r4, [sp]", + bl!("__udivmoddi4"), + "ldr r2, [sp, #8]", + "ldr r3, [sp, #12]", + "add sp, sp, #16", + "pop {{r4, pc}}", + options(noreturn) + ); + } + + #[naked] + pub unsafe extern "C" fn __aeabi_idivmod() { + core::arch::asm!( + "push {{r0, r1, r4, lr}}", + bl!("__aeabi_idiv"), + "pop {{r1, r2}}", + "muls r2, r2, r0", + "subs r1, r1, r2", + "pop {{r4, pc}}", + options(noreturn) + ); + } + + #[naked] + pub unsafe extern "C" fn __aeabi_ldivmod() { + core::arch::asm!( + "push {{r4, lr}}", + "sub sp, sp, #16", + "add r4, sp, #8", + "str r4, [sp]", + bl!("__divmoddi4"), + "ldr r2, [sp, #8]", + "ldr r3, [sp, #12]", + "add sp, sp, #16", + "pop {{r4, pc}}", + options(noreturn) + ); + } + + // The following functions use weak linkage to allow users to override + // with custom implementation. + // FIXME: The `*4` and `*8` variants should be defined as aliases. + + #[cfg(not(target_os = "ios"))] + #[linkage = "weak"] + pub unsafe extern "aapcs" fn __aeabi_memcpy(dest: *mut u8, src: *const u8, n: usize) { + ::mem::memcpy(dest, src, n); + } + + #[cfg(not(target_os = "ios"))] + #[linkage = "weak"] + pub unsafe extern "aapcs" fn __aeabi_memcpy4(dest: *mut u8, src: *const u8, n: usize) { + // We are guaranteed 4-alignment, so accessing at u32 is okay. + let mut dest = dest as *mut u32; + let mut src = src as *mut u32; + let mut n = n; + + while n >= 4 { + *dest = *src; + dest = dest.offset(1); + src = src.offset(1); + n -= 4; + } + + __aeabi_memcpy(dest as *mut u8, src as *const u8, n); + } + + #[cfg(not(target_os = "ios"))] + #[linkage = "weak"] + pub unsafe extern "aapcs" fn __aeabi_memcpy8(dest: *mut u8, src: *const u8, n: usize) { + __aeabi_memcpy4(dest, src, n); + } + + #[cfg(not(target_os = "ios"))] + #[linkage = "weak"] + pub unsafe extern "aapcs" fn __aeabi_memmove(dest: *mut u8, src: *const u8, n: usize) { + ::mem::memmove(dest, src, n); + } + + #[cfg(not(any(target_os = "ios", target_env = "msvc")))] + #[linkage = "weak"] + pub unsafe extern "aapcs" fn __aeabi_memmove4(dest: *mut u8, src: *const u8, n: usize) { + __aeabi_memmove(dest, src, n); + } + + #[cfg(not(any(target_os = "ios", target_env = "msvc")))] + #[linkage = "weak"] + pub unsafe extern "aapcs" fn __aeabi_memmove8(dest: *mut u8, src: *const u8, n: usize) { + __aeabi_memmove(dest, src, n); + } + + #[cfg(not(target_os = "ios"))] + #[linkage = "weak"] + pub unsafe extern "aapcs" fn __aeabi_memset(dest: *mut u8, n: usize, c: i32) { + // Note the different argument order + ::mem::memset(dest, c, n); + } + + #[cfg(not(target_os = "ios"))] + #[linkage = "weak"] + pub unsafe extern "aapcs" fn __aeabi_memset4(dest: *mut u8, n: usize, c: i32) { + let mut dest = dest as *mut u32; + let mut n = n; + + let byte = (c as u32) & 0xff; + let c = (byte << 24) | (byte << 16) | (byte << 8) | byte; + + while n >= 4 { + *dest = c; + dest = dest.offset(1); + n -= 4; + } + + __aeabi_memset(dest as *mut u8, n, byte as i32); + } + + #[cfg(not(target_os = "ios"))] + #[linkage = "weak"] + pub unsafe extern "aapcs" fn __aeabi_memset8(dest: *mut u8, n: usize, c: i32) { + __aeabi_memset4(dest, n, c); + } + + #[cfg(not(target_os = "ios"))] + #[linkage = "weak"] + pub unsafe extern "aapcs" fn __aeabi_memclr(dest: *mut u8, n: usize) { + __aeabi_memset(dest, n, 0); + } + + #[cfg(not(any(target_os = "ios", target_env = "msvc")))] + #[linkage = "weak"] + pub unsafe extern "aapcs" fn __aeabi_memclr4(dest: *mut u8, n: usize) { + __aeabi_memset4(dest, n, 0); + } + + #[cfg(not(any(target_os = "ios", target_env = "msvc")))] + #[linkage = "weak"] + pub unsafe extern "aapcs" fn __aeabi_memclr8(dest: *mut u8, n: usize) { + __aeabi_memset4(dest, n, 0); + } +} diff --git a/vendor/compiler_builtins/src/arm_linux.rs b/vendor/compiler_builtins/src/arm_linux.rs new file mode 100644 index 000000000..8fe09485b --- /dev/null +++ b/vendor/compiler_builtins/src/arm_linux.rs @@ -0,0 +1,214 @@ +use core::intrinsics; +use core::mem; + +// Kernel-provided user-mode helper functions: +// https://www.kernel.org/doc/Documentation/arm/kernel_user_helpers.txt +unsafe fn __kuser_cmpxchg(oldval: u32, newval: u32, ptr: *mut u32) -> bool { + let f: extern "C" fn(u32, u32, *mut u32) -> u32 = mem::transmute(0xffff0fc0usize as *const ()); + f(oldval, newval, ptr) == 0 +} +unsafe fn __kuser_memory_barrier() { + let f: extern "C" fn() = mem::transmute(0xffff0fa0usize as *const ()); + f(); +} + +// Word-align a pointer +fn align_ptr(ptr: *mut T) -> *mut u32 { + // This gives us a mask of 0 when T == u32 since the pointer is already + // supposed to be aligned, which avoids any masking in that case. + let ptr_mask = 3 & (4 - mem::size_of::()); + (ptr as usize & !ptr_mask) as *mut u32 +} + +// Calculate the shift and mask of a value inside an aligned word +fn get_shift_mask(ptr: *mut T) -> (u32, u32) { + // Mask to get the low byte/halfword/word + let mask = match mem::size_of::() { + 1 => 0xff, + 2 => 0xffff, + 4 => 0xffffffff, + _ => unreachable!(), + }; + + // If we are on big-endian then we need to adjust the shift accordingly + let endian_adjust = if cfg!(target_endian = "little") { + 0 + } else { + 4 - mem::size_of::() as u32 + }; + + // Shift to get the desired element in the word + let ptr_mask = 3 & (4 - mem::size_of::()); + let shift = ((ptr as usize & ptr_mask) as u32 ^ endian_adjust) * 8; + + (shift, mask) +} + +// Extract a value from an aligned word +fn extract_aligned(aligned: u32, shift: u32, mask: u32) -> u32 { + (aligned >> shift) & mask +} + +// Insert a value into an aligned word +fn insert_aligned(aligned: u32, val: u32, shift: u32, mask: u32) -> u32 { + (aligned & !(mask << shift)) | ((val & mask) << shift) +} + +// Generic atomic read-modify-write operation +unsafe fn atomic_rmw u32>(ptr: *mut T, f: F) -> u32 { + let aligned_ptr = align_ptr(ptr); + let (shift, mask) = get_shift_mask(ptr); + + loop { + let curval_aligned = intrinsics::atomic_load_unordered(aligned_ptr); + let curval = extract_aligned(curval_aligned, shift, mask); + let newval = f(curval); + let newval_aligned = insert_aligned(curval_aligned, newval, shift, mask); + if __kuser_cmpxchg(curval_aligned, newval_aligned, aligned_ptr) { + return curval; + } + } +} + +// Generic atomic compare-exchange operation +unsafe fn atomic_cmpxchg(ptr: *mut T, oldval: u32, newval: u32) -> u32 { + let aligned_ptr = align_ptr(ptr); + let (shift, mask) = get_shift_mask(ptr); + + loop { + let curval_aligned = intrinsics::atomic_load_unordered(aligned_ptr); + let curval = extract_aligned(curval_aligned, shift, mask); + if curval != oldval { + return curval; + } + let newval_aligned = insert_aligned(curval_aligned, newval, shift, mask); + if __kuser_cmpxchg(curval_aligned, newval_aligned, aligned_ptr) { + return oldval; + } + } +} + +macro_rules! atomic_rmw { + ($name:ident, $ty:ty, $op:expr) => { + intrinsics! { + pub unsafe extern "C" fn $name(ptr: *mut $ty, val: $ty) -> $ty { + atomic_rmw(ptr, |x| $op(x as $ty, val) as u32) as $ty + } + } + }; +} +macro_rules! atomic_cmpxchg { + ($name:ident, $ty:ty) => { + intrinsics! { + pub unsafe extern "C" fn $name(ptr: *mut $ty, oldval: $ty, newval: $ty) -> $ty { + atomic_cmpxchg(ptr, oldval as u32, newval as u32) as $ty + } + } + }; +} + +atomic_rmw!(__sync_fetch_and_add_1, u8, |a: u8, b: u8| a.wrapping_add(b)); +atomic_rmw!(__sync_fetch_and_add_2, u16, |a: u16, b: u16| a + .wrapping_add(b)); +atomic_rmw!(__sync_fetch_and_add_4, u32, |a: u32, b: u32| a + .wrapping_add(b)); + +atomic_rmw!(__sync_fetch_and_sub_1, u8, |a: u8, b: u8| a.wrapping_sub(b)); +atomic_rmw!(__sync_fetch_and_sub_2, u16, |a: u16, b: u16| a + .wrapping_sub(b)); +atomic_rmw!(__sync_fetch_and_sub_4, u32, |a: u32, b: u32| a + .wrapping_sub(b)); + +atomic_rmw!(__sync_fetch_and_and_1, u8, |a: u8, b: u8| a & b); +atomic_rmw!(__sync_fetch_and_and_2, u16, |a: u16, b: u16| a & b); +atomic_rmw!(__sync_fetch_and_and_4, u32, |a: u32, b: u32| a & b); + +atomic_rmw!(__sync_fetch_and_or_1, u8, |a: u8, b: u8| a | b); +atomic_rmw!(__sync_fetch_and_or_2, u16, |a: u16, b: u16| a | b); +atomic_rmw!(__sync_fetch_and_or_4, u32, |a: u32, b: u32| a | b); + +atomic_rmw!(__sync_fetch_and_xor_1, u8, |a: u8, b: u8| a ^ b); +atomic_rmw!(__sync_fetch_and_xor_2, u16, |a: u16, b: u16| a ^ b); +atomic_rmw!(__sync_fetch_and_xor_4, u32, |a: u32, b: u32| a ^ b); + +atomic_rmw!(__sync_fetch_and_nand_1, u8, |a: u8, b: u8| !(a & b)); +atomic_rmw!(__sync_fetch_and_nand_2, u16, |a: u16, b: u16| !(a & b)); +atomic_rmw!(__sync_fetch_and_nand_4, u32, |a: u32, b: u32| !(a & b)); + +atomic_rmw!(__sync_fetch_and_max_1, i8, |a: i8, b: i8| if a > b { + a +} else { + b +}); +atomic_rmw!(__sync_fetch_and_max_2, i16, |a: i16, b: i16| if a > b { + a +} else { + b +}); +atomic_rmw!(__sync_fetch_and_max_4, i32, |a: i32, b: i32| if a > b { + a +} else { + b +}); + +atomic_rmw!(__sync_fetch_and_umax_1, u8, |a: u8, b: u8| if a > b { + a +} else { + b +}); +atomic_rmw!(__sync_fetch_and_umax_2, u16, |a: u16, b: u16| if a > b { + a +} else { + b +}); +atomic_rmw!(__sync_fetch_and_umax_4, u32, |a: u32, b: u32| if a > b { + a +} else { + b +}); + +atomic_rmw!(__sync_fetch_and_min_1, i8, |a: i8, b: i8| if a < b { + a +} else { + b +}); +atomic_rmw!(__sync_fetch_and_min_2, i16, |a: i16, b: i16| if a < b { + a +} else { + b +}); +atomic_rmw!(__sync_fetch_and_min_4, i32, |a: i32, b: i32| if a < b { + a +} else { + b +}); + +atomic_rmw!(__sync_fetch_and_umin_1, u8, |a: u8, b: u8| if a < b { + a +} else { + b +}); +atomic_rmw!(__sync_fetch_and_umin_2, u16, |a: u16, b: u16| if a < b { + a +} else { + b +}); +atomic_rmw!(__sync_fetch_and_umin_4, u32, |a: u32, b: u32| if a < b { + a +} else { + b +}); + +atomic_rmw!(__sync_lock_test_and_set_1, u8, |_: u8, b: u8| b); +atomic_rmw!(__sync_lock_test_and_set_2, u16, |_: u16, b: u16| b); +atomic_rmw!(__sync_lock_test_and_set_4, u32, |_: u32, b: u32| b); + +atomic_cmpxchg!(__sync_val_compare_and_swap_1, u8); +atomic_cmpxchg!(__sync_val_compare_and_swap_2, u16); +atomic_cmpxchg!(__sync_val_compare_and_swap_4, u32); + +intrinsics! { + pub unsafe extern "C" fn __sync_synchronize() { + __kuser_memory_barrier(); + } +} diff --git a/vendor/compiler_builtins/src/float/add.rs b/vendor/compiler_builtins/src/float/add.rs new file mode 100644 index 000000000..67f6c2c14 --- /dev/null +++ b/vendor/compiler_builtins/src/float/add.rs @@ -0,0 +1,213 @@ +use float::Float; +use int::{CastInto, Int}; + +/// Returns `a + b` +fn add(a: F, b: F) -> F +where + u32: CastInto, + F::Int: CastInto, + i32: CastInto, + F::Int: CastInto, +{ + let one = F::Int::ONE; + let zero = F::Int::ZERO; + + let bits = F::BITS.cast(); + let significand_bits = F::SIGNIFICAND_BITS; + let max_exponent = F::EXPONENT_MAX; + + let implicit_bit = F::IMPLICIT_BIT; + let significand_mask = F::SIGNIFICAND_MASK; + let sign_bit = F::SIGN_MASK as F::Int; + let abs_mask = sign_bit - one; + let exponent_mask = F::EXPONENT_MASK; + let inf_rep = exponent_mask; + let quiet_bit = implicit_bit >> 1; + let qnan_rep = exponent_mask | quiet_bit; + + let mut a_rep = a.repr(); + let mut b_rep = b.repr(); + let a_abs = a_rep & abs_mask; + let b_abs = b_rep & abs_mask; + + // Detect if a or b is zero, infinity, or NaN. + if a_abs.wrapping_sub(one) >= inf_rep - one || b_abs.wrapping_sub(one) >= inf_rep - one { + // NaN + anything = qNaN + if a_abs > inf_rep { + return F::from_repr(a_abs | quiet_bit); + } + // anything + NaN = qNaN + if b_abs > inf_rep { + return F::from_repr(b_abs | quiet_bit); + } + + if a_abs == inf_rep { + // +/-infinity + -/+infinity = qNaN + if (a.repr() ^ b.repr()) == sign_bit { + return F::from_repr(qnan_rep); + } else { + // +/-infinity + anything remaining = +/- infinity + return a; + } + } + + // anything remaining + +/-infinity = +/-infinity + if b_abs == inf_rep { + return b; + } + + // zero + anything = anything + if a_abs == Int::ZERO { + // but we need to get the sign right for zero + zero + if b_abs == Int::ZERO { + return F::from_repr(a.repr() & b.repr()); + } else { + return b; + } + } + + // anything + zero = anything + if b_abs == Int::ZERO { + return a; + } + } + + // Swap a and b if necessary so that a has the larger absolute value. + if b_abs > a_abs { + // Don't use mem::swap because it may generate references to memcpy in unoptimized code. + let tmp = a_rep; + a_rep = b_rep; + b_rep = tmp; + } + + // Extract the exponent and significand from the (possibly swapped) a and b. + let mut a_exponent: i32 = ((a_rep & exponent_mask) >> significand_bits).cast(); + let mut b_exponent: i32 = ((b_rep & exponent_mask) >> significand_bits).cast(); + let mut a_significand = a_rep & significand_mask; + let mut b_significand = b_rep & significand_mask; + + // normalize any denormals, and adjust the exponent accordingly. + if a_exponent == 0 { + let (exponent, significand) = F::normalize(a_significand); + a_exponent = exponent; + a_significand = significand; + } + if b_exponent == 0 { + let (exponent, significand) = F::normalize(b_significand); + b_exponent = exponent; + b_significand = significand; + } + + // The sign of the result is the sign of the larger operand, a. If they + // have opposite signs, we are performing a subtraction; otherwise addition. + let result_sign = a_rep & sign_bit; + let subtraction = ((a_rep ^ b_rep) & sign_bit) != zero; + + // Shift the significands to give us round, guard and sticky, and or in the + // implicit significand bit. (If we fell through from the denormal path it + // was already set by normalize(), but setting it twice won't hurt + // anything.) + a_significand = (a_significand | implicit_bit) << 3; + b_significand = (b_significand | implicit_bit) << 3; + + // Shift the significand of b by the difference in exponents, with a sticky + // bottom bit to get rounding correct. + let align = a_exponent.wrapping_sub(b_exponent).cast(); + if align != Int::ZERO { + if align < bits { + let sticky = + F::Int::from_bool(b_significand << bits.wrapping_sub(align).cast() != Int::ZERO); + b_significand = (b_significand >> align.cast()) | sticky; + } else { + b_significand = one; // sticky; b is known to be non-zero. + } + } + if subtraction { + a_significand = a_significand.wrapping_sub(b_significand); + // If a == -b, return +zero. + if a_significand == Int::ZERO { + return F::from_repr(Int::ZERO); + } + + // If partial cancellation occured, we need to left-shift the result + // and adjust the exponent: + if a_significand < implicit_bit << 3 { + let shift = + a_significand.leading_zeros() as i32 - (implicit_bit << 3).leading_zeros() as i32; + a_significand <<= shift; + a_exponent -= shift; + } + } else { + // addition + a_significand += b_significand; + + // If the addition carried up, we need to right-shift the result and + // adjust the exponent: + if a_significand & implicit_bit << 4 != Int::ZERO { + let sticky = F::Int::from_bool(a_significand & one != Int::ZERO); + a_significand = a_significand >> 1 | sticky; + a_exponent += 1; + } + } + + // If we have overflowed the type, return +/- infinity: + if a_exponent >= max_exponent as i32 { + return F::from_repr(inf_rep | result_sign); + } + + if a_exponent <= 0 { + // Result is denormal before rounding; the exponent is zero and we + // need to shift the significand. + let shift = (1 - a_exponent).cast(); + let sticky = + F::Int::from_bool((a_significand << bits.wrapping_sub(shift).cast()) != Int::ZERO); + a_significand = a_significand >> shift.cast() | sticky; + a_exponent = 0; + } + + // Low three bits are round, guard, and sticky. + let a_significand_i32: i32 = a_significand.cast(); + let round_guard_sticky: i32 = a_significand_i32 & 0x7; + + // Shift the significand into place, and mask off the implicit bit. + let mut result = a_significand >> 3 & significand_mask; + + // Insert the exponent and sign. + result |= a_exponent.cast() << significand_bits; + result |= result_sign; + + // Final rounding. The result may overflow to infinity, but that is the + // correct result in that case. + if round_guard_sticky > 0x4 { + result += one; + } + if round_guard_sticky == 0x4 { + result += result & one; + } + + F::from_repr(result) +} + +intrinsics! { + #[aapcs_on_arm] + #[arm_aeabi_alias = __aeabi_fadd] + pub extern "C" fn __addsf3(a: f32, b: f32) -> f32 { + add(a, b) + } + + #[aapcs_on_arm] + #[arm_aeabi_alias = __aeabi_dadd] + pub extern "C" fn __adddf3(a: f64, b: f64) -> f64 { + add(a, b) + } + + #[cfg(target_arch = "arm")] + pub extern "C" fn __addsf3vfp(a: f32, b: f32) -> f32 { + a + b + } + + #[cfg(target_arch = "arm")] + pub extern "C" fn __adddf3vfp(a: f64, b: f64) -> f64 { + a + b + } +} diff --git a/vendor/compiler_builtins/src/float/cmp.rs b/vendor/compiler_builtins/src/float/cmp.rs new file mode 100644 index 000000000..1d4e38433 --- /dev/null +++ b/vendor/compiler_builtins/src/float/cmp.rs @@ -0,0 +1,253 @@ +#![allow(unreachable_code)] + +use float::Float; +use int::Int; + +#[derive(Clone, Copy)] +enum Result { + Less, + Equal, + Greater, + Unordered, +} + +impl Result { + fn to_le_abi(self) -> i32 { + match self { + Result::Less => -1, + Result::Equal => 0, + Result::Greater => 1, + Result::Unordered => 1, + } + } + + fn to_ge_abi(self) -> i32 { + match self { + Result::Less => -1, + Result::Equal => 0, + Result::Greater => 1, + Result::Unordered => -1, + } + } +} + +fn cmp(a: F, b: F) -> Result { + let one = F::Int::ONE; + let zero = F::Int::ZERO; + let szero = F::SignedInt::ZERO; + + let sign_bit = F::SIGN_MASK as F::Int; + let abs_mask = sign_bit - one; + let exponent_mask = F::EXPONENT_MASK; + let inf_rep = exponent_mask; + + let a_rep = a.repr(); + let b_rep = b.repr(); + let a_abs = a_rep & abs_mask; + let b_abs = b_rep & abs_mask; + + // If either a or b is NaN, they are unordered. + if a_abs > inf_rep || b_abs > inf_rep { + return Result::Unordered; + } + + // If a and b are both zeros, they are equal. + if a_abs | b_abs == zero { + return Result::Equal; + } + + let a_srep = a.signed_repr(); + let b_srep = b.signed_repr(); + + // If at least one of a and b is positive, we get the same result comparing + // a and b as signed integers as we would with a fp_ting-point compare. + if a_srep & b_srep >= szero { + if a_srep < b_srep { + Result::Less + } else if a_srep == b_srep { + Result::Equal + } else { + Result::Greater + } + // Otherwise, both are negative, so we need to flip the sense of the + // comparison to get the correct result. (This assumes a twos- or ones- + // complement integer representation; if integers are represented in a + // sign-magnitude representation, then this flip is incorrect). + } else if a_srep > b_srep { + Result::Less + } else if a_srep == b_srep { + Result::Equal + } else { + Result::Greater + } +} + +fn unord(a: F, b: F) -> bool { + let one = F::Int::ONE; + + let sign_bit = F::SIGN_MASK as F::Int; + let abs_mask = sign_bit - one; + let exponent_mask = F::EXPONENT_MASK; + let inf_rep = exponent_mask; + + let a_rep = a.repr(); + let b_rep = b.repr(); + let a_abs = a_rep & abs_mask; + let b_abs = b_rep & abs_mask; + + a_abs > inf_rep || b_abs > inf_rep +} + +intrinsics! { + pub extern "C" fn __lesf2(a: f32, b: f32) -> i32 { + cmp(a, b).to_le_abi() + } + + pub extern "C" fn __gesf2(a: f32, b: f32) -> i32 { + cmp(a, b).to_ge_abi() + } + + #[arm_aeabi_alias = __aeabi_fcmpun] + pub extern "C" fn __unordsf2(a: f32, b: f32) -> i32 { + unord(a, b) as i32 + } + + pub extern "C" fn __eqsf2(a: f32, b: f32) -> i32 { + cmp(a, b).to_le_abi() + } + + pub extern "C" fn __ltsf2(a: f32, b: f32) -> i32 { + cmp(a, b).to_le_abi() + } + + pub extern "C" fn __nesf2(a: f32, b: f32) -> i32 { + cmp(a, b).to_le_abi() + } + + pub extern "C" fn __gtsf2(a: f32, b: f32) -> i32 { + cmp(a, b).to_ge_abi() + } + + pub extern "C" fn __ledf2(a: f64, b: f64) -> i32 { + cmp(a, b).to_le_abi() + } + + pub extern "C" fn __gedf2(a: f64, b: f64) -> i32 { + cmp(a, b).to_ge_abi() + } + + #[arm_aeabi_alias = __aeabi_dcmpun] + pub extern "C" fn __unorddf2(a: f64, b: f64) -> i32 { + unord(a, b) as i32 + } + + pub extern "C" fn __eqdf2(a: f64, b: f64) -> i32 { + cmp(a, b).to_le_abi() + } + + pub extern "C" fn __ltdf2(a: f64, b: f64) -> i32 { + cmp(a, b).to_le_abi() + } + + pub extern "C" fn __nedf2(a: f64, b: f64) -> i32 { + cmp(a, b).to_le_abi() + } + + pub extern "C" fn __gtdf2(a: f64, b: f64) -> i32 { + cmp(a, b).to_ge_abi() + } +} + +#[cfg(target_arch = "arm")] +intrinsics! { + pub extern "aapcs" fn __aeabi_fcmple(a: f32, b: f32) -> i32 { + (__lesf2(a, b) <= 0) as i32 + } + + pub extern "aapcs" fn __aeabi_fcmpge(a: f32, b: f32) -> i32 { + (__gesf2(a, b) >= 0) as i32 + } + + pub extern "aapcs" fn __aeabi_fcmpeq(a: f32, b: f32) -> i32 { + (__eqsf2(a, b) == 0) as i32 + } + + pub extern "aapcs" fn __aeabi_fcmplt(a: f32, b: f32) -> i32 { + (__ltsf2(a, b) < 0) as i32 + } + + pub extern "aapcs" fn __aeabi_fcmpgt(a: f32, b: f32) -> i32 { + (__gtsf2(a, b) > 0) as i32 + } + + pub extern "aapcs" fn __aeabi_dcmple(a: f64, b: f64) -> i32 { + (__ledf2(a, b) <= 0) as i32 + } + + pub extern "aapcs" fn __aeabi_dcmpge(a: f64, b: f64) -> i32 { + (__gedf2(a, b) >= 0) as i32 + } + + pub extern "aapcs" fn __aeabi_dcmpeq(a: f64, b: f64) -> i32 { + (__eqdf2(a, b) == 0) as i32 + } + + pub extern "aapcs" fn __aeabi_dcmplt(a: f64, b: f64) -> i32 { + (__ltdf2(a, b) < 0) as i32 + } + + pub extern "aapcs" fn __aeabi_dcmpgt(a: f64, b: f64) -> i32 { + (__gtdf2(a, b) > 0) as i32 + } + + // On hard-float targets LLVM will use native instructions + // for all VFP intrinsics below + + pub extern "C" fn __gesf2vfp(a: f32, b: f32) -> i32 { + (a >= b) as i32 + } + + pub extern "C" fn __gedf2vfp(a: f64, b: f64) -> i32 { + (a >= b) as i32 + } + + pub extern "C" fn __gtsf2vfp(a: f32, b: f32) -> i32 { + (a > b) as i32 + } + + pub extern "C" fn __gtdf2vfp(a: f64, b: f64) -> i32 { + (a > b) as i32 + } + + pub extern "C" fn __ltsf2vfp(a: f32, b: f32) -> i32 { + (a < b) as i32 + } + + pub extern "C" fn __ltdf2vfp(a: f64, b: f64) -> i32 { + (a < b) as i32 + } + + pub extern "C" fn __lesf2vfp(a: f32, b: f32) -> i32 { + (a <= b) as i32 + } + + pub extern "C" fn __ledf2vfp(a: f64, b: f64) -> i32 { + (a <= b) as i32 + } + + pub extern "C" fn __nesf2vfp(a: f32, b: f32) -> i32 { + (a != b) as i32 + } + + pub extern "C" fn __nedf2vfp(a: f64, b: f64) -> i32 { + (a != b) as i32 + } + + pub extern "C" fn __eqsf2vfp(a: f32, b: f32) -> i32 { + (a == b) as i32 + } + + pub extern "C" fn __eqdf2vfp(a: f64, b: f64) -> i32 { + (a == b) as i32 + } +} diff --git a/vendor/compiler_builtins/src/float/conv.rs b/vendor/compiler_builtins/src/float/conv.rs new file mode 100644 index 000000000..07b58f3d2 --- /dev/null +++ b/vendor/compiler_builtins/src/float/conv.rs @@ -0,0 +1,351 @@ +/// Conversions from integers to floats. +/// +/// These are hand-optimized bit twiddling code, +/// which unfortunately isn't the easiest kind of code to read. +/// +/// The algorithm is explained here: https://blog.m-ou.se/floats/ +mod int_to_float { + pub fn u32_to_f32_bits(i: u32) -> u32 { + if i == 0 { + return 0; + } + let n = i.leading_zeros(); + let a = (i << n) >> 8; // Significant bits, with bit 24 still in tact. + let b = (i << n) << 24; // Insignificant bits, only relevant for rounding. + let m = a + ((b - (b >> 31 & !a)) >> 31); // Add one when we need to round up. Break ties to even. + let e = 157 - n as u32; // Exponent plus 127, minus one. + (e << 23) + m // + not |, so the mantissa can overflow into the exponent. + } + + pub fn u32_to_f64_bits(i: u32) -> u64 { + if i == 0 { + return 0; + } + let n = i.leading_zeros(); + let m = (i as u64) << (21 + n); // Significant bits, with bit 53 still in tact. + let e = 1053 - n as u64; // Exponent plus 1023, minus one. + (e << 52) + m // Bit 53 of m will overflow into e. + } + + pub fn u64_to_f32_bits(i: u64) -> u32 { + let n = i.leading_zeros(); + let y = i.wrapping_shl(n); + let a = (y >> 40) as u32; // Significant bits, with bit 24 still in tact. + let b = (y >> 8 | y & 0xFFFF) as u32; // Insignificant bits, only relevant for rounding. + let m = a + ((b - (b >> 31 & !a)) >> 31); // Add one when we need to round up. Break ties to even. + let e = if i == 0 { 0 } else { 189 - n }; // Exponent plus 127, minus one, except for zero. + (e << 23) + m // + not |, so the mantissa can overflow into the exponent. + } + + pub fn u64_to_f64_bits(i: u64) -> u64 { + if i == 0 { + return 0; + } + let n = i.leading_zeros(); + let a = ((i << n) >> 11) as u64; // Significant bits, with bit 53 still in tact. + let b = ((i << n) << 53) as u64; // Insignificant bits, only relevant for rounding. + let m = a + ((b - (b >> 63 & !a)) >> 63); // Add one when we need to round up. Break ties to even. + let e = 1085 - n as u64; // Exponent plus 1023, minus one. + (e << 52) + m // + not |, so the mantissa can overflow into the exponent. + } + + pub fn u128_to_f32_bits(i: u128) -> u32 { + let n = i.leading_zeros(); + let y = i.wrapping_shl(n); + let a = (y >> 104) as u32; // Significant bits, with bit 24 still in tact. + let b = (y >> 72) as u32 | ((y << 32) >> 32 != 0) as u32; // Insignificant bits, only relevant for rounding. + let m = a + ((b - (b >> 31 & !a)) >> 31); // Add one when we need to round up. Break ties to even. + let e = if i == 0 { 0 } else { 253 - n }; // Exponent plus 127, minus one, except for zero. + (e << 23) + m // + not |, so the mantissa can overflow into the exponent. + } + + pub fn u128_to_f64_bits(i: u128) -> u64 { + let n = i.leading_zeros(); + let y = i.wrapping_shl(n); + let a = (y >> 75) as u64; // Significant bits, with bit 53 still in tact. + let b = (y >> 11 | y & 0xFFFF_FFFF) as u64; // Insignificant bits, only relevant for rounding. + let m = a + ((b - (b >> 63 & !a)) >> 63); // Add one when we need to round up. Break ties to even. + let e = if i == 0 { 0 } else { 1149 - n as u64 }; // Exponent plus 1023, minus one, except for zero. + (e << 52) + m // + not |, so the mantissa can overflow into the exponent. + } +} + +// Conversions from unsigned integers to floats. +intrinsics! { + #[arm_aeabi_alias = __aeabi_ui2f] + pub extern "C" fn __floatunsisf(i: u32) -> f32 { + f32::from_bits(int_to_float::u32_to_f32_bits(i)) + } + + #[arm_aeabi_alias = __aeabi_ui2d] + pub extern "C" fn __floatunsidf(i: u32) -> f64 { + f64::from_bits(int_to_float::u32_to_f64_bits(i)) + } + + #[arm_aeabi_alias = __aeabi_ul2f] + pub extern "C" fn __floatundisf(i: u64) -> f32 { + f32::from_bits(int_to_float::u64_to_f32_bits(i)) + } + + #[arm_aeabi_alias = __aeabi_ul2d] + pub extern "C" fn __floatundidf(i: u64) -> f64 { + f64::from_bits(int_to_float::u64_to_f64_bits(i)) + } + + #[cfg_attr(not(target_feature = "llvm14-builtins-abi"), unadjusted_on_win64)] + pub extern "C" fn __floatuntisf(i: u128) -> f32 { + f32::from_bits(int_to_float::u128_to_f32_bits(i)) + } + + #[cfg_attr(not(target_feature = "llvm14-builtins-abi"), unadjusted_on_win64)] + pub extern "C" fn __floatuntidf(i: u128) -> f64 { + f64::from_bits(int_to_float::u128_to_f64_bits(i)) + } +} + +// Conversions from signed integers to floats. +intrinsics! { + #[arm_aeabi_alias = __aeabi_i2f] + pub extern "C" fn __floatsisf(i: i32) -> f32 { + let sign_bit = ((i >> 31) as u32) << 31; + f32::from_bits(int_to_float::u32_to_f32_bits(i.unsigned_abs()) | sign_bit) + } + + #[arm_aeabi_alias = __aeabi_i2d] + pub extern "C" fn __floatsidf(i: i32) -> f64 { + let sign_bit = ((i >> 31) as u64) << 63; + f64::from_bits(int_to_float::u32_to_f64_bits(i.unsigned_abs()) | sign_bit) + } + + #[arm_aeabi_alias = __aeabi_l2f] + pub extern "C" fn __floatdisf(i: i64) -> f32 { + let sign_bit = ((i >> 63) as u32) << 31; + f32::from_bits(int_to_float::u64_to_f32_bits(i.unsigned_abs()) | sign_bit) + } + + #[arm_aeabi_alias = __aeabi_l2d] + pub extern "C" fn __floatdidf(i: i64) -> f64 { + let sign_bit = ((i >> 63) as u64) << 63; + f64::from_bits(int_to_float::u64_to_f64_bits(i.unsigned_abs()) | sign_bit) + } + + #[cfg_attr(not(target_feature = "llvm14-builtins-abi"), unadjusted_on_win64)] + pub extern "C" fn __floattisf(i: i128) -> f32 { + let sign_bit = ((i >> 127) as u32) << 31; + f32::from_bits(int_to_float::u128_to_f32_bits(i.unsigned_abs()) | sign_bit) + } + + #[cfg_attr(not(target_feature = "llvm14-builtins-abi"), unadjusted_on_win64)] + pub extern "C" fn __floattidf(i: i128) -> f64 { + let sign_bit = ((i >> 127) as u64) << 63; + f64::from_bits(int_to_float::u128_to_f64_bits(i.unsigned_abs()) | sign_bit) + } +} + +// Conversions from floats to unsigned integers. +intrinsics! { + #[arm_aeabi_alias = __aeabi_f2uiz] + pub extern "C" fn __fixunssfsi(f: f32) -> u32 { + let fbits = f.to_bits(); + if fbits < 127 << 23 { // >= 0, < 1 + 0 + } else if fbits < 159 << 23 { // >= 1, < max + let m = 1 << 31 | fbits << 8; // Mantissa and the implicit 1-bit. + let s = 158 - (fbits >> 23); // Shift based on the exponent and bias. + m >> s + } else if fbits <= 255 << 23 { // >= max (incl. inf) + u32::MAX + } else { // Negative or NaN + 0 + } + } + + #[arm_aeabi_alias = __aeabi_f2ulz] + pub extern "C" fn __fixunssfdi(f: f32) -> u64 { + let fbits = f.to_bits(); + if fbits < 127 << 23 { // >= 0, < 1 + 0 + } else if fbits < 191 << 23 { // >= 1, < max + let m = 1 << 63 | (fbits as u64) << 40; // Mantissa and the implicit 1-bit. + let s = 190 - (fbits >> 23); // Shift based on the exponent and bias. + m >> s + } else if fbits <= 255 << 23 { // >= max (incl. inf) + u64::MAX + } else { // Negative or NaN + 0 + } + } + + #[cfg_attr(target_feature = "llvm14-builtins-abi", win64_128bit_abi_hack)] + #[cfg_attr(not(target_feature = "llvm14-builtins-abi"), unadjusted_on_win64)] + pub extern "C" fn __fixunssfti(f: f32) -> u128 { + let fbits = f.to_bits(); + if fbits < 127 << 23 { // >= 0, < 1 + 0 + } else if fbits < 255 << 23 { // >= 1, < inf + let m = 1 << 127 | (fbits as u128) << 104; // Mantissa and the implicit 1-bit. + let s = 254 - (fbits >> 23); // Shift based on the exponent and bias. + m >> s + } else if fbits == 255 << 23 { // == inf + u128::MAX + } else { // Negative or NaN + 0 + } + } + + #[arm_aeabi_alias = __aeabi_d2uiz] + pub extern "C" fn __fixunsdfsi(f: f64) -> u32 { + let fbits = f.to_bits(); + if fbits < 1023 << 52 { // >= 0, < 1 + 0 + } else if fbits < 1055 << 52 { // >= 1, < max + let m = 1 << 31 | (fbits >> 21) as u32; // Mantissa and the implicit 1-bit. + let s = 1054 - (fbits >> 52); // Shift based on the exponent and bias. + m >> s + } else if fbits <= 2047 << 52 { // >= max (incl. inf) + u32::MAX + } else { // Negative or NaN + 0 + } + } + + #[arm_aeabi_alias = __aeabi_d2ulz] + pub extern "C" fn __fixunsdfdi(f: f64) -> u64 { + let fbits = f.to_bits(); + if fbits < 1023 << 52 { // >= 0, < 1 + 0 + } else if fbits < 1087 << 52 { // >= 1, < max + let m = 1 << 63 | fbits << 11; // Mantissa and the implicit 1-bit. + let s = 1086 - (fbits >> 52); // Shift based on the exponent and bias. + m >> s + } else if fbits <= 2047 << 52 { // >= max (incl. inf) + u64::MAX + } else { // Negative or NaN + 0 + } + } + + #[cfg_attr(target_feature = "llvm14-builtins-abi", win64_128bit_abi_hack)] + #[cfg_attr(not(target_feature = "llvm14-builtins-abi"), unadjusted_on_win64)] + pub extern "C" fn __fixunsdfti(f: f64) -> u128 { + let fbits = f.to_bits(); + if fbits < 1023 << 52 { // >= 0, < 1 + 0 + } else if fbits < 1151 << 52 { // >= 1, < max + let m = 1 << 127 | (fbits as u128) << 75; // Mantissa and the implicit 1-bit. + let s = 1150 - (fbits >> 52); // Shift based on the exponent and bias. + m >> s + } else if fbits <= 2047 << 52 { // >= max (incl. inf) + u128::MAX + } else { // Negative or NaN + 0 + } + } +} + +// Conversions from floats to signed integers. +intrinsics! { + #[arm_aeabi_alias = __aeabi_f2iz] + pub extern "C" fn __fixsfsi(f: f32) -> i32 { + let fbits = f.to_bits() & !0 >> 1; // Remove sign bit. + if fbits < 127 << 23 { // >= 0, < 1 + 0 + } else if fbits < 158 << 23 { // >= 1, < max + let m = 1 << 31 | fbits << 8; // Mantissa and the implicit 1-bit. + let s = 158 - (fbits >> 23); // Shift based on the exponent and bias. + let u = (m >> s) as i32; // Unsigned result. + if f.is_sign_negative() { -u } else { u } + } else if fbits <= 255 << 23 { // >= max (incl. inf) + if f.is_sign_negative() { i32::MIN } else { i32::MAX } + } else { // NaN + 0 + } + } + + #[arm_aeabi_alias = __aeabi_f2lz] + pub extern "C" fn __fixsfdi(f: f32) -> i64 { + let fbits = f.to_bits() & !0 >> 1; // Remove sign bit. + if fbits < 127 << 23 { // >= 0, < 1 + 0 + } else if fbits < 190 << 23 { // >= 1, < max + let m = 1 << 63 | (fbits as u64) << 40; // Mantissa and the implicit 1-bit. + let s = 190 - (fbits >> 23); // Shift based on the exponent and bias. + let u = (m >> s) as i64; // Unsigned result. + if f.is_sign_negative() { -u } else { u } + } else if fbits <= 255 << 23 { // >= max (incl. inf) + if f.is_sign_negative() { i64::MIN } else { i64::MAX } + } else { // NaN + 0 + } + } + + #[cfg_attr(target_feature = "llvm14-builtins-abi", win64_128bit_abi_hack)] + #[cfg_attr(not(target_feature = "llvm14-builtins-abi"), unadjusted_on_win64)] + pub extern "C" fn __fixsfti(f: f32) -> i128 { + let fbits = f.to_bits() & !0 >> 1; // Remove sign bit. + if fbits < 127 << 23 { // >= 0, < 1 + 0 + } else if fbits < 254 << 23 { // >= 1, < max + let m = 1 << 127 | (fbits as u128) << 104; // Mantissa and the implicit 1-bit. + let s = 254 - (fbits >> 23); // Shift based on the exponent and bias. + let u = (m >> s) as i128; // Unsigned result. + if f.is_sign_negative() { -u } else { u } + } else if fbits <= 255 << 23 { // >= max (incl. inf) + if f.is_sign_negative() { i128::MIN } else { i128::MAX } + } else { // NaN + 0 + } + } + + #[arm_aeabi_alias = __aeabi_d2iz] + pub extern "C" fn __fixdfsi(f: f64) -> i32 { + let fbits = f.to_bits() & !0 >> 1; // Remove sign bit. + if fbits < 1023 << 52 { // >= 0, < 1 + 0 + } else if fbits < 1054 << 52 { // >= 1, < max + let m = 1 << 31 | (fbits >> 21) as u32; // Mantissa and the implicit 1-bit. + let s = 1054 - (fbits >> 52); // Shift based on the exponent and bias. + let u = (m >> s) as i32; // Unsigned result. + if f.is_sign_negative() { -u } else { u } + } else if fbits <= 2047 << 52 { // >= max (incl. inf) + if f.is_sign_negative() { i32::MIN } else { i32::MAX } + } else { // NaN + 0 + } + } + + #[arm_aeabi_alias = __aeabi_d2lz] + pub extern "C" fn __fixdfdi(f: f64) -> i64 { + let fbits = f.to_bits() & !0 >> 1; // Remove sign bit. + if fbits < 1023 << 52 { // >= 0, < 1 + 0 + } else if fbits < 1086 << 52 { // >= 1, < max + let m = 1 << 63 | fbits << 11; // Mantissa and the implicit 1-bit. + let s = 1086 - (fbits >> 52); // Shift based on the exponent and bias. + let u = (m >> s) as i64; // Unsigned result. + if f.is_sign_negative() { -u } else { u } + } else if fbits <= 2047 << 52 { // >= max (incl. inf) + if f.is_sign_negative() { i64::MIN } else { i64::MAX } + } else { // NaN + 0 + } + } + + #[cfg_attr(target_feature = "llvm14-builtins-abi", win64_128bit_abi_hack)] + #[cfg_attr(not(target_feature = "llvm14-builtins-abi"), unadjusted_on_win64)] + pub extern "C" fn __fixdfti(f: f64) -> i128 { + let fbits = f.to_bits() & !0 >> 1; // Remove sign bit. + if fbits < 1023 << 52 { // >= 0, < 1 + 0 + } else if fbits < 1150 << 52 { // >= 1, < max + let m = 1 << 127 | (fbits as u128) << 75; // Mantissa and the implicit 1-bit. + let s = 1150 - (fbits >> 52); // Shift based on the exponent and bias. + let u = (m >> s) as i128; // Unsigned result. + if f.is_sign_negative() { -u } else { u } + } else if fbits <= 2047 << 52 { // >= max (incl. inf) + if f.is_sign_negative() { i128::MIN } else { i128::MAX } + } else { // NaN + 0 + } + } +} diff --git a/vendor/compiler_builtins/src/float/div.rs b/vendor/compiler_builtins/src/float/div.rs new file mode 100644 index 000000000..528a8368d --- /dev/null +++ b/vendor/compiler_builtins/src/float/div.rs @@ -0,0 +1,467 @@ +// The functions are complex with many branches, and explicit +// `return`s makes it clear where function exit points are +#![allow(clippy::needless_return)] + +use float::Float; +use int::{CastInto, DInt, HInt, Int}; + +fn div32(a: F, b: F) -> F +where + u32: CastInto, + F::Int: CastInto, + i32: CastInto, + F::Int: CastInto, + F::Int: HInt, +{ + let one = F::Int::ONE; + let zero = F::Int::ZERO; + + // let bits = F::BITS; + let significand_bits = F::SIGNIFICAND_BITS; + let max_exponent = F::EXPONENT_MAX; + + let exponent_bias = F::EXPONENT_BIAS; + + let implicit_bit = F::IMPLICIT_BIT; + let significand_mask = F::SIGNIFICAND_MASK; + let sign_bit = F::SIGN_MASK as F::Int; + let abs_mask = sign_bit - one; + let exponent_mask = F::EXPONENT_MASK; + let inf_rep = exponent_mask; + let quiet_bit = implicit_bit >> 1; + let qnan_rep = exponent_mask | quiet_bit; + + #[inline(always)] + fn negate_u32(a: u32) -> u32 { + (::wrapping_neg(a as i32)) as u32 + } + + let a_rep = a.repr(); + let b_rep = b.repr(); + + let a_exponent = (a_rep >> significand_bits) & max_exponent.cast(); + let b_exponent = (b_rep >> significand_bits) & max_exponent.cast(); + let quotient_sign = (a_rep ^ b_rep) & sign_bit; + + let mut a_significand = a_rep & significand_mask; + let mut b_significand = b_rep & significand_mask; + let mut scale = 0; + + // Detect if a or b is zero, denormal, infinity, or NaN. + if a_exponent.wrapping_sub(one) >= (max_exponent - 1).cast() + || b_exponent.wrapping_sub(one) >= (max_exponent - 1).cast() + { + let a_abs = a_rep & abs_mask; + let b_abs = b_rep & abs_mask; + + // NaN / anything = qNaN + if a_abs > inf_rep { + return F::from_repr(a_rep | quiet_bit); + } + // anything / NaN = qNaN + if b_abs > inf_rep { + return F::from_repr(b_rep | quiet_bit); + } + + if a_abs == inf_rep { + if b_abs == inf_rep { + // infinity / infinity = NaN + return F::from_repr(qnan_rep); + } else { + // infinity / anything else = +/- infinity + return F::from_repr(a_abs | quotient_sign); + } + } + + // anything else / infinity = +/- 0 + if b_abs == inf_rep { + return F::from_repr(quotient_sign); + } + + if a_abs == zero { + if b_abs == zero { + // zero / zero = NaN + return F::from_repr(qnan_rep); + } else { + // zero / anything else = +/- zero + return F::from_repr(quotient_sign); + } + } + + // anything else / zero = +/- infinity + if b_abs == zero { + return F::from_repr(inf_rep | quotient_sign); + } + + // one or both of a or b is denormal, the other (if applicable) is a + // normal number. Renormalize one or both of a and b, and set scale to + // include the necessary exponent adjustment. + if a_abs < implicit_bit { + let (exponent, significand) = F::normalize(a_significand); + scale += exponent; + a_significand = significand; + } + + if b_abs < implicit_bit { + let (exponent, significand) = F::normalize(b_significand); + scale -= exponent; + b_significand = significand; + } + } + + // Or in the implicit significand bit. (If we fell through from the + // denormal path it was already set by normalize( ), but setting it twice + // won't hurt anything.) + a_significand |= implicit_bit; + b_significand |= implicit_bit; + let mut quotient_exponent: i32 = CastInto::::cast(a_exponent) + .wrapping_sub(CastInto::::cast(b_exponent)) + .wrapping_add(scale); + + // Align the significand of b as a Q31 fixed-point number in the range + // [1, 2.0) and get a Q32 approximate reciprocal using a small minimax + // polynomial approximation: reciprocal = 3/4 + 1/sqrt(2) - b/2. This + // is accurate to about 3.5 binary digits. + let q31b = CastInto::::cast(b_significand << 8.cast()); + let mut reciprocal = (0x7504f333u32).wrapping_sub(q31b); + + // Now refine the reciprocal estimate using a Newton-Raphson iteration: + // + // x1 = x0 * (2 - x0 * b) + // + // This doubles the number of correct binary digits in the approximation + // with each iteration, so after three iterations, we have about 28 binary + // digits of accuracy. + + let mut correction: u32 = + negate_u32(((reciprocal as u64).wrapping_mul(q31b as u64) >> 32) as u32); + reciprocal = ((reciprocal as u64).wrapping_mul(correction as u64) as u64 >> 31) as u32; + correction = negate_u32(((reciprocal as u64).wrapping_mul(q31b as u64) >> 32) as u32); + reciprocal = ((reciprocal as u64).wrapping_mul(correction as u64) as u64 >> 31) as u32; + correction = negate_u32(((reciprocal as u64).wrapping_mul(q31b as u64) >> 32) as u32); + reciprocal = ((reciprocal as u64).wrapping_mul(correction as u64) as u64 >> 31) as u32; + + // Exhaustive testing shows that the error in reciprocal after three steps + // is in the interval [-0x1.f58108p-31, 0x1.d0e48cp-29], in line with our + // expectations. We bump the reciprocal by a tiny value to force the error + // to be strictly positive (in the range [0x1.4fdfp-37,0x1.287246p-29], to + // be specific). This also causes 1/1 to give a sensible approximation + // instead of zero (due to overflow). + reciprocal = reciprocal.wrapping_sub(2); + + // The numerical reciprocal is accurate to within 2^-28, lies in the + // interval [0x1.000000eep-1, 0x1.fffffffcp-1], and is strictly smaller + // than the true reciprocal of b. Multiplying a by this reciprocal thus + // gives a numerical q = a/b in Q24 with the following properties: + // + // 1. q < a/b + // 2. q is in the interval [0x1.000000eep-1, 0x1.fffffffcp0) + // 3. the error in q is at most 2^-24 + 2^-27 -- the 2^24 term comes + // from the fact that we truncate the product, and the 2^27 term + // is the error in the reciprocal of b scaled by the maximum + // possible value of a. As a consequence of this error bound, + // either q or nextafter(q) is the correctly rounded + let mut quotient = (a_significand << 1).widen_mul(reciprocal.cast()).hi(); + + // Two cases: quotient is in [0.5, 1.0) or quotient is in [1.0, 2.0). + // In either case, we are going to compute a residual of the form + // + // r = a - q*b + // + // We know from the construction of q that r satisfies: + // + // 0 <= r < ulp(q)*b + // + // if r is greater than 1/2 ulp(q)*b, then q rounds up. Otherwise, we + // already have the correct result. The exact halfway case cannot occur. + // We also take this time to right shift quotient if it falls in the [1,2) + // range and adjust the exponent accordingly. + let residual = if quotient < (implicit_bit << 1) { + quotient_exponent = quotient_exponent.wrapping_sub(1); + (a_significand << (significand_bits + 1)).wrapping_sub(quotient.wrapping_mul(b_significand)) + } else { + quotient >>= 1; + (a_significand << significand_bits).wrapping_sub(quotient.wrapping_mul(b_significand)) + }; + + let written_exponent = quotient_exponent.wrapping_add(exponent_bias as i32); + + if written_exponent >= max_exponent as i32 { + // If we have overflowed the exponent, return infinity. + return F::from_repr(inf_rep | quotient_sign); + } else if written_exponent < 1 { + // Flush denormals to zero. In the future, it would be nice to add + // code to round them correctly. + return F::from_repr(quotient_sign); + } else { + let round = ((residual << 1) > b_significand) as u32; + // Clear the implicit bits + let mut abs_result = quotient & significand_mask; + // Insert the exponent + abs_result |= written_exponent.cast() << significand_bits; + // Round + abs_result = abs_result.wrapping_add(round.cast()); + // Insert the sign and return + return F::from_repr(abs_result | quotient_sign); + } +} + +fn div64(a: F, b: F) -> F +where + u32: CastInto, + F::Int: CastInto, + i32: CastInto, + F::Int: CastInto, + u64: CastInto, + F::Int: CastInto, + i64: CastInto, + F::Int: CastInto, + F::Int: HInt, +{ + let one = F::Int::ONE; + let zero = F::Int::ZERO; + + // let bits = F::BITS; + let significand_bits = F::SIGNIFICAND_BITS; + let max_exponent = F::EXPONENT_MAX; + + let exponent_bias = F::EXPONENT_BIAS; + + let implicit_bit = F::IMPLICIT_BIT; + let significand_mask = F::SIGNIFICAND_MASK; + let sign_bit = F::SIGN_MASK as F::Int; + let abs_mask = sign_bit - one; + let exponent_mask = F::EXPONENT_MASK; + let inf_rep = exponent_mask; + let quiet_bit = implicit_bit >> 1; + let qnan_rep = exponent_mask | quiet_bit; + // let exponent_bits = F::EXPONENT_BITS; + + #[inline(always)] + fn negate_u32(a: u32) -> u32 { + (::wrapping_neg(a as i32)) as u32 + } + + #[inline(always)] + fn negate_u64(a: u64) -> u64 { + (::wrapping_neg(a as i64)) as u64 + } + + let a_rep = a.repr(); + let b_rep = b.repr(); + + let a_exponent = (a_rep >> significand_bits) & max_exponent.cast(); + let b_exponent = (b_rep >> significand_bits) & max_exponent.cast(); + let quotient_sign = (a_rep ^ b_rep) & sign_bit; + + let mut a_significand = a_rep & significand_mask; + let mut b_significand = b_rep & significand_mask; + let mut scale = 0; + + // Detect if a or b is zero, denormal, infinity, or NaN. + if a_exponent.wrapping_sub(one) >= (max_exponent - 1).cast() + || b_exponent.wrapping_sub(one) >= (max_exponent - 1).cast() + { + let a_abs = a_rep & abs_mask; + let b_abs = b_rep & abs_mask; + + // NaN / anything = qNaN + if a_abs > inf_rep { + return F::from_repr(a_rep | quiet_bit); + } + // anything / NaN = qNaN + if b_abs > inf_rep { + return F::from_repr(b_rep | quiet_bit); + } + + if a_abs == inf_rep { + if b_abs == inf_rep { + // infinity / infinity = NaN + return F::from_repr(qnan_rep); + } else { + // infinity / anything else = +/- infinity + return F::from_repr(a_abs | quotient_sign); + } + } + + // anything else / infinity = +/- 0 + if b_abs == inf_rep { + return F::from_repr(quotient_sign); + } + + if a_abs == zero { + if b_abs == zero { + // zero / zero = NaN + return F::from_repr(qnan_rep); + } else { + // zero / anything else = +/- zero + return F::from_repr(quotient_sign); + } + } + + // anything else / zero = +/- infinity + if b_abs == zero { + return F::from_repr(inf_rep | quotient_sign); + } + + // one or both of a or b is denormal, the other (if applicable) is a + // normal number. Renormalize one or both of a and b, and set scale to + // include the necessary exponent adjustment. + if a_abs < implicit_bit { + let (exponent, significand) = F::normalize(a_significand); + scale += exponent; + a_significand = significand; + } + + if b_abs < implicit_bit { + let (exponent, significand) = F::normalize(b_significand); + scale -= exponent; + b_significand = significand; + } + } + + // Or in the implicit significand bit. (If we fell through from the + // denormal path it was already set by normalize( ), but setting it twice + // won't hurt anything.) + a_significand |= implicit_bit; + b_significand |= implicit_bit; + let mut quotient_exponent: i32 = CastInto::::cast(a_exponent) + .wrapping_sub(CastInto::::cast(b_exponent)) + .wrapping_add(scale); + + // Align the significand of b as a Q31 fixed-point number in the range + // [1, 2.0) and get a Q32 approximate reciprocal using a small minimax + // polynomial approximation: reciprocal = 3/4 + 1/sqrt(2) - b/2. This + // is accurate to about 3.5 binary digits. + let q31b = CastInto::::cast(b_significand >> 21.cast()); + let mut recip32 = (0x7504f333u32).wrapping_sub(q31b); + + // Now refine the reciprocal estimate using a Newton-Raphson iteration: + // + // x1 = x0 * (2 - x0 * b) + // + // This doubles the number of correct binary digits in the approximation + // with each iteration, so after three iterations, we have about 28 binary + // digits of accuracy. + + let mut correction32: u32 = + negate_u32(((recip32 as u64).wrapping_mul(q31b as u64) >> 32) as u32); + recip32 = ((recip32 as u64).wrapping_mul(correction32 as u64) >> 31) as u32; + correction32 = negate_u32(((recip32 as u64).wrapping_mul(q31b as u64) >> 32) as u32); + recip32 = ((recip32 as u64).wrapping_mul(correction32 as u64) >> 31) as u32; + correction32 = negate_u32(((recip32 as u64).wrapping_mul(q31b as u64) >> 32) as u32); + recip32 = ((recip32 as u64).wrapping_mul(correction32 as u64) >> 31) as u32; + + // recip32 might have overflowed to exactly zero in the preceeding + // computation if the high word of b is exactly 1.0. This would sabotage + // the full-width final stage of the computation that follows, so we adjust + // recip32 downward by one bit. + recip32 = recip32.wrapping_sub(1); + + // We need to perform one more iteration to get us to 56 binary digits; + // The last iteration needs to happen with extra precision. + let q63blo = CastInto::::cast(b_significand << 11.cast()); + + let correction: u64 = negate_u64( + (recip32 as u64) + .wrapping_mul(q31b as u64) + .wrapping_add((recip32 as u64).wrapping_mul(q63blo as u64) >> 32), + ); + let c_hi = (correction >> 32) as u32; + let c_lo = correction as u32; + let mut reciprocal: u64 = (recip32 as u64) + .wrapping_mul(c_hi as u64) + .wrapping_add((recip32 as u64).wrapping_mul(c_lo as u64) >> 32); + + // We already adjusted the 32-bit estimate, now we need to adjust the final + // 64-bit reciprocal estimate downward to ensure that it is strictly smaller + // than the infinitely precise exact reciprocal. Because the computation + // of the Newton-Raphson step is truncating at every step, this adjustment + // is small; most of the work is already done. + reciprocal = reciprocal.wrapping_sub(2); + + // The numerical reciprocal is accurate to within 2^-56, lies in the + // interval [0.5, 1.0), and is strictly smaller than the true reciprocal + // of b. Multiplying a by this reciprocal thus gives a numerical q = a/b + // in Q53 with the following properties: + // + // 1. q < a/b + // 2. q is in the interval [0.5, 2.0) + // 3. the error in q is bounded away from 2^-53 (actually, we have a + // couple of bits to spare, but this is all we need). + + // We need a 64 x 64 multiply high to compute q, which isn't a basic + // operation in C, so we need to be a little bit fussy. + // let mut quotient: F::Int = ((((reciprocal as u64) + // .wrapping_mul(CastInto::::cast(a_significand << 1) as u64)) + // >> 32) as u32) + // .cast(); + + // We need a 64 x 64 multiply high to compute q, which isn't a basic + // operation in C, so we need to be a little bit fussy. + let mut quotient = (a_significand << 2).widen_mul(reciprocal.cast()).hi(); + + // Two cases: quotient is in [0.5, 1.0) or quotient is in [1.0, 2.0). + // In either case, we are going to compute a residual of the form + // + // r = a - q*b + // + // We know from the construction of q that r satisfies: + // + // 0 <= r < ulp(q)*b + // + // if r is greater than 1/2 ulp(q)*b, then q rounds up. Otherwise, we + // already have the correct result. The exact halfway case cannot occur. + // We also take this time to right shift quotient if it falls in the [1,2) + // range and adjust the exponent accordingly. + let residual = if quotient < (implicit_bit << 1) { + quotient_exponent = quotient_exponent.wrapping_sub(1); + (a_significand << (significand_bits + 1)).wrapping_sub(quotient.wrapping_mul(b_significand)) + } else { + quotient >>= 1; + (a_significand << significand_bits).wrapping_sub(quotient.wrapping_mul(b_significand)) + }; + + let written_exponent = quotient_exponent.wrapping_add(exponent_bias as i32); + + if written_exponent >= max_exponent as i32 { + // If we have overflowed the exponent, return infinity. + return F::from_repr(inf_rep | quotient_sign); + } else if written_exponent < 1 { + // Flush denormals to zero. In the future, it would be nice to add + // code to round them correctly. + return F::from_repr(quotient_sign); + } else { + let round = ((residual << 1) > b_significand) as u32; + // Clear the implicit bits + let mut abs_result = quotient & significand_mask; + // Insert the exponent + abs_result |= written_exponent.cast() << significand_bits; + // Round + abs_result = abs_result.wrapping_add(round.cast()); + // Insert the sign and return + return F::from_repr(abs_result | quotient_sign); + } +} + +intrinsics! { + #[arm_aeabi_alias = __aeabi_fdiv] + pub extern "C" fn __divsf3(a: f32, b: f32) -> f32 { + div32(a, b) + } + + #[arm_aeabi_alias = __aeabi_ddiv] + pub extern "C" fn __divdf3(a: f64, b: f64) -> f64 { + div64(a, b) + } + + #[cfg(target_arch = "arm")] + pub extern "C" fn __divsf3vfp(a: f32, b: f32) -> f32 { + a / b + } + + #[cfg(target_arch = "arm")] + pub extern "C" fn __divdf3vfp(a: f64, b: f64) -> f64 { + a / b + } +} diff --git a/vendor/compiler_builtins/src/float/extend.rs b/vendor/compiler_builtins/src/float/extend.rs new file mode 100644 index 000000000..39633773b --- /dev/null +++ b/vendor/compiler_builtins/src/float/extend.rs @@ -0,0 +1,83 @@ +use float::Float; +use int::{CastInto, Int}; + +/// Generic conversion from a narrower to a wider IEEE-754 floating-point type +fn extend(a: F) -> R +where + F::Int: CastInto, + u64: CastInto, + u32: CastInto, + R::Int: CastInto, + R::Int: CastInto, + u64: CastInto, + F::Int: CastInto, +{ + let src_zero = F::Int::ZERO; + let src_one = F::Int::ONE; + let src_bits = F::BITS; + let src_sign_bits = F::SIGNIFICAND_BITS; + let src_exp_bias = F::EXPONENT_BIAS; + let src_min_normal = F::IMPLICIT_BIT; + let src_infinity = F::EXPONENT_MASK; + let src_sign_mask = F::SIGN_MASK as F::Int; + let src_abs_mask = src_sign_mask - src_one; + let src_qnan = F::SIGNIFICAND_MASK; + let src_nan_code = src_qnan - src_one; + + let dst_bits = R::BITS; + let dst_sign_bits = R::SIGNIFICAND_BITS; + let dst_inf_exp = R::EXPONENT_MAX; + let dst_exp_bias = R::EXPONENT_BIAS; + let dst_min_normal = R::IMPLICIT_BIT; + + let sign_bits_delta = dst_sign_bits - src_sign_bits; + let exp_bias_delta = dst_exp_bias - src_exp_bias; + let a_abs = a.repr() & src_abs_mask; + let mut abs_result = R::Int::ZERO; + + if a_abs.wrapping_sub(src_min_normal) < src_infinity.wrapping_sub(src_min_normal) { + // a is a normal number. + // Extend to the destination type by shifting the significand and + // exponent into the proper position and rebiasing the exponent. + let abs_dst: R::Int = a_abs.cast(); + let bias_dst: R::Int = exp_bias_delta.cast(); + abs_result = abs_dst.wrapping_shl(sign_bits_delta); + abs_result += bias_dst.wrapping_shl(dst_sign_bits); + } else if a_abs >= src_infinity { + // a is NaN or infinity. + // Conjure the result by beginning with infinity, then setting the qNaN + // bit (if needed) and right-aligning the rest of the trailing NaN + // payload field. + let qnan_dst: R::Int = (a_abs & src_qnan).cast(); + let nan_code_dst: R::Int = (a_abs & src_nan_code).cast(); + let inf_exp_dst: R::Int = dst_inf_exp.cast(); + abs_result = inf_exp_dst.wrapping_shl(dst_sign_bits); + abs_result |= qnan_dst.wrapping_shl(sign_bits_delta); + abs_result |= nan_code_dst.wrapping_shl(sign_bits_delta); + } else if a_abs != src_zero { + // a is denormal. + // Renormalize the significand and clear the leading bit, then insert + // the correct adjusted exponent in the destination type. + let scale = a_abs.leading_zeros() - src_min_normal.leading_zeros(); + let abs_dst: R::Int = a_abs.cast(); + let bias_dst: R::Int = (exp_bias_delta - scale + 1).cast(); + abs_result = abs_dst.wrapping_shl(sign_bits_delta + scale); + abs_result = (abs_result ^ dst_min_normal) | (bias_dst.wrapping_shl(dst_sign_bits)); + } + + let sign_result: R::Int = (a.repr() & src_sign_mask).cast(); + R::from_repr(abs_result | (sign_result.wrapping_shl(dst_bits - src_bits))) +} + +intrinsics! { + #[aapcs_on_arm] + #[arm_aeabi_alias = __aeabi_f2d] + pub extern "C" fn __extendsfdf2(a: f32) -> f64 { + extend(a) + } + + #[cfg(target_arch = "arm")] + pub extern "C" fn __extendsfdf2vfp(a: f32) -> f64 { + a as f64 // LLVM generate 'fcvtds' + } +} diff --git a/vendor/compiler_builtins/src/float/mod.rs b/vendor/compiler_builtins/src/float/mod.rs new file mode 100644 index 000000000..01a5504d5 --- /dev/null +++ b/vendor/compiler_builtins/src/float/mod.rs @@ -0,0 +1,175 @@ +use core::ops; + +use super::int::Int; + +pub mod add; +pub mod cmp; +pub mod conv; +pub mod div; +pub mod extend; +pub mod mul; +pub mod pow; +pub mod sub; +pub mod trunc; + +public_test_dep! { +/// Trait for some basic operations on floats +pub(crate) trait Float: + Copy + + core::fmt::Debug + + PartialEq + + PartialOrd + + ops::AddAssign + + ops::MulAssign + + ops::Add + + ops::Sub + + ops::Div + + ops::Rem +{ + /// A uint of the same with as the float + type Int: Int; + + /// A int of the same with as the float + type SignedInt: Int; + + /// An int capable of containing the exponent bits plus a sign bit. This is signed. + type ExpInt: Int; + + const ZERO: Self; + const ONE: Self; + + /// The bitwidth of the float type + const BITS: u32; + + /// The bitwidth of the significand + const SIGNIFICAND_BITS: u32; + + /// The bitwidth of the exponent + const EXPONENT_BITS: u32 = Self::BITS - Self::SIGNIFICAND_BITS - 1; + + /// The maximum value of the exponent + const EXPONENT_MAX: u32 = (1 << Self::EXPONENT_BITS) - 1; + + /// The exponent bias value + const EXPONENT_BIAS: u32 = Self::EXPONENT_MAX >> 1; + + /// A mask for the sign bit + const SIGN_MASK: Self::Int; + + /// A mask for the significand + const SIGNIFICAND_MASK: Self::Int; + + // The implicit bit of the float format + const IMPLICIT_BIT: Self::Int; + + /// A mask for the exponent + const EXPONENT_MASK: Self::Int; + + /// Returns `self` transmuted to `Self::Int` + fn repr(self) -> Self::Int; + + /// Returns `self` transmuted to `Self::SignedInt` + fn signed_repr(self) -> Self::SignedInt; + + /// Checks if two floats have the same bit representation. *Except* for NaNs! NaN can be + /// represented in multiple different ways. This method returns `true` if two NaNs are + /// compared. + fn eq_repr(self, rhs: Self) -> bool; + + /// Returns the sign bit + fn sign(self) -> bool; + + /// Returns the exponent with bias + fn exp(self) -> Self::ExpInt; + + /// Returns the significand with no implicit bit (or the "fractional" part) + fn frac(self) -> Self::Int; + + /// Returns the significand with implicit bit + fn imp_frac(self) -> Self::Int; + + /// Returns a `Self::Int` transmuted back to `Self` + fn from_repr(a: Self::Int) -> Self; + + /// Constructs a `Self` from its parts. Inputs are treated as bits and shifted into position. + fn from_parts(sign: bool, exponent: Self::Int, significand: Self::Int) -> Self; + + /// Returns (normalized exponent, normalized significand) + fn normalize(significand: Self::Int) -> (i32, Self::Int); + + /// Returns if `self` is subnormal + fn is_subnormal(self) -> bool; +} +} + +macro_rules! float_impl { + ($ty:ident, $ity:ident, $sity:ident, $expty:ident, $bits:expr, $significand_bits:expr) => { + impl Float for $ty { + type Int = $ity; + type SignedInt = $sity; + type ExpInt = $expty; + + const ZERO: Self = 0.0; + const ONE: Self = 1.0; + + const BITS: u32 = $bits; + const SIGNIFICAND_BITS: u32 = $significand_bits; + + const SIGN_MASK: Self::Int = 1 << (Self::BITS - 1); + const SIGNIFICAND_MASK: Self::Int = (1 << Self::SIGNIFICAND_BITS) - 1; + const IMPLICIT_BIT: Self::Int = 1 << Self::SIGNIFICAND_BITS; + const EXPONENT_MASK: Self::Int = !(Self::SIGN_MASK | Self::SIGNIFICAND_MASK); + + fn repr(self) -> Self::Int { + self.to_bits() + } + fn signed_repr(self) -> Self::SignedInt { + self.to_bits() as Self::SignedInt + } + fn eq_repr(self, rhs: Self) -> bool { + if self.is_nan() && rhs.is_nan() { + true + } else { + self.repr() == rhs.repr() + } + } + fn sign(self) -> bool { + self.signed_repr() < Self::SignedInt::ZERO + } + fn exp(self) -> Self::ExpInt { + ((self.to_bits() & Self::EXPONENT_MASK) >> Self::SIGNIFICAND_BITS) as Self::ExpInt + } + fn frac(self) -> Self::Int { + self.to_bits() & Self::SIGNIFICAND_MASK + } + fn imp_frac(self) -> Self::Int { + self.frac() | Self::IMPLICIT_BIT + } + fn from_repr(a: Self::Int) -> Self { + Self::from_bits(a) + } + fn from_parts(sign: bool, exponent: Self::Int, significand: Self::Int) -> Self { + Self::from_repr( + ((sign as Self::Int) << (Self::BITS - 1)) + | ((exponent << Self::SIGNIFICAND_BITS) & Self::EXPONENT_MASK) + | (significand & Self::SIGNIFICAND_MASK), + ) + } + fn normalize(significand: Self::Int) -> (i32, Self::Int) { + let shift = significand + .leading_zeros() + .wrapping_sub((Self::Int::ONE << Self::SIGNIFICAND_BITS).leading_zeros()); + ( + 1i32.wrapping_sub(shift as i32), + significand << shift as Self::Int, + ) + } + fn is_subnormal(self) -> bool { + (self.repr() & Self::EXPONENT_MASK) == Self::Int::ZERO + } + } + }; +} + +float_impl!(f32, u32, i32, i16, 32, 23); +float_impl!(f64, u64, i64, i16, 64, 52); diff --git a/vendor/compiler_builtins/src/float/mul.rs b/vendor/compiler_builtins/src/float/mul.rs new file mode 100644 index 000000000..c89f22756 --- /dev/null +++ b/vendor/compiler_builtins/src/float/mul.rs @@ -0,0 +1,209 @@ +use float::Float; +use int::{CastInto, DInt, HInt, Int}; + +fn mul(a: F, b: F) -> F +where + u32: CastInto, + F::Int: CastInto, + i32: CastInto, + F::Int: CastInto, + F::Int: HInt, +{ + let one = F::Int::ONE; + let zero = F::Int::ZERO; + + let bits = F::BITS; + let significand_bits = F::SIGNIFICAND_BITS; + let max_exponent = F::EXPONENT_MAX; + + let exponent_bias = F::EXPONENT_BIAS; + + let implicit_bit = F::IMPLICIT_BIT; + let significand_mask = F::SIGNIFICAND_MASK; + let sign_bit = F::SIGN_MASK as F::Int; + let abs_mask = sign_bit - one; + let exponent_mask = F::EXPONENT_MASK; + let inf_rep = exponent_mask; + let quiet_bit = implicit_bit >> 1; + let qnan_rep = exponent_mask | quiet_bit; + let exponent_bits = F::EXPONENT_BITS; + + let a_rep = a.repr(); + let b_rep = b.repr(); + + let a_exponent = (a_rep >> significand_bits) & max_exponent.cast(); + let b_exponent = (b_rep >> significand_bits) & max_exponent.cast(); + let product_sign = (a_rep ^ b_rep) & sign_bit; + + let mut a_significand = a_rep & significand_mask; + let mut b_significand = b_rep & significand_mask; + let mut scale = 0; + + // Detect if a or b is zero, denormal, infinity, or NaN. + if a_exponent.wrapping_sub(one) >= (max_exponent - 1).cast() + || b_exponent.wrapping_sub(one) >= (max_exponent - 1).cast() + { + let a_abs = a_rep & abs_mask; + let b_abs = b_rep & abs_mask; + + // NaN + anything = qNaN + if a_abs > inf_rep { + return F::from_repr(a_rep | quiet_bit); + } + // anything + NaN = qNaN + if b_abs > inf_rep { + return F::from_repr(b_rep | quiet_bit); + } + + if a_abs == inf_rep { + if b_abs != zero { + // infinity * non-zero = +/- infinity + return F::from_repr(a_abs | product_sign); + } else { + // infinity * zero = NaN + return F::from_repr(qnan_rep); + } + } + + if b_abs == inf_rep { + if a_abs != zero { + // infinity * non-zero = +/- infinity + return F::from_repr(b_abs | product_sign); + } else { + // infinity * zero = NaN + return F::from_repr(qnan_rep); + } + } + + // zero * anything = +/- zero + if a_abs == zero { + return F::from_repr(product_sign); + } + + // anything * zero = +/- zero + if b_abs == zero { + return F::from_repr(product_sign); + } + + // one or both of a or b is denormal, the other (if applicable) is a + // normal number. Renormalize one or both of a and b, and set scale to + // include the necessary exponent adjustment. + if a_abs < implicit_bit { + let (exponent, significand) = F::normalize(a_significand); + scale += exponent; + a_significand = significand; + } + + if b_abs < implicit_bit { + let (exponent, significand) = F::normalize(b_significand); + scale += exponent; + b_significand = significand; + } + } + + // Or in the implicit significand bit. (If we fell through from the + // denormal path it was already set by normalize( ), but setting it twice + // won't hurt anything.) + a_significand |= implicit_bit; + b_significand |= implicit_bit; + + // Get the significand of a*b. Before multiplying the significands, shift + // one of them left to left-align it in the field. Thus, the product will + // have (exponentBits + 2) integral digits, all but two of which must be + // zero. Normalizing this result is just a conditional left-shift by one + // and bumping the exponent accordingly. + let (mut product_low, mut product_high) = a_significand + .widen_mul(b_significand << exponent_bits) + .lo_hi(); + + let a_exponent_i32: i32 = a_exponent.cast(); + let b_exponent_i32: i32 = b_exponent.cast(); + let mut product_exponent: i32 = a_exponent_i32 + .wrapping_add(b_exponent_i32) + .wrapping_add(scale) + .wrapping_sub(exponent_bias as i32); + + // Normalize the significand, adjust exponent if needed. + if (product_high & implicit_bit) != zero { + product_exponent = product_exponent.wrapping_add(1); + } else { + product_high = (product_high << 1) | (product_low >> (bits - 1)); + product_low <<= 1; + } + + // If we have overflowed the type, return +/- infinity. + if product_exponent >= max_exponent as i32 { + return F::from_repr(inf_rep | product_sign); + } + + if product_exponent <= 0 { + // Result is denormal before rounding + // + // If the result is so small that it just underflows to zero, return + // a zero of the appropriate sign. Mathematically there is no need to + // handle this case separately, but we make it a special case to + // simplify the shift logic. + let shift = one.wrapping_sub(product_exponent.cast()).cast(); + if shift >= bits { + return F::from_repr(product_sign); + } + + // Otherwise, shift the significand of the result so that the round + // bit is the high bit of productLo. + if shift < bits { + let sticky = product_low << (bits - shift); + product_low = product_high << (bits - shift) | product_low >> shift | sticky; + product_high >>= shift; + } else if shift < (2 * bits) { + let sticky = product_high << (2 * bits - shift) | product_low; + product_low = product_high >> (shift - bits) | sticky; + product_high = zero; + } else { + product_high = zero; + } + } else { + // Result is normal before rounding; insert the exponent. + product_high &= significand_mask; + product_high |= product_exponent.cast() << significand_bits; + } + + // Insert the sign of the result: + product_high |= product_sign; + + // Final rounding. The final result may overflow to infinity, or underflow + // to zero, but those are the correct results in those cases. We use the + // default IEEE-754 round-to-nearest, ties-to-even rounding mode. + if product_low > sign_bit { + product_high += one; + } + + if product_low == sign_bit { + product_high += product_high & one; + } + + F::from_repr(product_high) +} + +intrinsics! { + #[aapcs_on_arm] + #[arm_aeabi_alias = __aeabi_fmul] + pub extern "C" fn __mulsf3(a: f32, b: f32) -> f32 { + mul(a, b) + } + + #[aapcs_on_arm] + #[arm_aeabi_alias = __aeabi_dmul] + pub extern "C" fn __muldf3(a: f64, b: f64) -> f64 { + mul(a, b) + } + + #[cfg(target_arch = "arm")] + pub extern "C" fn __mulsf3vfp(a: f32, b: f32) -> f32 { + a * b + } + + #[cfg(target_arch = "arm")] + pub extern "C" fn __muldf3vfp(a: f64, b: f64) -> f64 { + a * b + } +} diff --git a/vendor/compiler_builtins/src/float/pow.rs b/vendor/compiler_builtins/src/float/pow.rs new file mode 100644 index 000000000..a75340c30 --- /dev/null +++ b/vendor/compiler_builtins/src/float/pow.rs @@ -0,0 +1,36 @@ +use float::Float; +use int::Int; + +/// Returns `a` raised to the power `b` +fn pow(a: F, b: i32) -> F { + let mut a = a; + let recip = b < 0; + let mut pow = Int::abs_diff(b, 0); + let mut mul = F::ONE; + loop { + if (pow & 1) != 0 { + mul *= a; + } + pow >>= 1; + if pow == 0 { + break; + } + a *= a; + } + + if recip { + F::ONE / mul + } else { + mul + } +} + +intrinsics! { + pub extern "C" fn __powisf2(a: f32, b: i32) -> f32 { + pow(a, b) + } + + pub extern "C" fn __powidf2(a: f64, b: i32) -> f64 { + pow(a, b) + } +} diff --git a/vendor/compiler_builtins/src/float/sub.rs b/vendor/compiler_builtins/src/float/sub.rs new file mode 100644 index 000000000..8d300e9d2 --- /dev/null +++ b/vendor/compiler_builtins/src/float/sub.rs @@ -0,0 +1,25 @@ +use float::add::__adddf3; +use float::add::__addsf3; +use float::Float; + +intrinsics! { + #[arm_aeabi_alias = __aeabi_fsub] + pub extern "C" fn __subsf3(a: f32, b: f32) -> f32 { + __addsf3(a, f32::from_repr(b.repr() ^ f32::SIGN_MASK)) + } + + #[arm_aeabi_alias = __aeabi_dsub] + pub extern "C" fn __subdf3(a: f64, b: f64) -> f64 { + __adddf3(a, f64::from_repr(b.repr() ^ f64::SIGN_MASK)) + } + + #[cfg(target_arch = "arm")] + pub extern "C" fn __subsf3vfp(a: f32, b: f32) -> f32 { + a - b + } + + #[cfg(target_arch = "arm")] + pub extern "C" fn __subdf3vfp(a: f64, b: f64) -> f64 { + a - b + } +} diff --git a/vendor/compiler_builtins/src/float/trunc.rs b/vendor/compiler_builtins/src/float/trunc.rs new file mode 100644 index 000000000..d73713084 --- /dev/null +++ b/vendor/compiler_builtins/src/float/trunc.rs @@ -0,0 +1,125 @@ +use float::Float; +use int::{CastInto, Int}; + +fn trunc(a: F) -> R +where + F::Int: CastInto, + F::Int: CastInto, + u64: CastInto, + u32: CastInto, + + R::Int: CastInto, + u32: CastInto, + F::Int: CastInto, +{ + let src_zero = F::Int::ZERO; + let src_one = F::Int::ONE; + let src_bits = F::BITS; + let src_exp_bias = F::EXPONENT_BIAS; + + let src_min_normal = F::IMPLICIT_BIT; + let src_significand_mask = F::SIGNIFICAND_MASK; + let src_infinity = F::EXPONENT_MASK; + let src_sign_mask = F::SIGN_MASK; + let src_abs_mask = src_sign_mask - src_one; + let round_mask = (src_one << (F::SIGNIFICAND_BITS - R::SIGNIFICAND_BITS)) - src_one; + let halfway = src_one << (F::SIGNIFICAND_BITS - R::SIGNIFICAND_BITS - 1); + let src_qnan = src_one << (F::SIGNIFICAND_BITS - 1); + let src_nan_code = src_qnan - src_one; + + let dst_zero = R::Int::ZERO; + let dst_one = R::Int::ONE; + let dst_bits = R::BITS; + let dst_inf_exp = R::EXPONENT_MAX; + let dst_exp_bias = R::EXPONENT_BIAS; + + let underflow_exponent: F::Int = (src_exp_bias + 1 - dst_exp_bias).cast(); + let overflow_exponent: F::Int = (src_exp_bias + dst_inf_exp - dst_exp_bias).cast(); + let underflow: F::Int = underflow_exponent << F::SIGNIFICAND_BITS; + let overflow: F::Int = overflow_exponent << F::SIGNIFICAND_BITS; + + let dst_qnan = R::Int::ONE << (R::SIGNIFICAND_BITS - 1); + let dst_nan_code = dst_qnan - dst_one; + + let sign_bits_delta = F::SIGNIFICAND_BITS - R::SIGNIFICAND_BITS; + // Break a into a sign and representation of the absolute value. + let a_abs = a.repr() & src_abs_mask; + let sign = a.repr() & src_sign_mask; + let mut abs_result: R::Int; + + if a_abs.wrapping_sub(underflow) < a_abs.wrapping_sub(overflow) { + // The exponent of a is within the range of normal numbers in the + // destination format. We can convert by simply right-shifting with + // rounding and adjusting the exponent. + abs_result = (a_abs >> sign_bits_delta).cast(); + let tmp = src_exp_bias.wrapping_sub(dst_exp_bias) << R::SIGNIFICAND_BITS; + abs_result = abs_result.wrapping_sub(tmp.cast()); + + let round_bits = a_abs & round_mask; + if round_bits > halfway { + // Round to nearest. + abs_result += dst_one; + } else if round_bits == halfway { + // Tie to even. + abs_result += abs_result & dst_one; + }; + } else if a_abs > src_infinity { + // a is NaN. + // Conjure the result by beginning with infinity, setting the qNaN + // bit and inserting the (truncated) trailing NaN field. + abs_result = (dst_inf_exp << R::SIGNIFICAND_BITS).cast(); + abs_result |= dst_qnan; + abs_result |= dst_nan_code + & ((a_abs & src_nan_code) >> (F::SIGNIFICAND_BITS - R::SIGNIFICAND_BITS)).cast(); + } else if a_abs >= overflow { + // a overflows to infinity. + abs_result = (dst_inf_exp << R::SIGNIFICAND_BITS).cast(); + } else { + // a underflows on conversion to the destination type or is an exact + // zero. The result may be a denormal or zero. Extract the exponent + // to get the shift amount for the denormalization. + let a_exp: u32 = (a_abs >> F::SIGNIFICAND_BITS).cast(); + let shift = src_exp_bias - dst_exp_bias - a_exp + 1; + + let significand = (a.repr() & src_significand_mask) | src_min_normal; + + // Right shift by the denormalization amount with sticky. + if shift > F::SIGNIFICAND_BITS { + abs_result = dst_zero; + } else { + let sticky = if (significand << (src_bits - shift)) != src_zero { + src_one + } else { + src_zero + }; + let denormalized_significand: F::Int = significand >> shift | sticky; + abs_result = + (denormalized_significand >> (F::SIGNIFICAND_BITS - R::SIGNIFICAND_BITS)).cast(); + let round_bits = denormalized_significand & round_mask; + // Round to nearest + if round_bits > halfway { + abs_result += dst_one; + } + // Ties to even + else if round_bits == halfway { + abs_result += abs_result & dst_one; + }; + } + } + + // Apply the signbit to the absolute value. + R::from_repr(abs_result | sign.wrapping_shr(src_bits - dst_bits).cast()) +} + +intrinsics! { + #[aapcs_on_arm] + #[arm_aeabi_alias = __aeabi_d2f] + pub extern "C" fn __truncdfsf2(a: f64) -> f32 { + trunc(a) + } + + #[cfg(target_arch = "arm")] + pub extern "C" fn __truncdfsf2vfp(a: f64) -> f32 { + a as f32 + } +} diff --git a/vendor/compiler_builtins/src/int/addsub.rs b/vendor/compiler_builtins/src/int/addsub.rs new file mode 100644 index 000000000..f4841e90f --- /dev/null +++ b/vendor/compiler_builtins/src/int/addsub.rs @@ -0,0 +1,96 @@ +use int::{DInt, Int}; + +trait UAddSub: DInt { + fn uadd(self, other: Self) -> Self { + let (lo, carry) = self.lo().overflowing_add(other.lo()); + let hi = self.hi().wrapping_add(other.hi()); + let carry = if carry { Self::H::ONE } else { Self::H::ZERO }; + Self::from_lo_hi(lo, hi.wrapping_add(carry)) + } + fn uadd_one(self) -> Self { + let (lo, carry) = self.lo().overflowing_add(Self::H::ONE); + let carry = if carry { Self::H::ONE } else { Self::H::ZERO }; + Self::from_lo_hi(lo, self.hi().wrapping_add(carry)) + } + fn usub(self, other: Self) -> Self { + let uneg = (!other).uadd_one(); + self.uadd(uneg) + } +} + +impl UAddSub for u128 {} + +trait AddSub: Int +where + ::UnsignedInt: UAddSub, +{ + fn add(self, other: Self) -> Self { + Self::from_unsigned(self.unsigned().uadd(other.unsigned())) + } + fn sub(self, other: Self) -> Self { + Self::from_unsigned(self.unsigned().usub(other.unsigned())) + } +} + +impl AddSub for u128 {} +impl AddSub for i128 {} + +trait Addo: AddSub +where + ::UnsignedInt: UAddSub, +{ + fn addo(self, other: Self) -> (Self, bool) { + let sum = AddSub::add(self, other); + (sum, (other < Self::ZERO) != (sum < self)) + } +} + +impl Addo for i128 {} +impl Addo for u128 {} + +trait Subo: AddSub +where + ::UnsignedInt: UAddSub, +{ + fn subo(self, other: Self) -> (Self, bool) { + let sum = AddSub::sub(self, other); + (sum, (other < Self::ZERO) != (self < sum)) + } +} + +impl Subo for i128 {} +impl Subo for u128 {} + +intrinsics! { + pub extern "C" fn __rust_i128_add(a: i128, b: i128) -> i128 { + AddSub::add(a,b) + } + + pub extern "C" fn __rust_i128_addo(a: i128, b: i128) -> (i128, bool) { + a.addo(b) + } + + pub extern "C" fn __rust_u128_add(a: u128, b: u128) -> u128 { + AddSub::add(a,b) + } + + pub extern "C" fn __rust_u128_addo(a: u128, b: u128) -> (u128, bool) { + a.addo(b) + } + + pub extern "C" fn __rust_i128_sub(a: i128, b: i128) -> i128 { + AddSub::sub(a,b) + } + + pub extern "C" fn __rust_i128_subo(a: i128, b: i128) -> (i128, bool) { + a.subo(b) + } + + pub extern "C" fn __rust_u128_sub(a: u128, b: u128) -> u128 { + AddSub::sub(a,b) + } + + pub extern "C" fn __rust_u128_subo(a: u128, b: u128) -> (u128, bool) { + a.subo(b) + } +} diff --git a/vendor/compiler_builtins/src/int/leading_zeros.rs b/vendor/compiler_builtins/src/int/leading_zeros.rs new file mode 100644 index 000000000..9e60ab0d7 --- /dev/null +++ b/vendor/compiler_builtins/src/int/leading_zeros.rs @@ -0,0 +1,149 @@ +// Note: these functions happen to produce the correct `usize::leading_zeros(0)` value +// without a explicit zero check. Zero is probably common enough that it could warrant +// adding a zero check at the beginning, but `__clzsi2` has a precondition that `x != 0`. +// Compilers will insert the check for zero in cases where it is needed. + +public_test_dep! { +/// Returns the number of leading binary zeros in `x`. +#[allow(dead_code)] +pub(crate) fn usize_leading_zeros_default(x: usize) -> usize { + // The basic idea is to test if the higher bits of `x` are zero and bisect the number + // of leading zeros. It is possible for all branches of the bisection to use the same + // code path by conditionally shifting the higher parts down to let the next bisection + // step work on the higher or lower parts of `x`. Instead of starting with `z == 0` + // and adding to the number of zeros, it is slightly faster to start with + // `z == usize::MAX.count_ones()` and subtract from the potential number of zeros, + // because it simplifies the final bisection step. + let mut x = x; + // the number of potential leading zeros + let mut z = usize::MAX.count_ones() as usize; + // a temporary + let mut t: usize; + #[cfg(target_pointer_width = "64")] + { + t = x >> 32; + if t != 0 { + z -= 32; + x = t; + } + } + #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] + { + t = x >> 16; + if t != 0 { + z -= 16; + x = t; + } + } + t = x >> 8; + if t != 0 { + z -= 8; + x = t; + } + t = x >> 4; + if t != 0 { + z -= 4; + x = t; + } + t = x >> 2; + if t != 0 { + z -= 2; + x = t; + } + // the last two bisections are combined into one conditional + t = x >> 1; + if t != 0 { + z - 2 + } else { + z - x + } + + // We could potentially save a few cycles by using the LUT trick from + // "https://embeddedgurus.com/state-space/2014/09/ + // fast-deterministic-and-portable-counting-leading-zeros/". + // However, 256 bytes for a LUT is too large for embedded use cases. We could remove + // the last 3 bisections and use this 16 byte LUT for the rest of the work: + //const LUT: [u8; 16] = [0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]; + //z -= LUT[x] as usize; + //z + // However, it ends up generating about the same number of instructions. When benchmarked + // on x86_64, it is slightly faster to use the LUT, but this is probably because of OOO + // execution effects. Changing to using a LUT and branching is risky for smaller cores. +} +} + +// The above method does not compile well on RISC-V (because of the lack of predicated +// instructions), producing code with many branches or using an excessively long +// branchless solution. This method takes advantage of the set-if-less-than instruction on +// RISC-V that allows `(x >= power-of-two) as usize` to be branchless. + +public_test_dep! { +/// Returns the number of leading binary zeros in `x`. +#[allow(dead_code)] +pub(crate) fn usize_leading_zeros_riscv(x: usize) -> usize { + let mut x = x; + // the number of potential leading zeros + let mut z = usize::MAX.count_ones() as usize; + // a temporary + let mut t: usize; + + // RISC-V does not have a set-if-greater-than-or-equal instruction and + // `(x >= power-of-two) as usize` will get compiled into two instructions, but this is + // still the most optimal method. A conditional set can only be turned into a single + // immediate instruction if `x` is compared with an immediate `imm` (that can fit into + // 12 bits) like `x < imm` but not `imm < x` (because the immediate is always on the + // right). If we try to save an instruction by using `x < imm` for each bisection, we + // have to shift `x` left and compare with powers of two approaching `usize::MAX + 1`, + // but the immediate will never fit into 12 bits and never save an instruction. + #[cfg(target_pointer_width = "64")] + { + // If the upper 32 bits of `x` are not all 0, `t` is set to `1 << 5`, otherwise + // `t` is set to 0. + t = ((x >= (1 << 32)) as usize) << 5; + // If `t` was set to `1 << 5`, then the upper 32 bits are shifted down for the + // next step to process. + x >>= t; + // If `t` was set to `1 << 5`, then we subtract 32 from the number of potential + // leading zeros + z -= t; + } + #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] + { + t = ((x >= (1 << 16)) as usize) << 4; + x >>= t; + z -= t; + } + t = ((x >= (1 << 8)) as usize) << 3; + x >>= t; + z -= t; + t = ((x >= (1 << 4)) as usize) << 2; + x >>= t; + z -= t; + t = ((x >= (1 << 2)) as usize) << 1; + x >>= t; + z -= t; + t = (x >= (1 << 1)) as usize; + x >>= t; + z -= t; + // All bits except the LSB are guaranteed to be zero for this final bisection step. + // If `x != 0` then `x == 1` and subtracts one potential zero from `z`. + z - x +} +} + +intrinsics! { + #[maybe_use_optimized_c_shim] + #[cfg(any( + target_pointer_width = "16", + target_pointer_width = "32", + target_pointer_width = "64" + ))] + /// Returns the number of leading binary zeros in `x`. + pub extern "C" fn __clzsi2(x: usize) -> usize { + if cfg!(any(target_arch = "riscv32", target_arch = "riscv64")) { + usize_leading_zeros_riscv(x) + } else { + usize_leading_zeros_default(x) + } + } +} diff --git a/vendor/compiler_builtins/src/int/mod.rs b/vendor/compiler_builtins/src/int/mod.rs new file mode 100644 index 000000000..509f9fdae --- /dev/null +++ b/vendor/compiler_builtins/src/int/mod.rs @@ -0,0 +1,390 @@ +use core::ops; + +mod specialized_div_rem; + +pub mod addsub; +pub mod leading_zeros; +pub mod mul; +pub mod sdiv; +pub mod shift; +pub mod udiv; + +pub use self::leading_zeros::__clzsi2; + +public_test_dep! { +/// Trait for some basic operations on integers +pub(crate) trait Int: + Copy + + core::fmt::Debug + + PartialEq + + PartialOrd + + ops::AddAssign + + ops::SubAssign + + ops::BitAndAssign + + ops::BitOrAssign + + ops::BitXorAssign + + ops::ShlAssign + + ops::ShrAssign + + ops::Add + + ops::Sub + + ops::Div + + ops::Shl + + ops::Shr + + ops::BitOr + + ops::BitXor + + ops::BitAnd + + ops::Not +{ + /// Type with the same width but other signedness + type OtherSign: Int; + /// Unsigned version of Self + type UnsignedInt: Int; + + /// If `Self` is a signed integer + const SIGNED: bool; + + /// The bitwidth of the int type + const BITS: u32; + + const ZERO: Self; + const ONE: Self; + const MIN: Self; + const MAX: Self; + + /// LUT used for maximizing the space covered and minimizing the computational cost of fuzzing + /// in `testcrate`. For example, Self = u128 produces [0,1,2,7,8,15,16,31,32,63,64,95,96,111, + /// 112,119,120,125,126,127]. + const FUZZ_LENGTHS: [u8; 20]; + /// The number of entries of `FUZZ_LENGTHS` actually used. The maximum is 20 for u128. + const FUZZ_NUM: usize; + + fn unsigned(self) -> Self::UnsignedInt; + fn from_unsigned(unsigned: Self::UnsignedInt) -> Self; + + fn from_bool(b: bool) -> Self; + + /// Prevents the need for excessive conversions between signed and unsigned + fn logical_shr(self, other: u32) -> Self; + + /// Absolute difference between two integers. + fn abs_diff(self, other: Self) -> Self::UnsignedInt; + + // copied from primitive integers, but put in a trait + fn is_zero(self) -> bool; + fn wrapping_neg(self) -> Self; + fn wrapping_add(self, other: Self) -> Self; + fn wrapping_mul(self, other: Self) -> Self; + fn wrapping_sub(self, other: Self) -> Self; + fn wrapping_shl(self, other: u32) -> Self; + fn wrapping_shr(self, other: u32) -> Self; + fn rotate_left(self, other: u32) -> Self; + fn overflowing_add(self, other: Self) -> (Self, bool); + fn leading_zeros(self) -> u32; +} +} + +macro_rules! int_impl_common { + ($ty:ty) => { + const BITS: u32 = ::ZERO.count_zeros(); + const SIGNED: bool = Self::MIN != Self::ZERO; + + const ZERO: Self = 0; + const ONE: Self = 1; + const MIN: Self = ::MIN; + const MAX: Self = ::MAX; + + const FUZZ_LENGTHS: [u8; 20] = { + let bits = ::BITS; + let mut v = [0u8; 20]; + v[0] = 0; + v[1] = 1; + v[2] = 2; // important for parity and the iX::MIN case when reversed + let mut i = 3; + // No need for any more until the byte boundary, because there should be no algorithms + // that are sensitive to anything not next to byte boundaries after 2. We also scale + // in powers of two, which is important to prevent u128 corner tests from getting too + // big. + let mut l = 8; + loop { + if l >= ((bits / 2) as u8) { + break; + } + // get both sides of the byte boundary + v[i] = l - 1; + i += 1; + v[i] = l; + i += 1; + l *= 2; + } + + if bits != 8 { + // add the lower side of the middle boundary + v[i] = ((bits / 2) - 1) as u8; + i += 1; + } + + // We do not want to jump directly from the Self::BITS/2 boundary to the Self::BITS + // boundary because of algorithms that split the high part up. We reverse the scaling + // as we go to Self::BITS. + let mid = i; + let mut j = 1; + loop { + v[i] = (bits as u8) - (v[mid - j]) - 1; + if j == mid { + break; + } + i += 1; + j += 1; + } + v + }; + + const FUZZ_NUM: usize = { + let log2 = (::BITS - 1).count_ones() as usize; + if log2 == 3 { + // case for u8 + 6 + } else { + // 3 entries on each extreme, 2 in the middle, and 4 for each scale of intermediate + // boundaries. + 8 + (4 * (log2 - 4)) + } + }; + + fn from_bool(b: bool) -> Self { + b as $ty + } + + fn logical_shr(self, other: u32) -> Self { + Self::from_unsigned(self.unsigned().wrapping_shr(other)) + } + + fn is_zero(self) -> bool { + self == Self::ZERO + } + + fn wrapping_neg(self) -> Self { + ::wrapping_neg(self) + } + + fn wrapping_add(self, other: Self) -> Self { + ::wrapping_add(self, other) + } + + fn wrapping_mul(self, other: Self) -> Self { + ::wrapping_mul(self, other) + } + + fn wrapping_sub(self, other: Self) -> Self { + ::wrapping_sub(self, other) + } + + fn wrapping_shl(self, other: u32) -> Self { + ::wrapping_shl(self, other) + } + + fn wrapping_shr(self, other: u32) -> Self { + ::wrapping_shr(self, other) + } + + fn rotate_left(self, other: u32) -> Self { + ::rotate_left(self, other) + } + + fn overflowing_add(self, other: Self) -> (Self, bool) { + ::overflowing_add(self, other) + } + + fn leading_zeros(self) -> u32 { + ::leading_zeros(self) + } + }; +} + +macro_rules! int_impl { + ($ity:ty, $uty:ty) => { + impl Int for $uty { + type OtherSign = $ity; + type UnsignedInt = $uty; + + fn unsigned(self) -> $uty { + self + } + + // It makes writing macros easier if this is implemented for both signed and unsigned + #[allow(clippy::wrong_self_convention)] + fn from_unsigned(me: $uty) -> Self { + me + } + + fn abs_diff(self, other: Self) -> Self { + if self < other { + other.wrapping_sub(self) + } else { + self.wrapping_sub(other) + } + } + + int_impl_common!($uty); + } + + impl Int for $ity { + type OtherSign = $uty; + type UnsignedInt = $uty; + + fn unsigned(self) -> $uty { + self as $uty + } + + fn from_unsigned(me: $uty) -> Self { + me as $ity + } + + fn abs_diff(self, other: Self) -> $uty { + self.wrapping_sub(other).wrapping_abs() as $uty + } + + int_impl_common!($ity); + } + }; +} + +int_impl!(isize, usize); +int_impl!(i8, u8); +int_impl!(i16, u16); +int_impl!(i32, u32); +int_impl!(i64, u64); +int_impl!(i128, u128); + +public_test_dep! { +/// Trait for integers twice the bit width of another integer. This is implemented for all +/// primitives except for `u8`, because there is not a smaller primitive. +pub(crate) trait DInt: Int { + /// Integer that is half the bit width of the integer this trait is implemented for + type H: HInt + Int; + + /// Returns the low half of `self` + fn lo(self) -> Self::H; + /// Returns the high half of `self` + fn hi(self) -> Self::H; + /// Returns the low and high halves of `self` as a tuple + fn lo_hi(self) -> (Self::H, Self::H); + /// Constructs an integer using lower and higher half parts + fn from_lo_hi(lo: Self::H, hi: Self::H) -> Self; +} +} + +public_test_dep! { +/// Trait for integers half the bit width of another integer. This is implemented for all +/// primitives except for `u128`, because it there is not a larger primitive. +pub(crate) trait HInt: Int { + /// Integer that is double the bit width of the integer this trait is implemented for + type D: DInt + Int; + + /// Widens (using default extension) the integer to have double bit width + fn widen(self) -> Self::D; + /// Widens (zero extension only) the integer to have double bit width. This is needed to get + /// around problems with associated type bounds (such as `Int`) being unstable + fn zero_widen(self) -> Self::D; + /// Widens the integer to have double bit width and shifts the integer into the higher bits + fn widen_hi(self) -> Self::D; + /// Widening multiplication with zero widening. This cannot overflow. + fn zero_widen_mul(self, rhs: Self) -> Self::D; + /// Widening multiplication. This cannot overflow. + fn widen_mul(self, rhs: Self) -> Self::D; +} +} + +macro_rules! impl_d_int { + ($($X:ident $D:ident),*) => { + $( + impl DInt for $D { + type H = $X; + + fn lo(self) -> Self::H { + self as $X + } + fn hi(self) -> Self::H { + (self >> <$X as Int>::BITS) as $X + } + fn lo_hi(self) -> (Self::H, Self::H) { + (self.lo(), self.hi()) + } + fn from_lo_hi(lo: Self::H, hi: Self::H) -> Self { + lo.zero_widen() | hi.widen_hi() + } + } + )* + }; +} + +macro_rules! impl_h_int { + ($($H:ident $uH:ident $X:ident),*) => { + $( + impl HInt for $H { + type D = $X; + + fn widen(self) -> Self::D { + self as $X + } + fn zero_widen(self) -> Self::D { + (self as $uH) as $X + } + fn widen_hi(self) -> Self::D { + (self as $X) << <$H as Int>::BITS + } + fn zero_widen_mul(self, rhs: Self) -> Self::D { + self.zero_widen().wrapping_mul(rhs.zero_widen()) + } + fn widen_mul(self, rhs: Self) -> Self::D { + self.widen().wrapping_mul(rhs.widen()) + } + } + )* + }; +} + +impl_d_int!(u8 u16, u16 u32, u32 u64, u64 u128, i8 i16, i16 i32, i32 i64, i64 i128); +impl_h_int!( + u8 u8 u16, + u16 u16 u32, + u32 u32 u64, + u64 u64 u128, + i8 u8 i16, + i16 u16 i32, + i32 u32 i64, + i64 u64 i128 +); + +public_test_dep! { +/// Trait to express (possibly lossy) casting of integers +pub(crate) trait CastInto: Copy { + fn cast(self) -> T; +} +} + +macro_rules! cast_into { + ($ty:ty) => { + cast_into!($ty; usize, isize, u8, i8, u16, i16, u32, i32, u64, i64, u128, i128); + }; + ($ty:ty; $($into:ty),*) => {$( + impl CastInto<$into> for $ty { + fn cast(self) -> $into { + self as $into + } + } + )*}; +} + +cast_into!(usize); +cast_into!(isize); +cast_into!(u8); +cast_into!(i8); +cast_into!(u16); +cast_into!(i16); +cast_into!(u32); +cast_into!(i32); +cast_into!(u64); +cast_into!(i64); +cast_into!(u128); +cast_into!(i128); diff --git a/vendor/compiler_builtins/src/int/mul.rs b/vendor/compiler_builtins/src/int/mul.rs new file mode 100644 index 000000000..07ce061c9 --- /dev/null +++ b/vendor/compiler_builtins/src/int/mul.rs @@ -0,0 +1,138 @@ +use int::{DInt, HInt, Int}; + +trait Mul: DInt +where + Self::H: DInt, +{ + fn mul(self, rhs: Self) -> Self { + // In order to prevent infinite recursion, we cannot use the `widen_mul` in this: + //self.lo().widen_mul(rhs.lo()) + // .wrapping_add(self.lo().wrapping_mul(rhs.hi()).widen_hi()) + // .wrapping_add(self.hi().wrapping_mul(rhs.lo()).widen_hi()) + + let lhs_lo = self.lo(); + let rhs_lo = rhs.lo(); + // construct the widening multiplication using only `Self::H` sized multiplications + let tmp_0 = lhs_lo.lo().zero_widen_mul(rhs_lo.lo()); + let tmp_1 = lhs_lo.lo().zero_widen_mul(rhs_lo.hi()); + let tmp_2 = lhs_lo.hi().zero_widen_mul(rhs_lo.lo()); + let tmp_3 = lhs_lo.hi().zero_widen_mul(rhs_lo.hi()); + // sum up all widening partials + let mul = Self::from_lo_hi(tmp_0, tmp_3) + .wrapping_add(tmp_1.zero_widen() << (Self::BITS / 4)) + .wrapping_add(tmp_2.zero_widen() << (Self::BITS / 4)); + // add the higher partials + mul.wrapping_add(lhs_lo.wrapping_mul(rhs.hi()).widen_hi()) + .wrapping_add(self.hi().wrapping_mul(rhs_lo).widen_hi()) + } +} + +impl Mul for u64 {} +impl Mul for i128 {} + +pub(crate) trait UMulo: Int + DInt { + fn mulo(self, rhs: Self) -> (Self, bool) { + match (self.hi().is_zero(), rhs.hi().is_zero()) { + // overflow is guaranteed + (false, false) => (self.wrapping_mul(rhs), true), + (true, false) => { + let mul_lo = self.lo().widen_mul(rhs.lo()); + let mul_hi = self.lo().widen_mul(rhs.hi()); + let (mul, o) = mul_lo.overflowing_add(mul_hi.lo().widen_hi()); + (mul, o || !mul_hi.hi().is_zero()) + } + (false, true) => { + let mul_lo = rhs.lo().widen_mul(self.lo()); + let mul_hi = rhs.lo().widen_mul(self.hi()); + let (mul, o) = mul_lo.overflowing_add(mul_hi.lo().widen_hi()); + (mul, o || !mul_hi.hi().is_zero()) + } + // overflow is guaranteed to not happen, and use a smaller widening multiplication + (true, true) => (self.lo().widen_mul(rhs.lo()), false), + } + } +} + +impl UMulo for u32 {} +impl UMulo for u64 {} +impl UMulo for u128 {} + +macro_rules! impl_signed_mulo { + ($fn:ident, $iD:ident, $uD:ident) => { + fn $fn(lhs: $iD, rhs: $iD) -> ($iD, bool) { + let mut lhs = lhs; + let mut rhs = rhs; + // the test against `mul_neg` below fails without this early return + if lhs == 0 || rhs == 0 { + return (0, false); + } + + let lhs_neg = lhs < 0; + let rhs_neg = rhs < 0; + if lhs_neg { + lhs = lhs.wrapping_neg(); + } + if rhs_neg { + rhs = rhs.wrapping_neg(); + } + let mul_neg = lhs_neg != rhs_neg; + + let (mul, o) = (lhs as $uD).mulo(rhs as $uD); + let mut mul = mul as $iD; + + if mul_neg { + mul = mul.wrapping_neg(); + } + if (mul < 0) != mul_neg { + // this one check happens to catch all edge cases related to `$iD::MIN` + (mul, true) + } else { + (mul, o) + } + } + }; +} + +impl_signed_mulo!(i32_overflowing_mul, i32, u32); +impl_signed_mulo!(i64_overflowing_mul, i64, u64); +impl_signed_mulo!(i128_overflowing_mul, i128, u128); + +intrinsics! { + #[maybe_use_optimized_c_shim] + #[arm_aeabi_alias = __aeabi_lmul] + #[cfg(any(not(any(target_arch = "riscv32", target_arch = "riscv64")), target_feature = "m"))] + pub extern "C" fn __muldi3(a: u64, b: u64) -> u64 { + a.mul(b) + } + + pub extern "C" fn __multi3(a: i128, b: i128) -> i128 { + a.mul(b) + } + + pub extern "C" fn __mulosi4(a: i32, b: i32, oflow: &mut i32) -> i32 { + let (mul, o) = i32_overflowing_mul(a, b); + *oflow = o as i32; + mul + } + + pub extern "C" fn __mulodi4(a: i64, b: i64, oflow: &mut i32) -> i64 { + let (mul, o) = i64_overflowing_mul(a, b); + *oflow = o as i32; + mul + } + + #[unadjusted_on_win64] + pub extern "C" fn __muloti4(a: i128, b: i128, oflow: &mut i32) -> i128 { + let (mul, o) = i128_overflowing_mul(a, b); + *oflow = o as i32; + mul + } + + pub extern "C" fn __rust_i128_mulo(a: i128, b: i128) -> (i128, bool) { + i128_overflowing_mul(a, b) + } + + pub extern "C" fn __rust_u128_mulo(a: u128, b: u128) -> (u128, bool) { + a.mulo(b) + } +} diff --git a/vendor/compiler_builtins/src/int/sdiv.rs b/vendor/compiler_builtins/src/int/sdiv.rs new file mode 100644 index 000000000..f1822f0f8 --- /dev/null +++ b/vendor/compiler_builtins/src/int/sdiv.rs @@ -0,0 +1,169 @@ +use int::udiv::*; + +macro_rules! sdivmod { + ( + $unsigned_fn:ident, // name of the unsigned division function + $signed_fn:ident, // name of the signed division function + $uX:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name` + $iX:ident, // signed integer type for the inputs and outputs of `$signed_name` + $($attr:tt),* // attributes + ) => { + intrinsics! { + #[avr_skip] + $( + #[$attr] + )* + /// Returns `n / d` and sets `*rem = n % d` + pub extern "C" fn $signed_fn(a: $iX, b: $iX, rem: &mut $iX) -> $iX { + let a_neg = a < 0; + let b_neg = b < 0; + let mut a = a; + let mut b = b; + if a_neg { + a = a.wrapping_neg(); + } + if b_neg { + b = b.wrapping_neg(); + } + let mut r = *rem as $uX; + let t = $unsigned_fn(a as $uX, b as $uX, Some(&mut r)) as $iX; + let mut r = r as $iX; + if a_neg { + r = r.wrapping_neg(); + } + *rem = r; + if a_neg != b_neg { + t.wrapping_neg() + } else { + t + } + } + } + } +} + +macro_rules! sdiv { + ( + $unsigned_fn:ident, // name of the unsigned division function + $signed_fn:ident, // name of the signed division function + $uX:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name` + $iX:ident, // signed integer type for the inputs and outputs of `$signed_name` + $($attr:tt),* // attributes + ) => { + intrinsics! { + #[avr_skip] + $( + #[$attr] + )* + /// Returns `n / d` + pub extern "C" fn $signed_fn(a: $iX, b: $iX) -> $iX { + let a_neg = a < 0; + let b_neg = b < 0; + let mut a = a; + let mut b = b; + if a_neg { + a = a.wrapping_neg(); + } + if b_neg { + b = b.wrapping_neg(); + } + let t = $unsigned_fn(a as $uX, b as $uX) as $iX; + if a_neg != b_neg { + t.wrapping_neg() + } else { + t + } + } + } + } +} + +macro_rules! smod { + ( + $unsigned_fn:ident, // name of the unsigned division function + $signed_fn:ident, // name of the signed division function + $uX:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name` + $iX:ident, // signed integer type for the inputs and outputs of `$signed_name` + $($attr:tt),* // attributes + ) => { + intrinsics! { + #[avr_skip] + $( + #[$attr] + )* + /// Returns `n % d` + pub extern "C" fn $signed_fn(a: $iX, b: $iX) -> $iX { + let a_neg = a < 0; + let b_neg = b < 0; + let mut a = a; + let mut b = b; + if a_neg { + a = a.wrapping_neg(); + } + if b_neg { + b = b.wrapping_neg(); + } + let r = $unsigned_fn(a as $uX, b as $uX) as $iX; + if a_neg { + r.wrapping_neg() + } else { + r + } + } + } + } +} + +sdivmod!( + __udivmodsi4, + __divmodsi4, + u32, + i32, + maybe_use_optimized_c_shim +); +// The `#[arm_aeabi_alias = __aeabi_idiv]` attribute cannot be made to work with `intrinsics!` in macros +intrinsics! { + #[maybe_use_optimized_c_shim] + #[arm_aeabi_alias = __aeabi_idiv] + /// Returns `n / d` + pub extern "C" fn __divsi3(a: i32, b: i32) -> i32 { + let a_neg = a < 0; + let b_neg = b < 0; + let mut a = a; + let mut b = b; + if a_neg { + a = a.wrapping_neg(); + } + if b_neg { + b = b.wrapping_neg(); + } + let t = __udivsi3(a as u32, b as u32) as i32; + if a_neg != b_neg { + t.wrapping_neg() + } else { + t + } + } +} +smod!(__umodsi3, __modsi3, u32, i32, maybe_use_optimized_c_shim); + +sdivmod!( + __udivmoddi4, + __divmoddi4, + u64, + i64, + maybe_use_optimized_c_shim +); +sdiv!(__udivdi3, __divdi3, u64, i64, maybe_use_optimized_c_shim); +smod!(__umoddi3, __moddi3, u64, i64, maybe_use_optimized_c_shim); + +// LLVM does not currently have a `__divmodti4` function, but GCC does +sdivmod!( + __udivmodti4, + __divmodti4, + u128, + i128, + maybe_use_optimized_c_shim +); +sdiv!(__udivti3, __divti3, u128, i128, win64_128bit_abi_hack); +smod!(__umodti3, __modti3, u128, i128, win64_128bit_abi_hack); diff --git a/vendor/compiler_builtins/src/int/shift.rs b/vendor/compiler_builtins/src/int/shift.rs new file mode 100644 index 000000000..908e619e1 --- /dev/null +++ b/vendor/compiler_builtins/src/int/shift.rs @@ -0,0 +1,116 @@ +use int::{DInt, HInt, Int}; + +trait Ashl: DInt { + /// Returns `a << b`, requires `b < Self::BITS` + fn ashl(self, shl: u32) -> Self { + let n_h = Self::H::BITS; + if shl & n_h != 0 { + // we only need `self.lo()` because `self.hi()` will be shifted out entirely + self.lo().wrapping_shl(shl - n_h).widen_hi() + } else if shl == 0 { + self + } else { + Self::from_lo_hi( + self.lo().wrapping_shl(shl), + self.lo().logical_shr(n_h - shl) | self.hi().wrapping_shl(shl), + ) + } + } +} + +impl Ashl for u32 {} +impl Ashl for u64 {} +impl Ashl for u128 {} + +trait Ashr: DInt { + /// Returns arithmetic `a >> b`, requires `b < Self::BITS` + fn ashr(self, shr: u32) -> Self { + let n_h = Self::H::BITS; + if shr & n_h != 0 { + Self::from_lo_hi( + self.hi().wrapping_shr(shr - n_h), + // smear the sign bit + self.hi().wrapping_shr(n_h - 1), + ) + } else if shr == 0 { + self + } else { + Self::from_lo_hi( + self.lo().logical_shr(shr) | self.hi().wrapping_shl(n_h - shr), + self.hi().wrapping_shr(shr), + ) + } + } +} + +impl Ashr for i32 {} +impl Ashr for i64 {} +impl Ashr for i128 {} + +trait Lshr: DInt { + /// Returns logical `a >> b`, requires `b < Self::BITS` + fn lshr(self, shr: u32) -> Self { + let n_h = Self::H::BITS; + if shr & n_h != 0 { + self.hi().logical_shr(shr - n_h).zero_widen() + } else if shr == 0 { + self + } else { + Self::from_lo_hi( + self.lo().logical_shr(shr) | self.hi().wrapping_shl(n_h - shr), + self.hi().logical_shr(shr), + ) + } + } +} + +impl Lshr for u32 {} +impl Lshr for u64 {} +impl Lshr for u128 {} + +intrinsics! { + #[maybe_use_optimized_c_shim] + pub extern "C" fn __ashlsi3(a: u32, b: u32) -> u32 { + a.ashl(b) + } + + #[maybe_use_optimized_c_shim] + #[arm_aeabi_alias = __aeabi_llsl] + pub extern "C" fn __ashldi3(a: u64, b: u32) -> u64 { + a.ashl(b) + } + + pub extern "C" fn __ashlti3(a: u128, b: u32) -> u128 { + a.ashl(b) + } + + #[maybe_use_optimized_c_shim] + pub extern "C" fn __ashrsi3(a: i32, b: u32) -> i32 { + a.ashr(b) + } + + #[maybe_use_optimized_c_shim] + #[arm_aeabi_alias = __aeabi_lasr] + pub extern "C" fn __ashrdi3(a: i64, b: u32) -> i64 { + a.ashr(b) + } + + pub extern "C" fn __ashrti3(a: i128, b: u32) -> i128 { + a.ashr(b) + } + + #[maybe_use_optimized_c_shim] + pub extern "C" fn __lshrsi3(a: u32, b: u32) -> u32 { + a.lshr(b) + } + + #[maybe_use_optimized_c_shim] + #[arm_aeabi_alias = __aeabi_llsr] + pub extern "C" fn __lshrdi3(a: u64, b: u32) -> u64 { + a.lshr(b) + } + + pub extern "C" fn __lshrti3(a: u128, b: u32) -> u128 { + a.lshr(b) + } +} diff --git a/vendor/compiler_builtins/src/int/specialized_div_rem/asymmetric.rs b/vendor/compiler_builtins/src/int/specialized_div_rem/asymmetric.rs new file mode 100644 index 000000000..56ce188a3 --- /dev/null +++ b/vendor/compiler_builtins/src/int/specialized_div_rem/asymmetric.rs @@ -0,0 +1,69 @@ +/// Creates an unsigned division function optimized for dividing integers with the same +/// bitwidth as the largest operand in an asymmetrically sized division. For example, x86-64 has an +/// assembly instruction that can divide a 128 bit integer by a 64 bit integer if the quotient fits +/// in 64 bits. The 128 bit version of this algorithm would use that fast hardware division to +/// construct a full 128 bit by 128 bit division. +#[allow(unused_macros)] +macro_rules! impl_asymmetric { + ( + $fn:ident, // name of the unsigned division function + $zero_div_fn:ident, // function called when division by zero is attempted + $half_division:ident, // function for division of a $uX by a $uX + $asymmetric_division:ident, // function for division of a $uD by a $uX + $n_h:expr, // the number of bits in a $iH or $uH + $uH:ident, // unsigned integer with half the bit width of $uX + $uX:ident, // unsigned integer with half the bit width of $uD + $uD:ident // unsigned integer type for the inputs and outputs of `$fn` + ) => { + /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a + /// tuple. + pub fn $fn(duo: $uD, div: $uD) -> ($uD, $uD) { + let n: u32 = $n_h * 2; + + let duo_lo = duo as $uX; + let duo_hi = (duo >> n) as $uX; + let div_lo = div as $uX; + let div_hi = (div >> n) as $uX; + if div_hi == 0 { + if div_lo == 0 { + $zero_div_fn() + } + if duo_hi < div_lo { + // `$uD` by `$uX` division with a quotient that will fit into a `$uX` + let (quo, rem) = unsafe { $asymmetric_division(duo, div_lo) }; + return (quo as $uD, rem as $uD); + } else { + // Short division using the $uD by $uX division + let (quo_hi, rem_hi) = $half_division(duo_hi, div_lo); + let tmp = unsafe { + $asymmetric_division((duo_lo as $uD) | ((rem_hi as $uD) << n), div_lo) + }; + return ((tmp.0 as $uD) | ((quo_hi as $uD) << n), tmp.1 as $uD); + } + } + + // This has been adapted from + // https://www.codeproject.com/tips/785014/uint-division-modulus which was in turn + // adapted from Hacker's Delight. This is similar to the two possibility algorithm + // in that it uses only more significant parts of `duo` and `div` to divide a large + // integer with a smaller division instruction. + let div_lz = div_hi.leading_zeros(); + let div_extra = n - div_lz; + let div_sig_n = (div >> div_extra) as $uX; + let tmp = unsafe { $asymmetric_division(duo >> 1, div_sig_n) }; + + let mut quo = tmp.0 >> ((n - 1) - div_lz); + if quo != 0 { + quo -= 1; + } + + // Note that this is a full `$uD` multiplication being used here + let mut rem = duo - (quo as $uD).wrapping_mul(div); + if div <= rem { + quo += 1; + rem -= div; + } + return (quo as $uD, rem); + } + }; +} diff --git a/vendor/compiler_builtins/src/int/specialized_div_rem/binary_long.rs b/vendor/compiler_builtins/src/int/specialized_div_rem/binary_long.rs new file mode 100644 index 000000000..0d7822882 --- /dev/null +++ b/vendor/compiler_builtins/src/int/specialized_div_rem/binary_long.rs @@ -0,0 +1,548 @@ +/// Creates an unsigned division function that uses binary long division, designed for +/// computer architectures without division instructions. These functions have good performance for +/// microarchitectures with large branch miss penalties and architectures without the ability to +/// predicate instructions. For architectures with predicated instructions, one of the algorithms +/// described in the documentation of these functions probably has higher performance, and a custom +/// assembly routine should be used instead. +#[allow(unused_macros)] +macro_rules! impl_binary_long { + ( + $fn:ident, // name of the unsigned division function + $zero_div_fn:ident, // function called when division by zero is attempted + $normalization_shift:ident, // function for finding the normalization shift + $n:tt, // the number of bits in a $iX or $uX + $uX:ident, // unsigned integer type for the inputs and outputs of `$fn` + $iX:ident // signed integer type with same bitwidth as `$uX` + ) => { + /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a + /// tuple. + pub fn $fn(duo: $uX, div: $uX) -> ($uX, $uX) { + let mut duo = duo; + // handle edge cases before calling `$normalization_shift` + if div == 0 { + $zero_div_fn() + } + if duo < div { + return (0, duo); + } + + // There are many variations of binary division algorithm that could be used. This + // documentation gives a tour of different methods so that future readers wanting to + // optimize further do not have to painstakingly derive them. The SWAR variation is + // especially hard to understand without reading the less convoluted methods first. + + // You may notice that a `duo < div_original` check is included in many these + // algorithms. A critical optimization that many algorithms miss is handling of + // quotients that will turn out to have many trailing zeros or many leading zeros. This + // happens in cases of exact or close-to-exact divisions, divisions by power of two, and + // in cases where the quotient is small. The `duo < div_original` check handles these + // cases of early returns and ends up replacing other kinds of mundane checks that + // normally terminate a binary division algorithm. + // + // Something you may see in other algorithms that is not special-cased here is checks + // for division by powers of two. The `duo < div_original` check handles this case and + // more, however it can be checked up front before the bisection using the + // `((div > 0) && ((div & (div - 1)) == 0))` trick. This is not special-cased because + // compilers should handle most cases where divisions by power of two occur, and we do + // not want to add on a few cycles for every division operation just to save a few + // cycles rarely. + + // The following example is the most straightforward translation from the way binary + // long division is typically visualized: + // Dividing 178u8 (0b10110010) by 6u8 (0b110). `div` is shifted left by 5, according to + // the result from `$normalization_shift(duo, div, false)`. + // + // Step 0: `sub` is negative, so there is not full normalization, so no `quo` bit is set + // and `duo` is kept unchanged. + // duo:10110010, div_shifted:11000000, sub:11110010, quo:00000000, shl:5 + // + // Step 1: `sub` is positive, set a `quo` bit and update `duo` for next step. + // duo:10110010, div_shifted:01100000, sub:01010010, quo:00010000, shl:4 + // + // Step 2: Continue based on `sub`. The `quo` bits start accumulating. + // duo:01010010, div_shifted:00110000, sub:00100010, quo:00011000, shl:3 + // duo:00100010, div_shifted:00011000, sub:00001010, quo:00011100, shl:2 + // duo:00001010, div_shifted:00001100, sub:11111110, quo:00011100, shl:1 + // duo:00001010, div_shifted:00000110, sub:00000100, quo:00011100, shl:0 + // The `duo < div_original` check terminates the algorithm with the correct quotient of + // 29u8 and remainder of 4u8 + /* + let div_original = div; + let mut shl = $normalization_shift(duo, div, false); + let mut quo = 0; + loop { + let div_shifted = div << shl; + let sub = duo.wrapping_sub(div_shifted); + // it is recommended to use `println!`s like this if functionality is unclear + /* + println!("duo:{:08b}, div_shifted:{:08b}, sub:{:08b}, quo:{:08b}, shl:{}", + duo, + div_shifted, + sub, + quo, + shl + ); + */ + if 0 <= (sub as $iX) { + duo = sub; + quo += 1 << shl; + if duo < div_original { + // this branch is optional + return (quo, duo) + } + } + if shl == 0 { + return (quo, duo) + } + shl -= 1; + } + */ + + // This restoring binary long division algorithm reduces the number of operations + // overall via: + // - `pow` can be shifted right instead of recalculating from `shl` + // - starting `div` shifted left and shifting it right for each step instead of + // recalculating from `shl` + // - The `duo < div_original` branch is used to terminate the algorithm instead of the + // `shl == 0` branch. This check is strong enough to prevent set bits of `pow` and + // `div` from being shifted off the end. This check also only occurs on half of steps + // on average, since it is behind the `(sub as $iX) >= 0` branch. + // - `shl` is now not needed by any aspect of of the loop and thus only 3 variables are + // being updated between steps + // + // There are many variations of this algorithm, but this encompases the largest number + // of architectures and does not rely on carry flags, add-with-carry, or SWAR + // complications to be decently fast. + /* + let div_original = div; + let shl = $normalization_shift(duo, div, false); + let mut div: $uX = div << shl; + let mut pow: $uX = 1 << shl; + let mut quo: $uX = 0; + loop { + let sub = duo.wrapping_sub(div); + if 0 <= (sub as $iX) { + duo = sub; + quo |= pow; + if duo < div_original { + return (quo, duo) + } + } + div >>= 1; + pow >>= 1; + } + */ + + // If the architecture has flags and predicated arithmetic instructions, it is possible + // to do binary long division without branching and in only 3 or 4 instructions. This is + // a variation of a 3 instruction central loop from + // http://www.chiark.greenend.org.uk/~theom/riscos/docs/ultimate/a252div.txt. + // + // What allows doing division in only 3 instructions is realizing that instead of + // keeping `duo` in place and shifting `div` right to align bits, `div` can be kept in + // place and `duo` can be shifted left. This means `div` does not have to be updated, + // but causes edge case problems and makes `duo < div_original` tests harder. Some + // architectures have an option to shift an argument in an arithmetic operation, which + // means `duo` can be shifted left and subtracted from in one instruction. The other two + // instructions are updating `quo` and undoing the subtraction if it turns out things + // were not normalized. + + /* + // Perform one binary long division step on the already normalized arguments, because + // the main. Note that this does a full normalization since the central loop needs + // `duo.leading_zeros()` to be at least 1 more than `div.leading_zeros()`. The original + // variation only did normalization to the nearest 4 steps, but this makes handling edge + // cases much harder. We do a full normalization and perform a binary long division + // step. In the edge case where the msbs of `duo` and `div` are set, it clears the msb + // of `duo`, then the edge case handler shifts `div` right and does another long + // division step to always insure `duo.leading_zeros() + 1 >= div.leading_zeros()`. + let div_original = div; + let mut shl = $normalization_shift(duo, div, true); + let mut div: $uX = (div << shl); + let mut quo: $uX = 1; + duo = duo.wrapping_sub(div); + if duo < div_original { + return (1 << shl, duo); + } + let div_neg: $uX; + if (div as $iX) < 0 { + // A very ugly edge case where the most significant bit of `div` is set (after + // shifting to match `duo` when its most significant bit is at the sign bit), which + // leads to the sign bit of `div_neg` being cut off and carries not happening when + // they should. This branch performs a long division step that keeps `duo` in place + // and shifts `div` down. + div >>= 1; + div_neg = div.wrapping_neg(); + let (sub, carry) = duo.overflowing_add(div_neg); + duo = sub; + quo = quo.wrapping_add(quo).wrapping_add(carry as $uX); + if !carry { + duo = duo.wrapping_add(div); + } + shl -= 1; + } else { + div_neg = div.wrapping_neg(); + } + // The add-with-carry that updates `quo` needs to have the carry set when a normalized + // subtract happens. Using `duo.wrapping_shl(1).overflowing_sub(div)` to do the + // subtraction generates a carry when an unnormalized subtract happens, which is the + // opposite of what we want. Instead, we use + // `duo.wrapping_shl(1).overflowing_add(div_neg)`, where `div_neg` is negative `div`. + let mut i = shl; + loop { + if i == 0 { + break; + } + i -= 1; + // `ADDS duo, div, duo, LSL #1` + // (add `div` to `duo << 1` and set flags) + let (sub, carry) = duo.wrapping_shl(1).overflowing_add(div_neg); + duo = sub; + // `ADC quo, quo, quo` + // (add with carry). Effectively shifts `quo` left by 1 and sets the least + // significant bit to the carry. + quo = quo.wrapping_add(quo).wrapping_add(carry as $uX); + // `ADDCC duo, duo, div` + // (add if carry clear). Undoes the subtraction if no carry was generated. + if !carry { + duo = duo.wrapping_add(div); + } + } + return (quo, duo >> shl); + */ + + // This is the SWAR (SIMD within in a register) restoring division algorithm. + // This combines several ideas of the above algorithms: + // - If `duo` is shifted left instead of shifting `div` right like in the 3 instruction + // restoring division algorithm, some architectures can do the shifting and + // subtraction step in one instruction. + // - `quo` can be constructed by adding powers-of-two to it or shifting it left by one + // and adding one. + // - Every time `duo` is shifted left, there is another unused 0 bit shifted into the + // LSB, so what if we use those bits to store `quo`? + // Through a complex setup, it is possible to manage `duo` and `quo` in the same + // register, and perform one step with 2 or 3 instructions. The only major downsides are + // that there is significant setup (it is only saves instructions if `shl` is + // approximately more than 4), `duo < div_original` checks are impractical once SWAR is + // initiated, and the number of division steps taken has to be exact (we cannot do more + // division steps than `shl`, because it introduces edge cases where quotient bits in + // `duo` start to collide with the real part of `div`. + /* + // first step. The quotient bit is stored in `quo` for now + let div_original = div; + let mut shl = $normalization_shift(duo, div, true); + let mut div: $uX = (div << shl); + duo = duo.wrapping_sub(div); + let mut quo: $uX = 1 << shl; + if duo < div_original { + return (quo, duo); + } + + let mask: $uX; + if (div as $iX) < 0 { + // deal with same edge case as the 3 instruction restoring division algorithm, but + // the quotient bit from this step also has to be stored in `quo` + div >>= 1; + shl -= 1; + let tmp = 1 << shl; + mask = tmp - 1; + let sub = duo.wrapping_sub(div); + if (sub as $iX) >= 0 { + // restore + duo = sub; + quo |= tmp; + } + if duo < div_original { + return (quo, duo); + } + } else { + mask = quo - 1; + } + // There is now room for quotient bits in `duo`. + + // Note that `div` is already shifted left and has `shl` unset bits. We subtract 1 from + // `div` and end up with the subset of `shl` bits being all being set. This subset acts + // just like a two's complement negative one. The subset of `div` containing the divisor + // had 1 subtracted from it, but a carry will always be generated from the `shl` subset + // as long as the quotient stays positive. + // + // When the modified `div` is subtracted from `duo.wrapping_shl(1)`, the `shl` subset + // adds a quotient bit to the least significant bit. + // For example, 89 (0b01011001) divided by 3 (0b11): + // + // shl:4, div:0b00110000 + // first step: + // duo:0b01011001 + // + div_neg:0b11010000 + // ____________________ + // 0b00101001 + // quo is set to 0b00010000 and mask is set to 0b00001111 for later + // + // 1 is subtracted from `div`. I will differentiate the `shl` part of `div` and the + // quotient part of `duo` with `^`s. + // chars. + // div:0b00110000 + // ^^^^ + // + 0b11111111 + // ________________ + // 0b00101111 + // ^^^^ + // div_neg:0b11010001 + // + // first SWAR step: + // duo_shl1:0b01010010 + // ^ + // + div_neg:0b11010001 + // ____________________ + // 0b00100011 + // ^ + // second: + // duo_shl1:0b01000110 + // ^^ + // + div_neg:0b11010001 + // ____________________ + // 0b00010111 + // ^^ + // third: + // duo_shl1:0b00101110 + // ^^^ + // + div_neg:0b11010001 + // ____________________ + // 0b11111111 + // ^^^ + // 3 steps resulted in the quotient with 3 set bits as expected, but currently the real + // part of `duo` is negative and the third step was an unnormalized step. The restore + // branch then restores `duo`. Note that the restore branch does not shift `duo` left. + // + // duo:0b11111111 + // ^^^ + // + div:0b00101111 + // ^^^^ + // ________________ + // 0b00101110 + // ^^^ + // `duo` is now back in the `duo_shl1` state it was at in the the third step, with an + // unset quotient bit. + // + // final step (`shl` was 4, so exactly 4 steps must be taken) + // duo_shl1:0b01011100 + // ^^^^ + // + div_neg:0b11010001 + // ____________________ + // 0b00101101 + // ^^^^ + // The quotient includes the `^` bits added with the `quo` bits from the beginning that + // contained the first step and potential edge case step, + // `quo:0b00010000 + (duo:0b00101101 & mask:0b00001111) == 0b00011101 == 29u8`. + // The remainder is the bits remaining in `duo` that are not part of the quotient bits, + // `duo:0b00101101 >> shl == 0b0010 == 2u8`. + let div: $uX = div.wrapping_sub(1); + let mut i = shl; + loop { + if i == 0 { + break; + } + i -= 1; + duo = duo.wrapping_shl(1).wrapping_sub(div); + if (duo as $iX) < 0 { + // restore + duo = duo.wrapping_add(div); + } + } + // unpack the results of SWAR + return ((duo & mask) | quo, duo >> shl); + */ + + // The problem with the conditional restoring SWAR algorithm above is that, in practice, + // it requires assembly code to bring out its full unrolled potential (It seems that + // LLVM can't use unrolled conditionals optimally and ends up erasing all the benefit + // that my algorithm intends. On architectures without predicated instructions, the code + // gen is especially bad. We need a default software division algorithm that is + // guaranteed to get decent code gen for the central loop. + + // For non-SWAR algorithms, there is a way to do binary long division without + // predication or even branching. This involves creating a mask from the sign bit and + // performing different kinds of steps using that. + /* + let shl = $normalization_shift(duo, div, true); + let mut div: $uX = div << shl; + let mut pow: $uX = 1 << shl; + let mut quo: $uX = 0; + loop { + let sub = duo.wrapping_sub(div); + let sign_mask = !((sub as $iX).wrapping_shr($n - 1) as $uX); + duo -= div & sign_mask; + quo |= pow & sign_mask; + div >>= 1; + pow >>= 1; + if pow == 0 { + break; + } + } + return (quo, duo); + */ + // However, it requires about 4 extra operations (smearing the sign bit, negating the + // mask, and applying the mask twice) on top of the operations done by the actual + // algorithm. With SWAR however, just 2 extra operations are needed, making it + // practical and even the most optimal algorithm for some architectures. + + // What we do is use custom assembly for predicated architectures that need software + // division, and for the default algorithm use a mask based restoring SWAR algorithm + // without conditionals or branches. On almost all architectures, this Rust code is + // guaranteed to compile down to 5 assembly instructions or less for each step, and LLVM + // will unroll it in a decent way. + + // standard opening for SWAR algorithm with first step and edge case handling + let div_original = div; + let mut shl = $normalization_shift(duo, div, true); + let mut div: $uX = (div << shl); + duo = duo.wrapping_sub(div); + let mut quo: $uX = 1 << shl; + if duo < div_original { + return (quo, duo); + } + let mask: $uX; + if (div as $iX) < 0 { + div >>= 1; + shl -= 1; + let tmp = 1 << shl; + mask = tmp - 1; + let sub = duo.wrapping_sub(div); + if (sub as $iX) >= 0 { + duo = sub; + quo |= tmp; + } + if duo < div_original { + return (quo, duo); + } + } else { + mask = quo - 1; + } + + // central loop + div = div.wrapping_sub(1); + let mut i = shl; + loop { + if i == 0 { + break; + } + i -= 1; + // shift left 1 and subtract + duo = duo.wrapping_shl(1).wrapping_sub(div); + // create mask + let mask = (duo as $iX).wrapping_shr($n - 1) as $uX; + // restore + duo = duo.wrapping_add(div & mask); + } + // unpack + return ((duo & mask) | quo, duo >> shl); + + // miscellanious binary long division algorithms that might be better for specific + // architectures + + // Another kind of long division uses an interesting fact that `div` and `pow` can be + // negated when `duo` is negative to perform a "negated" division step that works in + // place of any normalization mechanism. This is a non-restoring division algorithm that + // is very similar to the non-restoring division algorithms that can be found on the + // internet, except there is only one test for `duo < 0`. The subtraction from `quo` can + // be viewed as shifting the least significant set bit right (e.x. if we enter a series + // of negated binary long division steps starting with `quo == 0b1011_0000` and + // `pow == 0b0000_1000`, `quo` will progress like this: 0b1010_1000, 0b1010_0100, + // 0b1010_0010, 0b1010_0001). + /* + let div_original = div; + let shl = $normalization_shift(duo, div, true); + let mut div: $uX = (div << shl); + let mut pow: $uX = 1 << shl; + let mut quo: $uX = pow; + duo = duo.wrapping_sub(div); + if duo < div_original { + return (quo, duo); + } + div >>= 1; + pow >>= 1; + loop { + if (duo as $iX) < 0 { + // Negated binary long division step. + duo = duo.wrapping_add(div); + quo = quo.wrapping_sub(pow); + } else { + // Normal long division step. + if duo < div_original { + return (quo, duo) + } + duo = duo.wrapping_sub(div); + quo = quo.wrapping_add(pow); + } + pow >>= 1; + div >>= 1; + } + */ + + // This is the Nonrestoring SWAR algorithm, combining the nonrestoring algorithm with + // SWAR techniques that makes the only difference between steps be negation of `div`. + // If there was an architecture with an instruction that negated inputs to an adder + // based on conditionals, and in place shifting (or a three input addition operation + // that can have `duo` as two of the inputs to effectively shift it left by 1), then a + // single instruction central loop is possible. Microarchitectures often have inputs to + // their ALU that can invert the arguments and carry in of adders, but the architectures + // unfortunately do not have an instruction to dynamically invert this input based on + // conditionals. + /* + // SWAR opening + let div_original = div; + let mut shl = $normalization_shift(duo, div, true); + let mut div: $uX = (div << shl); + duo = duo.wrapping_sub(div); + let mut quo: $uX = 1 << shl; + if duo < div_original { + return (quo, duo); + } + let mask: $uX; + if (div as $iX) < 0 { + div >>= 1; + shl -= 1; + let tmp = 1 << shl; + let sub = duo.wrapping_sub(div); + if (sub as $iX) >= 0 { + // restore + duo = sub; + quo |= tmp; + } + if duo < div_original { + return (quo, duo); + } + mask = tmp - 1; + } else { + mask = quo - 1; + } + + // central loop + let div: $uX = div.wrapping_sub(1); + let mut i = shl; + loop { + if i == 0 { + break; + } + i -= 1; + // note: the `wrapping_shl(1)` can be factored out, but would require another + // restoring division step to prevent `(duo as $iX)` from overflowing + if (duo as $iX) < 0 { + // Negated binary long division step. + duo = duo.wrapping_shl(1).wrapping_add(div); + } else { + // Normal long division step. + duo = duo.wrapping_shl(1).wrapping_sub(div); + } + } + if (duo as $iX) < 0 { + // Restore. This was not needed in the original nonrestoring algorithm because of + // the `duo < div_original` checks. + duo = duo.wrapping_add(div); + } + // unpack + return ((duo & mask) | quo, duo >> shl); + */ + } + }; +} diff --git a/vendor/compiler_builtins/src/int/specialized_div_rem/delegate.rs b/vendor/compiler_builtins/src/int/specialized_div_rem/delegate.rs new file mode 100644 index 000000000..330c6e4f8 --- /dev/null +++ b/vendor/compiler_builtins/src/int/specialized_div_rem/delegate.rs @@ -0,0 +1,319 @@ +/// Creates an unsigned division function that uses a combination of hardware division and +/// binary long division to divide integers larger than what hardware division by itself can do. This +/// function is intended for microarchitectures that have division hardware, but not fast enough +/// multiplication hardware for `impl_trifecta` to be faster. +#[allow(unused_macros)] +macro_rules! impl_delegate { + ( + $fn:ident, // name of the unsigned division function + $zero_div_fn:ident, // function called when division by zero is attempted + $half_normalization_shift:ident, // function for finding the normalization shift of $uX + $half_division:ident, // function for division of a $uX by a $uX + $n_h:expr, // the number of bits in $iH or $uH + $uH:ident, // unsigned integer with half the bit width of $uX + $uX:ident, // unsigned integer with half the bit width of $uD. + $uD:ident, // unsigned integer type for the inputs and outputs of `$fn` + $iD:ident // signed integer type with the same bitwidth as `$uD` + ) => { + /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a + /// tuple. + pub fn $fn(duo: $uD, div: $uD) -> ($uD, $uD) { + // The two possibility algorithm, undersubtracting long division algorithm, or any kind + // of reciprocal based algorithm will not be fastest, because they involve large + // multiplications that we assume to not be fast enough relative to the divisions to + // outweigh setup times. + + // the number of bits in a $uX + let n = $n_h * 2; + + let duo_lo = duo as $uX; + let duo_hi = (duo >> n) as $uX; + let div_lo = div as $uX; + let div_hi = (div >> n) as $uX; + + match (div_lo == 0, div_hi == 0, duo_hi == 0) { + (true, true, _) => $zero_div_fn(), + (_, false, true) => { + // `duo` < `div` + return (0, duo); + } + (false, true, true) => { + // delegate to smaller division + let tmp = $half_division(duo_lo, div_lo); + return (tmp.0 as $uD, tmp.1 as $uD); + } + (false, true, false) => { + if duo_hi < div_lo { + // `quo_hi` will always be 0. This performs a binary long division algorithm + // to zero `duo_hi` followed by a half division. + + // We can calculate the normalization shift using only `$uX` size functions. + // If we calculated the normalization shift using + // `$half_normalization_shift(duo_hi, div_lo false)`, it would break the + // assumption the function has that the first argument is more than the + // second argument. If the arguments are switched, the assumption holds true + // since `duo_hi < div_lo`. + let norm_shift = $half_normalization_shift(div_lo, duo_hi, false); + let shl = if norm_shift == 0 { + // Consider what happens if the msbs of `duo_hi` and `div_lo` align with + // no shifting. The normalization shift will always return + // `norm_shift == 0` regardless of whether it is fully normalized, + // because `duo_hi < div_lo`. In that edge case, `n - norm_shift` would + // result in shift overflow down the line. For the edge case, because + // both `duo_hi < div_lo` and we are comparing all the significant bits + // of `duo_hi` and `div`, we can make `shl = n - 1`. + n - 1 + } else { + // We also cannot just use `shl = n - norm_shift - 1` in the general + // case, because when we are not in the edge case comparing all the + // significant bits, then the full `duo < div` may not be true and thus + // breaks the division algorithm. + n - norm_shift + }; + + // The 3 variable restoring division algorithm (see binary_long.rs) is ideal + // for this task, since `pow` and `quo` can be `$uX` and the delegation + // check is simple. + let mut div: $uD = div << shl; + let mut pow_lo: $uX = 1 << shl; + let mut quo_lo: $uX = 0; + let mut duo = duo; + loop { + let sub = duo.wrapping_sub(div); + if 0 <= (sub as $iD) { + duo = sub; + quo_lo |= pow_lo; + let duo_hi = (duo >> n) as $uX; + if duo_hi == 0 { + // Delegate to get the rest of the quotient. Note that the + // `div_lo` here is the original unshifted `div`. + let tmp = $half_division(duo as $uX, div_lo); + return ((quo_lo | tmp.0) as $uD, tmp.1 as $uD); + } + } + div >>= 1; + pow_lo >>= 1; + } + } else if duo_hi == div_lo { + // `quo_hi == 1`. This branch is cheap and helps with edge cases. + let tmp = $half_division(duo as $uX, div as $uX); + return ((1 << n) | (tmp.0 as $uD), tmp.1 as $uD); + } else { + // `div_lo < duo_hi` + // `rem_hi == 0` + if (div_lo >> $n_h) == 0 { + // Short division of $uD by a $uH, using $uX by $uX division + let div_0 = div_lo as $uH as $uX; + let (quo_hi, rem_3) = $half_division(duo_hi, div_0); + + let duo_mid = ((duo >> $n_h) as $uH as $uX) | (rem_3 << $n_h); + let (quo_1, rem_2) = $half_division(duo_mid, div_0); + + let duo_lo = (duo as $uH as $uX) | (rem_2 << $n_h); + let (quo_0, rem_1) = $half_division(duo_lo, div_0); + + return ( + (quo_0 as $uD) | ((quo_1 as $uD) << $n_h) | ((quo_hi as $uD) << n), + rem_1 as $uD, + ); + } + + // This is basically a short division composed of a half division for the hi + // part, specialized 3 variable binary long division in the middle, and + // another half division for the lo part. + let duo_lo = duo as $uX; + let tmp = $half_division(duo_hi, div_lo); + let quo_hi = tmp.0; + let mut duo = (duo_lo as $uD) | ((tmp.1 as $uD) << n); + // This check is required to avoid breaking the long division below. + if duo < div { + return ((quo_hi as $uD) << n, duo); + } + + // The half division handled all shift alignments down to `n`, so this + // division can continue with a shift of `n - 1`. + let mut div: $uD = div << (n - 1); + let mut pow_lo: $uX = 1 << (n - 1); + let mut quo_lo: $uX = 0; + loop { + let sub = duo.wrapping_sub(div); + if 0 <= (sub as $iD) { + duo = sub; + quo_lo |= pow_lo; + let duo_hi = (duo >> n) as $uX; + if duo_hi == 0 { + // Delegate to get the rest of the quotient. Note that the + // `div_lo` here is the original unshifted `div`. + let tmp = $half_division(duo as $uX, div_lo); + return ( + (tmp.0) as $uD | (quo_lo as $uD) | ((quo_hi as $uD) << n), + tmp.1 as $uD, + ); + } + } + div >>= 1; + pow_lo >>= 1; + } + } + } + (_, false, false) => { + // Full $uD by $uD binary long division. `quo_hi` will always be 0. + if duo < div { + return (0, duo); + } + let div_original = div; + let shl = $half_normalization_shift(duo_hi, div_hi, false); + let mut duo = duo; + let mut div: $uD = div << shl; + let mut pow_lo: $uX = 1 << shl; + let mut quo_lo: $uX = 0; + loop { + let sub = duo.wrapping_sub(div); + if 0 <= (sub as $iD) { + duo = sub; + quo_lo |= pow_lo; + if duo < div_original { + return (quo_lo as $uD, duo); + } + } + div >>= 1; + pow_lo >>= 1; + } + } + } + } + }; +} + +public_test_dep! { +/// Returns `n / d` and sets `*rem = n % d`. +/// +/// This specialization exists because: +/// - The LLVM backend for 32-bit SPARC cannot compile functions that return `(u128, u128)`, +/// so we have to use an old fashioned `&mut u128` argument to return the remainder. +/// - 64-bit SPARC does not have u64 * u64 => u128 widening multiplication, which makes the +/// delegate algorithm strategy the only reasonably fast way to perform `u128` division. +// used on SPARC +#[allow(dead_code)] +pub(crate) fn u128_divide_sparc(duo: u128, div: u128, rem: &mut u128) -> u128 { + use super::*; + let duo_lo = duo as u64; + let duo_hi = (duo >> 64) as u64; + let div_lo = div as u64; + let div_hi = (div >> 64) as u64; + + match (div_lo == 0, div_hi == 0, duo_hi == 0) { + (true, true, _) => zero_div_fn(), + (_, false, true) => { + *rem = duo; + return 0; + } + (false, true, true) => { + let tmp = u64_by_u64_div_rem(duo_lo, div_lo); + *rem = tmp.1 as u128; + return tmp.0 as u128; + } + (false, true, false) => { + if duo_hi < div_lo { + let norm_shift = u64_normalization_shift(div_lo, duo_hi, false); + let shl = if norm_shift == 0 { + 64 - 1 + } else { + 64 - norm_shift + }; + + let mut div: u128 = div << shl; + let mut pow_lo: u64 = 1 << shl; + let mut quo_lo: u64 = 0; + let mut duo = duo; + loop { + let sub = duo.wrapping_sub(div); + if 0 <= (sub as i128) { + duo = sub; + quo_lo |= pow_lo; + let duo_hi = (duo >> 64) as u64; + if duo_hi == 0 { + let tmp = u64_by_u64_div_rem(duo as u64, div_lo); + *rem = tmp.1 as u128; + return (quo_lo | tmp.0) as u128; + } + } + div >>= 1; + pow_lo >>= 1; + } + } else if duo_hi == div_lo { + let tmp = u64_by_u64_div_rem(duo as u64, div as u64); + *rem = tmp.1 as u128; + return (1 << 64) | (tmp.0 as u128); + } else { + if (div_lo >> 32) == 0 { + let div_0 = div_lo as u32 as u64; + let (quo_hi, rem_3) = u64_by_u64_div_rem(duo_hi, div_0); + + let duo_mid = ((duo >> 32) as u32 as u64) | (rem_3 << 32); + let (quo_1, rem_2) = u64_by_u64_div_rem(duo_mid, div_0); + + let duo_lo = (duo as u32 as u64) | (rem_2 << 32); + let (quo_0, rem_1) = u64_by_u64_div_rem(duo_lo, div_0); + + *rem = rem_1 as u128; + return (quo_0 as u128) | ((quo_1 as u128) << 32) | ((quo_hi as u128) << 64); + } + + let duo_lo = duo as u64; + let tmp = u64_by_u64_div_rem(duo_hi, div_lo); + let quo_hi = tmp.0; + let mut duo = (duo_lo as u128) | ((tmp.1 as u128) << 64); + if duo < div { + *rem = duo; + return (quo_hi as u128) << 64; + } + + let mut div: u128 = div << (64 - 1); + let mut pow_lo: u64 = 1 << (64 - 1); + let mut quo_lo: u64 = 0; + loop { + let sub = duo.wrapping_sub(div); + if 0 <= (sub as i128) { + duo = sub; + quo_lo |= pow_lo; + let duo_hi = (duo >> 64) as u64; + if duo_hi == 0 { + let tmp = u64_by_u64_div_rem(duo as u64, div_lo); + *rem = tmp.1 as u128; + return (tmp.0) as u128 | (quo_lo as u128) | ((quo_hi as u128) << 64); + } + } + div >>= 1; + pow_lo >>= 1; + } + } + } + (_, false, false) => { + if duo < div { + *rem = duo; + return 0; + } + let div_original = div; + let shl = u64_normalization_shift(duo_hi, div_hi, false); + let mut duo = duo; + let mut div: u128 = div << shl; + let mut pow_lo: u64 = 1 << shl; + let mut quo_lo: u64 = 0; + loop { + let sub = duo.wrapping_sub(div); + if 0 <= (sub as i128) { + duo = sub; + quo_lo |= pow_lo; + if duo < div_original { + *rem = duo; + return quo_lo as u128; + } + } + div >>= 1; + pow_lo >>= 1; + } + } + } +} +} diff --git a/vendor/compiler_builtins/src/int/specialized_div_rem/mod.rs b/vendor/compiler_builtins/src/int/specialized_div_rem/mod.rs new file mode 100644 index 000000000..6ec4675df --- /dev/null +++ b/vendor/compiler_builtins/src/int/specialized_div_rem/mod.rs @@ -0,0 +1,306 @@ +// TODO: when `unsafe_block_in_unsafe_fn` is stabilized, remove this +#![allow(unused_unsafe)] +// The functions are complex with many branches, and explicit +// `return`s makes it clear where function exit points are +#![allow(clippy::needless_return)] +#![allow(clippy::comparison_chain)] +// Clippy is confused by the complex configuration +#![allow(clippy::if_same_then_else)] +#![allow(clippy::needless_bool)] + +//! This `specialized_div_rem` module is originally from version 1.0.0 of the +//! `specialized-div-rem` crate. Note that `for` loops with ranges are not used in this +//! module, since unoptimized compilation may generate references to `memcpy`. +//! +//! The purpose of these macros is to easily change the both the division algorithm used +//! for a given integer size and the half division used by that algorithm. The way +//! functions call each other is also constructed such that linkers will find the chain of +//! software and hardware divisions needed for every size of signed and unsigned division. +//! For example, most target compilations do the following: +//! +//! - Many 128 bit division functions like `u128::wrapping_div` use +//! `std::intrinsics::unchecked_div`, which gets replaced by `__udivti3` because there +//! is not a 128 bit by 128 bit hardware division function in most architectures. +//! `__udivti3` uses `u128_div_rem` (this extra level of function calls exists because +//! `__umodti3` and `__udivmodti4` also exist, and `specialized_div_rem` supplies just +//! one function to calculate both the quotient and remainder. If configuration flags +//! enable it, `impl_trifecta!` defines `u128_div_rem` to use the trifecta algorithm, +//! which requires the half sized division `u64_by_u64_div_rem`. If the architecture +//! supplies a 64 bit hardware division instruction, `u64_by_u64_div_rem` will be +//! reduced to those instructions. Note that we do not specify the half size division +//! directly to be `__udivdi3`, because hardware division would never be introduced. +//! - If the architecture does not supply a 64 bit hardware division instruction, u64 +//! divisions will use functions such as `__udivdi3`. This will call `u64_div_rem` +//! which is defined by `impl_delegate!`. The half division for this algorithm is +//! `u32_by_u32_div_rem` which in turn becomes hardware division instructions or more +//! software division algorithms. +//! - If the architecture does not supply a 32 bit hardware instruction, linkers will +//! look for `__udivsi3`. `impl_binary_long!` is used, but this algorithm uses no half +//! division, so the chain of calls ends here. +//! +//! On some architectures like x86_64, an asymmetrically sized division is supplied, in +//! which 128 bit numbers can be divided by 64 bit numbers. `impl_asymmetric!` is used to +//! extend the 128 by 64 bit division to a full 128 by 128 bit division. + +// `allow(dead_code)` is used in various places, because the configuration code would otherwise be +// ridiculously complex + +#[macro_use] +mod norm_shift; + +#[macro_use] +mod binary_long; + +#[macro_use] +mod delegate; + +// used on SPARC +#[allow(unused_imports)] +#[cfg(not(feature = "public-test-deps"))] +pub(crate) use self::delegate::u128_divide_sparc; + +#[cfg(feature = "public-test-deps")] +pub use self::delegate::u128_divide_sparc; + +#[macro_use] +mod trifecta; + +#[macro_use] +mod asymmetric; + +/// The behavior of all divisions by zero is controlled by this function. This function should be +/// impossible to reach by Rust users, unless `compiler-builtins` public division functions or +/// `core/std::unchecked_div/rem` are directly used without a zero check in front. +fn zero_div_fn() -> ! { + unsafe { core::hint::unreachable_unchecked() } +} + +const USE_LZ: bool = { + if cfg!(target_arch = "arm") { + if cfg!(target_feature = "thumb-mode") { + // ARM thumb targets have CLZ instructions if the instruction set of ARMv6T2 is + // supported. This is needed to successfully differentiate between targets like + // `thumbv8.base` and `thumbv8.main`. + cfg!(target_feature = "v6t2") + } else { + // Regular ARM targets have CLZ instructions if the ARMv5TE instruction set is + // supported. Technically, ARMv5T was the first to have CLZ, but the "v5t" target + // feature does not seem to work. + cfg!(target_feature = "v5te") + } + } else if cfg!(any(target_arch = "sparc", target_arch = "sparc64")) { + // LZD or LZCNT on SPARC only exists for the VIS 3 extension and later. + cfg!(target_feature = "vis3") + } else if cfg!(any(target_arch = "riscv32", target_arch = "riscv64")) { + // The `B` extension on RISC-V determines if a CLZ assembly instruction exists + cfg!(target_feature = "b") + } else { + // All other common targets Rust supports should have CLZ instructions + true + } +}; + +impl_normalization_shift!( + u32_normalization_shift, + USE_LZ, + 32, + u32, + i32, + allow(dead_code) +); +impl_normalization_shift!( + u64_normalization_shift, + USE_LZ, + 64, + u64, + i64, + allow(dead_code) +); + +/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder. +/// `checked_div` and `checked_rem` are used to avoid bringing in panic function +/// dependencies. +#[inline] +fn u64_by_u64_div_rem(duo: u64, div: u64) -> (u64, u64) { + if let Some(quo) = duo.checked_div(div) { + if let Some(rem) = duo.checked_rem(div) { + return (quo, rem); + } + } + zero_div_fn() +} + +// Whether `trifecta` or `delegate` is faster for 128 bit division depends on the speed at which a +// microarchitecture can multiply and divide. We decide to be optimistic and assume `trifecta` is +// faster if the target pointer width is at least 64. +#[cfg(all( + not(any(target_pointer_width = "16", target_pointer_width = "32")), + not(all(not(feature = "no-asm"), target_arch = "x86_64")), + not(any(target_arch = "sparc", target_arch = "sparc64")) +))] +impl_trifecta!( + u128_div_rem, + zero_div_fn, + u64_by_u64_div_rem, + 32, + u32, + u64, + u128 +); + +// If the pointer width less than 64, then the target architecture almost certainly does not have +// the fast 64 to 128 bit widening multiplication needed for `trifecta` to be faster. +#[cfg(all( + any(target_pointer_width = "16", target_pointer_width = "32"), + not(all(not(feature = "no-asm"), target_arch = "x86_64")), + not(any(target_arch = "sparc", target_arch = "sparc64")) +))] +impl_delegate!( + u128_div_rem, + zero_div_fn, + u64_normalization_shift, + u64_by_u64_div_rem, + 32, + u32, + u64, + u128, + i128 +); + +/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder. +/// +/// # Safety +/// +/// If the quotient does not fit in a `u64`, a floating point exception occurs. +/// If `div == 0`, then a division by zero exception occurs. +#[cfg(all(not(feature = "no-asm"), target_arch = "x86_64"))] +#[inline] +unsafe fn u128_by_u64_div_rem(duo: u128, div: u64) -> (u64, u64) { + let duo_lo = duo as u64; + let duo_hi = (duo >> 64) as u64; + let quo: u64; + let rem: u64; + unsafe { + // divides the combined registers rdx:rax (`duo` is split into two 64 bit parts to do this) + // by `div`. The quotient is stored in rax and the remainder in rdx. + // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust. + core::arch::asm!( + "div {0}", + in(reg) div, + inlateout("rax") duo_lo => quo, + inlateout("rdx") duo_hi => rem, + options(att_syntax, pure, nomem, nostack) + ); + } + (quo, rem) +} + +// use `asymmetric` instead of `trifecta` on x86_64 +#[cfg(all(not(feature = "no-asm"), target_arch = "x86_64"))] +impl_asymmetric!( + u128_div_rem, + zero_div_fn, + u64_by_u64_div_rem, + u128_by_u64_div_rem, + 32, + u32, + u64, + u128 +); + +/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder. +/// `checked_div` and `checked_rem` are used to avoid bringing in panic function +/// dependencies. +#[inline] +#[allow(dead_code)] +fn u32_by_u32_div_rem(duo: u32, div: u32) -> (u32, u32) { + if let Some(quo) = duo.checked_div(div) { + if let Some(rem) = duo.checked_rem(div) { + return (quo, rem); + } + } + zero_div_fn() +} + +// When not on x86 and the pointer width is not 64, use `delegate` since the division size is larger +// than register size. +#[cfg(all( + not(all(not(feature = "no-asm"), target_arch = "x86")), + not(target_pointer_width = "64") +))] +impl_delegate!( + u64_div_rem, + zero_div_fn, + u32_normalization_shift, + u32_by_u32_div_rem, + 16, + u16, + u32, + u64, + i64 +); + +// When not on x86 and the pointer width is 64, use `binary_long`. +#[cfg(all( + not(all(not(feature = "no-asm"), target_arch = "x86")), + target_pointer_width = "64" +))] +impl_binary_long!( + u64_div_rem, + zero_div_fn, + u64_normalization_shift, + 64, + u64, + i64 +); + +/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder. +/// +/// # Safety +/// +/// If the quotient does not fit in a `u32`, a floating point exception occurs. +/// If `div == 0`, then a division by zero exception occurs. +#[cfg(all(not(feature = "no-asm"), target_arch = "x86"))] +#[inline] +unsafe fn u64_by_u32_div_rem(duo: u64, div: u32) -> (u32, u32) { + let duo_lo = duo as u32; + let duo_hi = (duo >> 32) as u32; + let quo: u32; + let rem: u32; + unsafe { + // divides the combined registers rdx:rax (`duo` is split into two 32 bit parts to do this) + // by `div`. The quotient is stored in rax and the remainder in rdx. + // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust. + core::arch::asm!( + "div {0}", + in(reg) div, + inlateout("rax") duo_lo => quo, + inlateout("rdx") duo_hi => rem, + options(att_syntax, pure, nomem, nostack) + ); + } + (quo, rem) +} + +// use `asymmetric` instead of `delegate` on x86 +#[cfg(all(not(feature = "no-asm"), target_arch = "x86"))] +impl_asymmetric!( + u64_div_rem, + zero_div_fn, + u32_by_u32_div_rem, + u64_by_u32_div_rem, + 16, + u16, + u32, + u64 +); + +// 32 bits is the smallest division used by `compiler-builtins`, so we end with binary long division +impl_binary_long!( + u32_div_rem, + zero_div_fn, + u32_normalization_shift, + 32, + u32, + i32 +); diff --git a/vendor/compiler_builtins/src/int/specialized_div_rem/norm_shift.rs b/vendor/compiler_builtins/src/int/specialized_div_rem/norm_shift.rs new file mode 100644 index 000000000..61b67b6bc --- /dev/null +++ b/vendor/compiler_builtins/src/int/specialized_div_rem/norm_shift.rs @@ -0,0 +1,106 @@ +/// Creates a function used by some division algorithms to compute the "normalization shift". +#[allow(unused_macros)] +macro_rules! impl_normalization_shift { + ( + $name:ident, // name of the normalization shift function + // boolean for if `$uX::leading_zeros` should be used (if an architecture does not have a + // hardware instruction for `usize::leading_zeros`, then this should be `true`) + $use_lz:ident, + $n:tt, // the number of bits in a $iX or $uX + $uX:ident, // unsigned integer type for the inputs of `$name` + $iX:ident, // signed integer type for the inputs of `$name` + $($unsigned_attr:meta),* // attributes for the function + ) => { + /// Finds the shift left that the divisor `div` would need to be normalized for a binary + /// long division step with the dividend `duo`. NOTE: This function assumes that these edge + /// cases have been handled before reaching it: + /// ` + /// if div == 0 { + /// panic!("attempt to divide by zero") + /// } + /// if duo < div { + /// return (0, duo) + /// } + /// ` + /// + /// Normalization is defined as (where `shl` is the output of this function): + /// ` + /// if duo.leading_zeros() != (div << shl).leading_zeros() { + /// // If the most significant bits of `duo` and `div << shl` are not in the same place, + /// // then `div << shl` has one more leading zero than `duo`. + /// assert_eq!(duo.leading_zeros() + 1, (div << shl).leading_zeros()); + /// // Also, `2*(div << shl)` is not more than `duo` (otherwise the first division step + /// // would not be able to clear the msb of `duo`) + /// assert!(duo < (div << (shl + 1))); + /// } + /// if full_normalization { + /// // Some algorithms do not need "full" normalization, which means that `duo` is + /// // larger than `div << shl` when the most significant bits are aligned. + /// assert!((div << shl) <= duo); + /// } + /// ` + /// + /// Note: If the software bisection algorithm is being used in this function, it happens + /// that full normalization always occurs, so be careful that new algorithms are not + /// invisibly depending on this invariant when `full_normalization` is set to `false`. + $( + #[$unsigned_attr] + )* + fn $name(duo: $uX, div: $uX, full_normalization: bool) -> usize { + // We have to find the leading zeros of `div` to know where its msb (most significant + // set bit) is to even begin binary long division. It is also good to know where the msb + // of `duo` is so that useful work can be started instead of shifting `div` for all + // possible quotients (many division steps are wasted if `duo.leading_zeros()` is large + // and `div` starts out being shifted all the way to the msb). Aligning the msbs of + // `div` and `duo` could be done by shifting `div` left by + // `div.leading_zeros() - duo.leading_zeros()`, but some CPUs without division hardware + // also do not have single instructions for calculating `leading_zeros`. Instead of + // software doing two bisections to find the two `leading_zeros`, we do one bisection to + // find `div.leading_zeros() - duo.leading_zeros()` without actually knowing either of + // the leading zeros values. + + let mut shl: usize; + if $use_lz { + shl = (div.leading_zeros() - duo.leading_zeros()) as usize; + if full_normalization { + if duo < (div << shl) { + // when the msb of `duo` and `div` are aligned, the resulting `div` may be + // larger than `duo`, so we decrease the shift by 1. + shl -= 1; + } + } + } else { + let mut test = duo; + shl = 0usize; + let mut lvl = $n >> 1; + loop { + let tmp = test >> lvl; + // It happens that a final `duo < (div << shl)` check is not needed, because the + // `div <= tmp` check insures that the msb of `test` never passes the msb of + // `div`, and any set bits shifted off the end of `test` would still keep + // `div <= tmp` true. + if div <= tmp { + test = tmp; + shl += lvl; + } + // narrow down bisection + lvl >>= 1; + if lvl == 0 { + break + } + } + } + // tests the invariants that should hold before beginning binary long division + /* + if full_normalization { + assert!((div << shl) <= duo); + } + if duo.leading_zeros() != (div << shl).leading_zeros() { + assert_eq!(duo.leading_zeros() + 1, (div << shl).leading_zeros()); + assert!(duo < (div << (shl + 1))); + } + */ + shl + } + } +} diff --git a/vendor/compiler_builtins/src/int/specialized_div_rem/trifecta.rs b/vendor/compiler_builtins/src/int/specialized_div_rem/trifecta.rs new file mode 100644 index 000000000..7e104053b --- /dev/null +++ b/vendor/compiler_builtins/src/int/specialized_div_rem/trifecta.rs @@ -0,0 +1,386 @@ +/// Creates an unsigned division function optimized for division of integers with bitwidths +/// larger than the largest hardware integer division supported. These functions use large radix +/// division algorithms that require both fast division and very fast widening multiplication on the +/// target microarchitecture. Otherwise, `impl_delegate` should be used instead. +#[allow(unused_macros)] +macro_rules! impl_trifecta { + ( + $fn:ident, // name of the unsigned division function + $zero_div_fn:ident, // function called when division by zero is attempted + $half_division:ident, // function for division of a $uX by a $uX + $n_h:expr, // the number of bits in $iH or $uH + $uH:ident, // unsigned integer with half the bit width of $uX + $uX:ident, // unsigned integer with half the bit width of $uD + $uD:ident // unsigned integer type for the inputs and outputs of `$unsigned_name` + ) => { + /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a + /// tuple. + pub fn $fn(duo: $uD, div: $uD) -> ($uD, $uD) { + // This is called the trifecta algorithm because it uses three main algorithms: short + // division for small divisors, the two possibility algorithm for large divisors, and an + // undersubtracting long division algorithm for intermediate cases. + + // This replicates `carrying_mul` (rust-lang rfc #2417). LLVM correctly optimizes this + // to use a widening multiply to 128 bits on the relevant architectures. + fn carrying_mul(lhs: $uX, rhs: $uX) -> ($uX, $uX) { + let tmp = (lhs as $uD).wrapping_mul(rhs as $uD); + (tmp as $uX, (tmp >> ($n_h * 2)) as $uX) + } + fn carrying_mul_add(lhs: $uX, mul: $uX, add: $uX) -> ($uX, $uX) { + let tmp = (lhs as $uD) + .wrapping_mul(mul as $uD) + .wrapping_add(add as $uD); + (tmp as $uX, (tmp >> ($n_h * 2)) as $uX) + } + + // the number of bits in a $uX + let n = $n_h * 2; + + if div == 0 { + $zero_div_fn() + } + + // Trying to use a normalization shift function will cause inelegancies in the code and + // inefficiencies for architectures with a native count leading zeros instruction. The + // undersubtracting algorithm needs both values (keeping the original `div_lz` but + // updating `duo_lz` multiple times), so we assume hardware support for fast + // `leading_zeros` calculation. + let div_lz = div.leading_zeros(); + let mut duo_lz = duo.leading_zeros(); + + // the possible ranges of `duo` and `div` at this point: + // `0 <= duo < 2^n_d` + // `1 <= div < 2^n_d` + + // quotient is 0 or 1 branch + if div_lz <= duo_lz { + // The quotient cannot be more than 1. The highest set bit of `duo` needs to be at + // least one place higher than `div` for the quotient to be more than 1. + if duo >= div { + return (1, duo - div); + } else { + return (0, duo); + } + } + + // `_sb` is the number of significant bits (from the ones place to the highest set bit) + // `{2, 2^div_sb} <= duo < 2^n_d` + // `1 <= div < {2^duo_sb, 2^(n_d - 1)}` + // smaller division branch + if duo_lz >= n { + // `duo < 2^n` so it will fit in a $uX. `div` will also fit in a $uX (because of the + // `div_lz <= duo_lz` branch) so no numerical error. + let (quo, rem) = $half_division(duo as $uX, div as $uX); + return (quo as $uD, rem as $uD); + } + + // `{2^n, 2^div_sb} <= duo < 2^n_d` + // `1 <= div < {2^duo_sb, 2^(n_d - 1)}` + // short division branch + if div_lz >= (n + $n_h) { + // `1 <= div < {2^duo_sb, 2^n_h}` + + // It is barely possible to improve the performance of this by calculating the + // reciprocal and removing one `$half_division`, but only if the CPU can do fast + // multiplications in parallel. Other reciprocal based methods can remove two + // `$half_division`s, but have multiplications that cannot be done in parallel and + // reduce performance. I have decided to use this trivial short division method and + // rely on the CPU having quick divisions. + + let duo_hi = (duo >> n) as $uX; + let div_0 = div as $uH as $uX; + let (quo_hi, rem_3) = $half_division(duo_hi, div_0); + + let duo_mid = ((duo >> $n_h) as $uH as $uX) | (rem_3 << $n_h); + let (quo_1, rem_2) = $half_division(duo_mid, div_0); + + let duo_lo = (duo as $uH as $uX) | (rem_2 << $n_h); + let (quo_0, rem_1) = $half_division(duo_lo, div_0); + + return ( + (quo_0 as $uD) | ((quo_1 as $uD) << $n_h) | ((quo_hi as $uD) << n), + rem_1 as $uD, + ); + } + + // relative leading significant bits, cannot overflow because of above branches + let lz_diff = div_lz - duo_lz; + + // `{2^n, 2^div_sb} <= duo < 2^n_d` + // `2^n_h <= div < {2^duo_sb, 2^(n_d - 1)}` + // `mul` or `mul - 1` branch + if lz_diff < $n_h { + // Two possibility division algorithm + + // The most significant bits of `duo` and `div` are within `$n_h` bits of each + // other. If we take the `n` most significant bits of `duo` and divide them by the + // corresponding bits in `div`, it produces a quotient value `quo`. It happens that + // `quo` or `quo - 1` will always be the correct quotient for the whole number. In + // other words, the bits less significant than the `n` most significant bits of + // `duo` and `div` can only influence the quotient to be one of two values. + // Because there are only two possibilities, there only needs to be one `$uH` sized + // division, a `$uH` by `$uD` multiplication, and only one branch with a few simple + // operations. + // + // Proof that the true quotient can only be `quo` or `quo - 1`. + // All `/` operators here are floored divisions. + // + // `shift` is the number of bits not in the higher `n` significant bits of `duo`. + // (definitions) + // 0. shift = n - duo_lz + // 1. duo_sig_n == duo / 2^shift + // 2. div_sig_n == div / 2^shift + // 3. quo == duo_sig_n / div_sig_n + // + // + // We are trying to find the true quotient, `true_quo`. + // 4. true_quo = duo / div. (definition) + // + // This is true because of the bits that are cut off during the bit shift. + // 5. duo_sig_n * 2^shift <= duo < (duo_sig_n + 1) * 2^shift. + // 6. div_sig_n * 2^shift <= div < (div_sig_n + 1) * 2^shift. + // + // Dividing each bound of (5) by each bound of (6) gives 4 possibilities for what + // `true_quo == duo / div` is bounded by: + // (duo_sig_n * 2^shift) / (div_sig_n * 2^shift) + // (duo_sig_n * 2^shift) / ((div_sig_n + 1) * 2^shift) + // ((duo_sig_n + 1) * 2^shift) / (div_sig_n * 2^shift) + // ((duo_sig_n + 1) * 2^shift) / ((div_sig_n + 1) * 2^shift) + // + // Simplifying each of these four: + // duo_sig_n / div_sig_n + // duo_sig_n / (div_sig_n + 1) + // (duo_sig_n + 1) / div_sig_n + // (duo_sig_n + 1) / (div_sig_n + 1) + // + // Taking the smallest and the largest of these as the low and high bounds + // and replacing `duo / div` with `true_quo`: + // 7. duo_sig_n / (div_sig_n + 1) <= true_quo < (duo_sig_n + 1) / div_sig_n + // + // The `lz_diff < n_h` conditional on this branch makes sure that `div_sig_n` is at + // least `2^n_h`, and the `div_lz <= duo_lz` branch makes sure that the highest bit + // of `div_sig_n` is not the `2^(n - 1)` bit. + // 8. `2^(n - 1) <= duo_sig_n < 2^n` + // 9. `2^n_h <= div_sig_n < 2^(n - 1)` + // + // We want to prove that either + // `(duo_sig_n + 1) / div_sig_n == duo_sig_n / (div_sig_n + 1)` or that + // `(duo_sig_n + 1) / div_sig_n == duo_sig_n / (div_sig_n + 1) + 1`. + // + // We also want to prove that `quo` is one of these: + // `duo_sig_n / div_sig_n == duo_sig_n / (div_sig_n + 1)` or + // `duo_sig_n / div_sig_n == (duo_sig_n + 1) / div_sig_n`. + // + // When 1 is added to the numerator of `duo_sig_n / div_sig_n` to produce + // `(duo_sig_n + 1) / div_sig_n`, it is not possible that the value increases by + // more than 1 with floored integer arithmetic and `div_sig_n != 0`. Consider + // `x/y + 1 < (x + 1)/y` <=> `x/y + 1 < x/y + 1/y` <=> `1 < 1/y` <=> `y < 1`. + // `div_sig_n` is a nonzero integer. Thus, + // 10. `duo_sig_n / div_sig_n == (duo_sig_n + 1) / div_sig_n` or + // `(duo_sig_n / div_sig_n) + 1 == (duo_sig_n + 1) / div_sig_n. + // + // When 1 is added to the denominator of `duo_sig_n / div_sig_n` to produce + // `duo_sig_n / (div_sig_n + 1)`, it is not possible that the value decreases by + // more than 1 with the bounds (8) and (9). Consider `x/y - 1 <= x/(y + 1)` <=> + // `(x - y)/y < x/(y + 1)` <=> `(y + 1)*(x - y) < x*y` <=> `x*y - y*y + x - y < x*y` + // <=> `x < y*y + y`. The smallest value of `div_sig_n` is `2^n_h` and the largest + // value of `duo_sig_n` is `2^n - 1`. Substituting reveals `2^n - 1 < 2^n + 2^n_h`. + // Thus, + // 11. `duo_sig_n / div_sig_n == duo_sig_n / (div_sig_n + 1)` or + // `(duo_sig_n / div_sig_n) - 1` == duo_sig_n / (div_sig_n + 1)` + // + // Combining both (10) and (11), we know that + // `quo - 1 <= duo_sig_n / (div_sig_n + 1) <= true_quo + // < (duo_sig_n + 1) / div_sig_n <= quo + 1` and therefore: + // 12. quo - 1 <= true_quo < quo + 1 + // + // In a lot of division algorithms using smaller divisions to construct a larger + // division, we often encounter a situation where the approximate `quo` value + // calculated from a smaller division is multiple increments away from the true + // `quo` value. In those algorithms, multiple correction steps have to be applied. + // Those correction steps may need more multiplications to test `duo - (quo*div)` + // again. Because of the fact that our `quo` can only be one of two values, we can + // see if `duo - (quo*div)` overflows. If it did overflow, then we know that we have + // the larger of the two values (since the true quotient is unique, and any larger + // quotient will cause `duo - (quo*div)` to be negative). Also because there is only + // one correction needed, we can calculate the remainder `duo - (true_quo*div) == + // duo - ((quo - 1)*div) == duo - (quo*div - div) == duo + div - quo*div`. + // If `duo - (quo*div)` did not overflow, then we have the correct answer. + let shift = n - duo_lz; + let duo_sig_n = (duo >> shift) as $uX; + let div_sig_n = (div >> shift) as $uX; + let quo = $half_division(duo_sig_n, div_sig_n).0; + + // The larger `quo` value can overflow `$uD` in the right circumstances. This is a + // manual `carrying_mul_add` with overflow checking. + let div_lo = div as $uX; + let div_hi = (div >> n) as $uX; + let (tmp_lo, carry) = carrying_mul(quo, div_lo); + let (tmp_hi, overflow) = carrying_mul_add(quo, div_hi, carry); + let tmp = (tmp_lo as $uD) | ((tmp_hi as $uD) << n); + if (overflow != 0) || (duo < tmp) { + return ( + (quo - 1) as $uD, + // Both the addition and subtraction can overflow, but when combined end up + // as a correct positive number. + duo.wrapping_add(div).wrapping_sub(tmp), + ); + } else { + return (quo as $uD, duo - tmp); + } + } + + // Undersubtracting long division algorithm. + // Instead of clearing a minimum of 1 bit from `duo` per iteration via binary long + // division, `n_h - 1` bits are cleared per iteration with this algorithm. It is a more + // complicated version of regular long division. Most integer division algorithms tend + // to guess a part of the quotient, and may have a larger quotient than the true + // quotient (which when multiplied by `div` will "oversubtract" the original dividend). + // They then check if the quotient was in fact too large and then have to correct it. + // This long division algorithm has been carefully constructed to always underguess the + // quotient by slim margins. This allows different subalgorithms to be blindly jumped to + // without needing an extra correction step. + // + // The only problem is that this subalgorithm will not work for many ranges of `duo` and + // `div`. Fortunately, the short division, two possibility algorithm, and other simple + // cases happen to exactly fill these gaps. + // + // For an example, consider the division of 76543210 by 213 and assume that `n_h` is + // equal to two decimal digits (note: we are working with base 10 here for readability). + // The first `sig_n_h` part of the divisor (21) is taken and is incremented by 1 to + // prevent oversubtraction. We also record the number of extra places not a part of + // the `sig_n` or `sig_n_h` parts. + // + // sig_n_h == 2 digits, sig_n == 4 digits + // + // vvvv <- `duo_sig_n` + // 76543210 + // ^^^^ <- extra places in duo, `duo_extra == 4` + // + // vv <- `div_sig_n_h` + // 213 + // ^ <- extra places in div, `div_extra == 1` + // + // The difference in extra places, `duo_extra - div_extra == extra_shl == 3`, is used + // for shifting partial sums in the long division. + // + // In the first step, the first `sig_n` part of duo (7654) is divided by + // `div_sig_n_h_add_1` (22), which results in a partial quotient of 347. This is + // multiplied by the whole divisor to make 73911, which is shifted left by `extra_shl` + // and subtracted from duo. The partial quotient is also shifted left by `extra_shl` to + // be added to `quo`. + // + // 347 + // ________ + // |76543210 + // -73911 + // 2632210 + // + // Variables dependent on duo have to be updated: + // + // vvvv <- `duo_sig_n == 2632` + // 2632210 + // ^^^ <- `duo_extra == 3` + // + // `extra_shl == 2` + // + // Two more steps are taken after this and then duo fits into `n` bits, and then a final + // normal long division step is made. The partial quotients are all progressively added + // to each other in the actual algorithm, but here I have left them all in a tower that + // can be added together to produce the quotient, 359357. + // + // 14 + // 443 + // 119 + // 347 + // ________ + // |76543210 + // -73911 + // 2632210 + // -25347 + // 97510 + // -94359 + // 3151 + // -2982 + // 169 <- the remainder + + let mut duo = duo; + let mut quo: $uD = 0; + + // The number of lesser significant bits not a part of `div_sig_n_h` + let div_extra = (n + $n_h) - div_lz; + + // The most significant `n_h` bits of div + let div_sig_n_h = (div >> div_extra) as $uH; + + // This needs to be a `$uX` in case of overflow from the increment + let div_sig_n_h_add1 = (div_sig_n_h as $uX) + 1; + + // `{2^n, 2^(div_sb + n_h)} <= duo < 2^n_d` + // `2^n_h <= div < {2^(duo_sb - n_h), 2^n}` + loop { + // The number of lesser significant bits not a part of `duo_sig_n` + let duo_extra = n - duo_lz; + + // The most significant `n` bits of `duo` + let duo_sig_n = (duo >> duo_extra) as $uX; + + // the two possibility algorithm requires that the difference between msbs is less + // than `n_h`, so the comparison is `<=` here. + if div_extra <= duo_extra { + // Undersubtracting long division step + let quo_part = $half_division(duo_sig_n, div_sig_n_h_add1).0 as $uD; + let extra_shl = duo_extra - div_extra; + + // Addition to the quotient. + quo += (quo_part << extra_shl); + + // Subtraction from `duo`. At least `n_h - 1` bits are cleared from `duo` here. + duo -= (div.wrapping_mul(quo_part) << extra_shl); + } else { + // Two possibility algorithm + let shift = n - duo_lz; + let duo_sig_n = (duo >> shift) as $uX; + let div_sig_n = (div >> shift) as $uX; + let quo_part = $half_division(duo_sig_n, div_sig_n).0; + let div_lo = div as $uX; + let div_hi = (div >> n) as $uX; + + let (tmp_lo, carry) = carrying_mul(quo_part, div_lo); + // The undersubtracting long division algorithm has already run once, so + // overflow beyond `$uD` bits is not possible here + let (tmp_hi, _) = carrying_mul_add(quo_part, div_hi, carry); + let tmp = (tmp_lo as $uD) | ((tmp_hi as $uD) << n); + + if duo < tmp { + return ( + quo + ((quo_part - 1) as $uD), + duo.wrapping_add(div).wrapping_sub(tmp), + ); + } else { + return (quo + (quo_part as $uD), duo - tmp); + } + } + + duo_lz = duo.leading_zeros(); + + if div_lz <= duo_lz { + // quotient can have 0 or 1 added to it + if div <= duo { + return (quo + 1, duo - div); + } else { + return (quo, duo); + } + } + + // This can only happen if `div_sd < n` (because of previous "quo = 0 or 1" + // branches), but it is not worth it to unroll further. + if n <= duo_lz { + // simple division and addition + let tmp = $half_division(duo as $uX, div as $uX); + return (quo + (tmp.0 as $uD), tmp.1 as $uD); + } + } + } + }; +} diff --git a/vendor/compiler_builtins/src/int/udiv.rs b/vendor/compiler_builtins/src/int/udiv.rs new file mode 100644 index 000000000..fb09f87d8 --- /dev/null +++ b/vendor/compiler_builtins/src/int/udiv.rs @@ -0,0 +1,106 @@ +#[cfg(not(feature = "public-test-deps"))] +pub(crate) use int::specialized_div_rem::*; + +#[cfg(feature = "public-test-deps")] +pub use int::specialized_div_rem::*; + +intrinsics! { + #[maybe_use_optimized_c_shim] + #[arm_aeabi_alias = __aeabi_uidiv] + /// Returns `n / d` + pub extern "C" fn __udivsi3(n: u32, d: u32) -> u32 { + u32_div_rem(n, d).0 + } + + #[maybe_use_optimized_c_shim] + /// Returns `n % d` + pub extern "C" fn __umodsi3(n: u32, d: u32) -> u32 { + u32_div_rem(n, d).1 + } + + #[avr_skip] + #[maybe_use_optimized_c_shim] + /// Returns `n / d` and sets `*rem = n % d` + pub extern "C" fn __udivmodsi4(n: u32, d: u32, rem: Option<&mut u32>) -> u32 { + let quo_rem = u32_div_rem(n, d); + if let Some(rem) = rem { + *rem = quo_rem.1; + } + quo_rem.0 + } + + #[avr_skip] + #[maybe_use_optimized_c_shim] + /// Returns `n / d` + pub extern "C" fn __udivdi3(n: u64, d: u64) -> u64 { + u64_div_rem(n, d).0 + } + + #[avr_skip] + #[maybe_use_optimized_c_shim] + /// Returns `n % d` + pub extern "C" fn __umoddi3(n: u64, d: u64) -> u64 { + u64_div_rem(n, d).1 + } + + #[avr_skip] + #[maybe_use_optimized_c_shim] + /// Returns `n / d` and sets `*rem = n % d` + pub extern "C" fn __udivmoddi4(n: u64, d: u64, rem: Option<&mut u64>) -> u64 { + let quo_rem = u64_div_rem(n, d); + if let Some(rem) = rem { + *rem = quo_rem.1; + } + quo_rem.0 + } + + // Note: we use block configuration and not `if cfg!(...)`, because we need to entirely disable + // the existence of `u128_div_rem` to get 32-bit SPARC to compile, see `u128_divide_sparc` docs. + + #[avr_skip] + #[win64_128bit_abi_hack] + /// Returns `n / d` + pub extern "C" fn __udivti3(n: u128, d: u128) -> u128 { + #[cfg(not(any(target_arch = "sparc", target_arch = "sparc64")))] { + u128_div_rem(n, d).0 + } + #[cfg(any(target_arch = "sparc", target_arch = "sparc64"))] { + u128_divide_sparc(n, d, &mut 0) + } + } + + #[avr_skip] + #[win64_128bit_abi_hack] + /// Returns `n % d` + pub extern "C" fn __umodti3(n: u128, d: u128) -> u128 { + #[cfg(not(any(target_arch = "sparc", target_arch = "sparc64")))] { + u128_div_rem(n, d).1 + } + #[cfg(any(target_arch = "sparc", target_arch = "sparc64"))] { + let mut rem = 0; + u128_divide_sparc(n, d, &mut rem); + rem + } + } + + #[avr_skip] + #[win64_128bit_abi_hack] + /// Returns `n / d` and sets `*rem = n % d` + pub extern "C" fn __udivmodti4(n: u128, d: u128, rem: Option<&mut u128>) -> u128 { + #[cfg(not(any(target_arch = "sparc", target_arch = "sparc64")))] { + let quo_rem = u128_div_rem(n, d); + if let Some(rem) = rem { + *rem = quo_rem.1; + } + quo_rem.0 + } + #[cfg(any(target_arch = "sparc", target_arch = "sparc64"))] { + let mut tmp = 0; + let quo = u128_divide_sparc(n, d, &mut tmp); + if let Some(rem) = rem { + *rem = tmp; + } + quo + } + } +} diff --git a/vendor/compiler_builtins/src/lib.rs b/vendor/compiler_builtins/src/lib.rs new file mode 100644 index 000000000..009923d27 --- /dev/null +++ b/vendor/compiler_builtins/src/lib.rs @@ -0,0 +1,72 @@ +#![cfg_attr(feature = "compiler-builtins", compiler_builtins)] +#![cfg_attr(not(feature = "no-asm"), feature(asm))] +#![feature(abi_unadjusted)] +#![cfg_attr(not(feature = "no-asm"), feature(global_asm))] +#![feature(cfg_target_has_atomic)] +#![feature(compiler_builtins)] +#![feature(core_ffi_c)] +#![feature(core_intrinsics)] +#![feature(lang_items)] +#![feature(linkage)] +#![feature(naked_functions)] +#![feature(repr_simd)] +#![no_builtins] +#![no_std] +#![allow(unused_features)] +// We use `u128` in a whole bunch of places which we currently agree with the +// compiler on ABIs and such, so we should be "good enough" for now and changes +// to the `u128` ABI will be reflected here. +#![allow(improper_ctypes, improper_ctypes_definitions)] +// `mem::swap` cannot be used because it may generate references to memcpy in unoptimized code. +#![allow(clippy::manual_swap)] +// Support compiling on both stage0 and stage1 which may differ in supported stable features. +#![allow(stable_features)] + +// We disable #[no_mangle] for tests so that we can verify the test results +// against the native compiler-rt implementations of the builtins. + +// NOTE cfg(all(feature = "c", ..)) indicate that compiler-rt provides an arch optimized +// implementation of that intrinsic and we'll prefer to use that + +// NOTE(aapcs, aeabi, arm) ARM targets use intrinsics named __aeabi_* instead of the intrinsics +// that follow "x86 naming convention" (e.g. addsf3). Those aeabi intrinsics must adhere to the +// AAPCS calling convention (`extern "aapcs"`) because that's how LLVM will call them. + +#[cfg(test)] +extern crate core; + +#[macro_use] +mod macros; + +pub mod float; +pub mod int; + +#[cfg(any( + all(target_family = "wasm", target_os = "unknown"), + all(target_arch = "x86_64", target_os = "uefi"), + all(target_arch = "arm", target_os = "none"), + all(target_vendor = "fortanix", target_env = "sgx") +))] +pub mod math; +pub mod mem; + +#[cfg(target_arch = "arm")] +pub mod arm; + +#[cfg(all( + kernel_user_helpers, + any(target_os = "linux", target_os = "android"), + target_arch = "arm" +))] +pub mod arm_linux; + +#[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))] +pub mod riscv; + +#[cfg(target_arch = "x86")] +pub mod x86; + +#[cfg(target_arch = "x86_64")] +pub mod x86_64; + +pub mod probestack; diff --git a/vendor/compiler_builtins/src/macros.rs b/vendor/compiler_builtins/src/macros.rs new file mode 100644 index 000000000..518a18d4d --- /dev/null +++ b/vendor/compiler_builtins/src/macros.rs @@ -0,0 +1,448 @@ +//! Macros shared throughout the compiler-builtins implementation + +/// Changes the visibility to `pub` if feature "public-test-deps" is set +#[cfg(not(feature = "public-test-deps"))] +macro_rules! public_test_dep { + ($(#[$($meta:meta)*])* pub(crate) $ident:ident $($tokens:tt)*) => { + $(#[$($meta)*])* pub(crate) $ident $($tokens)* + }; +} + +/// Changes the visibility to `pub` if feature "public-test-deps" is set +#[cfg(feature = "public-test-deps")] +macro_rules! public_test_dep { + {$(#[$($meta:meta)*])* pub(crate) $ident:ident $($tokens:tt)*} => { + $(#[$($meta)*])* pub $ident $($tokens)* + }; +} + +/// The "main macro" used for defining intrinsics. +/// +/// The compiler-builtins library is super platform-specific with tons of crazy +/// little tweaks for various platforms. As a result it *could* involve a lot of +/// #[cfg] and macro soup, but the intention is that this macro alleviates a lot +/// of that complexity. Ideally this macro has all the weird ABI things +/// platforms need and elsewhere in this library it just looks like normal Rust +/// code. +/// +/// This macro is structured to be invoked with a bunch of functions that looks +/// like: +/// +/// intrinsics! { +/// pub extern "C" fn foo(a: i32) -> u32 { +/// // ... +/// } +/// +/// #[nonstandard_attribute] +/// pub extern "C" fn bar(a: i32) -> u32 { +/// // ... +/// } +/// } +/// +/// Each function is defined in a manner that looks like a normal Rust function. +/// The macro then accepts a few nonstandard attributes that can decorate +/// various functions. Each of the attributes is documented below with what it +/// can do, and each of them slightly tweaks how further expansion happens. +/// +/// A quick overview of attributes supported right now are: +/// +/// * `maybe_use_optimized_c_shim` - indicates that the Rust implementation is +/// ignored if an optimized C version was compiled. +/// * `aapcs_on_arm` - forces the ABI of the function to be `"aapcs"` on ARM and +/// the specified ABI everywhere else. +/// * `unadjusted_on_win64` - like `aapcs_on_arm` this switches to the +/// `"unadjusted"` abi on Win64 and the specified abi elsewhere. +/// * `win64_128bit_abi_hack` - this attribute is used for 128-bit integer +/// intrinsics where the ABI is slightly tweaked on Windows platforms, but +/// it's a normal ABI elsewhere for returning a 128 bit integer. +/// * `arm_aeabi_alias` - handles the "aliasing" of various intrinsics on ARM +/// their otherwise typical names to other prefixed ones. +/// +macro_rules! intrinsics { + () => (); + + // Support cfg_attr: + ( + #[cfg_attr($e:meta, $($attr:tt)*)] + $(#[$($attrs:tt)*])* + pub extern $abi:tt fn $name:ident( $($argname:ident: $ty:ty),* ) $(-> $ret:ty)? { + $($body:tt)* + } + $($rest:tt)* + ) => ( + #[cfg($e)] + intrinsics! { + #[$($attr)*] + $(#[$($attrs)*])* + pub extern $abi fn $name($($argname: $ty),*) $(-> $ret)? { + $($body)* + } + } + + #[cfg(not($e))] + intrinsics! { + $(#[$($attrs)*])* + pub extern $abi fn $name($($argname: $ty),*) $(-> $ret)? { + $($body)* + } + } + + intrinsics!($($rest)*); + ); + + // Right now there's a bunch of architecture-optimized intrinsics in the + // stock compiler-rt implementation. Not all of these have been ported over + // to Rust yet so when the `c` feature of this crate is enabled we fall back + // to the architecture-specific versions which should be more optimized. The + // purpose of this macro is to easily allow specifying this. + // + // The `#[maybe_use_optimized_c_shim]` attribute indicates that this + // intrinsic may have an optimized C version. In these situations the build + // script, if the C code is enabled and compiled, will emit a cfg directive + // to get passed to rustc for our compilation. If that cfg is set we skip + // the Rust implementation, but if the attribute is not enabled then we + // compile in the Rust implementation. + ( + #[maybe_use_optimized_c_shim] + $(#[$($attr:tt)*])* + pub extern $abi:tt fn $name:ident( $($argname:ident: $ty:ty),* ) $(-> $ret:ty)? { + $($body:tt)* + } + + $($rest:tt)* + ) => ( + #[cfg($name = "optimized-c")] + pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? { + extern $abi { + fn $name($($argname: $ty),*) $(-> $ret)?; + } + unsafe { + $name($($argname),*) + } + } + + #[cfg(not($name = "optimized-c"))] + intrinsics! { + $(#[$($attr)*])* + pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? { + $($body)* + } + } + + intrinsics!($($rest)*); + ); + + // We recognize the `#[aapcs_on_arm]` attribute here and generate the + // same intrinsic but force it to have the `"aapcs"` calling convention on + // ARM and `"C"` elsewhere. + ( + #[aapcs_on_arm] + $(#[$($attr:tt)*])* + pub extern $abi:tt fn $name:ident( $($argname:ident: $ty:ty),* ) $(-> $ret:ty)? { + $($body:tt)* + } + + $($rest:tt)* + ) => ( + #[cfg(target_arch = "arm")] + intrinsics! { + $(#[$($attr)*])* + pub extern "aapcs" fn $name( $($argname: $ty),* ) $(-> $ret)? { + $($body)* + } + } + + #[cfg(not(target_arch = "arm"))] + intrinsics! { + $(#[$($attr)*])* + pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? { + $($body)* + } + } + + intrinsics!($($rest)*); + ); + + // Like aapcs above we recognize an attribute for the "unadjusted" abi on + // win64 for some methods. + ( + #[unadjusted_on_win64] + $(#[$($attr:tt)*])* + pub extern $abi:tt fn $name:ident( $($argname:ident: $ty:ty),* ) $(-> $ret:ty)? { + $($body:tt)* + } + + $($rest:tt)* + ) => ( + #[cfg(all(windows, target_pointer_width = "64"))] + intrinsics! { + $(#[$($attr)*])* + pub extern "unadjusted" fn $name( $($argname: $ty),* ) $(-> $ret)? { + $($body)* + } + } + + #[cfg(not(all(windows, target_pointer_width = "64")))] + intrinsics! { + $(#[$($attr)*])* + pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? { + $($body)* + } + } + + intrinsics!($($rest)*); + ); + + // Some intrinsics on win64 which return a 128-bit integer have an.. unusual + // calling convention. That's managed here with this "abi hack" which alters + // the generated symbol's ABI. + // + // This will still define a function in this crate with the given name and + // signature, but the actual symbol for the intrinsic may have a slightly + // different ABI on win64. + ( + #[win64_128bit_abi_hack] + $(#[$($attr:tt)*])* + pub extern $abi:tt fn $name:ident( $($argname:ident: $ty:ty),* ) $(-> $ret:ty)? { + $($body:tt)* + } + + $($rest:tt)* + ) => ( + #[cfg(all(windows, target_arch = "x86_64"))] + $(#[$($attr)*])* + pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? { + $($body)* + } + + #[cfg(all(windows, target_arch = "x86_64"))] + pub mod $name { + #[cfg_attr(not(feature = "mangled-names"), no_mangle)] + pub extern $abi fn $name( $($argname: $ty),* ) + -> ::macros::win64_128bit_abi_hack::U64x2 + { + let e: $($ret)? = super::$name($($argname),*); + ::macros::win64_128bit_abi_hack::U64x2::from(e) + } + } + + #[cfg(not(all(windows, target_arch = "x86_64")))] + intrinsics! { + $(#[$($attr)*])* + pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? { + $($body)* + } + } + + intrinsics!($($rest)*); + ); + + // A bunch of intrinsics on ARM are aliased in the standard compiler-rt + // build under `__aeabi_*` aliases, and LLVM will call these instead of the + // original function. The aliasing here is used to generate these symbols in + // the object file. + ( + #[arm_aeabi_alias = $alias:ident] + $(#[$($attr:tt)*])* + pub extern $abi:tt fn $name:ident( $($argname:ident: $ty:ty),* ) $(-> $ret:ty)? { + $($body:tt)* + } + + $($rest:tt)* + ) => ( + #[cfg(target_arch = "arm")] + pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? { + $($body)* + } + + #[cfg(target_arch = "arm")] + pub mod $name { + #[cfg_attr(not(feature = "mangled-names"), no_mangle)] + pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? { + super::$name($($argname),*) + } + } + + #[cfg(target_arch = "arm")] + pub mod $alias { + #[cfg_attr(not(feature = "mangled-names"), no_mangle)] + pub extern "aapcs" fn $alias( $($argname: $ty),* ) $(-> $ret)? { + super::$name($($argname),*) + } + } + + #[cfg(not(target_arch = "arm"))] + intrinsics! { + $(#[$($attr)*])* + pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? { + $($body)* + } + } + + intrinsics!($($rest)*); + ); + + // C mem* functions are only generated when the "mem" feature is enabled. + ( + #[mem_builtin] + $(#[$($attr:tt)*])* + pub unsafe extern $abi:tt fn $name:ident( $($argname:ident: $ty:ty),* ) $(-> $ret:ty)? { + $($body:tt)* + } + + $($rest:tt)* + ) => ( + $(#[$($attr)*])* + pub unsafe extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? { + $($body)* + } + + #[cfg(feature = "mem")] + pub mod $name { + $(#[$($attr)*])* + #[cfg_attr(not(feature = "mangled-names"), no_mangle)] + pub unsafe extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? { + super::$name($($argname),*) + } + } + + intrinsics!($($rest)*); + ); + + // Naked functions are special: we can't generate wrappers for them since + // they use a custom calling convention. + ( + #[naked] + $(#[$($attr:tt)*])* + pub unsafe extern $abi:tt fn $name:ident( $($argname:ident: $ty:ty),* ) $(-> $ret:ty)? { + $($body:tt)* + } + + $($rest:tt)* + ) => ( + pub mod $name { + #[naked] + $(#[$($attr)*])* + #[cfg_attr(not(feature = "mangled-names"), no_mangle)] + pub unsafe extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? { + $($body)* + } + } + + intrinsics!($($rest)*); + ); + + // For division and modulo, AVR uses a custom calling convention¹ that does + // not match our definitions here. Ideally we would just use hand-written + // naked functions, but that's quite a lot of code to port² - so for the + // time being we are just ignoring the problematic functions, letting + // avr-gcc (which is required to compile to AVR anyway) link them from + // libgcc. + // + // ¹ https://gcc.gnu.org/wiki/avr-gcc (see "Exceptions to the Calling + // Convention") + // ² https://github.com/gcc-mirror/gcc/blob/31048012db98f5ec9c2ba537bfd850374bdd771f/libgcc/config/avr/lib1funcs.S + ( + #[avr_skip] + $(#[$($attr:tt)*])* + pub extern $abi:tt fn $name:ident( $($argname:ident: $ty:ty),* ) $(-> $ret:ty)? { + $($body:tt)* + } + + $($rest:tt)* + ) => ( + #[cfg(not(target_arch = "avr"))] + intrinsics! { + $(#[$($attr)*])* + pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? { + $($body)* + } + } + + intrinsics!($($rest)*); + ); + + // This is the final catch-all rule. At this point we generate an + // intrinsic with a conditional `#[no_mangle]` directive to avoid + // interfering with duplicate symbols and whatnot during testing. + // + // The implementation is placed in a separate module, to take advantage + // of the fact that rustc partitions functions into code generation + // units based on module they are defined in. As a result we will have + // a separate object file for each intrinsic. For further details see + // corresponding PR in rustc https://github.com/rust-lang/rust/pull/70846 + // + // After the intrinsic is defined we just continue with the rest of the + // input we were given. + ( + $(#[$($attr:tt)*])* + pub extern $abi:tt fn $name:ident( $($argname:ident: $ty:ty),* ) $(-> $ret:ty)? { + $($body:tt)* + } + + $($rest:tt)* + ) => ( + $(#[$($attr)*])* + pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? { + $($body)* + } + + pub mod $name { + $(#[$($attr)*])* + #[cfg_attr(not(feature = "mangled-names"), no_mangle)] + pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? { + super::$name($($argname),*) + } + } + + intrinsics!($($rest)*); + ); + + // Same as the above for unsafe functions. + ( + $(#[$($attr:tt)*])* + pub unsafe extern $abi:tt fn $name:ident( $($argname:ident: $ty:ty),* ) $(-> $ret:ty)? { + $($body:tt)* + } + + $($rest:tt)* + ) => ( + $(#[$($attr)*])* + pub unsafe extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? { + $($body)* + } + + pub mod $name { + $(#[$($attr)*])* + #[cfg_attr(not(feature = "mangled-names"), no_mangle)] + pub unsafe extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? { + super::$name($($argname),*) + } + } + + intrinsics!($($rest)*); + ); +} + +// Hack for LLVM expectations for ABI on windows. This is used by the +// `#[win64_128bit_abi_hack]` attribute recognized above +#[cfg(all(windows, target_pointer_width = "64"))] +pub mod win64_128bit_abi_hack { + #[repr(simd)] + pub struct U64x2(u64, u64); + + impl From for U64x2 { + fn from(i: i128) -> U64x2 { + use int::DInt; + let j = i as u128; + U64x2(j.lo(), j.hi()) + } + } + + impl From for U64x2 { + fn from(i: u128) -> U64x2 { + use int::DInt; + U64x2(i.lo(), i.hi()) + } + } +} diff --git a/vendor/compiler_builtins/src/math.rs b/vendor/compiler_builtins/src/math.rs new file mode 100644 index 000000000..fa59753f8 --- /dev/null +++ b/vendor/compiler_builtins/src/math.rs @@ -0,0 +1,117 @@ +#[allow(dead_code)] +#[path = "../libm/src/math/mod.rs"] +mod libm; + +macro_rules! no_mangle { + ($(fn $fun:ident($($iid:ident : $ity:ty),+) -> $oty:ty;)+) => { + intrinsics! { + $( + pub extern "C" fn $fun($($iid: $ity),+) -> $oty { + self::libm::$fun($($iid),+) + } + )+ + } + } +} + +#[cfg(any( + all( + target_family = "wasm", + target_os = "unknown", + not(target_env = "wasi") + ), + all(target_arch = "x86_64", target_os = "uefi"), + all(target_arch = "xtensa", target_os = "none"), + all(target_vendor = "fortanix", target_env = "sgx") +))] +no_mangle! { + fn acos(x: f64) -> f64; + fn asin(x: f64) -> f64; + fn cbrt(x: f64) -> f64; + fn expm1(x: f64) -> f64; + fn hypot(x: f64, y: f64) -> f64; + fn tan(x: f64) -> f64; + fn cos(x: f64) -> f64; + fn expf(x: f32) -> f32; + fn log2(x: f64) -> f64; + fn log2f(x: f32) -> f32; + fn log10(x: f64) -> f64; + fn log10f(x: f32) -> f32; + fn log(x: f64) -> f64; + fn logf(x: f32) -> f32; + fn fmin(x: f64, y: f64) -> f64; + fn fminf(x: f32, y: f32) -> f32; + fn fmax(x: f64, y: f64) -> f64; + fn fmaxf(x: f32, y: f32) -> f32; + fn round(x: f64) -> f64; + fn roundf(x: f32) -> f32; + fn sin(x: f64) -> f64; + fn pow(x: f64, y: f64) -> f64; + fn powf(x: f32, y: f32) -> f32; + fn fmod(x: f64, y: f64) -> f64; + fn fmodf(x: f32, y: f32) -> f32; + fn acosf(n: f32) -> f32; + fn atan2f(a: f32, b: f32) -> f32; + fn atanf(n: f32) -> f32; + fn coshf(n: f32) -> f32; + fn expm1f(n: f32) -> f32; + fn fdim(a: f64, b: f64) -> f64; + fn fdimf(a: f32, b: f32) -> f32; + fn log1pf(n: f32) -> f32; + fn sinhf(n: f32) -> f32; + fn tanhf(n: f32) -> f32; + fn ldexp(f: f64, n: i32) -> f64; + fn ldexpf(f: f32, n: i32) -> f32; +} + +#[cfg(any( + all( + target_family = "wasm", + target_os = "unknown", + not(target_env = "wasi") + ), + all(target_arch = "xtensa", target_os = "none"), + all(target_vendor = "fortanix", target_env = "sgx") +))] +no_mangle! { + fn atan(x: f64) -> f64; + fn atan2(x: f64, y: f64) -> f64; + fn cosh(x: f64) -> f64; + fn log1p(x: f64) -> f64; + fn sinh(x: f64) -> f64; + fn tanh(x: f64) -> f64; + fn cosf(x: f32) -> f32; + fn exp(x: f64) -> f64; + fn sinf(x: f32) -> f32; + fn exp2(x: f64) -> f64; + fn exp2f(x: f32) -> f32; + fn fma(x: f64, y: f64, z: f64) -> f64; + fn fmaf(x: f32, y: f32, z: f32) -> f32; + fn asinf(n: f32) -> f32; + fn cbrtf(n: f32) -> f32; + fn hypotf(x: f32, y: f32) -> f32; + fn tanf(n: f32) -> f32; +} + +#[cfg(all(target_vendor = "fortanix", target_env = "sgx"))] +no_mangle! { + fn ceil(x: f64) -> f64; + fn ceilf(x: f32) -> f32; + fn floor(x: f64) -> f64; + fn floorf(x: f32) -> f32; + fn trunc(x: f64) -> f64; + fn truncf(x: f32) -> f32; +} + +// only for the thumb*-none-eabi* targets +#[cfg(all(target_arch = "arm", target_os = "none"))] +no_mangle! { + fn fmin(x: f64, y: f64) -> f64; + fn fminf(x: f32, y: f32) -> f32; + fn fmax(x: f64, y: f64) -> f64; + fn fmaxf(x: f32, y: f32) -> f32; + // `f64 % f64` + fn fmod(x: f64, y: f64) -> f64; + // `f32 % f32` + fn fmodf(x: f32, y: f32) -> f32; +} diff --git a/vendor/compiler_builtins/src/mem/impls.rs b/vendor/compiler_builtins/src/mem/impls.rs new file mode 100644 index 000000000..815132425 --- /dev/null +++ b/vendor/compiler_builtins/src/mem/impls.rs @@ -0,0 +1,267 @@ +use core::intrinsics::likely; + +const WORD_SIZE: usize = core::mem::size_of::(); +const WORD_MASK: usize = WORD_SIZE - 1; + +// If the number of bytes involved exceed this threshold we will opt in word-wise copy. +// The value here selected is max(2 * WORD_SIZE, 16): +// * We need at least 2 * WORD_SIZE bytes to guarantee that at least 1 word will be copied through +// word-wise copy. +// * The word-wise copy logic needs to perform some checks so it has some small overhead. +// ensures that even on 32-bit platforms we have copied at least 8 bytes through +// word-wise copy so the saving of word-wise copy outweights the fixed overhead. +const WORD_COPY_THRESHOLD: usize = if 2 * WORD_SIZE > 16 { + 2 * WORD_SIZE +} else { + 16 +}; + +#[cfg(feature = "mem-unaligned")] +unsafe fn read_usize_unaligned(x: *const usize) -> usize { + // Do not use `core::ptr::read_unaligned` here, since it calls `copy_nonoverlapping` which + // is translated to memcpy in LLVM. + let x_read = (x as *const [u8; core::mem::size_of::()]).read(); + core::mem::transmute(x_read) +} + +#[inline(always)] +pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, mut n: usize) { + #[inline(always)] + unsafe fn copy_forward_bytes(mut dest: *mut u8, mut src: *const u8, n: usize) { + let dest_end = dest.add(n); + while dest < dest_end { + *dest = *src; + dest = dest.add(1); + src = src.add(1); + } + } + + #[inline(always)] + unsafe fn copy_forward_aligned_words(dest: *mut u8, src: *const u8, n: usize) { + let mut dest_usize = dest as *mut usize; + let mut src_usize = src as *mut usize; + let dest_end = dest.add(n) as *mut usize; + + while dest_usize < dest_end { + *dest_usize = *src_usize; + dest_usize = dest_usize.add(1); + src_usize = src_usize.add(1); + } + } + + #[cfg(not(feature = "mem-unaligned"))] + #[inline(always)] + unsafe fn copy_forward_misaligned_words(dest: *mut u8, src: *const u8, n: usize) { + let mut dest_usize = dest as *mut usize; + let dest_end = dest.add(n) as *mut usize; + + // Calculate the misalignment offset and shift needed to reassemble value. + let offset = src as usize & WORD_MASK; + let shift = offset * 8; + + // Realign src + let mut src_aligned = (src as usize & !WORD_MASK) as *mut usize; + // This will read (but won't use) bytes out of bound. + // cfg needed because not all targets will have atomic loads that can be lowered + // (e.g. BPF, MSP430), or provided by an external library (e.g. RV32I) + #[cfg(target_has_atomic_load_store = "ptr")] + let mut prev_word = core::intrinsics::atomic_load_unordered(src_aligned); + #[cfg(not(target_has_atomic_load_store = "ptr"))] + let mut prev_word = core::ptr::read_volatile(src_aligned); + + while dest_usize < dest_end { + src_aligned = src_aligned.add(1); + let cur_word = *src_aligned; + #[cfg(target_endian = "little")] + let resembled = prev_word >> shift | cur_word << (WORD_SIZE * 8 - shift); + #[cfg(target_endian = "big")] + let resembled = prev_word << shift | cur_word >> (WORD_SIZE * 8 - shift); + prev_word = cur_word; + + *dest_usize = resembled; + dest_usize = dest_usize.add(1); + } + } + + #[cfg(feature = "mem-unaligned")] + #[inline(always)] + unsafe fn copy_forward_misaligned_words(dest: *mut u8, src: *const u8, n: usize) { + let mut dest_usize = dest as *mut usize; + let mut src_usize = src as *mut usize; + let dest_end = dest.add(n) as *mut usize; + + while dest_usize < dest_end { + *dest_usize = read_usize_unaligned(src_usize); + dest_usize = dest_usize.add(1); + src_usize = src_usize.add(1); + } + } + + if n >= WORD_COPY_THRESHOLD { + // Align dest + // Because of n >= 2 * WORD_SIZE, dst_misalignment < n + let dest_misalignment = (dest as usize).wrapping_neg() & WORD_MASK; + copy_forward_bytes(dest, src, dest_misalignment); + dest = dest.add(dest_misalignment); + src = src.add(dest_misalignment); + n -= dest_misalignment; + + let n_words = n & !WORD_MASK; + let src_misalignment = src as usize & WORD_MASK; + if likely(src_misalignment == 0) { + copy_forward_aligned_words(dest, src, n_words); + } else { + copy_forward_misaligned_words(dest, src, n_words); + } + dest = dest.add(n_words); + src = src.add(n_words); + n -= n_words; + } + copy_forward_bytes(dest, src, n); +} + +#[inline(always)] +pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, mut n: usize) { + // The following backward copy helper functions uses the pointers past the end + // as their inputs instead of pointers to the start! + #[inline(always)] + unsafe fn copy_backward_bytes(mut dest: *mut u8, mut src: *const u8, n: usize) { + let dest_start = dest.sub(n); + while dest_start < dest { + dest = dest.sub(1); + src = src.sub(1); + *dest = *src; + } + } + + #[inline(always)] + unsafe fn copy_backward_aligned_words(dest: *mut u8, src: *const u8, n: usize) { + let mut dest_usize = dest as *mut usize; + let mut src_usize = src as *mut usize; + let dest_start = dest.sub(n) as *mut usize; + + while dest_start < dest_usize { + dest_usize = dest_usize.sub(1); + src_usize = src_usize.sub(1); + *dest_usize = *src_usize; + } + } + + #[cfg(not(feature = "mem-unaligned"))] + #[inline(always)] + unsafe fn copy_backward_misaligned_words(dest: *mut u8, src: *const u8, n: usize) { + let mut dest_usize = dest as *mut usize; + let dest_start = dest.sub(n) as *mut usize; + + // Calculate the misalignment offset and shift needed to reassemble value. + let offset = src as usize & WORD_MASK; + let shift = offset * 8; + + // Realign src_aligned + let mut src_aligned = (src as usize & !WORD_MASK) as *mut usize; + // This will read (but won't use) bytes out of bound. + // cfg needed because not all targets will have atomic loads that can be lowered + // (e.g. BPF, MSP430), or provided by an external library (e.g. RV32I) + #[cfg(target_has_atomic_load_store = "ptr")] + let mut prev_word = core::intrinsics::atomic_load_unordered(src_aligned); + #[cfg(not(target_has_atomic_load_store = "ptr"))] + let mut prev_word = core::ptr::read_volatile(src_aligned); + + while dest_start < dest_usize { + src_aligned = src_aligned.sub(1); + let cur_word = *src_aligned; + #[cfg(target_endian = "little")] + let resembled = prev_word << (WORD_SIZE * 8 - shift) | cur_word >> shift; + #[cfg(target_endian = "big")] + let resembled = prev_word >> (WORD_SIZE * 8 - shift) | cur_word << shift; + prev_word = cur_word; + + dest_usize = dest_usize.sub(1); + *dest_usize = resembled; + } + } + + #[cfg(feature = "mem-unaligned")] + #[inline(always)] + unsafe fn copy_backward_misaligned_words(dest: *mut u8, src: *const u8, n: usize) { + let mut dest_usize = dest as *mut usize; + let mut src_usize = src as *mut usize; + let dest_start = dest.sub(n) as *mut usize; + + while dest_start < dest_usize { + dest_usize = dest_usize.sub(1); + src_usize = src_usize.sub(1); + *dest_usize = read_usize_unaligned(src_usize); + } + } + + let mut dest = dest.add(n); + let mut src = src.add(n); + + if n >= WORD_COPY_THRESHOLD { + // Align dest + // Because of n >= 2 * WORD_SIZE, dst_misalignment < n + let dest_misalignment = dest as usize & WORD_MASK; + copy_backward_bytes(dest, src, dest_misalignment); + dest = dest.sub(dest_misalignment); + src = src.sub(dest_misalignment); + n -= dest_misalignment; + + let n_words = n & !WORD_MASK; + let src_misalignment = src as usize & WORD_MASK; + if likely(src_misalignment == 0) { + copy_backward_aligned_words(dest, src, n_words); + } else { + copy_backward_misaligned_words(dest, src, n_words); + } + dest = dest.sub(n_words); + src = src.sub(n_words); + n -= n_words; + } + copy_backward_bytes(dest, src, n); +} + +#[inline(always)] +pub unsafe fn set_bytes(mut s: *mut u8, c: u8, mut n: usize) { + #[inline(always)] + pub unsafe fn set_bytes_bytes(mut s: *mut u8, c: u8, n: usize) { + let end = s.add(n); + while s < end { + *s = c; + s = s.add(1); + } + } + + #[inline(always)] + pub unsafe fn set_bytes_words(s: *mut u8, c: u8, n: usize) { + let mut broadcast = c as usize; + let mut bits = 8; + while bits < WORD_SIZE * 8 { + broadcast |= broadcast << bits; + bits *= 2; + } + + let mut s_usize = s as *mut usize; + let end = s.add(n) as *mut usize; + + while s_usize < end { + *s_usize = broadcast; + s_usize = s_usize.add(1); + } + } + + if likely(n >= WORD_COPY_THRESHOLD) { + // Align s + // Because of n >= 2 * WORD_SIZE, dst_misalignment < n + let misalignment = (s as usize).wrapping_neg() & WORD_MASK; + set_bytes_bytes(s, c, misalignment); + s = s.add(misalignment); + n -= misalignment; + + let n_words = n & !WORD_MASK; + set_bytes_words(s, c, n_words); + s = s.add(n_words); + n -= n_words; + } + set_bytes_bytes(s, c, n); +} diff --git a/vendor/compiler_builtins/src/mem/mod.rs b/vendor/compiler_builtins/src/mem/mod.rs new file mode 100644 index 000000000..a55113861 --- /dev/null +++ b/vendor/compiler_builtins/src/mem/mod.rs @@ -0,0 +1,211 @@ +// Trying to satisfy clippy here is hopeless +#![allow(clippy::style)] + +#[allow(warnings)] +#[cfg(target_pointer_width = "16")] +type c_int = i16; +#[allow(warnings)] +#[cfg(not(target_pointer_width = "16"))] +type c_int = i32; + +use core::intrinsics::{atomic_load_unordered, atomic_store_unordered, exact_div}; +use core::mem; +use core::ops::{BitOr, Shl}; + +// memcpy/memmove/memset have optimized implementations on some architectures +#[cfg_attr( + all(not(feature = "no-asm"), target_arch = "x86_64"), + path = "x86_64.rs" +)] +mod impls; + +intrinsics! { + #[mem_builtin] + #[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), linkage = "weak")] + pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 { + impls::copy_forward(dest, src, n); + dest + } + + #[mem_builtin] + #[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), linkage = "weak")] + pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 { + let delta = (dest as usize).wrapping_sub(src as usize); + if delta >= n { + // We can copy forwards because either dest is far enough ahead of src, + // or src is ahead of dest (and delta overflowed). + impls::copy_forward(dest, src, n); + } else { + impls::copy_backward(dest, src, n); + } + dest + } + + #[mem_builtin] + #[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), linkage = "weak")] + pub unsafe extern "C" fn memset(s: *mut u8, c: crate::mem::c_int, n: usize) -> *mut u8 { + impls::set_bytes(s, c as u8, n); + s + } + + #[mem_builtin] + #[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), linkage = "weak")] + pub unsafe extern "C" fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 { + let mut i = 0; + while i < n { + let a = *s1.add(i); + let b = *s2.add(i); + if a != b { + return a as i32 - b as i32; + } + i += 1; + } + 0 + } + + #[mem_builtin] + #[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), linkage = "weak")] + pub unsafe extern "C" fn bcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 { + memcmp(s1, s2, n) + } + + #[mem_builtin] + #[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), linkage = "weak")] + pub unsafe extern "C" fn strlen(s: *const core::ffi::c_char) -> usize { + let mut n = 0; + let mut s = s; + while *s != 0 { + n += 1; + s = s.offset(1); + } + n + } +} + +// `bytes` must be a multiple of `mem::size_of::()` +#[cfg_attr(not(target_has_atomic_load_store = "8"), allow(dead_code))] +fn memcpy_element_unordered_atomic(dest: *mut T, src: *const T, bytes: usize) { + unsafe { + let n = exact_div(bytes, mem::size_of::()); + let mut i = 0; + while i < n { + atomic_store_unordered(dest.add(i), atomic_load_unordered(src.add(i))); + i += 1; + } + } +} + +// `bytes` must be a multiple of `mem::size_of::()` +#[cfg_attr(not(target_has_atomic_load_store = "8"), allow(dead_code))] +fn memmove_element_unordered_atomic(dest: *mut T, src: *const T, bytes: usize) { + unsafe { + let n = exact_div(bytes, mem::size_of::()); + if src < dest as *const T { + // copy from end + let mut i = n; + while i != 0 { + i -= 1; + atomic_store_unordered(dest.add(i), atomic_load_unordered(src.add(i))); + } + } else { + // copy from beginning + let mut i = 0; + while i < n { + atomic_store_unordered(dest.add(i), atomic_load_unordered(src.add(i))); + i += 1; + } + } + } +} + +// `T` must be a primitive integer type, and `bytes` must be a multiple of `mem::size_of::()` +#[cfg_attr(not(target_has_atomic_load_store = "8"), allow(dead_code))] +fn memset_element_unordered_atomic(s: *mut T, c: u8, bytes: usize) +where + T: Copy + From + Shl + BitOr, +{ + unsafe { + let n = exact_div(bytes, mem::size_of::()); + + // Construct a value of type `T` consisting of repeated `c` + // bytes, to let us ensure we write each `T` atomically. + let mut x = T::from(c); + let mut i = 1; + while i < mem::size_of::() { + x = x << 8 | T::from(c); + i += 1; + } + + // Write it to `s` + let mut i = 0; + while i < n { + atomic_store_unordered(s.add(i), x); + i += 1; + } + } +} + +intrinsics! { + #[cfg(target_has_atomic_load_store = "8")] + pub unsafe extern "C" fn __llvm_memcpy_element_unordered_atomic_1(dest: *mut u8, src: *const u8, bytes: usize) -> () { + memcpy_element_unordered_atomic(dest, src, bytes); + } + #[cfg(target_has_atomic_load_store = "16")] + pub unsafe extern "C" fn __llvm_memcpy_element_unordered_atomic_2(dest: *mut u16, src: *const u16, bytes: usize) -> () { + memcpy_element_unordered_atomic(dest, src, bytes); + } + #[cfg(target_has_atomic_load_store = "32")] + pub unsafe extern "C" fn __llvm_memcpy_element_unordered_atomic_4(dest: *mut u32, src: *const u32, bytes: usize) -> () { + memcpy_element_unordered_atomic(dest, src, bytes); + } + #[cfg(target_has_atomic_load_store = "64")] + pub unsafe extern "C" fn __llvm_memcpy_element_unordered_atomic_8(dest: *mut u64, src: *const u64, bytes: usize) -> () { + memcpy_element_unordered_atomic(dest, src, bytes); + } + #[cfg(target_has_atomic_load_store = "128")] + pub unsafe extern "C" fn __llvm_memcpy_element_unordered_atomic_16(dest: *mut u128, src: *const u128, bytes: usize) -> () { + memcpy_element_unordered_atomic(dest, src, bytes); + } + + #[cfg(target_has_atomic_load_store = "8")] + pub unsafe extern "C" fn __llvm_memmove_element_unordered_atomic_1(dest: *mut u8, src: *const u8, bytes: usize) -> () { + memmove_element_unordered_atomic(dest, src, bytes); + } + #[cfg(target_has_atomic_load_store = "16")] + pub unsafe extern "C" fn __llvm_memmove_element_unordered_atomic_2(dest: *mut u16, src: *const u16, bytes: usize) -> () { + memmove_element_unordered_atomic(dest, src, bytes); + } + #[cfg(target_has_atomic_load_store = "32")] + pub unsafe extern "C" fn __llvm_memmove_element_unordered_atomic_4(dest: *mut u32, src: *const u32, bytes: usize) -> () { + memmove_element_unordered_atomic(dest, src, bytes); + } + #[cfg(target_has_atomic_load_store = "64")] + pub unsafe extern "C" fn __llvm_memmove_element_unordered_atomic_8(dest: *mut u64, src: *const u64, bytes: usize) -> () { + memmove_element_unordered_atomic(dest, src, bytes); + } + #[cfg(target_has_atomic_load_store = "128")] + pub unsafe extern "C" fn __llvm_memmove_element_unordered_atomic_16(dest: *mut u128, src: *const u128, bytes: usize) -> () { + memmove_element_unordered_atomic(dest, src, bytes); + } + + #[cfg(target_has_atomic_load_store = "8")] + pub unsafe extern "C" fn __llvm_memset_element_unordered_atomic_1(s: *mut u8, c: u8, bytes: usize) -> () { + memset_element_unordered_atomic(s, c, bytes); + } + #[cfg(target_has_atomic_load_store = "16")] + pub unsafe extern "C" fn __llvm_memset_element_unordered_atomic_2(s: *mut u16, c: u8, bytes: usize) -> () { + memset_element_unordered_atomic(s, c, bytes); + } + #[cfg(target_has_atomic_load_store = "32")] + pub unsafe extern "C" fn __llvm_memset_element_unordered_atomic_4(s: *mut u32, c: u8, bytes: usize) -> () { + memset_element_unordered_atomic(s, c, bytes); + } + #[cfg(target_has_atomic_load_store = "64")] + pub unsafe extern "C" fn __llvm_memset_element_unordered_atomic_8(s: *mut u64, c: u8, bytes: usize) -> () { + memset_element_unordered_atomic(s, c, bytes); + } + #[cfg(target_has_atomic_load_store = "128")] + pub unsafe extern "C" fn __llvm_memset_element_unordered_atomic_16(s: *mut u128, c: u8, bytes: usize) -> () { + memset_element_unordered_atomic(s, c, bytes); + } +} diff --git a/vendor/compiler_builtins/src/mem/x86_64.rs b/vendor/compiler_builtins/src/mem/x86_64.rs new file mode 100644 index 000000000..a7ab6f605 --- /dev/null +++ b/vendor/compiler_builtins/src/mem/x86_64.rs @@ -0,0 +1,100 @@ +// On most modern Intel and AMD processors, "rep movsq" and "rep stosq" have +// been enhanced to perform better than an simple qword loop, making them ideal +// for implementing memcpy/memset. Note that "rep cmps" has received no such +// enhancement, so it is not used to implement memcmp. +// +// On certain recent Intel processors, "rep movsb" and "rep stosb" have been +// further enhanced to automatically select the best microarchitectural +// implementation based on length and alignment. See the following features from +// the "Intel® 64 and IA-32 Architectures Optimization Reference Manual": +// - ERMSB - Enhanced REP MOVSB and STOSB (Ivy Bridge and later) +// - FSRM - Fast Short REP MOV (Ice Lake and later) +// - Fast Zero-Length MOVSB (On no current hardware) +// - Fast Short STOSB (On no current hardware) +// +// To simplify things, we switch to using the byte-based variants if the "ermsb" +// feature is present at compile-time. We don't bother detecting other features. +// Note that ERMSB does not enhance the backwards (DF=1) "rep movsb". + +#[inline(always)] +#[cfg(target_feature = "ermsb")] +pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) { + // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust. + core::arch::asm!( + "repe movsb (%rsi), (%rdi)", + inout("rcx") count => _, + inout("rdi") dest => _, + inout("rsi") src => _, + options(att_syntax, nostack, preserves_flags) + ); +} + +#[inline(always)] +#[cfg(not(target_feature = "ermsb"))] +pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) { + let qword_count = count >> 3; + let byte_count = count & 0b111; + // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust. + core::arch::asm!( + "repe movsq (%rsi), (%rdi)", + "mov {byte_count:e}, %ecx", + "repe movsb (%rsi), (%rdi)", + byte_count = in(reg) byte_count, + inout("rcx") qword_count => _, + inout("rdi") dest => _, + inout("rsi") src => _, + options(att_syntax, nostack, preserves_flags) + ); +} + +#[inline(always)] +pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) { + let qword_count = count >> 3; + let byte_count = count & 0b111; + // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust. + core::arch::asm!( + "std", + "repe movsq (%rsi), (%rdi)", + "movl {byte_count:e}, %ecx", + "addq $7, %rdi", + "addq $7, %rsi", + "repe movsb (%rsi), (%rdi)", + "cld", + byte_count = in(reg) byte_count, + inout("rcx") qword_count => _, + inout("rdi") dest.add(count).wrapping_sub(8) => _, + inout("rsi") src.add(count).wrapping_sub(8) => _, + options(att_syntax, nostack) + ); +} + +#[inline(always)] +#[cfg(target_feature = "ermsb")] +pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) { + // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust. + core::arch::asm!( + "repe stosb %al, (%rdi)", + inout("rcx") count => _, + inout("rdi") dest => _, + inout("al") c => _, + options(att_syntax, nostack, preserves_flags) + ) +} + +#[inline(always)] +#[cfg(not(target_feature = "ermsb"))] +pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) { + let qword_count = count >> 3; + let byte_count = count & 0b111; + // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust. + core::arch::asm!( + "repe stosq %rax, (%rdi)", + "mov {byte_count:e}, %ecx", + "repe stosb %al, (%rdi)", + byte_count = in(reg) byte_count, + inout("rcx") qword_count => _, + inout("rdi") dest => _, + in("rax") (c as u64) * 0x0101010101010101, + options(att_syntax, nostack, preserves_flags) + ); +} diff --git a/vendor/compiler_builtins/src/probestack.rs b/vendor/compiler_builtins/src/probestack.rs new file mode 100644 index 000000000..0c30384db --- /dev/null +++ b/vendor/compiler_builtins/src/probestack.rs @@ -0,0 +1,350 @@ +// Copyright 2017 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! This module defines the `__rust_probestack` intrinsic which is used in the +//! implementation of "stack probes" on certain platforms. +//! +//! The purpose of a stack probe is to provide a static guarantee that if a +//! thread has a guard page then a stack overflow is guaranteed to hit that +//! guard page. If a function did not have a stack probe then there's a risk of +//! having a stack frame *larger* than the guard page, so a function call could +//! skip over the guard page entirely and then later hit maybe the heap or +//! another thread, possibly leading to security vulnerabilities such as [The +//! Stack Clash], for example. +//! +//! [The Stack Clash]: https://blog.qualys.com/securitylabs/2017/06/19/the-stack-clash +//! +//! The `__rust_probestack` is called in the prologue of functions whose stack +//! size is larger than the guard page, for example larger than 4096 bytes on +//! x86. This function is then responsible for "touching" all pages relevant to +//! the stack to ensure that that if any of them are the guard page we'll hit +//! them guaranteed. +//! +//! The precise ABI for how this function operates is defined by LLVM. There's +//! no real documentation as to what this is, so you'd basically need to read +//! the LLVM source code for reference. Often though the test cases can be +//! illuminating as to the ABI that's generated, or just looking at the output +//! of `llc`. +//! +//! Note that `#[naked]` is typically used here for the stack probe because the +//! ABI corresponds to no actual ABI. +//! +//! Finally it's worth noting that at the time of this writing LLVM only has +//! support for stack probes on x86 and x86_64. There's no support for stack +//! probes on any other architecture like ARM or PowerPC64. LLVM I'm sure would +//! be more than welcome to accept such a change! + +#![cfg(not(feature = "mangled-names"))] +// Windows already has builtins to do this. +#![cfg(not(windows))] +// All these builtins require assembly +#![cfg(not(feature = "no-asm"))] +// We only define stack probing for these architectures today. +#![cfg(any(target_arch = "x86_64", target_arch = "x86"))] + +extern "C" { + pub fn __rust_probestack(); +} + +// A wrapper for our implementation of __rust_probestack, which allows us to +// keep the assembly inline while controlling all CFI directives in the assembly +// emitted for the function. +// +// This is the ELF version. +#[cfg(not(any(target_vendor = "apple", target_os = "uefi")))] +macro_rules! define_rust_probestack { + ($body: expr) => { + concat!( + " + .pushsection .text.__rust_probestack + .globl __rust_probestack + .type __rust_probestack, @function + .hidden __rust_probestack + __rust_probestack: + ", + $body, + " + .size __rust_probestack, . - __rust_probestack + .popsection + " + ) + }; +} + +#[cfg(all(target_os = "uefi", target_arch = "x86_64"))] +macro_rules! define_rust_probestack { + ($body: expr) => { + concat!( + " + .globl __rust_probestack + __rust_probestack: + ", + $body + ) + }; +} + +// Same as above, but for Mach-O. Note that the triple underscore +// is deliberate +#[cfg(target_vendor = "apple")] +macro_rules! define_rust_probestack { + ($body: expr) => { + concat!( + " + .globl ___rust_probestack + ___rust_probestack: + ", + $body + ) + }; +} + +// In UEFI x86 arch, triple underscore is deliberate. +#[cfg(all(target_os = "uefi", target_arch = "x86"))] +macro_rules! define_rust_probestack { + ($body: expr) => { + concat!( + " + .globl ___rust_probestack + ___rust_probestack: + ", + $body + ) + }; +} + +// Our goal here is to touch each page between %rsp+8 and %rsp+8-%rax, +// ensuring that if any pages are unmapped we'll make a page fault. +// +// The ABI here is that the stack frame size is located in `%rax`. Upon +// return we're not supposed to modify `%rsp` or `%rax`. +// +// Any changes to this function should be replicated to the SGX version below. +#[cfg(all( + target_arch = "x86_64", + not(all(target_env = "sgx", target_vendor = "fortanix")) +))] +core::arch::global_asm!( + define_rust_probestack!( + " + .cfi_startproc + pushq %rbp + .cfi_adjust_cfa_offset 8 + .cfi_offset %rbp, -16 + movq %rsp, %rbp + .cfi_def_cfa_register %rbp + + mov %rax,%r11 // duplicate %rax as we're clobbering %r11 + + // Main loop, taken in one page increments. We're decrementing rsp by + // a page each time until there's less than a page remaining. We're + // guaranteed that this function isn't called unless there's more than a + // page needed. + // + // Note that we're also testing against `8(%rsp)` to account for the 8 + // bytes pushed on the stack orginally with our return address. Using + // `8(%rsp)` simulates us testing the stack pointer in the caller's + // context. + + // It's usually called when %rax >= 0x1000, but that's not always true. + // Dynamic stack allocation, which is needed to implement unsized + // rvalues, triggers stackprobe even if %rax < 0x1000. + // Thus we have to check %r11 first to avoid segfault. + cmp $0x1000,%r11 + jna 3f +2: + sub $0x1000,%rsp + test %rsp,8(%rsp) + sub $0x1000,%r11 + cmp $0x1000,%r11 + ja 2b + +3: + // Finish up the last remaining stack space requested, getting the last + // bits out of r11 + sub %r11,%rsp + test %rsp,8(%rsp) + + // Restore the stack pointer to what it previously was when entering + // this function. The caller will readjust the stack pointer after we + // return. + add %rax,%rsp + + leave + .cfi_def_cfa_register %rsp + .cfi_adjust_cfa_offset -8 + ret + .cfi_endproc + " + ), + options(att_syntax) +); + +// This function is the same as above, except that some instructions are +// [manually patched for LVI]. +// +// [manually patched for LVI]: https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection#specialinstructions +#[cfg(all( + target_arch = "x86_64", + all(target_env = "sgx", target_vendor = "fortanix") +))] +core::arch::global_asm!( + define_rust_probestack!( + " + .cfi_startproc + pushq %rbp + .cfi_adjust_cfa_offset 8 + .cfi_offset %rbp, -16 + movq %rsp, %rbp + .cfi_def_cfa_register %rbp + + mov %rax,%r11 // duplicate %rax as we're clobbering %r11 + + // Main loop, taken in one page increments. We're decrementing rsp by + // a page each time until there's less than a page remaining. We're + // guaranteed that this function isn't called unless there's more than a + // page needed. + // + // Note that we're also testing against `8(%rsp)` to account for the 8 + // bytes pushed on the stack orginally with our return address. Using + // `8(%rsp)` simulates us testing the stack pointer in the caller's + // context. + + // It's usually called when %rax >= 0x1000, but that's not always true. + // Dynamic stack allocation, which is needed to implement unsized + // rvalues, triggers stackprobe even if %rax < 0x1000. + // Thus we have to check %r11 first to avoid segfault. + cmp $0x1000,%r11 + jna 3f +2: + sub $0x1000,%rsp + test %rsp,8(%rsp) + sub $0x1000,%r11 + cmp $0x1000,%r11 + ja 2b + +3: + // Finish up the last remaining stack space requested, getting the last + // bits out of r11 + sub %r11,%rsp + test %rsp,8(%rsp) + + // Restore the stack pointer to what it previously was when entering + // this function. The caller will readjust the stack pointer after we + // return. + add %rax,%rsp + + leave + .cfi_def_cfa_register %rsp + .cfi_adjust_cfa_offset -8 + pop %r11 + lfence + jmp *%r11 + .cfi_endproc + " + ), + options(att_syntax) +); + +#[cfg(all(target_arch = "x86", not(target_os = "uefi")))] +// This is the same as x86_64 above, only translated for 32-bit sizes. Note +// that on Unix we're expected to restore everything as it was, this +// function basically can't tamper with anything. +// +// The ABI here is the same as x86_64, except everything is 32-bits large. +core::arch::global_asm!( + define_rust_probestack!( + " + .cfi_startproc + push %ebp + .cfi_adjust_cfa_offset 4 + .cfi_offset %ebp, -8 + mov %esp, %ebp + .cfi_def_cfa_register %ebp + push %ecx + mov %eax,%ecx + + cmp $0x1000,%ecx + jna 3f +2: + sub $0x1000,%esp + test %esp,8(%esp) + sub $0x1000,%ecx + cmp $0x1000,%ecx + ja 2b + +3: + sub %ecx,%esp + test %esp,8(%esp) + + add %eax,%esp + pop %ecx + leave + .cfi_def_cfa_register %esp + .cfi_adjust_cfa_offset -4 + ret + .cfi_endproc + " + ), + options(att_syntax) +); + +#[cfg(all(target_arch = "x86", target_os = "uefi"))] +// UEFI target is windows like target. LLVM will do _chkstk things like windows. +// probestack function will also do things like _chkstk in MSVC. +// So we need to sub %ax %sp in probestack when arch is x86. +// +// REF: Rust commit(74e80468347) +// rust\src\llvm-project\llvm\lib\Target\X86\X86FrameLowering.cpp: 805 +// Comments in LLVM: +// MSVC x32's _chkstk and cygwin/mingw's _alloca adjust %esp themselves. +// MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp +// themselves. +core::arch::global_asm!( + define_rust_probestack!( + " + .cfi_startproc + push %ebp + .cfi_adjust_cfa_offset 4 + .cfi_offset %ebp, -8 + mov %esp, %ebp + .cfi_def_cfa_register %ebp + push %ecx + push %edx + mov %eax,%ecx + + cmp $0x1000,%ecx + jna 3f +2: + sub $0x1000,%esp + test %esp,8(%esp) + sub $0x1000,%ecx + cmp $0x1000,%ecx + ja 2b + +3: + sub %ecx,%esp + test %esp,8(%esp) + mov 4(%ebp),%edx + mov %edx, 12(%esp) + add %eax,%esp + pop %edx + pop %ecx + leave + + sub %eax, %esp + .cfi_def_cfa_register %esp + .cfi_adjust_cfa_offset -4 + ret + .cfi_endproc + " + ), + options(att_syntax) +); diff --git a/vendor/compiler_builtins/src/riscv.rs b/vendor/compiler_builtins/src/riscv.rs new file mode 100644 index 000000000..ee78b9dba --- /dev/null +++ b/vendor/compiler_builtins/src/riscv.rs @@ -0,0 +1,34 @@ +intrinsics! { + // Implementation from gcc + // https://raw.githubusercontent.com/gcc-mirror/gcc/master/libgcc/config/epiphany/mulsi3.c + pub extern "C" fn __mulsi3(a: u32, b: u32) -> u32 { + let (mut a, mut b) = (a, b); + let mut r = 0; + + while a > 0 { + if a & 1 > 0 { + r += b; + } + a >>= 1; + b <<= 1; + } + + r + } + + #[cfg(not(target_feature = "m"))] + pub extern "C" fn __muldi3(a: u64, b: u64) -> u64 { + let (mut a, mut b) = (a, b); + let mut r = 0; + + while a > 0 { + if a & 1 > 0 { + r += b; + } + a >>= 1; + b <<= 1; + } + + r + } +} diff --git a/vendor/compiler_builtins/src/x86.rs b/vendor/compiler_builtins/src/x86.rs new file mode 100644 index 000000000..fd1f32e3a --- /dev/null +++ b/vendor/compiler_builtins/src/x86.rs @@ -0,0 +1,85 @@ +#![allow(unused_imports)] + +use core::intrinsics; + +// NOTE These functions are implemented using assembly because they using a custom +// calling convention which can't be implemented using a normal Rust function + +// NOTE These functions are never mangled as they are not tested against compiler-rt +// and mangling ___chkstk would break the `jmp ___chkstk` instruction in __alloca + +intrinsics! { + #[naked] + #[cfg(all( + windows, + target_env = "gnu", + not(feature = "no-asm") + ))] + pub unsafe extern "C" fn ___chkstk_ms() { + core::arch::asm!( + "push %ecx", + "push %eax", + "cmp $0x1000,%eax", + "lea 12(%esp),%ecx", + "jb 1f", + "2:", + "sub $0x1000,%ecx", + "test %ecx,(%ecx)", + "sub $0x1000,%eax", + "cmp $0x1000,%eax", + "ja 2b", + "1:", + "sub %eax,%ecx", + "test %ecx,(%ecx)", + "pop %eax", + "pop %ecx", + "ret", + options(noreturn, att_syntax) + ); + } + + // FIXME: __alloca should be an alias to __chkstk + #[naked] + #[cfg(all( + windows, + target_env = "gnu", + not(feature = "no-asm") + ))] + pub unsafe extern "C" fn __alloca() { + core::arch::asm!( + "jmp ___chkstk", // Jump to ___chkstk since fallthrough may be unreliable" + options(noreturn, att_syntax) + ); + } + + #[naked] + #[cfg(all( + windows, + target_env = "gnu", + not(feature = "no-asm") + ))] + pub unsafe extern "C" fn ___chkstk() { + core::arch::asm!( + "push %ecx", + "cmp $0x1000,%eax", + "lea 8(%esp),%ecx", // esp before calling this routine -> ecx + "jb 1f", + "2:", + "sub $0x1000,%ecx", + "test %ecx,(%ecx)", + "sub $0x1000,%eax", + "cmp $0x1000,%eax", + "ja 2b", + "1:", + "sub %eax,%ecx", + "test %ecx,(%ecx)", + "lea 4(%esp),%eax", // load pointer to the return address into eax + "mov %ecx,%esp", // install the new top of stack pointer into esp + "mov -4(%eax),%ecx", // restore ecx + "push (%eax)", // push return address onto the stack + "sub %esp,%eax", // restore the original value in eax + "ret", + options(noreturn, att_syntax) + ); + } +} diff --git a/vendor/compiler_builtins/src/x86_64.rs b/vendor/compiler_builtins/src/x86_64.rs new file mode 100644 index 000000000..393eeddd8 --- /dev/null +++ b/vendor/compiler_builtins/src/x86_64.rs @@ -0,0 +1,94 @@ +#![allow(unused_imports)] + +use core::intrinsics; + +// NOTE These functions are implemented using assembly because they using a custom +// calling convention which can't be implemented using a normal Rust function + +// NOTE These functions are never mangled as they are not tested against compiler-rt +// and mangling ___chkstk would break the `jmp ___chkstk` instruction in __alloca + +intrinsics! { + #[naked] + #[cfg(all( + windows, + target_env = "gnu", + not(feature = "no-asm") + ))] + pub unsafe extern "C" fn ___chkstk_ms() { + core::arch::asm!( + "push %rcx", + "push %rax", + "cmp $0x1000,%rax", + "lea 24(%rsp),%rcx", + "jb 1f", + "2:", + "sub $0x1000,%rcx", + "test %rcx,(%rcx)", + "sub $0x1000,%rax", + "cmp $0x1000,%rax", + "ja 2b", + "1:", + "sub %rax,%rcx", + "test %rcx,(%rcx)", + "pop %rax", + "pop %rcx", + "ret", + options(noreturn, att_syntax) + ); + } + + #[naked] + #[cfg(all( + windows, + target_env = "gnu", + not(feature = "no-asm") + ))] + pub unsafe extern "C" fn __alloca() { + core::arch::asm!( + "mov %rcx,%rax", // x64 _alloca is a normal function with parameter in rcx + "jmp ___chkstk", // Jump to ___chkstk since fallthrough may be unreliable" + options(noreturn, att_syntax) + ); + } + + #[naked] + #[cfg(all( + windows, + target_env = "gnu", + not(feature = "no-asm") + ))] + pub unsafe extern "C" fn ___chkstk() { + core::arch::asm!( + "push %rcx", + "cmp $0x1000,%rax", + "lea 16(%rsp),%rcx", // rsp before calling this routine -> rcx + "jb 1f", + "2:", + "sub $0x1000,%rcx", + "test %rcx,(%rcx)", + "sub $0x1000,%rax", + "cmp $0x1000,%rax", + "ja 2b", + "1:", + "sub %rax,%rcx", + "test %rcx,(%rcx)", + "lea 8(%rsp),%rax", // load pointer to the return address into rax + "mov %rcx,%rsp", // install the new top of stack pointer into rsp + "mov -8(%rax),%rcx", // restore rcx + "push (%rax)", // push return address onto the stack + "sub %rsp,%rax", // restore the original value in rax + "ret", + options(noreturn, att_syntax) + ); + } +} + +// HACK(https://github.com/rust-lang/rust/issues/62785): x86_64-unknown-uefi needs special LLVM +// support unless we emit the _fltused +mod _fltused { + #[no_mangle] + #[used] + #[cfg(target_os = "uefi")] + static _fltused: i32 = 0; +} -- cgit v1.2.3