diff options
Diffstat (limited to 'vendor/compiler_builtins/src')
-rw-r--r-- | vendor/compiler_builtins/src/arm.rs | 4 | ||||
-rw-r--r-- | vendor/compiler_builtins/src/arm_linux.rs | 110 | ||||
-rw-r--r-- | vendor/compiler_builtins/src/float/conv.rs | 8 | ||||
-rw-r--r-- | vendor/compiler_builtins/src/lib.rs | 2 | ||||
-rw-r--r-- | vendor/compiler_builtins/src/macros.rs | 12 | ||||
-rw-r--r-- | vendor/compiler_builtins/src/math.rs | 17 | ||||
-rw-r--r-- | vendor/compiler_builtins/src/mem/impls.rs | 14 | ||||
-rw-r--r-- | vendor/compiler_builtins/src/mem/mod.rs | 11 | ||||
-rw-r--r-- | vendor/compiler_builtins/src/mem/x86_64.rs | 150 |
9 files changed, 237 insertions, 91 deletions
diff --git a/vendor/compiler_builtins/src/arm.rs b/vendor/compiler_builtins/src/arm.rs index 9c1b6ad12..e517a9ef3 100644 --- a/vendor/compiler_builtins/src/arm.rs +++ b/vendor/compiler_builtins/src/arm.rs @@ -22,6 +22,7 @@ intrinsics! { // custom calling convention which can't be implemented using a normal Rust function. #[naked] #[cfg(not(target_env = "msvc"))] + #[cfg_attr(all(not(windows), not(target_vendor="apple")), linkage = "weak")] pub unsafe extern "C" fn __aeabi_uidivmod() { core::arch::asm!( "push {{lr}}", @@ -36,6 +37,7 @@ intrinsics! { } #[naked] + #[cfg_attr(all(not(windows), not(target_vendor="apple")), linkage = "weak")] pub unsafe extern "C" fn __aeabi_uldivmod() { core::arch::asm!( "push {{r4, lr}}", @@ -52,6 +54,7 @@ intrinsics! { } #[naked] + #[cfg_attr(all(not(windows), not(target_vendor="apple")), linkage = "weak")] pub unsafe extern "C" fn __aeabi_idivmod() { core::arch::asm!( "push {{r0, r1, r4, lr}}", @@ -65,6 +68,7 @@ intrinsics! { } #[naked] + #[cfg_attr(all(not(windows), not(target_vendor="apple")), linkage = "weak")] pub unsafe extern "C" fn __aeabi_ldivmod() { core::arch::asm!( "push {{r4, lr}}", diff --git a/vendor/compiler_builtins/src/arm_linux.rs b/vendor/compiler_builtins/src/arm_linux.rs index 8fe09485b..8f22eb628 100644 --- a/vendor/compiler_builtins/src/arm_linux.rs +++ b/vendor/compiler_builtins/src/arm_linux.rs @@ -55,7 +55,7 @@ fn insert_aligned(aligned: u32, val: u32, shift: u32, mask: u32) -> u32 { } // Generic atomic read-modify-write operation -unsafe fn atomic_rmw<T, F: Fn(u32) -> u32>(ptr: *mut T, f: F) -> u32 { +unsafe fn atomic_rmw<T, F: Fn(u32) -> u32, G: Fn(u32, u32) -> u32>(ptr: *mut T, f: F, g: G) -> u32 { let aligned_ptr = align_ptr(ptr); let (shift, mask) = get_shift_mask(ptr); @@ -65,7 +65,7 @@ unsafe fn atomic_rmw<T, F: Fn(u32) -> u32>(ptr: *mut T, f: F) -> u32 { let newval = f(curval); let newval_aligned = insert_aligned(curval_aligned, newval, shift, mask); if __kuser_cmpxchg(curval_aligned, newval_aligned, aligned_ptr) { - return curval; + return g(curval, newval); } } } @@ -89,13 +89,21 @@ unsafe fn atomic_cmpxchg<T>(ptr: *mut T, oldval: u32, newval: u32) -> u32 { } macro_rules! atomic_rmw { - ($name:ident, $ty:ty, $op:expr) => { + ($name:ident, $ty:ty, $op:expr, $fetch:expr) => { intrinsics! { pub unsafe extern "C" fn $name(ptr: *mut $ty, val: $ty) -> $ty { - atomic_rmw(ptr, |x| $op(x as $ty, val) as u32) as $ty + atomic_rmw(ptr, |x| $op(x as $ty, val) as u32, |old, new| $fetch(old, new)) as $ty } } }; + + (@old $name:ident, $ty:ty, $op:expr) => { + atomic_rmw!($name, $ty, $op, |old, _| old); + }; + + (@new $name:ident, $ty:ty, $op:expr) => { + atomic_rmw!($name, $ty, $op, |_, new| new); + }; } macro_rules! atomic_cmpxchg { ($name:ident, $ty:ty) => { @@ -107,101 +115,129 @@ macro_rules! atomic_cmpxchg { }; } -atomic_rmw!(__sync_fetch_and_add_1, u8, |a: u8, b: u8| a.wrapping_add(b)); -atomic_rmw!(__sync_fetch_and_add_2, u16, |a: u16, b: u16| a +atomic_rmw!(@old __sync_fetch_and_add_1, u8, |a: u8, b: u8| a.wrapping_add(b)); +atomic_rmw!(@old __sync_fetch_and_add_2, u16, |a: u16, b: u16| a + .wrapping_add(b)); +atomic_rmw!(@old __sync_fetch_and_add_4, u32, |a: u32, b: u32| a + .wrapping_add(b)); + +atomic_rmw!(@new __sync_add_and_fetch_1, u8, |a: u8, b: u8| a.wrapping_add(b)); +atomic_rmw!(@new __sync_add_and_fetch_2, u16, |a: u16, b: u16| a .wrapping_add(b)); -atomic_rmw!(__sync_fetch_and_add_4, u32, |a: u32, b: u32| a +atomic_rmw!(@new __sync_add_and_fetch_4, u32, |a: u32, b: u32| a .wrapping_add(b)); -atomic_rmw!(__sync_fetch_and_sub_1, u8, |a: u8, b: u8| a.wrapping_sub(b)); -atomic_rmw!(__sync_fetch_and_sub_2, u16, |a: u16, b: u16| a +atomic_rmw!(@old __sync_fetch_and_sub_1, u8, |a: u8, b: u8| a.wrapping_sub(b)); +atomic_rmw!(@old __sync_fetch_and_sub_2, u16, |a: u16, b: u16| a .wrapping_sub(b)); -atomic_rmw!(__sync_fetch_and_sub_4, u32, |a: u32, b: u32| a +atomic_rmw!(@old __sync_fetch_and_sub_4, u32, |a: u32, b: u32| a .wrapping_sub(b)); -atomic_rmw!(__sync_fetch_and_and_1, u8, |a: u8, b: u8| a & b); -atomic_rmw!(__sync_fetch_and_and_2, u16, |a: u16, b: u16| a & b); -atomic_rmw!(__sync_fetch_and_and_4, u32, |a: u32, b: u32| a & b); +atomic_rmw!(@new __sync_sub_and_fetch_1, u8, |a: u8, b: u8| a.wrapping_sub(b)); +atomic_rmw!(@new __sync_sub_and_fetch_2, u16, |a: u16, b: u16| a + .wrapping_sub(b)); +atomic_rmw!(@new __sync_sub_and_fetch_4, u32, |a: u32, b: u32| a + .wrapping_sub(b)); + +atomic_rmw!(@old __sync_fetch_and_and_1, u8, |a: u8, b: u8| a & b); +atomic_rmw!(@old __sync_fetch_and_and_2, u16, |a: u16, b: u16| a & b); +atomic_rmw!(@old __sync_fetch_and_and_4, u32, |a: u32, b: u32| a & b); + +atomic_rmw!(@new __sync_and_and_fetch_1, u8, |a: u8, b: u8| a & b); +atomic_rmw!(@new __sync_and_and_fetch_2, u16, |a: u16, b: u16| a & b); +atomic_rmw!(@new __sync_and_and_fetch_4, u32, |a: u32, b: u32| a & b); + +atomic_rmw!(@old __sync_fetch_and_or_1, u8, |a: u8, b: u8| a | b); +atomic_rmw!(@old __sync_fetch_and_or_2, u16, |a: u16, b: u16| a | b); +atomic_rmw!(@old __sync_fetch_and_or_4, u32, |a: u32, b: u32| a | b); + +atomic_rmw!(@new __sync_or_and_fetch_1, u8, |a: u8, b: u8| a | b); +atomic_rmw!(@new __sync_or_and_fetch_2, u16, |a: u16, b: u16| a | b); +atomic_rmw!(@new __sync_or_and_fetch_4, u32, |a: u32, b: u32| a | b); + +atomic_rmw!(@old __sync_fetch_and_xor_1, u8, |a: u8, b: u8| a ^ b); +atomic_rmw!(@old __sync_fetch_and_xor_2, u16, |a: u16, b: u16| a ^ b); +atomic_rmw!(@old __sync_fetch_and_xor_4, u32, |a: u32, b: u32| a ^ b); -atomic_rmw!(__sync_fetch_and_or_1, u8, |a: u8, b: u8| a | b); -atomic_rmw!(__sync_fetch_and_or_2, u16, |a: u16, b: u16| a | b); -atomic_rmw!(__sync_fetch_and_or_4, u32, |a: u32, b: u32| a | b); +atomic_rmw!(@new __sync_xor_and_fetch_1, u8, |a: u8, b: u8| a ^ b); +atomic_rmw!(@new __sync_xor_and_fetch_2, u16, |a: u16, b: u16| a ^ b); +atomic_rmw!(@new __sync_xor_and_fetch_4, u32, |a: u32, b: u32| a ^ b); -atomic_rmw!(__sync_fetch_and_xor_1, u8, |a: u8, b: u8| a ^ b); -atomic_rmw!(__sync_fetch_and_xor_2, u16, |a: u16, b: u16| a ^ b); -atomic_rmw!(__sync_fetch_and_xor_4, u32, |a: u32, b: u32| a ^ b); +atomic_rmw!(@old __sync_fetch_and_nand_1, u8, |a: u8, b: u8| !(a & b)); +atomic_rmw!(@old __sync_fetch_and_nand_2, u16, |a: u16, b: u16| !(a & b)); +atomic_rmw!(@old __sync_fetch_and_nand_4, u32, |a: u32, b: u32| !(a & b)); -atomic_rmw!(__sync_fetch_and_nand_1, u8, |a: u8, b: u8| !(a & b)); -atomic_rmw!(__sync_fetch_and_nand_2, u16, |a: u16, b: u16| !(a & b)); -atomic_rmw!(__sync_fetch_and_nand_4, u32, |a: u32, b: u32| !(a & b)); +atomic_rmw!(@new __sync_nand_and_fetch_1, u8, |a: u8, b: u8| !(a & b)); +atomic_rmw!(@new __sync_nand_and_fetch_2, u16, |a: u16, b: u16| !(a & b)); +atomic_rmw!(@new __sync_nand_and_fetch_4, u32, |a: u32, b: u32| !(a & b)); -atomic_rmw!(__sync_fetch_and_max_1, i8, |a: i8, b: i8| if a > b { +atomic_rmw!(@old __sync_fetch_and_max_1, i8, |a: i8, b: i8| if a > b { a } else { b }); -atomic_rmw!(__sync_fetch_and_max_2, i16, |a: i16, b: i16| if a > b { +atomic_rmw!(@old __sync_fetch_and_max_2, i16, |a: i16, b: i16| if a > b { a } else { b }); -atomic_rmw!(__sync_fetch_and_max_4, i32, |a: i32, b: i32| if a > b { +atomic_rmw!(@old __sync_fetch_and_max_4, i32, |a: i32, b: i32| if a > b { a } else { b }); -atomic_rmw!(__sync_fetch_and_umax_1, u8, |a: u8, b: u8| if a > b { +atomic_rmw!(@old __sync_fetch_and_umax_1, u8, |a: u8, b: u8| if a > b { a } else { b }); -atomic_rmw!(__sync_fetch_and_umax_2, u16, |a: u16, b: u16| if a > b { +atomic_rmw!(@old __sync_fetch_and_umax_2, u16, |a: u16, b: u16| if a > b { a } else { b }); -atomic_rmw!(__sync_fetch_and_umax_4, u32, |a: u32, b: u32| if a > b { +atomic_rmw!(@old __sync_fetch_and_umax_4, u32, |a: u32, b: u32| if a > b { a } else { b }); -atomic_rmw!(__sync_fetch_and_min_1, i8, |a: i8, b: i8| if a < b { +atomic_rmw!(@old __sync_fetch_and_min_1, i8, |a: i8, b: i8| if a < b { a } else { b }); -atomic_rmw!(__sync_fetch_and_min_2, i16, |a: i16, b: i16| if a < b { +atomic_rmw!(@old __sync_fetch_and_min_2, i16, |a: i16, b: i16| if a < b { a } else { b }); -atomic_rmw!(__sync_fetch_and_min_4, i32, |a: i32, b: i32| if a < b { +atomic_rmw!(@old __sync_fetch_and_min_4, i32, |a: i32, b: i32| if a < b { a } else { b }); -atomic_rmw!(__sync_fetch_and_umin_1, u8, |a: u8, b: u8| if a < b { +atomic_rmw!(@old __sync_fetch_and_umin_1, u8, |a: u8, b: u8| if a < b { a } else { b }); -atomic_rmw!(__sync_fetch_and_umin_2, u16, |a: u16, b: u16| if a < b { +atomic_rmw!(@old __sync_fetch_and_umin_2, u16, |a: u16, b: u16| if a < b { a } else { b }); -atomic_rmw!(__sync_fetch_and_umin_4, u32, |a: u32, b: u32| if a < b { +atomic_rmw!(@old __sync_fetch_and_umin_4, u32, |a: u32, b: u32| if a < b { a } else { b }); -atomic_rmw!(__sync_lock_test_and_set_1, u8, |_: u8, b: u8| b); -atomic_rmw!(__sync_lock_test_and_set_2, u16, |_: u16, b: u16| b); -atomic_rmw!(__sync_lock_test_and_set_4, u32, |_: u32, b: u32| b); +atomic_rmw!(@old __sync_lock_test_and_set_1, u8, |_: u8, b: u8| b); +atomic_rmw!(@old __sync_lock_test_and_set_2, u16, |_: u16, b: u16| b); +atomic_rmw!(@old __sync_lock_test_and_set_4, u32, |_: u32, b: u32| b); atomic_cmpxchg!(__sync_val_compare_and_swap_1, u8); atomic_cmpxchg!(__sync_val_compare_and_swap_2, u16); diff --git a/vendor/compiler_builtins/src/float/conv.rs b/vendor/compiler_builtins/src/float/conv.rs index 07b58f3d2..68ba63408 100644 --- a/vendor/compiler_builtins/src/float/conv.rs +++ b/vendor/compiler_builtins/src/float/conv.rs @@ -92,12 +92,12 @@ intrinsics! { f64::from_bits(int_to_float::u64_to_f64_bits(i)) } - #[cfg_attr(not(target_feature = "llvm14-builtins-abi"), unadjusted_on_win64)] + #[cfg_attr(any(not(target_feature = "llvm14-builtins-abi"), target_os = "uefi"), unadjusted_on_win64)] pub extern "C" fn __floatuntisf(i: u128) -> f32 { f32::from_bits(int_to_float::u128_to_f32_bits(i)) } - #[cfg_attr(not(target_feature = "llvm14-builtins-abi"), unadjusted_on_win64)] + #[cfg_attr(any(not(target_feature = "llvm14-builtins-abi"), target_os = "uefi"), unadjusted_on_win64)] pub extern "C" fn __floatuntidf(i: u128) -> f64 { f64::from_bits(int_to_float::u128_to_f64_bits(i)) } @@ -129,13 +129,13 @@ intrinsics! { f64::from_bits(int_to_float::u64_to_f64_bits(i.unsigned_abs()) | sign_bit) } - #[cfg_attr(not(target_feature = "llvm14-builtins-abi"), unadjusted_on_win64)] + #[cfg_attr(any(not(target_feature = "llvm14-builtins-abi"), target_os = "uefi"), unadjusted_on_win64)] pub extern "C" fn __floattisf(i: i128) -> f32 { let sign_bit = ((i >> 127) as u32) << 31; f32::from_bits(int_to_float::u128_to_f32_bits(i.unsigned_abs()) | sign_bit) } - #[cfg_attr(not(target_feature = "llvm14-builtins-abi"), unadjusted_on_win64)] + #[cfg_attr(any(not(target_feature = "llvm14-builtins-abi"), target_os = "uefi"), unadjusted_on_win64)] pub extern "C" fn __floattidf(i: i128) -> f64 { let sign_bit = ((i >> 127) as u64) << 63; f64::from_bits(int_to_float::u128_to_f64_bits(i.unsigned_abs()) | sign_bit) diff --git a/vendor/compiler_builtins/src/lib.rs b/vendor/compiler_builtins/src/lib.rs index 009923d27..e7bc61e4c 100644 --- a/vendor/compiler_builtins/src/lib.rs +++ b/vendor/compiler_builtins/src/lib.rs @@ -6,6 +6,7 @@ #![feature(compiler_builtins)] #![feature(core_ffi_c)] #![feature(core_intrinsics)] +#![feature(inline_const)] #![feature(lang_items)] #![feature(linkage)] #![feature(naked_functions)] @@ -45,6 +46,7 @@ pub mod int; all(target_family = "wasm", target_os = "unknown"), all(target_arch = "x86_64", target_os = "uefi"), all(target_arch = "arm", target_os = "none"), + target_os = "xous", all(target_vendor = "fortanix", target_env = "sgx") ))] pub mod math; diff --git a/vendor/compiler_builtins/src/macros.rs b/vendor/compiler_builtins/src/macros.rs index 518a18d4d..7d90b7aad 100644 --- a/vendor/compiler_builtins/src/macros.rs +++ b/vendor/compiler_builtins/src/macros.rs @@ -174,7 +174,7 @@ macro_rules! intrinsics { $($rest:tt)* ) => ( - #[cfg(all(windows, target_pointer_width = "64"))] + #[cfg(all(any(windows, all(target_os = "uefi", target_arch = "x86_64")), target_pointer_width = "64"))] intrinsics! { $(#[$($attr)*])* pub extern "unadjusted" fn $name( $($argname: $ty),* ) $(-> $ret)? { @@ -182,7 +182,7 @@ macro_rules! intrinsics { } } - #[cfg(not(all(windows, target_pointer_width = "64")))] + #[cfg(not(all(any(windows, all(target_os = "uefi", target_arch = "x86_64")), target_pointer_width = "64")))] intrinsics! { $(#[$($attr)*])* pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? { @@ -209,13 +209,13 @@ macro_rules! intrinsics { $($rest:tt)* ) => ( - #[cfg(all(windows, target_arch = "x86_64"))] + #[cfg(all(any(windows, target_os = "uefi"), target_arch = "x86_64"))] $(#[$($attr)*])* pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? { $($body)* } - #[cfg(all(windows, target_arch = "x86_64"))] + #[cfg(all(any(windows, target_os = "uefi"), target_arch = "x86_64"))] pub mod $name { #[cfg_attr(not(feature = "mangled-names"), no_mangle)] pub extern $abi fn $name( $($argname: $ty),* ) @@ -226,7 +226,7 @@ macro_rules! intrinsics { } } - #[cfg(not(all(windows, target_arch = "x86_64")))] + #[cfg(not(all(any(windows, target_os = "uefi"), target_arch = "x86_64")))] intrinsics! { $(#[$($attr)*])* pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? { @@ -426,7 +426,7 @@ macro_rules! intrinsics { // Hack for LLVM expectations for ABI on windows. This is used by the // `#[win64_128bit_abi_hack]` attribute recognized above -#[cfg(all(windows, target_pointer_width = "64"))] +#[cfg(all(any(windows, target_os = "uefi"), target_pointer_width = "64"))] pub mod win64_128bit_abi_hack { #[repr(simd)] pub struct U64x2(u64, u64); diff --git a/vendor/compiler_builtins/src/math.rs b/vendor/compiler_builtins/src/math.rs index fa59753f8..fa9836186 100644 --- a/vendor/compiler_builtins/src/math.rs +++ b/vendor/compiler_builtins/src/math.rs @@ -20,6 +20,7 @@ macro_rules! no_mangle { target_os = "unknown", not(target_env = "wasi") ), + target_os = "xous", all(target_arch = "x86_64", target_os = "uefi"), all(target_arch = "xtensa", target_os = "none"), all(target_vendor = "fortanix", target_env = "sgx") @@ -62,6 +63,8 @@ no_mangle! { fn tanhf(n: f32) -> f32; fn ldexp(f: f64, n: i32) -> f64; fn ldexpf(f: f32, n: i32) -> f32; + fn tgamma(x: f64) -> f64; + fn tgammaf(x: f32) -> f32; } #[cfg(any( @@ -70,6 +73,8 @@ no_mangle! { target_os = "unknown", not(target_env = "wasi") ), + target_os = "xous", + all(target_arch = "x86_64", target_os = "uefi"), all(target_arch = "xtensa", target_os = "none"), all(target_vendor = "fortanix", target_env = "sgx") ))] @@ -93,7 +98,17 @@ no_mangle! { fn tanf(n: f32) -> f32; } -#[cfg(all(target_vendor = "fortanix", target_env = "sgx"))] +#[cfg(any(target_os = "xous", target_os = "uefi"))] +no_mangle! { + fn sqrtf(x: f32) -> f32; + fn sqrt(x: f64) -> f64; +} + +#[cfg(any( + all(target_vendor = "fortanix", target_env = "sgx"), + target_os = "xous", + target_os = "uefi" +))] no_mangle! { fn ceil(x: f64) -> f64; fn ceilf(x: f32) -> f32; diff --git a/vendor/compiler_builtins/src/mem/impls.rs b/vendor/compiler_builtins/src/mem/impls.rs index 815132425..72003a5c4 100644 --- a/vendor/compiler_builtins/src/mem/impls.rs +++ b/vendor/compiler_builtins/src/mem/impls.rs @@ -265,3 +265,17 @@ pub unsafe fn set_bytes(mut s: *mut u8, c: u8, mut n: usize) { } set_bytes_bytes(s, c, n); } + +#[inline(always)] +pub unsafe fn compare_bytes(s1: *const u8, s2: *const u8, n: usize) -> i32 { + let mut i = 0; + while i < n { + let a = *s1.add(i); + let b = *s2.add(i); + if a != b { + return a as i32 - b as i32; + } + i += 1; + } + 0 +} diff --git a/vendor/compiler_builtins/src/mem/mod.rs b/vendor/compiler_builtins/src/mem/mod.rs index a55113861..c5b0ddc16 100644 --- a/vendor/compiler_builtins/src/mem/mod.rs +++ b/vendor/compiler_builtins/src/mem/mod.rs @@ -51,16 +51,7 @@ intrinsics! { #[mem_builtin] #[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), linkage = "weak")] pub unsafe extern "C" fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 { - let mut i = 0; - while i < n { - let a = *s1.add(i); - let b = *s2.add(i); - if a != b { - return a as i32 - b as i32; - } - i += 1; - } - 0 + impls::compare_bytes(s1, s2, n) } #[mem_builtin] diff --git a/vendor/compiler_builtins/src/mem/x86_64.rs b/vendor/compiler_builtins/src/mem/x86_64.rs index a7ab6f605..17b461f79 100644 --- a/vendor/compiler_builtins/src/mem/x86_64.rs +++ b/vendor/compiler_builtins/src/mem/x86_64.rs @@ -16,6 +16,10 @@ // feature is present at compile-time. We don't bother detecting other features. // Note that ERMSB does not enhance the backwards (DF=1) "rep movsb". +use core::arch::asm; +use core::intrinsics; +use core::mem; + #[inline(always)] #[cfg(target_feature = "ermsb")] pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) { @@ -31,16 +35,26 @@ pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) { #[inline(always)] #[cfg(not(target_feature = "ermsb"))] -pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) { - let qword_count = count >> 3; - let byte_count = count & 0b111; - // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust. - core::arch::asm!( - "repe movsq (%rsi), (%rdi)", - "mov {byte_count:e}, %ecx", - "repe movsb (%rsi), (%rdi)", - byte_count = in(reg) byte_count, +pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, count: usize) { + let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count); + // Separating the blocks gives the compiler more freedom to reorder instructions. + asm!( + "rep movsb", + inout("ecx") pre_byte_count => _, + inout("rdi") dest => dest, + inout("rsi") src => src, + options(att_syntax, nostack, preserves_flags) + ); + asm!( + "rep movsq", inout("rcx") qword_count => _, + inout("rdi") dest => dest, + inout("rsi") src => src, + options(att_syntax, nostack, preserves_flags) + ); + asm!( + "rep movsb", + inout("ecx") byte_count => _, inout("rdi") dest => _, inout("rsi") src => _, options(att_syntax, nostack, preserves_flags) @@ -49,22 +63,28 @@ pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) { #[inline(always)] pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) { - let qword_count = count >> 3; - let byte_count = count & 0b111; - // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust. - core::arch::asm!( + let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count); + // We can't separate this block due to std/cld + asm!( "std", - "repe movsq (%rsi), (%rdi)", - "movl {byte_count:e}, %ecx", - "addq $7, %rdi", - "addq $7, %rsi", - "repe movsb (%rsi), (%rdi)", + "rep movsb", + "sub $7, %rsi", + "sub $7, %rdi", + "mov {qword_count}, %rcx", + "rep movsq", + "test {pre_byte_count:e}, {pre_byte_count:e}", + "add $7, %rsi", + "add $7, %rdi", + "mov {pre_byte_count:e}, %ecx", + "rep movsb", "cld", - byte_count = in(reg) byte_count, - inout("rcx") qword_count => _, - inout("rdi") dest.add(count).wrapping_sub(8) => _, - inout("rsi") src.add(count).wrapping_sub(8) => _, - options(att_syntax, nostack) + pre_byte_count = in(reg) pre_byte_count, + qword_count = in(reg) qword_count, + inout("ecx") byte_count => _, + inout("rdi") dest.add(count - 1) => _, + inout("rsi") src.add(count - 1) => _, + // We modify flags, but we restore it afterwards + options(att_syntax, nostack, preserves_flags) ); } @@ -83,18 +103,82 @@ pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) { #[inline(always)] #[cfg(not(target_feature = "ermsb"))] -pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) { - let qword_count = count >> 3; - let byte_count = count & 0b111; - // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust. - core::arch::asm!( - "repe stosq %rax, (%rdi)", - "mov {byte_count:e}, %ecx", - "repe stosb %al, (%rdi)", - byte_count = in(reg) byte_count, +pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) { + let c = c as u64 * 0x0101_0101_0101_0101; + let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count); + // Separating the blocks gives the compiler more freedom to reorder instructions. + asm!( + "rep stosb", + inout("ecx") pre_byte_count => _, + inout("rdi") dest => dest, + in("rax") c, + options(att_syntax, nostack, preserves_flags) + ); + asm!( + "rep stosq", inout("rcx") qword_count => _, + inout("rdi") dest => dest, + in("rax") c, + options(att_syntax, nostack, preserves_flags) + ); + asm!( + "rep stosb", + inout("ecx") byte_count => _, inout("rdi") dest => _, - in("rax") (c as u64) * 0x0101010101010101, + in("rax") c, options(att_syntax, nostack, preserves_flags) ); } + +#[inline(always)] +pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 { + #[inline(always)] + unsafe fn cmp<T, U, F>(mut a: *const T, mut b: *const T, n: usize, f: F) -> i32 + where + T: Clone + Copy + Eq, + U: Clone + Copy + Eq, + F: FnOnce(*const U, *const U, usize) -> i32, + { + // Ensure T is not a ZST. + const { assert!(mem::size_of::<T>() != 0) }; + + let end = a.add(intrinsics::unchecked_div(n, mem::size_of::<T>())); + while a != end { + if a.read_unaligned() != b.read_unaligned() { + return f(a.cast(), b.cast(), mem::size_of::<T>()); + } + a = a.add(1); + b = b.add(1); + } + f( + a.cast(), + b.cast(), + intrinsics::unchecked_rem(n, mem::size_of::<T>()), + ) + } + let c1 = |mut a: *const u8, mut b: *const u8, n| { + for _ in 0..n { + if a.read() != b.read() { + return i32::from(a.read()) - i32::from(b.read()); + } + a = a.add(1); + b = b.add(1); + } + 0 + }; + let c2 = |a: *const u16, b, n| cmp(a, b, n, c1); + let c4 = |a: *const u32, b, n| cmp(a, b, n, c2); + let c8 = |a: *const u64, b, n| cmp(a, b, n, c4); + let c16 = |a: *const u128, b, n| cmp(a, b, n, c8); + c16(a.cast(), b.cast(), n) +} + +/// Determine optimal parameters for a `rep` instruction. +fn rep_param(dest: *mut u8, mut count: usize) -> (usize, usize, usize) { + // Unaligned writes are still slow on modern processors, so align the destination address. + let pre_byte_count = ((8 - (dest as usize & 0b111)) & 0b111).min(count); + count -= pre_byte_count; + let qword_count = count >> 3; + let byte_count = count & 0b111; + (pre_byte_count, qword_count, byte_count) +} |