summaryrefslogtreecommitdiffstats
path: root/vendor/compiler_builtins/src
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/compiler_builtins/src')
-rw-r--r--vendor/compiler_builtins/src/arm.rs4
-rw-r--r--vendor/compiler_builtins/src/arm_linux.rs110
-rw-r--r--vendor/compiler_builtins/src/float/conv.rs8
-rw-r--r--vendor/compiler_builtins/src/lib.rs2
-rw-r--r--vendor/compiler_builtins/src/macros.rs12
-rw-r--r--vendor/compiler_builtins/src/math.rs17
-rw-r--r--vendor/compiler_builtins/src/mem/impls.rs14
-rw-r--r--vendor/compiler_builtins/src/mem/mod.rs11
-rw-r--r--vendor/compiler_builtins/src/mem/x86_64.rs150
9 files changed, 237 insertions, 91 deletions
diff --git a/vendor/compiler_builtins/src/arm.rs b/vendor/compiler_builtins/src/arm.rs
index 9c1b6ad12..e517a9ef3 100644
--- a/vendor/compiler_builtins/src/arm.rs
+++ b/vendor/compiler_builtins/src/arm.rs
@@ -22,6 +22,7 @@ intrinsics! {
// custom calling convention which can't be implemented using a normal Rust function.
#[naked]
#[cfg(not(target_env = "msvc"))]
+ #[cfg_attr(all(not(windows), not(target_vendor="apple")), linkage = "weak")]
pub unsafe extern "C" fn __aeabi_uidivmod() {
core::arch::asm!(
"push {{lr}}",
@@ -36,6 +37,7 @@ intrinsics! {
}
#[naked]
+ #[cfg_attr(all(not(windows), not(target_vendor="apple")), linkage = "weak")]
pub unsafe extern "C" fn __aeabi_uldivmod() {
core::arch::asm!(
"push {{r4, lr}}",
@@ -52,6 +54,7 @@ intrinsics! {
}
#[naked]
+ #[cfg_attr(all(not(windows), not(target_vendor="apple")), linkage = "weak")]
pub unsafe extern "C" fn __aeabi_idivmod() {
core::arch::asm!(
"push {{r0, r1, r4, lr}}",
@@ -65,6 +68,7 @@ intrinsics! {
}
#[naked]
+ #[cfg_attr(all(not(windows), not(target_vendor="apple")), linkage = "weak")]
pub unsafe extern "C" fn __aeabi_ldivmod() {
core::arch::asm!(
"push {{r4, lr}}",
diff --git a/vendor/compiler_builtins/src/arm_linux.rs b/vendor/compiler_builtins/src/arm_linux.rs
index 8fe09485b..8f22eb628 100644
--- a/vendor/compiler_builtins/src/arm_linux.rs
+++ b/vendor/compiler_builtins/src/arm_linux.rs
@@ -55,7 +55,7 @@ fn insert_aligned(aligned: u32, val: u32, shift: u32, mask: u32) -> u32 {
}
// Generic atomic read-modify-write operation
-unsafe fn atomic_rmw<T, F: Fn(u32) -> u32>(ptr: *mut T, f: F) -> u32 {
+unsafe fn atomic_rmw<T, F: Fn(u32) -> u32, G: Fn(u32, u32) -> u32>(ptr: *mut T, f: F, g: G) -> u32 {
let aligned_ptr = align_ptr(ptr);
let (shift, mask) = get_shift_mask(ptr);
@@ -65,7 +65,7 @@ unsafe fn atomic_rmw<T, F: Fn(u32) -> u32>(ptr: *mut T, f: F) -> u32 {
let newval = f(curval);
let newval_aligned = insert_aligned(curval_aligned, newval, shift, mask);
if __kuser_cmpxchg(curval_aligned, newval_aligned, aligned_ptr) {
- return curval;
+ return g(curval, newval);
}
}
}
@@ -89,13 +89,21 @@ unsafe fn atomic_cmpxchg<T>(ptr: *mut T, oldval: u32, newval: u32) -> u32 {
}
macro_rules! atomic_rmw {
- ($name:ident, $ty:ty, $op:expr) => {
+ ($name:ident, $ty:ty, $op:expr, $fetch:expr) => {
intrinsics! {
pub unsafe extern "C" fn $name(ptr: *mut $ty, val: $ty) -> $ty {
- atomic_rmw(ptr, |x| $op(x as $ty, val) as u32) as $ty
+ atomic_rmw(ptr, |x| $op(x as $ty, val) as u32, |old, new| $fetch(old, new)) as $ty
}
}
};
+
+ (@old $name:ident, $ty:ty, $op:expr) => {
+ atomic_rmw!($name, $ty, $op, |old, _| old);
+ };
+
+ (@new $name:ident, $ty:ty, $op:expr) => {
+ atomic_rmw!($name, $ty, $op, |_, new| new);
+ };
}
macro_rules! atomic_cmpxchg {
($name:ident, $ty:ty) => {
@@ -107,101 +115,129 @@ macro_rules! atomic_cmpxchg {
};
}
-atomic_rmw!(__sync_fetch_and_add_1, u8, |a: u8, b: u8| a.wrapping_add(b));
-atomic_rmw!(__sync_fetch_and_add_2, u16, |a: u16, b: u16| a
+atomic_rmw!(@old __sync_fetch_and_add_1, u8, |a: u8, b: u8| a.wrapping_add(b));
+atomic_rmw!(@old __sync_fetch_and_add_2, u16, |a: u16, b: u16| a
+ .wrapping_add(b));
+atomic_rmw!(@old __sync_fetch_and_add_4, u32, |a: u32, b: u32| a
+ .wrapping_add(b));
+
+atomic_rmw!(@new __sync_add_and_fetch_1, u8, |a: u8, b: u8| a.wrapping_add(b));
+atomic_rmw!(@new __sync_add_and_fetch_2, u16, |a: u16, b: u16| a
.wrapping_add(b));
-atomic_rmw!(__sync_fetch_and_add_4, u32, |a: u32, b: u32| a
+atomic_rmw!(@new __sync_add_and_fetch_4, u32, |a: u32, b: u32| a
.wrapping_add(b));
-atomic_rmw!(__sync_fetch_and_sub_1, u8, |a: u8, b: u8| a.wrapping_sub(b));
-atomic_rmw!(__sync_fetch_and_sub_2, u16, |a: u16, b: u16| a
+atomic_rmw!(@old __sync_fetch_and_sub_1, u8, |a: u8, b: u8| a.wrapping_sub(b));
+atomic_rmw!(@old __sync_fetch_and_sub_2, u16, |a: u16, b: u16| a
.wrapping_sub(b));
-atomic_rmw!(__sync_fetch_and_sub_4, u32, |a: u32, b: u32| a
+atomic_rmw!(@old __sync_fetch_and_sub_4, u32, |a: u32, b: u32| a
.wrapping_sub(b));
-atomic_rmw!(__sync_fetch_and_and_1, u8, |a: u8, b: u8| a & b);
-atomic_rmw!(__sync_fetch_and_and_2, u16, |a: u16, b: u16| a & b);
-atomic_rmw!(__sync_fetch_and_and_4, u32, |a: u32, b: u32| a & b);
+atomic_rmw!(@new __sync_sub_and_fetch_1, u8, |a: u8, b: u8| a.wrapping_sub(b));
+atomic_rmw!(@new __sync_sub_and_fetch_2, u16, |a: u16, b: u16| a
+ .wrapping_sub(b));
+atomic_rmw!(@new __sync_sub_and_fetch_4, u32, |a: u32, b: u32| a
+ .wrapping_sub(b));
+
+atomic_rmw!(@old __sync_fetch_and_and_1, u8, |a: u8, b: u8| a & b);
+atomic_rmw!(@old __sync_fetch_and_and_2, u16, |a: u16, b: u16| a & b);
+atomic_rmw!(@old __sync_fetch_and_and_4, u32, |a: u32, b: u32| a & b);
+
+atomic_rmw!(@new __sync_and_and_fetch_1, u8, |a: u8, b: u8| a & b);
+atomic_rmw!(@new __sync_and_and_fetch_2, u16, |a: u16, b: u16| a & b);
+atomic_rmw!(@new __sync_and_and_fetch_4, u32, |a: u32, b: u32| a & b);
+
+atomic_rmw!(@old __sync_fetch_and_or_1, u8, |a: u8, b: u8| a | b);
+atomic_rmw!(@old __sync_fetch_and_or_2, u16, |a: u16, b: u16| a | b);
+atomic_rmw!(@old __sync_fetch_and_or_4, u32, |a: u32, b: u32| a | b);
+
+atomic_rmw!(@new __sync_or_and_fetch_1, u8, |a: u8, b: u8| a | b);
+atomic_rmw!(@new __sync_or_and_fetch_2, u16, |a: u16, b: u16| a | b);
+atomic_rmw!(@new __sync_or_and_fetch_4, u32, |a: u32, b: u32| a | b);
+
+atomic_rmw!(@old __sync_fetch_and_xor_1, u8, |a: u8, b: u8| a ^ b);
+atomic_rmw!(@old __sync_fetch_and_xor_2, u16, |a: u16, b: u16| a ^ b);
+atomic_rmw!(@old __sync_fetch_and_xor_4, u32, |a: u32, b: u32| a ^ b);
-atomic_rmw!(__sync_fetch_and_or_1, u8, |a: u8, b: u8| a | b);
-atomic_rmw!(__sync_fetch_and_or_2, u16, |a: u16, b: u16| a | b);
-atomic_rmw!(__sync_fetch_and_or_4, u32, |a: u32, b: u32| a | b);
+atomic_rmw!(@new __sync_xor_and_fetch_1, u8, |a: u8, b: u8| a ^ b);
+atomic_rmw!(@new __sync_xor_and_fetch_2, u16, |a: u16, b: u16| a ^ b);
+atomic_rmw!(@new __sync_xor_and_fetch_4, u32, |a: u32, b: u32| a ^ b);
-atomic_rmw!(__sync_fetch_and_xor_1, u8, |a: u8, b: u8| a ^ b);
-atomic_rmw!(__sync_fetch_and_xor_2, u16, |a: u16, b: u16| a ^ b);
-atomic_rmw!(__sync_fetch_and_xor_4, u32, |a: u32, b: u32| a ^ b);
+atomic_rmw!(@old __sync_fetch_and_nand_1, u8, |a: u8, b: u8| !(a & b));
+atomic_rmw!(@old __sync_fetch_and_nand_2, u16, |a: u16, b: u16| !(a & b));
+atomic_rmw!(@old __sync_fetch_and_nand_4, u32, |a: u32, b: u32| !(a & b));
-atomic_rmw!(__sync_fetch_and_nand_1, u8, |a: u8, b: u8| !(a & b));
-atomic_rmw!(__sync_fetch_and_nand_2, u16, |a: u16, b: u16| !(a & b));
-atomic_rmw!(__sync_fetch_and_nand_4, u32, |a: u32, b: u32| !(a & b));
+atomic_rmw!(@new __sync_nand_and_fetch_1, u8, |a: u8, b: u8| !(a & b));
+atomic_rmw!(@new __sync_nand_and_fetch_2, u16, |a: u16, b: u16| !(a & b));
+atomic_rmw!(@new __sync_nand_and_fetch_4, u32, |a: u32, b: u32| !(a & b));
-atomic_rmw!(__sync_fetch_and_max_1, i8, |a: i8, b: i8| if a > b {
+atomic_rmw!(@old __sync_fetch_and_max_1, i8, |a: i8, b: i8| if a > b {
a
} else {
b
});
-atomic_rmw!(__sync_fetch_and_max_2, i16, |a: i16, b: i16| if a > b {
+atomic_rmw!(@old __sync_fetch_and_max_2, i16, |a: i16, b: i16| if a > b {
a
} else {
b
});
-atomic_rmw!(__sync_fetch_and_max_4, i32, |a: i32, b: i32| if a > b {
+atomic_rmw!(@old __sync_fetch_and_max_4, i32, |a: i32, b: i32| if a > b {
a
} else {
b
});
-atomic_rmw!(__sync_fetch_and_umax_1, u8, |a: u8, b: u8| if a > b {
+atomic_rmw!(@old __sync_fetch_and_umax_1, u8, |a: u8, b: u8| if a > b {
a
} else {
b
});
-atomic_rmw!(__sync_fetch_and_umax_2, u16, |a: u16, b: u16| if a > b {
+atomic_rmw!(@old __sync_fetch_and_umax_2, u16, |a: u16, b: u16| if a > b {
a
} else {
b
});
-atomic_rmw!(__sync_fetch_and_umax_4, u32, |a: u32, b: u32| if a > b {
+atomic_rmw!(@old __sync_fetch_and_umax_4, u32, |a: u32, b: u32| if a > b {
a
} else {
b
});
-atomic_rmw!(__sync_fetch_and_min_1, i8, |a: i8, b: i8| if a < b {
+atomic_rmw!(@old __sync_fetch_and_min_1, i8, |a: i8, b: i8| if a < b {
a
} else {
b
});
-atomic_rmw!(__sync_fetch_and_min_2, i16, |a: i16, b: i16| if a < b {
+atomic_rmw!(@old __sync_fetch_and_min_2, i16, |a: i16, b: i16| if a < b {
a
} else {
b
});
-atomic_rmw!(__sync_fetch_and_min_4, i32, |a: i32, b: i32| if a < b {
+atomic_rmw!(@old __sync_fetch_and_min_4, i32, |a: i32, b: i32| if a < b {
a
} else {
b
});
-atomic_rmw!(__sync_fetch_and_umin_1, u8, |a: u8, b: u8| if a < b {
+atomic_rmw!(@old __sync_fetch_and_umin_1, u8, |a: u8, b: u8| if a < b {
a
} else {
b
});
-atomic_rmw!(__sync_fetch_and_umin_2, u16, |a: u16, b: u16| if a < b {
+atomic_rmw!(@old __sync_fetch_and_umin_2, u16, |a: u16, b: u16| if a < b {
a
} else {
b
});
-atomic_rmw!(__sync_fetch_and_umin_4, u32, |a: u32, b: u32| if a < b {
+atomic_rmw!(@old __sync_fetch_and_umin_4, u32, |a: u32, b: u32| if a < b {
a
} else {
b
});
-atomic_rmw!(__sync_lock_test_and_set_1, u8, |_: u8, b: u8| b);
-atomic_rmw!(__sync_lock_test_and_set_2, u16, |_: u16, b: u16| b);
-atomic_rmw!(__sync_lock_test_and_set_4, u32, |_: u32, b: u32| b);
+atomic_rmw!(@old __sync_lock_test_and_set_1, u8, |_: u8, b: u8| b);
+atomic_rmw!(@old __sync_lock_test_and_set_2, u16, |_: u16, b: u16| b);
+atomic_rmw!(@old __sync_lock_test_and_set_4, u32, |_: u32, b: u32| b);
atomic_cmpxchg!(__sync_val_compare_and_swap_1, u8);
atomic_cmpxchg!(__sync_val_compare_and_swap_2, u16);
diff --git a/vendor/compiler_builtins/src/float/conv.rs b/vendor/compiler_builtins/src/float/conv.rs
index 07b58f3d2..68ba63408 100644
--- a/vendor/compiler_builtins/src/float/conv.rs
+++ b/vendor/compiler_builtins/src/float/conv.rs
@@ -92,12 +92,12 @@ intrinsics! {
f64::from_bits(int_to_float::u64_to_f64_bits(i))
}
- #[cfg_attr(not(target_feature = "llvm14-builtins-abi"), unadjusted_on_win64)]
+ #[cfg_attr(any(not(target_feature = "llvm14-builtins-abi"), target_os = "uefi"), unadjusted_on_win64)]
pub extern "C" fn __floatuntisf(i: u128) -> f32 {
f32::from_bits(int_to_float::u128_to_f32_bits(i))
}
- #[cfg_attr(not(target_feature = "llvm14-builtins-abi"), unadjusted_on_win64)]
+ #[cfg_attr(any(not(target_feature = "llvm14-builtins-abi"), target_os = "uefi"), unadjusted_on_win64)]
pub extern "C" fn __floatuntidf(i: u128) -> f64 {
f64::from_bits(int_to_float::u128_to_f64_bits(i))
}
@@ -129,13 +129,13 @@ intrinsics! {
f64::from_bits(int_to_float::u64_to_f64_bits(i.unsigned_abs()) | sign_bit)
}
- #[cfg_attr(not(target_feature = "llvm14-builtins-abi"), unadjusted_on_win64)]
+ #[cfg_attr(any(not(target_feature = "llvm14-builtins-abi"), target_os = "uefi"), unadjusted_on_win64)]
pub extern "C" fn __floattisf(i: i128) -> f32 {
let sign_bit = ((i >> 127) as u32) << 31;
f32::from_bits(int_to_float::u128_to_f32_bits(i.unsigned_abs()) | sign_bit)
}
- #[cfg_attr(not(target_feature = "llvm14-builtins-abi"), unadjusted_on_win64)]
+ #[cfg_attr(any(not(target_feature = "llvm14-builtins-abi"), target_os = "uefi"), unadjusted_on_win64)]
pub extern "C" fn __floattidf(i: i128) -> f64 {
let sign_bit = ((i >> 127) as u64) << 63;
f64::from_bits(int_to_float::u128_to_f64_bits(i.unsigned_abs()) | sign_bit)
diff --git a/vendor/compiler_builtins/src/lib.rs b/vendor/compiler_builtins/src/lib.rs
index 009923d27..e7bc61e4c 100644
--- a/vendor/compiler_builtins/src/lib.rs
+++ b/vendor/compiler_builtins/src/lib.rs
@@ -6,6 +6,7 @@
#![feature(compiler_builtins)]
#![feature(core_ffi_c)]
#![feature(core_intrinsics)]
+#![feature(inline_const)]
#![feature(lang_items)]
#![feature(linkage)]
#![feature(naked_functions)]
@@ -45,6 +46,7 @@ pub mod int;
all(target_family = "wasm", target_os = "unknown"),
all(target_arch = "x86_64", target_os = "uefi"),
all(target_arch = "arm", target_os = "none"),
+ target_os = "xous",
all(target_vendor = "fortanix", target_env = "sgx")
))]
pub mod math;
diff --git a/vendor/compiler_builtins/src/macros.rs b/vendor/compiler_builtins/src/macros.rs
index 518a18d4d..7d90b7aad 100644
--- a/vendor/compiler_builtins/src/macros.rs
+++ b/vendor/compiler_builtins/src/macros.rs
@@ -174,7 +174,7 @@ macro_rules! intrinsics {
$($rest:tt)*
) => (
- #[cfg(all(windows, target_pointer_width = "64"))]
+ #[cfg(all(any(windows, all(target_os = "uefi", target_arch = "x86_64")), target_pointer_width = "64"))]
intrinsics! {
$(#[$($attr)*])*
pub extern "unadjusted" fn $name( $($argname: $ty),* ) $(-> $ret)? {
@@ -182,7 +182,7 @@ macro_rules! intrinsics {
}
}
- #[cfg(not(all(windows, target_pointer_width = "64")))]
+ #[cfg(not(all(any(windows, all(target_os = "uefi", target_arch = "x86_64")), target_pointer_width = "64")))]
intrinsics! {
$(#[$($attr)*])*
pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
@@ -209,13 +209,13 @@ macro_rules! intrinsics {
$($rest:tt)*
) => (
- #[cfg(all(windows, target_arch = "x86_64"))]
+ #[cfg(all(any(windows, target_os = "uefi"), target_arch = "x86_64"))]
$(#[$($attr)*])*
pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
$($body)*
}
- #[cfg(all(windows, target_arch = "x86_64"))]
+ #[cfg(all(any(windows, target_os = "uefi"), target_arch = "x86_64"))]
pub mod $name {
#[cfg_attr(not(feature = "mangled-names"), no_mangle)]
pub extern $abi fn $name( $($argname: $ty),* )
@@ -226,7 +226,7 @@ macro_rules! intrinsics {
}
}
- #[cfg(not(all(windows, target_arch = "x86_64")))]
+ #[cfg(not(all(any(windows, target_os = "uefi"), target_arch = "x86_64")))]
intrinsics! {
$(#[$($attr)*])*
pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
@@ -426,7 +426,7 @@ macro_rules! intrinsics {
// Hack for LLVM expectations for ABI on windows. This is used by the
// `#[win64_128bit_abi_hack]` attribute recognized above
-#[cfg(all(windows, target_pointer_width = "64"))]
+#[cfg(all(any(windows, target_os = "uefi"), target_pointer_width = "64"))]
pub mod win64_128bit_abi_hack {
#[repr(simd)]
pub struct U64x2(u64, u64);
diff --git a/vendor/compiler_builtins/src/math.rs b/vendor/compiler_builtins/src/math.rs
index fa59753f8..fa9836186 100644
--- a/vendor/compiler_builtins/src/math.rs
+++ b/vendor/compiler_builtins/src/math.rs
@@ -20,6 +20,7 @@ macro_rules! no_mangle {
target_os = "unknown",
not(target_env = "wasi")
),
+ target_os = "xous",
all(target_arch = "x86_64", target_os = "uefi"),
all(target_arch = "xtensa", target_os = "none"),
all(target_vendor = "fortanix", target_env = "sgx")
@@ -62,6 +63,8 @@ no_mangle! {
fn tanhf(n: f32) -> f32;
fn ldexp(f: f64, n: i32) -> f64;
fn ldexpf(f: f32, n: i32) -> f32;
+ fn tgamma(x: f64) -> f64;
+ fn tgammaf(x: f32) -> f32;
}
#[cfg(any(
@@ -70,6 +73,8 @@ no_mangle! {
target_os = "unknown",
not(target_env = "wasi")
),
+ target_os = "xous",
+ all(target_arch = "x86_64", target_os = "uefi"),
all(target_arch = "xtensa", target_os = "none"),
all(target_vendor = "fortanix", target_env = "sgx")
))]
@@ -93,7 +98,17 @@ no_mangle! {
fn tanf(n: f32) -> f32;
}
-#[cfg(all(target_vendor = "fortanix", target_env = "sgx"))]
+#[cfg(any(target_os = "xous", target_os = "uefi"))]
+no_mangle! {
+ fn sqrtf(x: f32) -> f32;
+ fn sqrt(x: f64) -> f64;
+}
+
+#[cfg(any(
+ all(target_vendor = "fortanix", target_env = "sgx"),
+ target_os = "xous",
+ target_os = "uefi"
+))]
no_mangle! {
fn ceil(x: f64) -> f64;
fn ceilf(x: f32) -> f32;
diff --git a/vendor/compiler_builtins/src/mem/impls.rs b/vendor/compiler_builtins/src/mem/impls.rs
index 815132425..72003a5c4 100644
--- a/vendor/compiler_builtins/src/mem/impls.rs
+++ b/vendor/compiler_builtins/src/mem/impls.rs
@@ -265,3 +265,17 @@ pub unsafe fn set_bytes(mut s: *mut u8, c: u8, mut n: usize) {
}
set_bytes_bytes(s, c, n);
}
+
+#[inline(always)]
+pub unsafe fn compare_bytes(s1: *const u8, s2: *const u8, n: usize) -> i32 {
+ let mut i = 0;
+ while i < n {
+ let a = *s1.add(i);
+ let b = *s2.add(i);
+ if a != b {
+ return a as i32 - b as i32;
+ }
+ i += 1;
+ }
+ 0
+}
diff --git a/vendor/compiler_builtins/src/mem/mod.rs b/vendor/compiler_builtins/src/mem/mod.rs
index a55113861..c5b0ddc16 100644
--- a/vendor/compiler_builtins/src/mem/mod.rs
+++ b/vendor/compiler_builtins/src/mem/mod.rs
@@ -51,16 +51,7 @@ intrinsics! {
#[mem_builtin]
#[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), linkage = "weak")]
pub unsafe extern "C" fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 {
- let mut i = 0;
- while i < n {
- let a = *s1.add(i);
- let b = *s2.add(i);
- if a != b {
- return a as i32 - b as i32;
- }
- i += 1;
- }
- 0
+ impls::compare_bytes(s1, s2, n)
}
#[mem_builtin]
diff --git a/vendor/compiler_builtins/src/mem/x86_64.rs b/vendor/compiler_builtins/src/mem/x86_64.rs
index a7ab6f605..17b461f79 100644
--- a/vendor/compiler_builtins/src/mem/x86_64.rs
+++ b/vendor/compiler_builtins/src/mem/x86_64.rs
@@ -16,6 +16,10 @@
// feature is present at compile-time. We don't bother detecting other features.
// Note that ERMSB does not enhance the backwards (DF=1) "rep movsb".
+use core::arch::asm;
+use core::intrinsics;
+use core::mem;
+
#[inline(always)]
#[cfg(target_feature = "ermsb")]
pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
@@ -31,16 +35,26 @@ pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
#[inline(always)]
#[cfg(not(target_feature = "ermsb"))]
-pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
- let qword_count = count >> 3;
- let byte_count = count & 0b111;
- // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
- core::arch::asm!(
- "repe movsq (%rsi), (%rdi)",
- "mov {byte_count:e}, %ecx",
- "repe movsb (%rsi), (%rdi)",
- byte_count = in(reg) byte_count,
+pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, count: usize) {
+ let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
+ // Separating the blocks gives the compiler more freedom to reorder instructions.
+ asm!(
+ "rep movsb",
+ inout("ecx") pre_byte_count => _,
+ inout("rdi") dest => dest,
+ inout("rsi") src => src,
+ options(att_syntax, nostack, preserves_flags)
+ );
+ asm!(
+ "rep movsq",
inout("rcx") qword_count => _,
+ inout("rdi") dest => dest,
+ inout("rsi") src => src,
+ options(att_syntax, nostack, preserves_flags)
+ );
+ asm!(
+ "rep movsb",
+ inout("ecx") byte_count => _,
inout("rdi") dest => _,
inout("rsi") src => _,
options(att_syntax, nostack, preserves_flags)
@@ -49,22 +63,28 @@ pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
#[inline(always)]
pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) {
- let qword_count = count >> 3;
- let byte_count = count & 0b111;
- // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
- core::arch::asm!(
+ let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
+ // We can't separate this block due to std/cld
+ asm!(
"std",
- "repe movsq (%rsi), (%rdi)",
- "movl {byte_count:e}, %ecx",
- "addq $7, %rdi",
- "addq $7, %rsi",
- "repe movsb (%rsi), (%rdi)",
+ "rep movsb",
+ "sub $7, %rsi",
+ "sub $7, %rdi",
+ "mov {qword_count}, %rcx",
+ "rep movsq",
+ "test {pre_byte_count:e}, {pre_byte_count:e}",
+ "add $7, %rsi",
+ "add $7, %rdi",
+ "mov {pre_byte_count:e}, %ecx",
+ "rep movsb",
"cld",
- byte_count = in(reg) byte_count,
- inout("rcx") qword_count => _,
- inout("rdi") dest.add(count).wrapping_sub(8) => _,
- inout("rsi") src.add(count).wrapping_sub(8) => _,
- options(att_syntax, nostack)
+ pre_byte_count = in(reg) pre_byte_count,
+ qword_count = in(reg) qword_count,
+ inout("ecx") byte_count => _,
+ inout("rdi") dest.add(count - 1) => _,
+ inout("rsi") src.add(count - 1) => _,
+ // We modify flags, but we restore it afterwards
+ options(att_syntax, nostack, preserves_flags)
);
}
@@ -83,18 +103,82 @@ pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
#[inline(always)]
#[cfg(not(target_feature = "ermsb"))]
-pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
- let qword_count = count >> 3;
- let byte_count = count & 0b111;
- // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
- core::arch::asm!(
- "repe stosq %rax, (%rdi)",
- "mov {byte_count:e}, %ecx",
- "repe stosb %al, (%rdi)",
- byte_count = in(reg) byte_count,
+pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) {
+ let c = c as u64 * 0x0101_0101_0101_0101;
+ let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
+ // Separating the blocks gives the compiler more freedom to reorder instructions.
+ asm!(
+ "rep stosb",
+ inout("ecx") pre_byte_count => _,
+ inout("rdi") dest => dest,
+ in("rax") c,
+ options(att_syntax, nostack, preserves_flags)
+ );
+ asm!(
+ "rep stosq",
inout("rcx") qword_count => _,
+ inout("rdi") dest => dest,
+ in("rax") c,
+ options(att_syntax, nostack, preserves_flags)
+ );
+ asm!(
+ "rep stosb",
+ inout("ecx") byte_count => _,
inout("rdi") dest => _,
- in("rax") (c as u64) * 0x0101010101010101,
+ in("rax") c,
options(att_syntax, nostack, preserves_flags)
);
}
+
+#[inline(always)]
+pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 {
+ #[inline(always)]
+ unsafe fn cmp<T, U, F>(mut a: *const T, mut b: *const T, n: usize, f: F) -> i32
+ where
+ T: Clone + Copy + Eq,
+ U: Clone + Copy + Eq,
+ F: FnOnce(*const U, *const U, usize) -> i32,
+ {
+ // Ensure T is not a ZST.
+ const { assert!(mem::size_of::<T>() != 0) };
+
+ let end = a.add(intrinsics::unchecked_div(n, mem::size_of::<T>()));
+ while a != end {
+ if a.read_unaligned() != b.read_unaligned() {
+ return f(a.cast(), b.cast(), mem::size_of::<T>());
+ }
+ a = a.add(1);
+ b = b.add(1);
+ }
+ f(
+ a.cast(),
+ b.cast(),
+ intrinsics::unchecked_rem(n, mem::size_of::<T>()),
+ )
+ }
+ let c1 = |mut a: *const u8, mut b: *const u8, n| {
+ for _ in 0..n {
+ if a.read() != b.read() {
+ return i32::from(a.read()) - i32::from(b.read());
+ }
+ a = a.add(1);
+ b = b.add(1);
+ }
+ 0
+ };
+ let c2 = |a: *const u16, b, n| cmp(a, b, n, c1);
+ let c4 = |a: *const u32, b, n| cmp(a, b, n, c2);
+ let c8 = |a: *const u64, b, n| cmp(a, b, n, c4);
+ let c16 = |a: *const u128, b, n| cmp(a, b, n, c8);
+ c16(a.cast(), b.cast(), n)
+}
+
+/// Determine optimal parameters for a `rep` instruction.
+fn rep_param(dest: *mut u8, mut count: usize) -> (usize, usize, usize) {
+ // Unaligned writes are still slow on modern processors, so align the destination address.
+ let pre_byte_count = ((8 - (dest as usize & 0b111)) & 0b111).min(count);
+ count -= pre_byte_count;
+ let qword_count = count >> 3;
+ let byte_count = count & 0b111;
+ (pre_byte_count, qword_count, byte_count)
+}