9 files changed, 237 insertions, 91 deletions
diff --git a/vendor/compiler_builtins/src/arm.rs b/vendor/compiler_builtins/src/arm.rs
index 9c1b6ad12..e517a9ef3 100644
--- a/vendor/compiler_builtins/src/arm.rs
+++ b/vendor/compiler_builtins/src/arm.rs
@@ -22,6 +22,7 @@ intrinsics! {
     // custom calling convention which can't be implemented using a normal Rust function.
     #[naked]
     #[cfg(not(target_env = "msvc"))]
+    #[cfg_attr(all(not(windows), not(target_vendor="apple")), linkage = "weak")]
     pub unsafe extern "C" fn __aeabi_uidivmod() {
         core::arch::asm!(
             "push {{lr}}",
@@ -36,6 +37,7 @@ intrinsics! {
     }
 
     #[naked]
+    #[cfg_attr(all(not(windows), not(target_vendor="apple")), linkage = "weak")]
     pub unsafe extern "C" fn __aeabi_uldivmod() {
         core::arch::asm!(
             "push {{r4, lr}}",
@@ -52,6 +54,7 @@ intrinsics! {
     }
 
     #[naked]
+    #[cfg_attr(all(not(windows), not(target_vendor="apple")), linkage = "weak")]
     pub unsafe extern "C" fn __aeabi_idivmod() {
         core::arch::asm!(
             "push {{r0, r1, r4, lr}}",
@@ -65,6 +68,7 @@ intrinsics! {
     }
 
     #[naked]
+    #[cfg_attr(all(not(windows), not(target_vendor="apple")), linkage = "weak")]
     pub unsafe extern "C" fn __aeabi_ldivmod() {
         core::arch::asm!(
             "push {{r4, lr}}",
diff --git a/vendor/compiler_builtins/src/arm_linux.rs b/vendor/compiler_builtins/src/arm_linux.rs
index 8fe09485b..8f22eb628 100644
--- a/vendor/compiler_builtins/src/arm_linux.rs
+++ b/vendor/compiler_builtins/src/arm_linux.rs
@@ -55,7 +55,7 @@ fn insert_aligned(aligned: u32, val: u32, shift: u32, mask: u32) -> u32 {
 }
 
 // Generic atomic read-modify-write operation
-unsafe fn atomic_rmw<T, F: Fn(u32) -> u32>(ptr: *mut T, f: F) -> u32 {
+unsafe fn atomic_rmw<T, F: Fn(u32) -> u32, G: Fn(u32, u32) -> u32>(ptr: *mut T, f: F, g: G) -> u32 {
     let aligned_ptr = align_ptr(ptr);
     let (shift, mask) = get_shift_mask(ptr);
 
@@ -65,7 +65,7 @@ unsafe fn atomic_rmw<T, F: Fn(u32) -> u32>(ptr: *mut T, f: F) -> u32 {
         let newval = f(curval);
         let newval_aligned = insert_aligned(curval_aligned, newval, shift, mask);
         if __kuser_cmpxchg(curval_aligned, newval_aligned, aligned_ptr) {
-            return curval;
+            return g(curval, newval);
         }
     }
 }
@@ -89,13 +89,21 @@ unsafe fn atomic_cmpxchg<T>(ptr: *mut T, oldval: u32, newval: u32) -> u32 {
 }
 
 macro_rules! atomic_rmw {
-    ($name:ident, $ty:ty, $op:expr) => {
+    ($name:ident, $ty:ty, $op:expr, $fetch:expr) => {
         intrinsics! {
             pub unsafe extern "C" fn $name(ptr: *mut $ty, val: $ty) -> $ty {
-                atomic_rmw(ptr, |x| $op(x as $ty, val) as u32) as $ty
+                atomic_rmw(ptr, |x| $op(x as $ty, val) as u32, |old, new| $fetch(old, new)) as $ty
             }
         }
     };
+
+    (@old $name:ident, $ty:ty, $op:expr) => {
+        atomic_rmw!($name, $ty, $op, |old, _| old);
+    };
+
+    (@new $name:ident, $ty:ty, $op:expr) => {
+        atomic_rmw!($name, $ty, $op, |_, new| new);
+    };
 }
 macro_rules! atomic_cmpxchg {
     ($name:ident, $ty:ty) => {
@@ -107,101 +115,129 @@ macro_rules! atomic_cmpxchg {
     };
 }
 
-atomic_rmw!(__sync_fetch_and_add_1, u8, |a: u8, b: u8| a.wrapping_add(b));
-atomic_rmw!(__sync_fetch_and_add_2, u16, |a: u16, b: u16| a
+atomic_rmw!(@old __sync_fetch_and_add_1, u8, |a: u8, b: u8| a.wrapping_add(b));
+atomic_rmw!(@old __sync_fetch_and_add_2, u16, |a: u16, b: u16| a
+    .wrapping_add(b));
+atomic_rmw!(@old __sync_fetch_and_add_4, u32, |a: u32, b: u32| a
+    .wrapping_add(b));
+
+atomic_rmw!(@new __sync_add_and_fetch_1, u8, |a: u8, b: u8| a.wrapping_add(b));
+atomic_rmw!(@new __sync_add_and_fetch_2, u16, |a: u16, b: u16| a
     .wrapping_add(b));
-atomic_rmw!(__sync_fetch_and_add_4, u32, |a: u32, b: u32| a
+atomic_rmw!(@new __sync_add_and_fetch_4, u32, |a: u32, b: u32| a
     .wrapping_add(b));
 
-atomic_rmw!(__sync_fetch_and_sub_1, u8, |a: u8, b: u8| a.wrapping_sub(b));
-atomic_rmw!(__sync_fetch_and_sub_2, u16, |a: u16, b: u16| a
+atomic_rmw!(@old __sync_fetch_and_sub_1, u8, |a: u8, b: u8| a.wrapping_sub(b));
+atomic_rmw!(@old __sync_fetch_and_sub_2, u16, |a: u16, b: u16| a
     .wrapping_sub(b));
-atomic_rmw!(__sync_fetch_and_sub_4, u32, |a: u32, b: u32| a
+atomic_rmw!(@old __sync_fetch_and_sub_4, u32, |a: u32, b: u32| a
     .wrapping_sub(b));
 
-atomic_rmw!(__sync_fetch_and_and_1, u8, |a: u8, b: u8| a & b);
-atomic_rmw!(__sync_fetch_and_and_2, u16, |a: u16, b: u16| a & b);
-atomic_rmw!(__sync_fetch_and_and_4, u32, |a: u32, b: u32| a & b);
+atomic_rmw!(@new __sync_sub_and_fetch_1, u8, |a: u8, b: u8| a.wrapping_sub(b));
+atomic_rmw!(@new __sync_sub_and_fetch_2, u16, |a: u16, b: u16| a
+    .wrapping_sub(b));
+atomic_rmw!(@new __sync_sub_and_fetch_4, u32, |a: u32, b: u32| a
+    .wrapping_sub(b));
+
+atomic_rmw!(@old __sync_fetch_and_and_1, u8, |a: u8, b: u8| a & b);
+atomic_rmw!(@old __sync_fetch_and_and_2, u16, |a: u16, b: u16| a & b);
+atomic_rmw!(@old __sync_fetch_and_and_4, u32, |a: u32, b: u32| a & b);
+
+atomic_rmw!(@new __sync_and_and_fetch_1, u8, |a: u8, b: u8| a & b);
+atomic_rmw!(@new __sync_and_and_fetch_2, u16, |a: u16, b: u16| a & b);
+atomic_rmw!(@new __sync_and_and_fetch_4, u32, |a: u32, b: u32| a & b);
+
+atomic_rmw!(@old __sync_fetch_and_or_1, u8, |a: u8, b: u8| a | b);
+atomic_rmw!(@old __sync_fetch_and_or_2, u16, |a: u16, b: u16| a | b);
+atomic_rmw!(@old __sync_fetch_and_or_4, u32, |a: u32, b: u32| a | b);
+
+atomic_rmw!(@new __sync_or_and_fetch_1, u8, |a: u8, b: u8| a | b);
+atomic_rmw!(@new __sync_or_and_fetch_2, u16, |a: u16, b: u16| a | b);
+atomic_rmw!(@new __sync_or_and_fetch_4, u32, |a: u32, b: u32| a | b);
+
+atomic_rmw!(@old __sync_fetch_and_xor_1, u8, |a: u8, b: u8| a ^ b);
+atomic_rmw!(@old __sync_fetch_and_xor_2, u16, |a: u16, b: u16| a ^ b);
+atomic_rmw!(@old __sync_fetch_and_xor_4, u32, |a: u32, b: u32| a ^ b);
 
-atomic_rmw!(__sync_fetch_and_or_1, u8, |a: u8, b: u8| a | b);
-atomic_rmw!(__sync_fetch_and_or_2, u16, |a: u16, b: u16| a | b);
-atomic_rmw!(__sync_fetch_and_or_4, u32, |a: u32, b: u32| a | b);
+atomic_rmw!(@new __sync_xor_and_fetch_1, u8, |a: u8, b: u8| a ^ b);
+atomic_rmw!(@new __sync_xor_and_fetch_2, u16, |a: u16, b: u16| a ^ b);
+atomic_rmw!(@new __sync_xor_and_fetch_4, u32, |a: u32, b: u32| a ^ b);
 
-atomic_rmw!(__sync_fetch_and_xor_1, u8, |a: u8, b: u8| a ^ b);
-atomic_rmw!(__sync_fetch_and_xor_2, u16, |a: u16, b: u16| a ^ b);
-atomic_rmw!(__sync_fetch_and_xor_4, u32, |a: u32, b: u32| a ^ b);
+atomic_rmw!(@old __sync_fetch_and_nand_1, u8, |a: u8, b: u8| !(a & b));
+atomic_rmw!(@old __sync_fetch_and_nand_2, u16, |a: u16, b: u16| !(a & b));
+atomic_rmw!(@old __sync_fetch_and_nand_4, u32, |a: u32, b: u32| !(a & b));
 
-atomic_rmw!(__sync_fetch_and_nand_1, u8, |a: u8, b: u8| !(a & b));
-atomic_rmw!(__sync_fetch_and_nand_2, u16, |a: u16, b: u16| !(a & b));
-atomic_rmw!(__sync_fetch_and_nand_4, u32, |a: u32, b: u32| !(a & b));
+atomic_rmw!(@new __sync_nand_and_fetch_1, u8, |a: u8, b: u8| !(a & b));
+atomic_rmw!(@new __sync_nand_and_fetch_2, u16, |a: u16, b: u16| !(a & b));
+atomic_rmw!(@new __sync_nand_and_fetch_4, u32, |a: u32, b: u32| !(a & b));
 
-atomic_rmw!(__sync_fetch_and_max_1, i8, |a: i8, b: i8| if a > b {
+atomic_rmw!(@old __sync_fetch_and_max_1, i8, |a: i8, b: i8| if a > b {
     a
 } else {
     b
 });
-atomic_rmw!(__sync_fetch_and_max_2, i16, |a: i16, b: i16| if a > b {
+atomic_rmw!(@old __sync_fetch_and_max_2, i16, |a: i16, b: i16| if a > b {
     a
 } else {
     b
 });
-atomic_rmw!(__sync_fetch_and_max_4, i32, |a: i32, b: i32| if a > b {
+atomic_rmw!(@old __sync_fetch_and_max_4, i32, |a: i32, b: i32| if a > b {
     a
 } else {
     b
 });
 
-atomic_rmw!(__sync_fetch_and_umax_1, u8, |a: u8, b: u8| if a > b {
+atomic_rmw!(@old __sync_fetch_and_umax_1, u8, |a: u8, b: u8| if a > b {
     a
 } else {
     b
 });
-atomic_rmw!(__sync_fetch_and_umax_2, u16, |a: u16, b: u16| if a > b {
+atomic_rmw!(@old __sync_fetch_and_umax_2, u16, |a: u16, b: u16| if a > b {
     a
 } else {
     b
 });
-atomic_rmw!(__sync_fetch_and_umax_4, u32, |a: u32, b: u32| if a > b {
+atomic_rmw!(@old __sync_fetch_and_umax_4, u32, |a: u32, b: u32| if a > b {
     a
 } else {
     b
 });
 
-atomic_rmw!(__sync_fetch_and_min_1, i8, |a: i8, b: i8| if a < b {
+atomic_rmw!(@old __sync_fetch_and_min_1, i8, |a: i8, b: i8| if a < b {
     a
 } else {
     b
 });
-atomic_rmw!(__sync_fetch_and_min_2, i16, |a: i16, b: i16| if a < b {
+atomic_rmw!(@old __sync_fetch_and_min_2, i16, |a: i16, b: i16| if a < b {
     a
 } else {
     b
 });
-atomic_rmw!(__sync_fetch_and_min_4, i32, |a: i32, b: i32| if a < b {
+atomic_rmw!(@old __sync_fetch_and_min_4, i32, |a: i32, b: i32| if a < b {
     a
 } else {
     b
 });
 
-atomic_rmw!(__sync_fetch_and_umin_1, u8, |a: u8, b: u8| if a < b {
+atomic_rmw!(@old __sync_fetch_and_umin_1, u8, |a: u8, b: u8| if a < b {
     a
 } else {
     b
 });
-atomic_rmw!(__sync_fetch_and_umin_2, u16, |a: u16, b: u16| if a < b {
+atomic_rmw!(@old __sync_fetch_and_umin_2, u16, |a: u16, b: u16| if a < b {
     a
 } else {
     b
 });
-atomic_rmw!(__sync_fetch_and_umin_4, u32, |a: u32, b: u32| if a < b {
+atomic_rmw!(@old __sync_fetch_and_umin_4, u32, |a: u32, b: u32| if a < b {
     a
 } else {
     b
 });
 
-atomic_rmw!(__sync_lock_test_and_set_1, u8, |_: u8, b: u8| b);
-atomic_rmw!(__sync_lock_test_and_set_2, u16, |_: u16, b: u16| b);
-atomic_rmw!(__sync_lock_test_and_set_4, u32, |_: u32, b: u32| b);
+atomic_rmw!(@old __sync_lock_test_and_set_1, u8, |_: u8, b: u8| b);
+atomic_rmw!(@old __sync_lock_test_and_set_2, u16, |_: u16, b: u16| b);
+atomic_rmw!(@old __sync_lock_test_and_set_4, u32, |_: u32, b: u32| b);
 
 atomic_cmpxchg!(__sync_val_compare_and_swap_1, u8);
 atomic_cmpxchg!(__sync_val_compare_and_swap_2, u16);
diff --git a/vendor/compiler_builtins/src/float/conv.rs b/vendor/compiler_builtins/src/float/conv.rs
index 07b58f3d2..68ba63408 100644
--- a/vendor/compiler_builtins/src/float/conv.rs
+++ b/vendor/compiler_builtins/src/float/conv.rs
@@ -92,12 +92,12 @@ intrinsics! {
         f64::from_bits(int_to_float::u64_to_f64_bits(i))
     }
 
-    #[cfg_attr(not(target_feature = "llvm14-builtins-abi"), unadjusted_on_win64)]
+    #[cfg_attr(any(not(target_feature = "llvm14-builtins-abi"), target_os = "uefi"), unadjusted_on_win64)]
     pub extern "C" fn __floatuntisf(i: u128) -> f32 {
         f32::from_bits(int_to_float::u128_to_f32_bits(i))
     }
 
-    #[cfg_attr(not(target_feature = "llvm14-builtins-abi"), unadjusted_on_win64)]
+    #[cfg_attr(any(not(target_feature = "llvm14-builtins-abi"), target_os = "uefi"), unadjusted_on_win64)]
     pub extern "C" fn __floatuntidf(i: u128) -> f64 {
         f64::from_bits(int_to_float::u128_to_f64_bits(i))
     }
@@ -129,13 +129,13 @@ intrinsics! {
         f64::from_bits(int_to_float::u64_to_f64_bits(i.unsigned_abs()) | sign_bit)
     }
 
-    #[cfg_attr(not(target_feature = "llvm14-builtins-abi"), unadjusted_on_win64)]
+    #[cfg_attr(any(not(target_feature = "llvm14-builtins-abi"), target_os = "uefi"), unadjusted_on_win64)]
     pub extern "C" fn __floattisf(i: i128) -> f32 {
         let sign_bit = ((i >> 127) as u32) << 31;
         f32::from_bits(int_to_float::u128_to_f32_bits(i.unsigned_abs()) | sign_bit)
     }
 
-    #[cfg_attr(not(target_feature = "llvm14-builtins-abi"), unadjusted_on_win64)]
+    #[cfg_attr(any(not(target_feature = "llvm14-builtins-abi"), target_os = "uefi"), unadjusted_on_win64)]
     pub extern "C" fn __floattidf(i: i128) -> f64 {
         let sign_bit = ((i >> 127) as u64) << 63;
         f64::from_bits(int_to_float::u128_to_f64_bits(i.unsigned_abs()) | sign_bit)
diff --git a/vendor/compiler_builtins/src/lib.rs b/vendor/compiler_builtins/src/lib.rs
index 009923d27..e7bc61e4c 100644
--- a/vendor/compiler_builtins/src/lib.rs
+++ b/vendor/compiler_builtins/src/lib.rs
@@ -6,6 +6,7 @@
 #![feature(compiler_builtins)]
 #![feature(core_ffi_c)]
 #![feature(core_intrinsics)]
+#![feature(inline_const)]
 #![feature(lang_items)]
 #![feature(linkage)]
 #![feature(naked_functions)]
@@ -45,6 +46,7 @@ pub mod int;
     all(target_family = "wasm", target_os = "unknown"),
     all(target_arch = "x86_64", target_os = "uefi"),
     all(target_arch = "arm", target_os = "none"),
+    target_os = "xous",
     all(target_vendor = "fortanix", target_env = "sgx")
 ))]
 pub mod math;
diff --git a/vendor/compiler_builtins/src/macros.rs b/vendor/compiler_builtins/src/macros.rs
index 518a18d4d..7d90b7aad 100644
--- a/vendor/compiler_builtins/src/macros.rs
+++ b/vendor/compiler_builtins/src/macros.rs
@@ -174,7 +174,7 @@ macro_rules! intrinsics {
 
         $($rest:tt)*
     ) => (
-        #[cfg(all(windows, target_pointer_width = "64"))]
+        #[cfg(all(any(windows, all(target_os = "uefi", target_arch = "x86_64")), target_pointer_width = "64"))]
         intrinsics! {
             $(#[$($attr)*])*
             pub extern "unadjusted" fn $name( $($argname: $ty),* ) $(-> $ret)? {
@@ -182,7 +182,7 @@ macro_rules! intrinsics {
             }
         }
 
-        #[cfg(not(all(windows, target_pointer_width = "64")))]
+        #[cfg(not(all(any(windows, all(target_os = "uefi", target_arch = "x86_64")), target_pointer_width = "64")))]
         intrinsics! {
             $(#[$($attr)*])*
             pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
@@ -209,13 +209,13 @@ macro_rules! intrinsics {
 
         $($rest:tt)*
     ) => (
-        #[cfg(all(windows, target_arch = "x86_64"))]
+        #[cfg(all(any(windows, target_os = "uefi"), target_arch = "x86_64"))]
         $(#[$($attr)*])*
         pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
             $($body)*
         }
 
-        #[cfg(all(windows, target_arch = "x86_64"))]
+        #[cfg(all(any(windows, target_os = "uefi"), target_arch = "x86_64"))]
         pub mod $name {
             #[cfg_attr(not(feature = "mangled-names"), no_mangle)]
             pub extern $abi fn $name( $($argname: $ty),* )
@@ -226,7 +226,7 @@ macro_rules! intrinsics {
             }
         }
 
-        #[cfg(not(all(windows, target_arch = "x86_64")))]
+        #[cfg(not(all(any(windows, target_os = "uefi"), target_arch = "x86_64")))]
         intrinsics! {
             $(#[$($attr)*])*
             pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
@@ -426,7 +426,7 @@ macro_rules! intrinsics {
 
 // Hack for LLVM expectations for ABI on windows. This is used by the
 // `#[win64_128bit_abi_hack]` attribute recognized above
-#[cfg(all(windows, target_pointer_width = "64"))]
+#[cfg(all(any(windows, target_os = "uefi"), target_pointer_width = "64"))]
 pub mod win64_128bit_abi_hack {
     #[repr(simd)]
     pub struct U64x2(u64, u64);
diff --git a/vendor/compiler_builtins/src/math.rs b/vendor/compiler_builtins/src/math.rs
index fa59753f8..fa9836186 100644
--- a/vendor/compiler_builtins/src/math.rs
+++ b/vendor/compiler_builtins/src/math.rs
@@ -20,6 +20,7 @@ macro_rules! no_mangle {
         target_os = "unknown",
         not(target_env = "wasi")
     ),
+    target_os = "xous",
     all(target_arch = "x86_64", target_os = "uefi"),
     all(target_arch = "xtensa", target_os = "none"),
     all(target_vendor = "fortanix", target_env = "sgx")
@@ -62,6 +63,8 @@ no_mangle! {
     fn tanhf(n: f32) -> f32;
     fn ldexp(f: f64, n: i32) -> f64;
     fn ldexpf(f: f32, n: i32) -> f32;
+    fn tgamma(x: f64) -> f64;
+    fn tgammaf(x: f32) -> f32;
 }
 
 #[cfg(any(
@@ -70,6 +73,8 @@ no_mangle! {
         target_os = "unknown",
         not(target_env = "wasi")
     ),
+    target_os = "xous",
+    all(target_arch = "x86_64", target_os = "uefi"),
     all(target_arch = "xtensa", target_os = "none"),
     all(target_vendor = "fortanix", target_env = "sgx")
 ))]
@@ -93,7 +98,17 @@ no_mangle! {
     fn tanf(n: f32) -> f32;
 }
 
-#[cfg(all(target_vendor = "fortanix", target_env = "sgx"))]
+#[cfg(any(target_os = "xous", target_os = "uefi"))]
+no_mangle! {
+    fn sqrtf(x: f32) -> f32;
+    fn sqrt(x: f64) -> f64;
+}
+
+#[cfg(any(
+    all(target_vendor = "fortanix", target_env = "sgx"),
+    target_os = "xous",
+    target_os = "uefi"
+))]
 no_mangle! {
     fn ceil(x: f64) -> f64;
     fn ceilf(x: f32) -> f32;
diff --git a/vendor/compiler_builtins/src/mem/impls.rs b/vendor/compiler_builtins/src/mem/impls.rs
index 815132425..72003a5c4 100644
--- a/vendor/compiler_builtins/src/mem/impls.rs
+++ b/vendor/compiler_builtins/src/mem/impls.rs
@@ -265,3 +265,17 @@ pub unsafe fn set_bytes(mut s: *mut u8, c: u8, mut n: usize) {
     }
     set_bytes_bytes(s, c, n);
 }
+
+#[inline(always)]
+pub unsafe fn compare_bytes(s1: *const u8, s2: *const u8, n: usize) -> i32 {
+    let mut i = 0;
+    while i < n {
+        let a = *s1.add(i);
+        let b = *s2.add(i);
+        if a != b {
+            return a as i32 - b as i32;
+        }
+        i += 1;
+    }
+    0
+}
diff --git a/vendor/compiler_builtins/src/mem/mod.rs b/vendor/compiler_builtins/src/mem/mod.rs
index a55113861..c5b0ddc16 100644
--- a/vendor/compiler_builtins/src/mem/mod.rs
+++ b/vendor/compiler_builtins/src/mem/mod.rs
@@ -51,16 +51,7 @@ intrinsics! {
     #[mem_builtin]
     #[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), linkage = "weak")]
     pub unsafe extern "C" fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 {
-        let mut i = 0;
-        while i < n {
-            let a = *s1.add(i);
-            let b = *s2.add(i);
-            if a != b {
-                return a as i32 - b as i32;
-            }
-            i += 1;
-        }
-        0
+        impls::compare_bytes(s1, s2, n)
     }
 
     #[mem_builtin]
diff --git a/vendor/compiler_builtins/src/mem/x86_64.rs b/vendor/compiler_builtins/src/mem/x86_64.rs
index a7ab6f605..17b461f79 100644
--- a/vendor/compiler_builtins/src/mem/x86_64.rs
+++ b/vendor/compiler_builtins/src/mem/x86_64.rs
@@ -16,6 +16,10 @@
 // feature is present at compile-time. We don't bother detecting other features.
 // Note that ERMSB does not enhance the backwards (DF=1) "rep movsb".
 
+use core::arch::asm;
+use core::intrinsics;
+use core::mem;
+
 #[inline(always)]
 #[cfg(target_feature = "ermsb")]
 pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
@@ -31,16 +35,26 @@ pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
 
 #[inline(always)]
 #[cfg(not(target_feature = "ermsb"))]
-pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
-    let qword_count = count >> 3;
-    let byte_count = count & 0b111;
-    // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
-    core::arch::asm!(
-        "repe movsq (%rsi), (%rdi)",
-        "mov {byte_count:e}, %ecx",
-        "repe movsb (%rsi), (%rdi)",
-        byte_count = in(reg) byte_count,
+pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, count: usize) {
+    let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
+    // Separating the blocks gives the compiler more freedom to reorder instructions.
+    asm!(
+        "rep movsb",
+        inout("ecx") pre_byte_count => _,
+        inout("rdi") dest => dest,
+        inout("rsi") src => src,
+        options(att_syntax, nostack, preserves_flags)
+    );
+    asm!(
+        "rep movsq",
         inout("rcx") qword_count => _,
+        inout("rdi") dest => dest,
+        inout("rsi") src => src,
+        options(att_syntax, nostack, preserves_flags)
+    );
+    asm!(
+        "rep movsb",
+        inout("ecx") byte_count => _,
         inout("rdi") dest => _,
         inout("rsi") src => _,
         options(att_syntax, nostack, preserves_flags)
@@ -49,22 +63,28 @@ pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
 
 #[inline(always)]
 pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) {
-    let qword_count = count >> 3;
-    let byte_count = count & 0b111;
-    // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
-    core::arch::asm!(
+    let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
+    // We can't separate this block due to std/cld
+    asm!(
         "std",
-        "repe movsq (%rsi), (%rdi)",
-        "movl {byte_count:e}, %ecx",
-        "addq $7, %rdi",
-        "addq $7, %rsi",
-        "repe movsb (%rsi), (%rdi)",
+        "rep movsb",
+        "sub $7, %rsi",
+        "sub $7, %rdi",
+        "mov {qword_count}, %rcx",
+        "rep movsq",
+        "test {pre_byte_count:e}, {pre_byte_count:e}",
+        "add $7, %rsi",
+        "add $7, %rdi",
+        "mov {pre_byte_count:e}, %ecx",
+        "rep movsb",
         "cld",
-        byte_count = in(reg) byte_count,
-        inout("rcx") qword_count => _,
-        inout("rdi") dest.add(count).wrapping_sub(8) => _,
-        inout("rsi") src.add(count).wrapping_sub(8) => _,
-        options(att_syntax, nostack)
+        pre_byte_count = in(reg) pre_byte_count,
+        qword_count = in(reg) qword_count,
+        inout("ecx") byte_count => _,
+        inout("rdi") dest.add(count - 1) => _,
+        inout("rsi") src.add(count - 1) => _,
+        // We modify flags, but we restore it afterwards
+        options(att_syntax, nostack, preserves_flags)
     );
 }
 
@@ -83,18 +103,82 @@ pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
 
 #[inline(always)]
 #[cfg(not(target_feature = "ermsb"))]
-pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
-    let qword_count = count >> 3;
-    let byte_count = count & 0b111;
-    // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
-    core::arch::asm!(
-        "repe stosq %rax, (%rdi)",
-        "mov {byte_count:e}, %ecx",
-        "repe stosb %al, (%rdi)",
-        byte_count = in(reg) byte_count,
+pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) {
+    let c = c as u64 * 0x0101_0101_0101_0101;
+    let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
+    // Separating the blocks gives the compiler more freedom to reorder instructions.
+    asm!(
+        "rep stosb",
+        inout("ecx") pre_byte_count => _,
+        inout("rdi") dest => dest,
+        in("rax") c,
+        options(att_syntax, nostack, preserves_flags)
+    );
+    asm!(
+        "rep stosq",
         inout("rcx") qword_count => _,
+        inout("rdi") dest => dest,
+        in("rax") c,
+        options(att_syntax, nostack, preserves_flags)
+    );
+    asm!(
+        "rep stosb",
+        inout("ecx") byte_count => _,
         inout("rdi") dest => _,
-        in("rax") (c as u64) * 0x0101010101010101,
+        in("rax") c,
         options(att_syntax, nostack, preserves_flags)
     );
 }
+
+#[inline(always)]
+pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 {
+    #[inline(always)]
+    unsafe fn cmp<T, U, F>(mut a: *const T, mut b: *const T, n: usize, f: F) -> i32
+    where
+        T: Clone + Copy + Eq,
+        U: Clone + Copy + Eq,
+        F: FnOnce(*const U, *const U, usize) -> i32,
+    {
+        // Ensure T is not a ZST.
+        const { assert!(mem::size_of::<T>() != 0) };
+
+        let end = a.add(intrinsics::unchecked_div(n, mem::size_of::<T>()));
+        while a != end {
+            if a.read_unaligned() != b.read_unaligned() {
+                return f(a.cast(), b.cast(), mem::size_of::<T>());
+            }
+            a = a.add(1);
+            b = b.add(1);
+        }
+        f(
+            a.cast(),
+            b.cast(),
+            intrinsics::unchecked_rem(n, mem::size_of::<T>()),
+        )
+    }
+    let c1 = |mut a: *const u8, mut b: *const u8, n| {
+        for _ in 0..n {
+            if a.read() != b.read() {
+                return i32::from(a.read()) - i32::from(b.read());
+            }
+            a = a.add(1);
+            b = b.add(1);
+        }
+        0
+    };
+    let c2 = |a: *const u16, b, n| cmp(a, b, n, c1);
+    let c4 = |a: *const u32, b, n| cmp(a, b, n, c2);
+    let c8 = |a: *const u64, b, n| cmp(a, b, n, c4);
+    let c16 = |a: *const u128, b, n| cmp(a, b, n, c8);
+    c16(a.cast(), b.cast(), n)
+}
+
+/// Determine optimal parameters for a `rep` instruction.
+fn rep_param(dest: *mut u8, mut count: usize) -> (usize, usize, usize) {
+    // Unaligned writes are still slow on modern processors, so align the destination address.
+    let pre_byte_count = ((8 - (dest as usize & 0b111)) & 0b111).min(count);
+    count -= pre_byte_count;
+    let qword_count = count >> 3;
+    let byte_count = count & 0b111;
+    (pre_byte_count, qword_count, byte_count)
+}