From 698f8c2f01ea549d77d7dc3338a12e04c11057b9 Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Wed, 17 Apr 2024 14:02:58 +0200
Subject: Adding upstream version 1.64.0+dfsg1.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 vendor/compiler_builtins/src/mem/impls.rs  | 267 +++++++++++++++++++++++++++++
 vendor/compiler_builtins/src/mem/mod.rs    | 211 +++++++++++++++++++++++
 vendor/compiler_builtins/src/mem/x86_64.rs | 100 +++++++++++
 3 files changed, 578 insertions(+)
 create mode 100644 vendor/compiler_builtins/src/mem/impls.rs
 create mode 100644 vendor/compiler_builtins/src/mem/mod.rs
 create mode 100644 vendor/compiler_builtins/src/mem/x86_64.rs

(limited to 'vendor/compiler_builtins/src/mem')
diff --git a/vendor/compiler_builtins/src/mem/impls.rs b/vendor/compiler_builtins/src/mem/impls.rs
new file mode 100644
index 000000000..815132425
--- /dev/null
+++ b/vendor/compiler_builtins/src/mem/impls.rs
@@ -0,0 +1,267 @@
+use core::intrinsics::likely;
+
+const WORD_SIZE: usize = core::mem::size_of::<usize>();
+const WORD_MASK: usize = WORD_SIZE - 1;
+
+// If the number of bytes involved exceed this threshold we will opt in word-wise copy.
+// The value here selected is max(2 * WORD_SIZE, 16):
+// * We need at least 2 * WORD_SIZE bytes to guarantee that at least 1 word will be copied through
+//   word-wise copy.
+// * The word-wise copy logic needs to perform some checks so it has some small overhead.
+//   ensures that even on 32-bit platforms we have copied at least 8 bytes through
+//   word-wise copy so the saving of word-wise copy outweights the fixed overhead.
+const WORD_COPY_THRESHOLD: usize = if 2 * WORD_SIZE > 16 {
+    2 * WORD_SIZE
+} else {
+    16
+};
+
+#[cfg(feature = "mem-unaligned")]
+unsafe fn read_usize_unaligned(x: *const usize) -> usize {
+    // Do not use `core::ptr::read_unaligned` here, since it calls `copy_nonoverlapping` which
+    // is translated to memcpy in LLVM.
+    let x_read = (x as *const [u8; core::mem::size_of::<usize>()]).read();
+    core::mem::transmute(x_read)
+}
+
+#[inline(always)]
+pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, mut n: usize) {
+    #[inline(always)]
+    unsafe fn copy_forward_bytes(mut dest: *mut u8, mut src: *const u8, n: usize) {
+        let dest_end = dest.add(n);
+        while dest < dest_end {
+            *dest = *src;
+            dest = dest.add(1);
+            src = src.add(1);
+        }
+    }
+
+    #[inline(always)]
+    unsafe fn copy_forward_aligned_words(dest: *mut u8, src: *const u8, n: usize) {
+        let mut dest_usize = dest as *mut usize;
+        let mut src_usize = src as *mut usize;
+        let dest_end = dest.add(n) as *mut usize;
+
+        while dest_usize < dest_end {
+            *dest_usize = *src_usize;
+            dest_usize = dest_usize.add(1);
+            src_usize = src_usize.add(1);
+        }
+    }
+
+    #[cfg(not(feature = "mem-unaligned"))]
+    #[inline(always)]
+    unsafe fn copy_forward_misaligned_words(dest: *mut u8, src: *const u8, n: usize) {
+        let mut dest_usize = dest as *mut usize;
+        let dest_end = dest.add(n) as *mut usize;
+
+        // Calculate the misalignment offset and shift needed to reassemble value.
+        let offset = src as usize & WORD_MASK;
+        let shift = offset * 8;
+
+        // Realign src
+        let mut src_aligned = (src as usize & !WORD_MASK) as *mut usize;
+        // This will read (but won't use) bytes out of bound.
+        // cfg needed because not all targets will have atomic loads that can be lowered
+        // (e.g. BPF, MSP430), or provided by an external library (e.g. RV32I)
+        #[cfg(target_has_atomic_load_store = "ptr")]
+        let mut prev_word = core::intrinsics::atomic_load_unordered(src_aligned);
+        #[cfg(not(target_has_atomic_load_store = "ptr"))]
+        let mut prev_word = core::ptr::read_volatile(src_aligned);
+
+        while dest_usize < dest_end {
+            src_aligned = src_aligned.add(1);
+            let cur_word = *src_aligned;
+            #[cfg(target_endian = "little")]
+            let resembled = prev_word >> shift | cur_word << (WORD_SIZE * 8 - shift);
+            #[cfg(target_endian = "big")]
+            let resembled = prev_word << shift | cur_word >> (WORD_SIZE * 8 - shift);
+            prev_word = cur_word;
+
+            *dest_usize = resembled;
+            dest_usize = dest_usize.add(1);
+        }
+    }
+
+    #[cfg(feature = "mem-unaligned")]
+    #[inline(always)]
+    unsafe fn copy_forward_misaligned_words(dest: *mut u8, src: *const u8, n: usize) {
+        let mut dest_usize = dest as *mut usize;
+        let mut src_usize = src as *mut usize;
+        let dest_end = dest.add(n) as *mut usize;
+
+        while dest_usize < dest_end {
+            *dest_usize = read_usize_unaligned(src_usize);
+            dest_usize = dest_usize.add(1);
+            src_usize = src_usize.add(1);
+        }
+    }
+
+    if n >= WORD_COPY_THRESHOLD {
+        // Align dest
+        // Because of n >= 2 * WORD_SIZE, dst_misalignment < n
+        let dest_misalignment = (dest as usize).wrapping_neg() & WORD_MASK;
+        copy_forward_bytes(dest, src, dest_misalignment);
+        dest = dest.add(dest_misalignment);
+        src = src.add(dest_misalignment);
+        n -= dest_misalignment;
+
+        let n_words = n & !WORD_MASK;
+        let src_misalignment = src as usize & WORD_MASK;
+        if likely(src_misalignment == 0) {
+            copy_forward_aligned_words(dest, src, n_words);
+        } else {
+            copy_forward_misaligned_words(dest, src, n_words);
+        }
+        dest = dest.add(n_words);
+        src = src.add(n_words);
+        n -= n_words;
+    }
+    copy_forward_bytes(dest, src, n);
+}
+
+#[inline(always)]
+pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, mut n: usize) {
+    // The following backward copy helper functions uses the pointers past the end
+    // as their inputs instead of pointers to the start!
+    #[inline(always)]
+    unsafe fn copy_backward_bytes(mut dest: *mut u8, mut src: *const u8, n: usize) {
+        let dest_start = dest.sub(n);
+        while dest_start < dest {
+            dest = dest.sub(1);
+            src = src.sub(1);
+            *dest = *src;
+        }
+    }
+
+    #[inline(always)]
+    unsafe fn copy_backward_aligned_words(dest: *mut u8, src: *const u8, n: usize) {
+        let mut dest_usize = dest as *mut usize;
+        let mut src_usize = src as *mut usize;
+        let dest_start = dest.sub(n) as *mut usize;
+
+        while dest_start < dest_usize {
+            dest_usize = dest_usize.sub(1);
+            src_usize = src_usize.sub(1);
+            *dest_usize = *src_usize;
+        }
+    }
+
+    #[cfg(not(feature = "mem-unaligned"))]
+    #[inline(always)]
+    unsafe fn copy_backward_misaligned_words(dest: *mut u8, src: *const u8, n: usize) {
+        let mut dest_usize = dest as *mut usize;
+        let dest_start = dest.sub(n) as *mut usize;
+
+        // Calculate the misalignment offset and shift needed to reassemble value.
+        let offset = src as usize & WORD_MASK;
+        let shift = offset * 8;
+
+        // Realign src_aligned
+        let mut src_aligned = (src as usize & !WORD_MASK) as *mut usize;
+        // This will read (but won't use) bytes out of bound.
+        // cfg needed because not all targets will have atomic loads that can be lowered
+        // (e.g. BPF, MSP430), or provided by an external library (e.g. RV32I)
+        #[cfg(target_has_atomic_load_store = "ptr")]
+        let mut prev_word = core::intrinsics::atomic_load_unordered(src_aligned);
+        #[cfg(not(target_has_atomic_load_store = "ptr"))]
+        let mut prev_word = core::ptr::read_volatile(src_aligned);
+
+        while dest_start < dest_usize {
+            src_aligned = src_aligned.sub(1);
+            let cur_word = *src_aligned;
+            #[cfg(target_endian = "little")]
+            let resembled = prev_word << (WORD_SIZE * 8 - shift) | cur_word >> shift;
+            #[cfg(target_endian = "big")]
+            let resembled = prev_word >> (WORD_SIZE * 8 - shift) | cur_word << shift;
+            prev_word = cur_word;
+
+            dest_usize = dest_usize.sub(1);
+            *dest_usize = resembled;
+        }
+    }
+
+    #[cfg(feature = "mem-unaligned")]
+    #[inline(always)]
+    unsafe fn copy_backward_misaligned_words(dest: *mut u8, src: *const u8, n: usize) {
+        let mut dest_usize = dest as *mut usize;
+        let mut src_usize = src as *mut usize;
+        let dest_start = dest.sub(n) as *mut usize;
+
+        while dest_start < dest_usize {
+            dest_usize = dest_usize.sub(1);
+            src_usize = src_usize.sub(1);
+            *dest_usize = read_usize_unaligned(src_usize);
+        }
+    }
+
+    let mut dest = dest.add(n);
+    let mut src = src.add(n);
+
+    if n >= WORD_COPY_THRESHOLD {
+        // Align dest
+        // Because of n >= 2 * WORD_SIZE, dst_misalignment < n
+        let dest_misalignment = dest as usize & WORD_MASK;
+        copy_backward_bytes(dest, src, dest_misalignment);
+        dest = dest.sub(dest_misalignment);
+        src = src.sub(dest_misalignment);
+        n -= dest_misalignment;
+
+        let n_words = n & !WORD_MASK;
+        let src_misalignment = src as usize & WORD_MASK;
+        if likely(src_misalignment == 0) {
+            copy_backward_aligned_words(dest, src, n_words);
+        } else {
+            copy_backward_misaligned_words(dest, src, n_words);
+        }
+        dest = dest.sub(n_words);
+        src = src.sub(n_words);
+        n -= n_words;
+    }
+    copy_backward_bytes(dest, src, n);
+}
+
+#[inline(always)]
+pub unsafe fn set_bytes(mut s: *mut u8, c: u8, mut n: usize) {
+    #[inline(always)]
+    pub unsafe fn set_bytes_bytes(mut s: *mut u8, c: u8, n: usize) {
+        let end = s.add(n);
+        while s < end {
+            *s = c;
+            s = s.add(1);
+        }
+    }
+
+    #[inline(always)]
+    pub unsafe fn set_bytes_words(s: *mut u8, c: u8, n: usize) {
+        let mut broadcast = c as usize;
+        let mut bits = 8;
+        while bits < WORD_SIZE * 8 {
+            broadcast |= broadcast << bits;
+            bits *= 2;
+        }
+
+        let mut s_usize = s as *mut usize;
+        let end = s.add(n) as *mut usize;
+
+        while s_usize < end {
+            *s_usize = broadcast;
+            s_usize = s_usize.add(1);
+        }
+    }
+
+    if likely(n >= WORD_COPY_THRESHOLD) {
+        // Align s
+        // Because of n >= 2 * WORD_SIZE, dst_misalignment < n
+        let misalignment = (s as usize).wrapping_neg() & WORD_MASK;
+        set_bytes_bytes(s, c, misalignment);
+        s = s.add(misalignment);
+        n -= misalignment;
+
+        let n_words = n & !WORD_MASK;
+        set_bytes_words(s, c, n_words);
+        s = s.add(n_words);
+        n -= n_words;
+    }
+    set_bytes_bytes(s, c, n);
+}
diff --git a/vendor/compiler_builtins/src/mem/mod.rs b/vendor/compiler_builtins/src/mem/mod.rs
new file mode 100644
index 000000000..a55113861
--- /dev/null
+++ b/vendor/compiler_builtins/src/mem/mod.rs
@@ -0,0 +1,211 @@
+// Trying to satisfy clippy here is hopeless
+#![allow(clippy::style)]
+
+#[allow(warnings)]
+#[cfg(target_pointer_width = "16")]
+type c_int = i16;
+#[allow(warnings)]
+#[cfg(not(target_pointer_width = "16"))]
+type c_int = i32;
+
+use core::intrinsics::{atomic_load_unordered, atomic_store_unordered, exact_div};
+use core::mem;
+use core::ops::{BitOr, Shl};
+
+// memcpy/memmove/memset have optimized implementations on some architectures
+#[cfg_attr(
+    all(not(feature = "no-asm"), target_arch = "x86_64"),
+    path = "x86_64.rs"
+)]
+mod impls;
+
+intrinsics! {
+    #[mem_builtin]
+    #[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), linkage = "weak")]
+    pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
+        impls::copy_forward(dest, src, n);
+        dest
+    }
+
+    #[mem_builtin]
+    #[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), linkage = "weak")]
+    pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
+        let delta = (dest as usize).wrapping_sub(src as usize);
+        if delta >= n {
+            // We can copy forwards because either dest is far enough ahead of src,
+            // or src is ahead of dest (and delta overflowed).
+            impls::copy_forward(dest, src, n);
+        } else {
+            impls::copy_backward(dest, src, n);
+        }
+        dest
+    }
+
+    #[mem_builtin]
+    #[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), linkage = "weak")]
+    pub unsafe extern "C" fn memset(s: *mut u8, c: crate::mem::c_int, n: usize) -> *mut u8 {
+        impls::set_bytes(s, c as u8, n);
+        s
+    }
+
+    #[mem_builtin]
+    #[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), linkage = "weak")]
+    pub unsafe extern "C" fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 {
+        let mut i = 0;
+        while i < n {
+            let a = *s1.add(i);
+            let b = *s2.add(i);
+            if a != b {
+                return a as i32 - b as i32;
+            }
+            i += 1;
+        }
+        0
+    }
+
+    #[mem_builtin]
+    #[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), linkage = "weak")]
+    pub unsafe extern "C" fn bcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 {
+        memcmp(s1, s2, n)
+    }
+
+    #[mem_builtin]
+    #[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), linkage = "weak")]
+    pub unsafe extern "C" fn strlen(s: *const core::ffi::c_char) -> usize {
+        let mut n = 0;
+        let mut s = s;
+        while *s != 0 {
+            n += 1;
+            s = s.offset(1);
+        }
+        n
+    }
+}
+
+// `bytes` must be a multiple of `mem::size_of::<T>()`
+#[cfg_attr(not(target_has_atomic_load_store = "8"), allow(dead_code))]
+fn memcpy_element_unordered_atomic<T: Copy>(dest: *mut T, src: *const T, bytes: usize) {
+    unsafe {
+        let n = exact_div(bytes, mem::size_of::<T>());
+        let mut i = 0;
+        while i < n {
+            atomic_store_unordered(dest.add(i), atomic_load_unordered(src.add(i)));
+            i += 1;
+        }
+    }
+}
+
+// `bytes` must be a multiple of `mem::size_of::<T>()`
+#[cfg_attr(not(target_has_atomic_load_store = "8"), allow(dead_code))]
+fn memmove_element_unordered_atomic<T: Copy>(dest: *mut T, src: *const T, bytes: usize) {
+    unsafe {
+        let n = exact_div(bytes, mem::size_of::<T>());
+        if src < dest as *const T {
+            // copy from end
+            let mut i = n;
+            while i != 0 {
+                i -= 1;
+                atomic_store_unordered(dest.add(i), atomic_load_unordered(src.add(i)));
+            }
+        } else {
+            // copy from beginning
+            let mut i = 0;
+            while i < n {
+                atomic_store_unordered(dest.add(i), atomic_load_unordered(src.add(i)));
+                i += 1;
+            }
+        }
+    }
+}
+
+// `T` must be a primitive integer type, and `bytes` must be a multiple of `mem::size_of::<T>()`
+#[cfg_attr(not(target_has_atomic_load_store = "8"), allow(dead_code))]
+fn memset_element_unordered_atomic<T>(s: *mut T, c: u8, bytes: usize)
+where
+    T: Copy + From<u8> + Shl<u32, Output = T> + BitOr<T, Output = T>,
+{
+    unsafe {
+        let n = exact_div(bytes, mem::size_of::<T>());
+
+        // Construct a value of type `T` consisting of repeated `c`
+        // bytes, to let us ensure we write each `T` atomically.
+        let mut x = T::from(c);
+        let mut i = 1;
+        while i < mem::size_of::<T>() {
+            x = x << 8 | T::from(c);
+            i += 1;
+        }
+
+        // Write it to `s`
+        let mut i = 0;
+        while i < n {
+            atomic_store_unordered(s.add(i), x);
+            i += 1;
+        }
+    }
+}
+
+intrinsics! {
+    #[cfg(target_has_atomic_load_store = "8")]
+    pub unsafe extern "C" fn __llvm_memcpy_element_unordered_atomic_1(dest: *mut u8, src: *const u8, bytes: usize) -> () {
+        memcpy_element_unordered_atomic(dest, src, bytes);
+    }
+    #[cfg(target_has_atomic_load_store = "16")]
+    pub unsafe extern "C" fn __llvm_memcpy_element_unordered_atomic_2(dest: *mut u16, src: *const u16, bytes: usize) -> () {
+        memcpy_element_unordered_atomic(dest, src, bytes);
+    }
+    #[cfg(target_has_atomic_load_store = "32")]
+    pub unsafe extern "C" fn __llvm_memcpy_element_unordered_atomic_4(dest: *mut u32, src: *const u32, bytes: usize) -> () {
+        memcpy_element_unordered_atomic(dest, src, bytes);
+    }
+    #[cfg(target_has_atomic_load_store = "64")]
+    pub unsafe extern "C" fn __llvm_memcpy_element_unordered_atomic_8(dest: *mut u64, src: *const u64, bytes: usize) -> () {
+        memcpy_element_unordered_atomic(dest, src, bytes);
+    }
+    #[cfg(target_has_atomic_load_store = "128")]
+    pub unsafe extern "C" fn __llvm_memcpy_element_unordered_atomic_16(dest: *mut u128, src: *const u128, bytes: usize) -> () {
+        memcpy_element_unordered_atomic(dest, src, bytes);
+    }
+
+    #[cfg(target_has_atomic_load_store = "8")]
+    pub unsafe extern "C" fn __llvm_memmove_element_unordered_atomic_1(dest: *mut u8, src: *const u8, bytes: usize) -> () {
+        memmove_element_unordered_atomic(dest, src, bytes);
+    }
+    #[cfg(target_has_atomic_load_store = "16")]
+    pub unsafe extern "C" fn __llvm_memmove_element_unordered_atomic_2(dest: *mut u16, src: *const u16, bytes: usize) -> () {
+        memmove_element_unordered_atomic(dest, src, bytes);
+    }
+    #[cfg(target_has_atomic_load_store = "32")]
+    pub unsafe extern "C" fn __llvm_memmove_element_unordered_atomic_4(dest: *mut u32, src: *const u32, bytes: usize) -> () {
+        memmove_element_unordered_atomic(dest, src, bytes);
+    }
+    #[cfg(target_has_atomic_load_store = "64")]
+    pub unsafe extern "C" fn __llvm_memmove_element_unordered_atomic_8(dest: *mut u64, src: *const u64, bytes: usize) -> () {
+        memmove_element_unordered_atomic(dest, src, bytes);
+    }
+    #[cfg(target_has_atomic_load_store = "128")]
+    pub unsafe extern "C" fn __llvm_memmove_element_unordered_atomic_16(dest: *mut u128, src: *const u128, bytes: usize) -> () {
+        memmove_element_unordered_atomic(dest, src, bytes);
+    }
+
+    #[cfg(target_has_atomic_load_store = "8")]
+    pub unsafe extern "C" fn __llvm_memset_element_unordered_atomic_1(s: *mut u8, c: u8, bytes: usize) -> () {
+        memset_element_unordered_atomic(s, c, bytes);
+    }
+    #[cfg(target_has_atomic_load_store = "16")]
+    pub unsafe extern "C" fn __llvm_memset_element_unordered_atomic_2(s: *mut u16, c: u8, bytes: usize) -> () {
+        memset_element_unordered_atomic(s, c, bytes);
+    }
+    #[cfg(target_has_atomic_load_store = "32")]
+    pub unsafe extern "C" fn __llvm_memset_element_unordered_atomic_4(s: *mut u32, c: u8, bytes: usize) -> () {
+        memset_element_unordered_atomic(s, c, bytes);
+    }
+    #[cfg(target_has_atomic_load_store = "64")]
+    pub unsafe extern "C" fn __llvm_memset_element_unordered_atomic_8(s: *mut u64, c: u8, bytes: usize) -> () {
+        memset_element_unordered_atomic(s, c, bytes);
+    }
+    #[cfg(target_has_atomic_load_store = "128")]
+    pub unsafe extern "C" fn __llvm_memset_element_unordered_atomic_16(s: *mut u128, c: u8, bytes: usize) -> () {
+        memset_element_unordered_atomic(s, c, bytes);
+    }
+}
diff --git a/vendor/compiler_builtins/src/mem/x86_64.rs b/vendor/compiler_builtins/src/mem/x86_64.rs
new file mode 100644
index 000000000..a7ab6f605
--- /dev/null
+++ b/vendor/compiler_builtins/src/mem/x86_64.rs
@@ -0,0 +1,100 @@
+// On most modern Intel and AMD processors, "rep movsq" and "rep stosq" have
+// been enhanced to perform better than an simple qword loop, making them ideal
+// for implementing memcpy/memset. Note that "rep cmps" has received no such
+// enhancement, so it is not used to implement memcmp.
+//
+// On certain recent Intel processors, "rep movsb" and "rep stosb" have been
+// further enhanced to automatically select the best microarchitectural
+// implementation based on length and alignment. See the following features from
+// the "Intel® 64 and IA-32 Architectures Optimization Reference Manual":
+//  - ERMSB - Enhanced REP MOVSB and STOSB (Ivy Bridge and later)
+//  - FSRM - Fast Short REP MOV (Ice Lake and later)
+//  - Fast Zero-Length MOVSB (On no current hardware)
+//  - Fast Short STOSB (On no current hardware)
+//
+// To simplify things, we switch to using the byte-based variants if the "ermsb"
+// feature is present at compile-time. We don't bother detecting other features.
+// Note that ERMSB does not enhance the backwards (DF=1) "rep movsb".
+
+#[inline(always)]
+#[cfg(target_feature = "ermsb")]
+pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
+    // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
+    core::arch::asm!(
+        "repe movsb (%rsi), (%rdi)",
+        inout("rcx") count => _,
+        inout("rdi") dest => _,
+        inout("rsi") src => _,
+        options(att_syntax, nostack, preserves_flags)
+    );
+}
+
+#[inline(always)]
+#[cfg(not(target_feature = "ermsb"))]
+pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
+    let qword_count = count >> 3;
+    let byte_count = count & 0b111;
+    // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
+    core::arch::asm!(
+        "repe movsq (%rsi), (%rdi)",
+        "mov {byte_count:e}, %ecx",
+        "repe movsb (%rsi), (%rdi)",
+        byte_count = in(reg) byte_count,
+        inout("rcx") qword_count => _,
+        inout("rdi") dest => _,
+        inout("rsi") src => _,
+        options(att_syntax, nostack, preserves_flags)
+    );
+}
+
+#[inline(always)]
+pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) {
+    let qword_count = count >> 3;
+    let byte_count = count & 0b111;
+    // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
+    core::arch::asm!(
+        "std",
+        "repe movsq (%rsi), (%rdi)",
+        "movl {byte_count:e}, %ecx",
+        "addq $7, %rdi",
+        "addq $7, %rsi",
+        "repe movsb (%rsi), (%rdi)",
+        "cld",
+        byte_count = in(reg) byte_count,
+        inout("rcx") qword_count => _,
+        inout("rdi") dest.add(count).wrapping_sub(8) => _,
+        inout("rsi") src.add(count).wrapping_sub(8) => _,
+        options(att_syntax, nostack)
+    );
+}
+
+#[inline(always)]
+#[cfg(target_feature = "ermsb")]
+pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
+    // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
+    core::arch::asm!(
+        "repe stosb %al, (%rdi)",
+        inout("rcx") count => _,
+        inout("rdi") dest => _,
+        inout("al") c => _,
+        options(att_syntax, nostack, preserves_flags)
+    )
+}
+
+#[inline(always)]
+#[cfg(not(target_feature = "ermsb"))]
+pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
+    let qword_count = count >> 3;
+    let byte_count = count & 0b111;
+    // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
+    core::arch::asm!(
+        "repe stosq %rax, (%rdi)",
+        "mov {byte_count:e}, %ecx",
+        "repe stosb %al, (%rdi)",
+        byte_count = in(reg) byte_count,
+        inout("rcx") qword_count => _,
+        inout("rdi") dest => _,
+        in("rax") (c as u64) * 0x0101010101010101,
+        options(att_syntax, nostack, preserves_flags)
+    );
+}
-- 
cgit v1.2.3