From 698f8c2f01ea549d77d7dc3338a12e04c11057b9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 17 Apr 2024 14:02:58 +0200 Subject: Adding upstream version 1.64.0+dfsg1. Signed-off-by: Daniel Baumann --- vendor/compiler_builtins/src/mem/impls.rs | 267 +++++++++++++++++++++++++++++ vendor/compiler_builtins/src/mem/mod.rs | 211 +++++++++++++++++++++++ vendor/compiler_builtins/src/mem/x86_64.rs | 100 +++++++++++ 3 files changed, 578 insertions(+) create mode 100644 vendor/compiler_builtins/src/mem/impls.rs create mode 100644 vendor/compiler_builtins/src/mem/mod.rs create mode 100644 vendor/compiler_builtins/src/mem/x86_64.rs (limited to 'vendor/compiler_builtins/src/mem') diff --git a/vendor/compiler_builtins/src/mem/impls.rs b/vendor/compiler_builtins/src/mem/impls.rs new file mode 100644 index 000000000..815132425 --- /dev/null +++ b/vendor/compiler_builtins/src/mem/impls.rs @@ -0,0 +1,267 @@ +use core::intrinsics::likely; + +const WORD_SIZE: usize = core::mem::size_of::(); +const WORD_MASK: usize = WORD_SIZE - 1; + +// If the number of bytes involved exceed this threshold we will opt in word-wise copy. +// The value here selected is max(2 * WORD_SIZE, 16): +// * We need at least 2 * WORD_SIZE bytes to guarantee that at least 1 word will be copied through +// word-wise copy. +// * The word-wise copy logic needs to perform some checks so it has some small overhead. +// ensures that even on 32-bit platforms we have copied at least 8 bytes through +// word-wise copy so the saving of word-wise copy outweights the fixed overhead. +const WORD_COPY_THRESHOLD: usize = if 2 * WORD_SIZE > 16 { + 2 * WORD_SIZE +} else { + 16 +}; + +#[cfg(feature = "mem-unaligned")] +unsafe fn read_usize_unaligned(x: *const usize) -> usize { + // Do not use `core::ptr::read_unaligned` here, since it calls `copy_nonoverlapping` which + // is translated to memcpy in LLVM. + let x_read = (x as *const [u8; core::mem::size_of::()]).read(); + core::mem::transmute(x_read) +} + +#[inline(always)] +pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, mut n: usize) { + #[inline(always)] + unsafe fn copy_forward_bytes(mut dest: *mut u8, mut src: *const u8, n: usize) { + let dest_end = dest.add(n); + while dest < dest_end { + *dest = *src; + dest = dest.add(1); + src = src.add(1); + } + } + + #[inline(always)] + unsafe fn copy_forward_aligned_words(dest: *mut u8, src: *const u8, n: usize) { + let mut dest_usize = dest as *mut usize; + let mut src_usize = src as *mut usize; + let dest_end = dest.add(n) as *mut usize; + + while dest_usize < dest_end { + *dest_usize = *src_usize; + dest_usize = dest_usize.add(1); + src_usize = src_usize.add(1); + } + } + + #[cfg(not(feature = "mem-unaligned"))] + #[inline(always)] + unsafe fn copy_forward_misaligned_words(dest: *mut u8, src: *const u8, n: usize) { + let mut dest_usize = dest as *mut usize; + let dest_end = dest.add(n) as *mut usize; + + // Calculate the misalignment offset and shift needed to reassemble value. + let offset = src as usize & WORD_MASK; + let shift = offset * 8; + + // Realign src + let mut src_aligned = (src as usize & !WORD_MASK) as *mut usize; + // This will read (but won't use) bytes out of bound. + // cfg needed because not all targets will have atomic loads that can be lowered + // (e.g. BPF, MSP430), or provided by an external library (e.g. RV32I) + #[cfg(target_has_atomic_load_store = "ptr")] + let mut prev_word = core::intrinsics::atomic_load_unordered(src_aligned); + #[cfg(not(target_has_atomic_load_store = "ptr"))] + let mut prev_word = core::ptr::read_volatile(src_aligned); + + while dest_usize < dest_end { + src_aligned = src_aligned.add(1); + let cur_word = *src_aligned; + #[cfg(target_endian = "little")] + let resembled = prev_word >> shift | cur_word << (WORD_SIZE * 8 - shift); + #[cfg(target_endian = "big")] + let resembled = prev_word << shift | cur_word >> (WORD_SIZE * 8 - shift); + prev_word = cur_word; + + *dest_usize = resembled; + dest_usize = dest_usize.add(1); + } + } + + #[cfg(feature = "mem-unaligned")] + #[inline(always)] + unsafe fn copy_forward_misaligned_words(dest: *mut u8, src: *const u8, n: usize) { + let mut dest_usize = dest as *mut usize; + let mut src_usize = src as *mut usize; + let dest_end = dest.add(n) as *mut usize; + + while dest_usize < dest_end { + *dest_usize = read_usize_unaligned(src_usize); + dest_usize = dest_usize.add(1); + src_usize = src_usize.add(1); + } + } + + if n >= WORD_COPY_THRESHOLD { + // Align dest + // Because of n >= 2 * WORD_SIZE, dst_misalignment < n + let dest_misalignment = (dest as usize).wrapping_neg() & WORD_MASK; + copy_forward_bytes(dest, src, dest_misalignment); + dest = dest.add(dest_misalignment); + src = src.add(dest_misalignment); + n -= dest_misalignment; + + let n_words = n & !WORD_MASK; + let src_misalignment = src as usize & WORD_MASK; + if likely(src_misalignment == 0) { + copy_forward_aligned_words(dest, src, n_words); + } else { + copy_forward_misaligned_words(dest, src, n_words); + } + dest = dest.add(n_words); + src = src.add(n_words); + n -= n_words; + } + copy_forward_bytes(dest, src, n); +} + +#[inline(always)] +pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, mut n: usize) { + // The following backward copy helper functions uses the pointers past the end + // as their inputs instead of pointers to the start! + #[inline(always)] + unsafe fn copy_backward_bytes(mut dest: *mut u8, mut src: *const u8, n: usize) { + let dest_start = dest.sub(n); + while dest_start < dest { + dest = dest.sub(1); + src = src.sub(1); + *dest = *src; + } + } + + #[inline(always)] + unsafe fn copy_backward_aligned_words(dest: *mut u8, src: *const u8, n: usize) { + let mut dest_usize = dest as *mut usize; + let mut src_usize = src as *mut usize; + let dest_start = dest.sub(n) as *mut usize; + + while dest_start < dest_usize { + dest_usize = dest_usize.sub(1); + src_usize = src_usize.sub(1); + *dest_usize = *src_usize; + } + } + + #[cfg(not(feature = "mem-unaligned"))] + #[inline(always)] + unsafe fn copy_backward_misaligned_words(dest: *mut u8, src: *const u8, n: usize) { + let mut dest_usize = dest as *mut usize; + let dest_start = dest.sub(n) as *mut usize; + + // Calculate the misalignment offset and shift needed to reassemble value. + let offset = src as usize & WORD_MASK; + let shift = offset * 8; + + // Realign src_aligned + let mut src_aligned = (src as usize & !WORD_MASK) as *mut usize; + // This will read (but won't use) bytes out of bound. + // cfg needed because not all targets will have atomic loads that can be lowered + // (e.g. BPF, MSP430), or provided by an external library (e.g. RV32I) + #[cfg(target_has_atomic_load_store = "ptr")] + let mut prev_word = core::intrinsics::atomic_load_unordered(src_aligned); + #[cfg(not(target_has_atomic_load_store = "ptr"))] + let mut prev_word = core::ptr::read_volatile(src_aligned); + + while dest_start < dest_usize { + src_aligned = src_aligned.sub(1); + let cur_word = *src_aligned; + #[cfg(target_endian = "little")] + let resembled = prev_word << (WORD_SIZE * 8 - shift) | cur_word >> shift; + #[cfg(target_endian = "big")] + let resembled = prev_word >> (WORD_SIZE * 8 - shift) | cur_word << shift; + prev_word = cur_word; + + dest_usize = dest_usize.sub(1); + *dest_usize = resembled; + } + } + + #[cfg(feature = "mem-unaligned")] + #[inline(always)] + unsafe fn copy_backward_misaligned_words(dest: *mut u8, src: *const u8, n: usize) { + let mut dest_usize = dest as *mut usize; + let mut src_usize = src as *mut usize; + let dest_start = dest.sub(n) as *mut usize; + + while dest_start < dest_usize { + dest_usize = dest_usize.sub(1); + src_usize = src_usize.sub(1); + *dest_usize = read_usize_unaligned(src_usize); + } + } + + let mut dest = dest.add(n); + let mut src = src.add(n); + + if n >= WORD_COPY_THRESHOLD { + // Align dest + // Because of n >= 2 * WORD_SIZE, dst_misalignment < n + let dest_misalignment = dest as usize & WORD_MASK; + copy_backward_bytes(dest, src, dest_misalignment); + dest = dest.sub(dest_misalignment); + src = src.sub(dest_misalignment); + n -= dest_misalignment; + + let n_words = n & !WORD_MASK; + let src_misalignment = src as usize & WORD_MASK; + if likely(src_misalignment == 0) { + copy_backward_aligned_words(dest, src, n_words); + } else { + copy_backward_misaligned_words(dest, src, n_words); + } + dest = dest.sub(n_words); + src = src.sub(n_words); + n -= n_words; + } + copy_backward_bytes(dest, src, n); +} + +#[inline(always)] +pub unsafe fn set_bytes(mut s: *mut u8, c: u8, mut n: usize) { + #[inline(always)] + pub unsafe fn set_bytes_bytes(mut s: *mut u8, c: u8, n: usize) { + let end = s.add(n); + while s < end { + *s = c; + s = s.add(1); + } + } + + #[inline(always)] + pub unsafe fn set_bytes_words(s: *mut u8, c: u8, n: usize) { + let mut broadcast = c as usize; + let mut bits = 8; + while bits < WORD_SIZE * 8 { + broadcast |= broadcast << bits; + bits *= 2; + } + + let mut s_usize = s as *mut usize; + let end = s.add(n) as *mut usize; + + while s_usize < end { + *s_usize = broadcast; + s_usize = s_usize.add(1); + } + } + + if likely(n >= WORD_COPY_THRESHOLD) { + // Align s + // Because of n >= 2 * WORD_SIZE, dst_misalignment < n + let misalignment = (s as usize).wrapping_neg() & WORD_MASK; + set_bytes_bytes(s, c, misalignment); + s = s.add(misalignment); + n -= misalignment; + + let n_words = n & !WORD_MASK; + set_bytes_words(s, c, n_words); + s = s.add(n_words); + n -= n_words; + } + set_bytes_bytes(s, c, n); +} diff --git a/vendor/compiler_builtins/src/mem/mod.rs b/vendor/compiler_builtins/src/mem/mod.rs new file mode 100644 index 000000000..a55113861 --- /dev/null +++ b/vendor/compiler_builtins/src/mem/mod.rs @@ -0,0 +1,211 @@ +// Trying to satisfy clippy here is hopeless +#![allow(clippy::style)] + +#[allow(warnings)] +#[cfg(target_pointer_width = "16")] +type c_int = i16; +#[allow(warnings)] +#[cfg(not(target_pointer_width = "16"))] +type c_int = i32; + +use core::intrinsics::{atomic_load_unordered, atomic_store_unordered, exact_div}; +use core::mem; +use core::ops::{BitOr, Shl}; + +// memcpy/memmove/memset have optimized implementations on some architectures +#[cfg_attr( + all(not(feature = "no-asm"), target_arch = "x86_64"), + path = "x86_64.rs" +)] +mod impls; + +intrinsics! { + #[mem_builtin] + #[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), linkage = "weak")] + pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 { + impls::copy_forward(dest, src, n); + dest + } + + #[mem_builtin] + #[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), linkage = "weak")] + pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 { + let delta = (dest as usize).wrapping_sub(src as usize); + if delta >= n { + // We can copy forwards because either dest is far enough ahead of src, + // or src is ahead of dest (and delta overflowed). + impls::copy_forward(dest, src, n); + } else { + impls::copy_backward(dest, src, n); + } + dest + } + + #[mem_builtin] + #[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), linkage = "weak")] + pub unsafe extern "C" fn memset(s: *mut u8, c: crate::mem::c_int, n: usize) -> *mut u8 { + impls::set_bytes(s, c as u8, n); + s + } + + #[mem_builtin] + #[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), linkage = "weak")] + pub unsafe extern "C" fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 { + let mut i = 0; + while i < n { + let a = *s1.add(i); + let b = *s2.add(i); + if a != b { + return a as i32 - b as i32; + } + i += 1; + } + 0 + } + + #[mem_builtin] + #[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), linkage = "weak")] + pub unsafe extern "C" fn bcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 { + memcmp(s1, s2, n) + } + + #[mem_builtin] + #[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), linkage = "weak")] + pub unsafe extern "C" fn strlen(s: *const core::ffi::c_char) -> usize { + let mut n = 0; + let mut s = s; + while *s != 0 { + n += 1; + s = s.offset(1); + } + n + } +} + +// `bytes` must be a multiple of `mem::size_of::()` +#[cfg_attr(not(target_has_atomic_load_store = "8"), allow(dead_code))] +fn memcpy_element_unordered_atomic(dest: *mut T, src: *const T, bytes: usize) { + unsafe { + let n = exact_div(bytes, mem::size_of::()); + let mut i = 0; + while i < n { + atomic_store_unordered(dest.add(i), atomic_load_unordered(src.add(i))); + i += 1; + } + } +} + +// `bytes` must be a multiple of `mem::size_of::()` +#[cfg_attr(not(target_has_atomic_load_store = "8"), allow(dead_code))] +fn memmove_element_unordered_atomic(dest: *mut T, src: *const T, bytes: usize) { + unsafe { + let n = exact_div(bytes, mem::size_of::()); + if src < dest as *const T { + // copy from end + let mut i = n; + while i != 0 { + i -= 1; + atomic_store_unordered(dest.add(i), atomic_load_unordered(src.add(i))); + } + } else { + // copy from beginning + let mut i = 0; + while i < n { + atomic_store_unordered(dest.add(i), atomic_load_unordered(src.add(i))); + i += 1; + } + } + } +} + +// `T` must be a primitive integer type, and `bytes` must be a multiple of `mem::size_of::()` +#[cfg_attr(not(target_has_atomic_load_store = "8"), allow(dead_code))] +fn memset_element_unordered_atomic(s: *mut T, c: u8, bytes: usize) +where + T: Copy + From + Shl + BitOr, +{ + unsafe { + let n = exact_div(bytes, mem::size_of::()); + + // Construct a value of type `T` consisting of repeated `c` + // bytes, to let us ensure we write each `T` atomically. + let mut x = T::from(c); + let mut i = 1; + while i < mem::size_of::() { + x = x << 8 | T::from(c); + i += 1; + } + + // Write it to `s` + let mut i = 0; + while i < n { + atomic_store_unordered(s.add(i), x); + i += 1; + } + } +} + +intrinsics! { + #[cfg(target_has_atomic_load_store = "8")] + pub unsafe extern "C" fn __llvm_memcpy_element_unordered_atomic_1(dest: *mut u8, src: *const u8, bytes: usize) -> () { + memcpy_element_unordered_atomic(dest, src, bytes); + } + #[cfg(target_has_atomic_load_store = "16")] + pub unsafe extern "C" fn __llvm_memcpy_element_unordered_atomic_2(dest: *mut u16, src: *const u16, bytes: usize) -> () { + memcpy_element_unordered_atomic(dest, src, bytes); + } + #[cfg(target_has_atomic_load_store = "32")] + pub unsafe extern "C" fn __llvm_memcpy_element_unordered_atomic_4(dest: *mut u32, src: *const u32, bytes: usize) -> () { + memcpy_element_unordered_atomic(dest, src, bytes); + } + #[cfg(target_has_atomic_load_store = "64")] + pub unsafe extern "C" fn __llvm_memcpy_element_unordered_atomic_8(dest: *mut u64, src: *const u64, bytes: usize) -> () { + memcpy_element_unordered_atomic(dest, src, bytes); + } + #[cfg(target_has_atomic_load_store = "128")] + pub unsafe extern "C" fn __llvm_memcpy_element_unordered_atomic_16(dest: *mut u128, src: *const u128, bytes: usize) -> () { + memcpy_element_unordered_atomic(dest, src, bytes); + } + + #[cfg(target_has_atomic_load_store = "8")] + pub unsafe extern "C" fn __llvm_memmove_element_unordered_atomic_1(dest: *mut u8, src: *const u8, bytes: usize) -> () { + memmove_element_unordered_atomic(dest, src, bytes); + } + #[cfg(target_has_atomic_load_store = "16")] + pub unsafe extern "C" fn __llvm_memmove_element_unordered_atomic_2(dest: *mut u16, src: *const u16, bytes: usize) -> () { + memmove_element_unordered_atomic(dest, src, bytes); + } + #[cfg(target_has_atomic_load_store = "32")] + pub unsafe extern "C" fn __llvm_memmove_element_unordered_atomic_4(dest: *mut u32, src: *const u32, bytes: usize) -> () { + memmove_element_unordered_atomic(dest, src, bytes); + } + #[cfg(target_has_atomic_load_store = "64")] + pub unsafe extern "C" fn __llvm_memmove_element_unordered_atomic_8(dest: *mut u64, src: *const u64, bytes: usize) -> () { + memmove_element_unordered_atomic(dest, src, bytes); + } + #[cfg(target_has_atomic_load_store = "128")] + pub unsafe extern "C" fn __llvm_memmove_element_unordered_atomic_16(dest: *mut u128, src: *const u128, bytes: usize) -> () { + memmove_element_unordered_atomic(dest, src, bytes); + } + + #[cfg(target_has_atomic_load_store = "8")] + pub unsafe extern "C" fn __llvm_memset_element_unordered_atomic_1(s: *mut u8, c: u8, bytes: usize) -> () { + memset_element_unordered_atomic(s, c, bytes); + } + #[cfg(target_has_atomic_load_store = "16")] + pub unsafe extern "C" fn __llvm_memset_element_unordered_atomic_2(s: *mut u16, c: u8, bytes: usize) -> () { + memset_element_unordered_atomic(s, c, bytes); + } + #[cfg(target_has_atomic_load_store = "32")] + pub unsafe extern "C" fn __llvm_memset_element_unordered_atomic_4(s: *mut u32, c: u8, bytes: usize) -> () { + memset_element_unordered_atomic(s, c, bytes); + } + #[cfg(target_has_atomic_load_store = "64")] + pub unsafe extern "C" fn __llvm_memset_element_unordered_atomic_8(s: *mut u64, c: u8, bytes: usize) -> () { + memset_element_unordered_atomic(s, c, bytes); + } + #[cfg(target_has_atomic_load_store = "128")] + pub unsafe extern "C" fn __llvm_memset_element_unordered_atomic_16(s: *mut u128, c: u8, bytes: usize) -> () { + memset_element_unordered_atomic(s, c, bytes); + } +} diff --git a/vendor/compiler_builtins/src/mem/x86_64.rs b/vendor/compiler_builtins/src/mem/x86_64.rs new file mode 100644 index 000000000..a7ab6f605 --- /dev/null +++ b/vendor/compiler_builtins/src/mem/x86_64.rs @@ -0,0 +1,100 @@ +// On most modern Intel and AMD processors, "rep movsq" and "rep stosq" have +// been enhanced to perform better than an simple qword loop, making them ideal +// for implementing memcpy/memset. Note that "rep cmps" has received no such +// enhancement, so it is not used to implement memcmp. +// +// On certain recent Intel processors, "rep movsb" and "rep stosb" have been +// further enhanced to automatically select the best microarchitectural +// implementation based on length and alignment. See the following features from +// the "IntelĀ® 64 and IA-32 Architectures Optimization Reference Manual": +// - ERMSB - Enhanced REP MOVSB and STOSB (Ivy Bridge and later) +// - FSRM - Fast Short REP MOV (Ice Lake and later) +// - Fast Zero-Length MOVSB (On no current hardware) +// - Fast Short STOSB (On no current hardware) +// +// To simplify things, we switch to using the byte-based variants if the "ermsb" +// feature is present at compile-time. We don't bother detecting other features. +// Note that ERMSB does not enhance the backwards (DF=1) "rep movsb". + +#[inline(always)] +#[cfg(target_feature = "ermsb")] +pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) { + // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust. + core::arch::asm!( + "repe movsb (%rsi), (%rdi)", + inout("rcx") count => _, + inout("rdi") dest => _, + inout("rsi") src => _, + options(att_syntax, nostack, preserves_flags) + ); +} + +#[inline(always)] +#[cfg(not(target_feature = "ermsb"))] +pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) { + let qword_count = count >> 3; + let byte_count = count & 0b111; + // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust. + core::arch::asm!( + "repe movsq (%rsi), (%rdi)", + "mov {byte_count:e}, %ecx", + "repe movsb (%rsi), (%rdi)", + byte_count = in(reg) byte_count, + inout("rcx") qword_count => _, + inout("rdi") dest => _, + inout("rsi") src => _, + options(att_syntax, nostack, preserves_flags) + ); +} + +#[inline(always)] +pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) { + let qword_count = count >> 3; + let byte_count = count & 0b111; + // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust. + core::arch::asm!( + "std", + "repe movsq (%rsi), (%rdi)", + "movl {byte_count:e}, %ecx", + "addq $7, %rdi", + "addq $7, %rsi", + "repe movsb (%rsi), (%rdi)", + "cld", + byte_count = in(reg) byte_count, + inout("rcx") qword_count => _, + inout("rdi") dest.add(count).wrapping_sub(8) => _, + inout("rsi") src.add(count).wrapping_sub(8) => _, + options(att_syntax, nostack) + ); +} + +#[inline(always)] +#[cfg(target_feature = "ermsb")] +pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) { + // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust. + core::arch::asm!( + "repe stosb %al, (%rdi)", + inout("rcx") count => _, + inout("rdi") dest => _, + inout("al") c => _, + options(att_syntax, nostack, preserves_flags) + ) +} + +#[inline(always)] +#[cfg(not(target_feature = "ermsb"))] +pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) { + let qword_count = count >> 3; + let byte_count = count & 0b111; + // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust. + core::arch::asm!( + "repe stosq %rax, (%rdi)", + "mov {byte_count:e}, %ecx", + "repe stosb %al, (%rdi)", + byte_count = in(reg) byte_count, + inout("rcx") qword_count => _, + inout("rdi") dest => _, + in("rax") (c as u64) * 0x0101010101010101, + options(att_syntax, nostack, preserves_flags) + ); +} -- cgit v1.2.3