summaryrefslogtreecommitdiffstats
path: root/vendor/compiler_builtins/src/mem/x86_64.rs
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/compiler_builtins/src/mem/x86_64.rs')
-rw-r--r--vendor/compiler_builtins/src/mem/x86_64.rs150
1 files changed, 117 insertions, 33 deletions
diff --git a/vendor/compiler_builtins/src/mem/x86_64.rs b/vendor/compiler_builtins/src/mem/x86_64.rs
index a7ab6f605..17b461f79 100644
--- a/vendor/compiler_builtins/src/mem/x86_64.rs
+++ b/vendor/compiler_builtins/src/mem/x86_64.rs
@@ -16,6 +16,10 @@
// feature is present at compile-time. We don't bother detecting other features.
// Note that ERMSB does not enhance the backwards (DF=1) "rep movsb".
+use core::arch::asm;
+use core::intrinsics;
+use core::mem;
+
#[inline(always)]
#[cfg(target_feature = "ermsb")]
pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
@@ -31,16 +35,26 @@ pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
#[inline(always)]
#[cfg(not(target_feature = "ermsb"))]
-pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
- let qword_count = count >> 3;
- let byte_count = count & 0b111;
- // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
- core::arch::asm!(
- "repe movsq (%rsi), (%rdi)",
- "mov {byte_count:e}, %ecx",
- "repe movsb (%rsi), (%rdi)",
- byte_count = in(reg) byte_count,
+pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, count: usize) {
+ let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
+ // Separating the blocks gives the compiler more freedom to reorder instructions.
+ asm!(
+ "rep movsb",
+ inout("ecx") pre_byte_count => _,
+ inout("rdi") dest => dest,
+ inout("rsi") src => src,
+ options(att_syntax, nostack, preserves_flags)
+ );
+ asm!(
+ "rep movsq",
inout("rcx") qword_count => _,
+ inout("rdi") dest => dest,
+ inout("rsi") src => src,
+ options(att_syntax, nostack, preserves_flags)
+ );
+ asm!(
+ "rep movsb",
+ inout("ecx") byte_count => _,
inout("rdi") dest => _,
inout("rsi") src => _,
options(att_syntax, nostack, preserves_flags)
@@ -49,22 +63,28 @@ pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
#[inline(always)]
pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) {
- let qword_count = count >> 3;
- let byte_count = count & 0b111;
- // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
- core::arch::asm!(
+ let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
+ // We can't separate this block due to std/cld
+ asm!(
"std",
- "repe movsq (%rsi), (%rdi)",
- "movl {byte_count:e}, %ecx",
- "addq $7, %rdi",
- "addq $7, %rsi",
- "repe movsb (%rsi), (%rdi)",
+ "rep movsb",
+ "sub $7, %rsi",
+ "sub $7, %rdi",
+ "mov {qword_count}, %rcx",
+ "rep movsq",
+ "test {pre_byte_count:e}, {pre_byte_count:e}",
+ "add $7, %rsi",
+ "add $7, %rdi",
+ "mov {pre_byte_count:e}, %ecx",
+ "rep movsb",
"cld",
- byte_count = in(reg) byte_count,
- inout("rcx") qword_count => _,
- inout("rdi") dest.add(count).wrapping_sub(8) => _,
- inout("rsi") src.add(count).wrapping_sub(8) => _,
- options(att_syntax, nostack)
+ pre_byte_count = in(reg) pre_byte_count,
+ qword_count = in(reg) qword_count,
+ inout("ecx") byte_count => _,
+ inout("rdi") dest.add(count - 1) => _,
+ inout("rsi") src.add(count - 1) => _,
+ // We modify flags, but we restore it afterwards
+ options(att_syntax, nostack, preserves_flags)
);
}
@@ -83,18 +103,82 @@ pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
#[inline(always)]
#[cfg(not(target_feature = "ermsb"))]
-pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
- let qword_count = count >> 3;
- let byte_count = count & 0b111;
- // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
- core::arch::asm!(
- "repe stosq %rax, (%rdi)",
- "mov {byte_count:e}, %ecx",
- "repe stosb %al, (%rdi)",
- byte_count = in(reg) byte_count,
+pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) {
+ let c = c as u64 * 0x0101_0101_0101_0101;
+ let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
+ // Separating the blocks gives the compiler more freedom to reorder instructions.
+ asm!(
+ "rep stosb",
+ inout("ecx") pre_byte_count => _,
+ inout("rdi") dest => dest,
+ in("rax") c,
+ options(att_syntax, nostack, preserves_flags)
+ );
+ asm!(
+ "rep stosq",
inout("rcx") qword_count => _,
+ inout("rdi") dest => dest,
+ in("rax") c,
+ options(att_syntax, nostack, preserves_flags)
+ );
+ asm!(
+ "rep stosb",
+ inout("ecx") byte_count => _,
inout("rdi") dest => _,
- in("rax") (c as u64) * 0x0101010101010101,
+ in("rax") c,
options(att_syntax, nostack, preserves_flags)
);
}
+
+#[inline(always)]
+pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 {
+ #[inline(always)]
+ unsafe fn cmp<T, U, F>(mut a: *const T, mut b: *const T, n: usize, f: F) -> i32
+ where
+ T: Clone + Copy + Eq,
+ U: Clone + Copy + Eq,
+ F: FnOnce(*const U, *const U, usize) -> i32,
+ {
+ // Ensure T is not a ZST.
+ const { assert!(mem::size_of::<T>() != 0) };
+
+ let end = a.add(intrinsics::unchecked_div(n, mem::size_of::<T>()));
+ while a != end {
+ if a.read_unaligned() != b.read_unaligned() {
+ return f(a.cast(), b.cast(), mem::size_of::<T>());
+ }
+ a = a.add(1);
+ b = b.add(1);
+ }
+ f(
+ a.cast(),
+ b.cast(),
+ intrinsics::unchecked_rem(n, mem::size_of::<T>()),
+ )
+ }
+ let c1 = |mut a: *const u8, mut b: *const u8, n| {
+ for _ in 0..n {
+ if a.read() != b.read() {
+ return i32::from(a.read()) - i32::from(b.read());
+ }
+ a = a.add(1);
+ b = b.add(1);
+ }
+ 0
+ };
+ let c2 = |a: *const u16, b, n| cmp(a, b, n, c1);
+ let c4 = |a: *const u32, b, n| cmp(a, b, n, c2);
+ let c8 = |a: *const u64, b, n| cmp(a, b, n, c4);
+ let c16 = |a: *const u128, b, n| cmp(a, b, n, c8);
+ c16(a.cast(), b.cast(), n)
+}
+
+/// Determine optimal parameters for a `rep` instruction.
+fn rep_param(dest: *mut u8, mut count: usize) -> (usize, usize, usize) {
+ // Unaligned writes are still slow on modern processors, so align the destination address.
+ let pre_byte_count = ((8 - (dest as usize & 0b111)) & 0b111).min(count);
+ count -= pre_byte_count;
+ let qword_count = count >> 3;
+ let byte_count = count & 0b111;
+ (pre_byte_count, qword_count, byte_count)
+}