// On most modern Intel and AMD processors, "rep movsq" and "rep stosq" have // been enhanced to perform better than an simple qword loop, making them ideal // for implementing memcpy/memset. Note that "rep cmps" has received no such // enhancement, so it is not used to implement memcmp. // // On certain recent Intel processors, "rep movsb" and "rep stosb" have been // further enhanced to automatically select the best microarchitectural // implementation based on length and alignment. See the following features from // the "IntelĀ® 64 and IA-32 Architectures Optimization Reference Manual": // - ERMSB - Enhanced REP MOVSB and STOSB (Ivy Bridge and later) // - FSRM - Fast Short REP MOV (Ice Lake and later) // - Fast Zero-Length MOVSB (On no current hardware) // - Fast Short STOSB (On no current hardware) // // To simplify things, we switch to using the byte-based variants if the "ermsb" // feature is present at compile-time. We don't bother detecting other features. // Note that ERMSB does not enhance the backwards (DF=1) "rep movsb". use core::arch::asm; use core::intrinsics; use core::mem; #[inline(always)] #[cfg(target_feature = "ermsb")] pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) { // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust. core::arch::asm!( "repe movsb (%rsi), (%rdi)", inout("rcx") count => _, inout("rdi") dest => _, inout("rsi") src => _, options(att_syntax, nostack, preserves_flags) ); } #[inline(always)] #[cfg(not(target_feature = "ermsb"))] pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, count: usize) { let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count); // Separating the blocks gives the compiler more freedom to reorder instructions. asm!( "rep movsb", inout("ecx") pre_byte_count => _, inout("rdi") dest => dest, inout("rsi") src => src, options(att_syntax, nostack, preserves_flags) ); asm!( "rep movsq", inout("rcx") qword_count => _, inout("rdi") dest => dest, inout("rsi") src => src, options(att_syntax, nostack, preserves_flags) ); asm!( "rep movsb", inout("ecx") byte_count => _, inout("rdi") dest => _, inout("rsi") src => _, options(att_syntax, nostack, preserves_flags) ); } #[inline(always)] pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) { let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count); // We can't separate this block due to std/cld asm!( "std", "rep movsb", "sub $7, %rsi", "sub $7, %rdi", "mov {qword_count}, %rcx", "rep movsq", "test {pre_byte_count:e}, {pre_byte_count:e}", "add $7, %rsi", "add $7, %rdi", "mov {pre_byte_count:e}, %ecx", "rep movsb", "cld", pre_byte_count = in(reg) pre_byte_count, qword_count = in(reg) qword_count, inout("ecx") byte_count => _, inout("rdi") dest.add(count - 1) => _, inout("rsi") src.add(count - 1) => _, // We modify flags, but we restore it afterwards options(att_syntax, nostack, preserves_flags) ); } #[inline(always)] #[cfg(target_feature = "ermsb")] pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) { // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust. core::arch::asm!( "repe stosb %al, (%rdi)", inout("rcx") count => _, inout("rdi") dest => _, inout("al") c => _, options(att_syntax, nostack, preserves_flags) ) } #[inline(always)] #[cfg(not(target_feature = "ermsb"))] pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) { let c = c as u64 * 0x0101_0101_0101_0101; let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count); // Separating the blocks gives the compiler more freedom to reorder instructions. asm!( "rep stosb", inout("ecx") pre_byte_count => _, inout("rdi") dest => dest, in("rax") c, options(att_syntax, nostack, preserves_flags) ); asm!( "rep stosq", inout("rcx") qword_count => _, inout("rdi") dest => dest, in("rax") c, options(att_syntax, nostack, preserves_flags) ); asm!( "rep stosb", inout("ecx") byte_count => _, inout("rdi") dest => _, in("rax") c, options(att_syntax, nostack, preserves_flags) ); } #[inline(always)] pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 { #[inline(always)] unsafe fn cmp(mut a: *const T, mut b: *const T, n: usize, f: F) -> i32 where T: Clone + Copy + Eq, U: Clone + Copy + Eq, F: FnOnce(*const U, *const U, usize) -> i32, { // Ensure T is not a ZST. const { assert!(mem::size_of::() != 0) }; let end = a.add(intrinsics::unchecked_div(n, mem::size_of::())); while a != end { if a.read_unaligned() != b.read_unaligned() { return f(a.cast(), b.cast(), mem::size_of::()); } a = a.add(1); b = b.add(1); } f( a.cast(), b.cast(), intrinsics::unchecked_rem(n, mem::size_of::()), ) } let c1 = |mut a: *const u8, mut b: *const u8, n| { for _ in 0..n { if a.read() != b.read() { return i32::from(a.read()) - i32::from(b.read()); } a = a.add(1); b = b.add(1); } 0 }; let c2 = |a: *const u16, b, n| cmp(a, b, n, c1); let c4 = |a: *const u32, b, n| cmp(a, b, n, c2); let c8 = |a: *const u64, b, n| cmp(a, b, n, c4); let c16 = |a: *const u128, b, n| cmp(a, b, n, c8); c16(a.cast(), b.cast(), n) } /// Determine optimal parameters for a `rep` instruction. fn rep_param(dest: *mut u8, mut count: usize) -> (usize, usize, usize) { // Unaligned writes are still slow on modern processors, so align the destination address. let pre_byte_count = ((8 - (dest as usize & 0b111)) & 0b111).min(count); count -= pre_byte_count; let qword_count = count >> 3; let byte_count = count & 0b111; (pre_byte_count, qword_count, byte_count) }