diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-11 08:27:49 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-11 08:27:49 +0000 |
commit | ace9429bb58fd418f0c81d4c2835699bddf6bde6 (patch) | |
tree | b2d64bc10158fdd5497876388cd68142ca374ed3 /arch/x86/lib/memcpy_64.S | |
parent | Initial commit. (diff) | |
download | linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.tar.xz linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.zip |
Adding upstream version 6.6.15.upstream/6.6.15
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'arch/x86/lib/memcpy_64.S')
-rw-r--r-- | arch/x86/lib/memcpy_64.S | 172 |
1 files changed, 172 insertions, 0 deletions
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S new file mode 100644 index 0000000000..76697df8df --- /dev/null +++ b/arch/x86/lib/memcpy_64.S @@ -0,0 +1,172 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright 2002 Andi Kleen */ + +#include <linux/linkage.h> +#include <linux/cfi_types.h> +#include <asm/errno.h> +#include <asm/cpufeatures.h> +#include <asm/alternative.h> +#include <asm/export.h> + +.section .noinstr.text, "ax" + +/* + * memcpy - Copy a memory block. + * + * Input: + * rdi destination + * rsi source + * rdx count + * + * Output: + * rax original destination + * + * The FSRM alternative should be done inline (avoiding the call and + * the disgusting return handling), but that would require some help + * from the compiler for better calling conventions. + * + * The 'rep movsb' itself is small enough to replace the call, but the + * two register moves blow up the code. And one of them is "needed" + * only for the return value that is the same as the source input, + * which the compiler could/should do much better anyway. + */ +SYM_TYPED_FUNC_START(__memcpy) + ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM + + movq %rdi, %rax + movq %rdx, %rcx + rep movsb + RET +SYM_FUNC_END(__memcpy) +EXPORT_SYMBOL(__memcpy) + +SYM_FUNC_ALIAS_MEMFUNC(memcpy, __memcpy) +EXPORT_SYMBOL(memcpy) + +SYM_FUNC_START_LOCAL(memcpy_orig) + movq %rdi, %rax + + cmpq $0x20, %rdx + jb .Lhandle_tail + + /* + * We check whether memory false dependence could occur, + * then jump to corresponding copy mode. + */ + cmp %dil, %sil + jl .Lcopy_backward + subq $0x20, %rdx +.Lcopy_forward_loop: + subq $0x20, %rdx + + /* + * Move in blocks of 4x8 bytes: + */ + movq 0*8(%rsi), %r8 + movq 1*8(%rsi), %r9 + movq 2*8(%rsi), %r10 + movq 3*8(%rsi), %r11 + leaq 4*8(%rsi), %rsi + + movq %r8, 0*8(%rdi) + movq %r9, 1*8(%rdi) + movq %r10, 2*8(%rdi) + movq %r11, 3*8(%rdi) + leaq 4*8(%rdi), %rdi + jae .Lcopy_forward_loop + addl $0x20, %edx + jmp .Lhandle_tail + +.Lcopy_backward: + /* + * Calculate copy position to tail. + */ + addq %rdx, %rsi + addq %rdx, %rdi + subq $0x20, %rdx + /* + * At most 3 ALU operations in one cycle, + * so append NOPS in the same 16 bytes trunk. + */ + .p2align 4 +.Lcopy_backward_loop: + subq $0x20, %rdx + movq -1*8(%rsi), %r8 + movq -2*8(%rsi), %r9 + movq -3*8(%rsi), %r10 + movq -4*8(%rsi), %r11 + leaq -4*8(%rsi), %rsi + movq %r8, -1*8(%rdi) + movq %r9, -2*8(%rdi) + movq %r10, -3*8(%rdi) + movq %r11, -4*8(%rdi) + leaq -4*8(%rdi), %rdi + jae .Lcopy_backward_loop + + /* + * Calculate copy position to head. + */ + addl $0x20, %edx + subq %rdx, %rsi + subq %rdx, %rdi +.Lhandle_tail: + cmpl $16, %edx + jb .Lless_16bytes + + /* + * Move data from 16 bytes to 31 bytes. + */ + movq 0*8(%rsi), %r8 + movq 1*8(%rsi), %r9 + movq -2*8(%rsi, %rdx), %r10 + movq -1*8(%rsi, %rdx), %r11 + movq %r8, 0*8(%rdi) + movq %r9, 1*8(%rdi) + movq %r10, -2*8(%rdi, %rdx) + movq %r11, -1*8(%rdi, %rdx) + RET + .p2align 4 +.Lless_16bytes: + cmpl $8, %edx + jb .Lless_8bytes + /* + * Move data from 8 bytes to 15 bytes. + */ + movq 0*8(%rsi), %r8 + movq -1*8(%rsi, %rdx), %r9 + movq %r8, 0*8(%rdi) + movq %r9, -1*8(%rdi, %rdx) + RET + .p2align 4 +.Lless_8bytes: + cmpl $4, %edx + jb .Lless_3bytes + + /* + * Move data from 4 bytes to 7 bytes. + */ + movl (%rsi), %ecx + movl -4(%rsi, %rdx), %r8d + movl %ecx, (%rdi) + movl %r8d, -4(%rdi, %rdx) + RET + .p2align 4 +.Lless_3bytes: + subl $1, %edx + jb .Lend + /* + * Move data from 1 bytes to 3 bytes. + */ + movzbl (%rsi), %ecx + jz .Lstore_1byte + movzbq 1(%rsi), %r8 + movzbq (%rsi, %rdx), %r9 + movb %r8b, 1(%rdi) + movb %r9b, (%rdi, %rdx) +.Lstore_1byte: + movb %cl, (%rdi) + +.Lend: + RET +SYM_FUNC_END(memcpy_orig) + |