diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-11 08:27:49 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-11 08:27:49 +0000 |
commit | ace9429bb58fd418f0c81d4c2835699bddf6bde6 (patch) | |
tree | b2d64bc10158fdd5497876388cd68142ca374ed3 /arch/x86/lib/memmove_64.S | |
parent | Initial commit. (diff) | |
download | linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.tar.xz linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.zip |
Adding upstream version 6.6.15.upstream/6.6.15
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'arch/x86/lib/memmove_64.S')
-rw-r--r-- | arch/x86/lib/memmove_64.S | 216 |
1 files changed, 216 insertions, 0 deletions
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S new file mode 100644 index 0000000000..ccdf3a5970 --- /dev/null +++ b/arch/x86/lib/memmove_64.S @@ -0,0 +1,216 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Normally compiler builtins are used, but sometimes the compiler calls out + * of line code. Based on asm-i386/string.h. + * + * This assembly file is re-written from memmove_64.c file. + * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com> + */ +#include <linux/linkage.h> +#include <asm/cpufeatures.h> +#include <asm/alternative.h> +#include <asm/export.h> + +#undef memmove + +.section .noinstr.text, "ax" + +/* + * Implement memmove(). This can handle overlap between src and dst. + * + * Input: + * rdi: dest + * rsi: src + * rdx: count + * + * Output: + * rax: dest + */ +SYM_FUNC_START(__memmove) + + mov %rdi, %rax + + /* Decide forward/backward copy mode */ + cmp %rdi, %rsi + jge .Lmemmove_begin_forward + mov %rsi, %r8 + add %rdx, %r8 + cmp %rdi, %r8 + jg 2f + +#define CHECK_LEN cmp $0x20, %rdx; jb 1f +#define MEMMOVE_BYTES movq %rdx, %rcx; rep movsb; RET +.Lmemmove_begin_forward: + ALTERNATIVE_2 __stringify(CHECK_LEN), \ + __stringify(CHECK_LEN; MEMMOVE_BYTES), X86_FEATURE_ERMS, \ + __stringify(MEMMOVE_BYTES), X86_FEATURE_FSRM + + /* + * movsq instruction have many startup latency + * so we handle small size by general register. + */ + cmp $680, %rdx + jb 3f + /* + * movsq instruction is only good for aligned case. + */ + + cmpb %dil, %sil + je 4f +3: + sub $0x20, %rdx + /* + * We gobble 32 bytes forward in each loop. + */ +5: + sub $0x20, %rdx + movq 0*8(%rsi), %r11 + movq 1*8(%rsi), %r10 + movq 2*8(%rsi), %r9 + movq 3*8(%rsi), %r8 + leaq 4*8(%rsi), %rsi + + movq %r11, 0*8(%rdi) + movq %r10, 1*8(%rdi) + movq %r9, 2*8(%rdi) + movq %r8, 3*8(%rdi) + leaq 4*8(%rdi), %rdi + jae 5b + addq $0x20, %rdx + jmp 1f + /* + * Handle data forward by movsq. + */ + .p2align 4 +4: + movq %rdx, %rcx + movq -8(%rsi, %rdx), %r11 + lea -8(%rdi, %rdx), %r10 + shrq $3, %rcx + rep movsq + movq %r11, (%r10) + jmp 13f +.Lmemmove_end_forward: + + /* + * Handle data backward by movsq. + */ + .p2align 4 +7: + movq %rdx, %rcx + movq (%rsi), %r11 + movq %rdi, %r10 + leaq -8(%rsi, %rdx), %rsi + leaq -8(%rdi, %rdx), %rdi + shrq $3, %rcx + std + rep movsq + cld + movq %r11, (%r10) + jmp 13f + + /* + * Start to prepare for backward copy. + */ + .p2align 4 +2: + cmp $0x20, %rdx + jb 1f + cmp $680, %rdx + jb 6f + cmp %dil, %sil + je 7b +6: + /* + * Calculate copy position to tail. + */ + addq %rdx, %rsi + addq %rdx, %rdi + subq $0x20, %rdx + /* + * We gobble 32 bytes backward in each loop. + */ +8: + subq $0x20, %rdx + movq -1*8(%rsi), %r11 + movq -2*8(%rsi), %r10 + movq -3*8(%rsi), %r9 + movq -4*8(%rsi), %r8 + leaq -4*8(%rsi), %rsi + + movq %r11, -1*8(%rdi) + movq %r10, -2*8(%rdi) + movq %r9, -3*8(%rdi) + movq %r8, -4*8(%rdi) + leaq -4*8(%rdi), %rdi + jae 8b + /* + * Calculate copy position to head. + */ + addq $0x20, %rdx + subq %rdx, %rsi + subq %rdx, %rdi +1: + cmpq $16, %rdx + jb 9f + /* + * Move data from 16 bytes to 31 bytes. + */ + movq 0*8(%rsi), %r11 + movq 1*8(%rsi), %r10 + movq -2*8(%rsi, %rdx), %r9 + movq -1*8(%rsi, %rdx), %r8 + movq %r11, 0*8(%rdi) + movq %r10, 1*8(%rdi) + movq %r9, -2*8(%rdi, %rdx) + movq %r8, -1*8(%rdi, %rdx) + jmp 13f + .p2align 4 +9: + cmpq $8, %rdx + jb 10f + /* + * Move data from 8 bytes to 15 bytes. + */ + movq 0*8(%rsi), %r11 + movq -1*8(%rsi, %rdx), %r10 + movq %r11, 0*8(%rdi) + movq %r10, -1*8(%rdi, %rdx) + jmp 13f +10: + cmpq $4, %rdx + jb 11f + /* + * Move data from 4 bytes to 7 bytes. + */ + movl (%rsi), %r11d + movl -4(%rsi, %rdx), %r10d + movl %r11d, (%rdi) + movl %r10d, -4(%rdi, %rdx) + jmp 13f +11: + cmp $2, %rdx + jb 12f + /* + * Move data from 2 bytes to 3 bytes. + */ + movw (%rsi), %r11w + movw -2(%rsi, %rdx), %r10w + movw %r11w, (%rdi) + movw %r10w, -2(%rdi, %rdx) + jmp 13f +12: + cmp $1, %rdx + jb 13f + /* + * Move data for 1 byte. + */ + movb (%rsi), %r11b + movb %r11b, (%rdi) +13: + RET +SYM_FUNC_END(__memmove) +EXPORT_SYMBOL(__memmove) + +SYM_FUNC_ALIAS_MEMFUNC(memmove, __memmove) +EXPORT_SYMBOL(memmove) |