summaryrefslogtreecommitdiffstats
path: root/arch/x86/lib/memset_64.S
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-11 08:27:49 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-11 08:27:49 +0000
commitace9429bb58fd418f0c81d4c2835699bddf6bde6 (patch)
treeb2d64bc10158fdd5497876388cd68142ca374ed3 /arch/x86/lib/memset_64.S
parentInitial commit. (diff)
downloadlinux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.tar.xz
linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.zip
Adding upstream version 6.6.15.upstream/6.6.15
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'arch/x86/lib/memset_64.S')
-rw-r--r--arch/x86/lib/memset_64.S117
1 files changed, 117 insertions, 0 deletions
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
new file mode 100644
index 0000000000..3d818b849e
--- /dev/null
+++ b/arch/x86/lib/memset_64.S
@@ -0,0 +1,117 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright 2002 Andi Kleen, SuSE Labs */
+
+#include <linux/linkage.h>
+#include <asm/cpufeatures.h>
+#include <asm/alternative.h>
+#include <asm/export.h>
+
+.section .noinstr.text, "ax"
+
+/*
+ * ISO C memset - set a memory block to a byte value. This function uses fast
+ * string to get better performance than the original function. The code is
+ * simpler and shorter than the original function as well.
+ *
+ * rdi destination
+ * rsi value (char)
+ * rdx count (bytes)
+ *
+ * rax original destination
+ *
+ * The FSRS alternative should be done inline (avoiding the call and
+ * the disgusting return handling), but that would require some help
+ * from the compiler for better calling conventions.
+ *
+ * The 'rep stosb' itself is small enough to replace the call, but all
+ * the register moves blow up the code. And two of them are "needed"
+ * only for the return value that is the same as the source input,
+ * which the compiler could/should do much better anyway.
+ */
+SYM_FUNC_START(__memset)
+ ALTERNATIVE "jmp memset_orig", "", X86_FEATURE_FSRS
+
+ movq %rdi,%r9
+ movb %sil,%al
+ movq %rdx,%rcx
+ rep stosb
+ movq %r9,%rax
+ RET
+SYM_FUNC_END(__memset)
+EXPORT_SYMBOL(__memset)
+
+SYM_FUNC_ALIAS_MEMFUNC(memset, __memset)
+EXPORT_SYMBOL(memset)
+
+SYM_FUNC_START_LOCAL(memset_orig)
+ movq %rdi,%r10
+
+ /* expand byte value */
+ movzbl %sil,%ecx
+ movabs $0x0101010101010101,%rax
+ imulq %rcx,%rax
+
+ /* align dst */
+ movl %edi,%r9d
+ andl $7,%r9d
+ jnz .Lbad_alignment
+.Lafter_bad_alignment:
+
+ movq %rdx,%rcx
+ shrq $6,%rcx
+ jz .Lhandle_tail
+
+ .p2align 4
+.Lloop_64:
+ decq %rcx
+ movq %rax,(%rdi)
+ movq %rax,8(%rdi)
+ movq %rax,16(%rdi)
+ movq %rax,24(%rdi)
+ movq %rax,32(%rdi)
+ movq %rax,40(%rdi)
+ movq %rax,48(%rdi)
+ movq %rax,56(%rdi)
+ leaq 64(%rdi),%rdi
+ jnz .Lloop_64
+
+ /* Handle tail in loops. The loops should be faster than hard
+ to predict jump tables. */
+ .p2align 4
+.Lhandle_tail:
+ movl %edx,%ecx
+ andl $63&(~7),%ecx
+ jz .Lhandle_7
+ shrl $3,%ecx
+ .p2align 4
+.Lloop_8:
+ decl %ecx
+ movq %rax,(%rdi)
+ leaq 8(%rdi),%rdi
+ jnz .Lloop_8
+
+.Lhandle_7:
+ andl $7,%edx
+ jz .Lende
+ .p2align 4
+.Lloop_1:
+ decl %edx
+ movb %al,(%rdi)
+ leaq 1(%rdi),%rdi
+ jnz .Lloop_1
+
+.Lende:
+ movq %r10,%rax
+ RET
+
+.Lbad_alignment:
+ cmpq $7,%rdx
+ jbe .Lhandle_7
+ movq %rax,(%rdi) /* unaligned store */
+ movq $8,%r8
+ subq %r9,%r8
+ addq %r8,%rdi
+ subq %r8,%rdx
+ jmp .Lafter_bad_alignment
+.Lfinal:
+SYM_FUNC_END(memset_orig)