diff options
Diffstat (limited to '')
-rw-r--r-- | arch/arm64/lib/Makefile | 27 | ||||
-rw-r--r-- | arch/arm64/lib/atomic_ll_sc.c | 3 | ||||
-rw-r--r-- | arch/arm64/lib/clear_page.S | 39 | ||||
-rw-r--r-- | arch/arm64/lib/clear_user.S | 62 | ||||
-rw-r--r-- | arch/arm64/lib/copy_from_user.S | 87 | ||||
-rw-r--r-- | arch/arm64/lib/copy_in_user.S | 90 | ||||
-rw-r--r-- | arch/arm64/lib/copy_page.S | 89 | ||||
-rw-r--r-- | arch/arm64/lib/copy_template.S | 193 | ||||
-rw-r--r-- | arch/arm64/lib/copy_to_user.S | 87 | ||||
-rw-r--r-- | arch/arm64/lib/delay.c | 70 | ||||
-rw-r--r-- | arch/arm64/lib/memchr.S | 44 | ||||
-rw-r--r-- | arch/arm64/lib/memcmp.S | 258 | ||||
-rw-r--r-- | arch/arm64/lib/memcpy.S | 76 | ||||
-rw-r--r-- | arch/arm64/lib/memmove.S | 199 | ||||
-rw-r--r-- | arch/arm64/lib/memset.S | 218 | ||||
-rw-r--r-- | arch/arm64/lib/strchr.S | 42 | ||||
-rw-r--r-- | arch/arm64/lib/strcmp.S | 234 | ||||
-rw-r--r-- | arch/arm64/lib/strlen.S | 126 | ||||
-rw-r--r-- | arch/arm64/lib/strncmp.S | 310 | ||||
-rw-r--r-- | arch/arm64/lib/strnlen.S | 171 | ||||
-rw-r--r-- | arch/arm64/lib/strrchr.S | 43 | ||||
-rw-r--r-- | arch/arm64/lib/tishift.S | 69 | ||||
-rw-r--r-- | arch/arm64/lib/uaccess_flushcache.c | 47 |
23 files changed, 2584 insertions, 0 deletions
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile new file mode 100644 index 000000000..5df2d611b --- /dev/null +++ b/arch/arm64/lib/Makefile @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: GPL-2.0 +lib-y := clear_user.o delay.o copy_from_user.o \ + copy_to_user.o copy_in_user.o copy_page.o \ + clear_page.o memchr.o memcpy.o memmove.o memset.o \ + memcmp.o strcmp.o strncmp.o strlen.o strnlen.o \ + strchr.o strrchr.o tishift.o + +# Tell the compiler to treat all general purpose registers (with the +# exception of the IP registers, which are already handled by the caller +# in case of a PLT) as callee-saved, which allows for efficient runtime +# patching of the bl instruction in the caller with an atomic instruction +# when supported by the CPU. Result and argument registers are handled +# correctly, based on the function prototype. +lib-$(CONFIG_ARM64_LSE_ATOMICS) += atomic_ll_sc.o +CFLAGS_atomic_ll_sc.o := -ffixed-x1 -ffixed-x2 \ + -ffixed-x3 -ffixed-x4 -ffixed-x5 -ffixed-x6 \ + -ffixed-x7 -fcall-saved-x8 -fcall-saved-x9 \ + -fcall-saved-x10 -fcall-saved-x11 -fcall-saved-x12 \ + -fcall-saved-x13 -fcall-saved-x14 -fcall-saved-x15 \ + -fcall-saved-x18 -fomit-frame-pointer +CFLAGS_REMOVE_atomic_ll_sc.o := -pg +GCOV_PROFILE_atomic_ll_sc.o := n +KASAN_SANITIZE_atomic_ll_sc.o := n +KCOV_INSTRUMENT_atomic_ll_sc.o := n +UBSAN_SANITIZE_atomic_ll_sc.o := n + +lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o diff --git a/arch/arm64/lib/atomic_ll_sc.c b/arch/arm64/lib/atomic_ll_sc.c new file mode 100644 index 000000000..b0c538b0d --- /dev/null +++ b/arch/arm64/lib/atomic_ll_sc.c @@ -0,0 +1,3 @@ +#include <asm/atomic.h> +#define __ARM64_IN_ATOMIC_IMPL +#include <asm/atomic_ll_sc.h> diff --git a/arch/arm64/lib/clear_page.S b/arch/arm64/lib/clear_page.S new file mode 100644 index 000000000..ef08e905e --- /dev/null +++ b/arch/arm64/lib/clear_page.S @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2012 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <linux/const.h> +#include <asm/assembler.h> +#include <asm/page.h> + +/* + * Clear page @dest + * + * Parameters: + * x0 - dest + */ +ENTRY(clear_page) + mrs x1, dczid_el0 + and w1, w1, #0xf + mov x2, #4 + lsl x1, x2, x1 + +1: dc zva, x0 + add x0, x0, x1 + tst x0, #(PAGE_SIZE - 1) + b.ne 1b + ret +ENDPROC(clear_page) diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S new file mode 100644 index 000000000..4374020c8 --- /dev/null +++ b/arch/arm64/lib/clear_user.S @@ -0,0 +1,62 @@ +/* + * Based on arch/arm/lib/clear_user.S + * + * Copyright (C) 2012 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +#include <linux/linkage.h> + +#include <asm/asm-uaccess.h> + + .text + +/* Prototype: int __arch_clear_user(void *addr, size_t sz) + * Purpose : clear some user memory + * Params : addr - user memory address to clear + * : sz - number of bytes to clear + * Returns : number of bytes NOT cleared + * + * Alignment fixed up by hardware. + */ +ENTRY(__arch_clear_user) + uaccess_enable_not_uao x2, x3, x4 + mov x2, x1 // save the size for fixup return + subs x1, x1, #8 + b.mi 2f +1: +uao_user_alternative 9f, str, sttr, xzr, x0, 8 + subs x1, x1, #8 + b.pl 1b +2: adds x1, x1, #4 + b.mi 3f +uao_user_alternative 9f, str, sttr, wzr, x0, 4 + sub x1, x1, #4 +3: adds x1, x1, #2 + b.mi 4f +uao_user_alternative 9f, strh, sttrh, wzr, x0, 2 + sub x1, x1, #2 +4: adds x1, x1, #1 + b.mi 5f +uao_user_alternative 9f, strb, sttrb, wzr, x0, 0 +5: mov x0, #0 + uaccess_disable_not_uao x2, x3 + ret +ENDPROC(__arch_clear_user) + + .section .fixup,"ax" + .align 2 +9: mov x0, x2 // return the original size + uaccess_disable_not_uao x2, x3 + ret + .previous diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S new file mode 100644 index 000000000..7cd6eeaa2 --- /dev/null +++ b/arch/arm64/lib/copy_from_user.S @@ -0,0 +1,87 @@ +/* + * Copyright (C) 2012 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> + +#include <asm/cache.h> +#include <asm/asm-uaccess.h> + +/* + * Copy from user space to a kernel buffer (alignment handled by the hardware) + * + * Parameters: + * x0 - to + * x1 - from + * x2 - n + * Returns: + * x0 - bytes not copied + */ + + .macro ldrb1 ptr, regB, val + uao_user_alternative 9998f, ldrb, ldtrb, \ptr, \regB, \val + .endm + + .macro strb1 ptr, regB, val + strb \ptr, [\regB], \val + .endm + + .macro ldrh1 ptr, regB, val + uao_user_alternative 9997f, ldrh, ldtrh, \ptr, \regB, \val + .endm + + .macro strh1 ptr, regB, val + strh \ptr, [\regB], \val + .endm + + .macro ldr1 ptr, regB, val + uao_user_alternative 9997f, ldr, ldtr, \ptr, \regB, \val + .endm + + .macro str1 ptr, regB, val + str \ptr, [\regB], \val + .endm + + .macro ldp1 ptr, regB, regC, val + uao_ldp 9997f, \ptr, \regB, \regC, \val + .endm + + .macro stp1 ptr, regB, regC, val + stp \ptr, \regB, [\regC], \val + .endm + +end .req x5 +srcin .req x15 +ENTRY(__arch_copy_from_user) + uaccess_enable_not_uao x3, x4, x5 + add end, x0, x2 + mov srcin, x1 +#include "copy_template.S" + uaccess_disable_not_uao x3, x4 + mov x0, #0 // Nothing to copy + ret +ENDPROC(__arch_copy_from_user) + + .section .fixup,"ax" + .align 2 +9997: cmp dst, dstin + b.ne 9998f + // Before being absolutely sure we couldn't copy anything, try harder +USER(9998f, ldtrb tmp1w, [srcin]) + strb tmp1w, [dst], #1 +9998: sub x0, end, dst // bytes not copied + uaccess_disable_not_uao x3, x4 + ret + .previous diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S new file mode 100644 index 000000000..b20d3a0b3 --- /dev/null +++ b/arch/arm64/lib/copy_in_user.S @@ -0,0 +1,90 @@ +/* + * Copy from user space to user space + * + * Copyright (C) 2012 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> + +#include <asm/cache.h> +#include <asm/asm-uaccess.h> + +/* + * Copy from user space to user space (alignment handled by the hardware) + * + * Parameters: + * x0 - to + * x1 - from + * x2 - n + * Returns: + * x0 - bytes not copied + */ + .macro ldrb1 ptr, regB, val + uao_user_alternative 9998f, ldrb, ldtrb, \ptr, \regB, \val + .endm + + .macro strb1 ptr, regB, val + uao_user_alternative 9998f, strb, sttrb, \ptr, \regB, \val + .endm + + .macro ldrh1 ptr, regB, val + uao_user_alternative 9997f, ldrh, ldtrh, \ptr, \regB, \val + .endm + + .macro strh1 ptr, regB, val + uao_user_alternative 9997f, strh, sttrh, \ptr, \regB, \val + .endm + + .macro ldr1 ptr, regB, val + uao_user_alternative 9997f, ldr, ldtr, \ptr, \regB, \val + .endm + + .macro str1 ptr, regB, val + uao_user_alternative 9997f, str, sttr, \ptr, \regB, \val + .endm + + .macro ldp1 ptr, regB, regC, val + uao_ldp 9997f, \ptr, \regB, \regC, \val + .endm + + .macro stp1 ptr, regB, regC, val + uao_stp 9997f, \ptr, \regB, \regC, \val + .endm + +end .req x5 +srcin .req x15 + +ENTRY(__arch_copy_in_user) + uaccess_enable_not_uao x3, x4, x5 + add end, x0, x2 + mov srcin, x1 +#include "copy_template.S" + uaccess_disable_not_uao x3, x4 + mov x0, #0 + ret +ENDPROC(__arch_copy_in_user) + + .section .fixup,"ax" + .align 2 +9997: cmp dst, dstin + b.ne 9998f + // Before being absolutely sure we couldn't copy anything, try harder +USER(9998f, ldtrb tmp1w, [srcin]) +USER(9998f, sttrb tmp1w, [dst]) + add dst, dst, #1 +9998: sub x0, end, dst // bytes not copied + uaccess_disable_not_uao x3, x4 + ret + .previous diff --git a/arch/arm64/lib/copy_page.S b/arch/arm64/lib/copy_page.S new file mode 100644 index 000000000..076c43715 --- /dev/null +++ b/arch/arm64/lib/copy_page.S @@ -0,0 +1,89 @@ +/* + * Copyright (C) 2012 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <linux/const.h> +#include <asm/assembler.h> +#include <asm/page.h> +#include <asm/cpufeature.h> +#include <asm/alternative.h> + +/* + * Copy a page from src to dest (both are page aligned) + * + * Parameters: + * x0 - dest + * x1 - src + */ +ENTRY(copy_page) +alternative_if ARM64_HAS_NO_HW_PREFETCH + // Prefetch three cache lines ahead. + prfm pldl1strm, [x1, #128] + prfm pldl1strm, [x1, #256] + prfm pldl1strm, [x1, #384] +alternative_else_nop_endif + + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + ldp x6, x7, [x1, #32] + ldp x8, x9, [x1, #48] + ldp x10, x11, [x1, #64] + ldp x12, x13, [x1, #80] + ldp x14, x15, [x1, #96] + ldp x16, x17, [x1, #112] + + mov x18, #(PAGE_SIZE - 128) + add x1, x1, #128 +1: + subs x18, x18, #128 + +alternative_if ARM64_HAS_NO_HW_PREFETCH + prfm pldl1strm, [x1, #384] +alternative_else_nop_endif + + stnp x2, x3, [x0] + ldp x2, x3, [x1] + stnp x4, x5, [x0, #16] + ldp x4, x5, [x1, #16] + stnp x6, x7, [x0, #32] + ldp x6, x7, [x1, #32] + stnp x8, x9, [x0, #48] + ldp x8, x9, [x1, #48] + stnp x10, x11, [x0, #64] + ldp x10, x11, [x1, #64] + stnp x12, x13, [x0, #80] + ldp x12, x13, [x1, #80] + stnp x14, x15, [x0, #96] + ldp x14, x15, [x1, #96] + stnp x16, x17, [x0, #112] + ldp x16, x17, [x1, #112] + + add x0, x0, #128 + add x1, x1, #128 + + b.gt 1b + + stnp x2, x3, [x0] + stnp x4, x5, [x0, #16] + stnp x6, x7, [x0, #32] + stnp x8, x9, [x0, #48] + stnp x10, x11, [x0, #64] + stnp x12, x13, [x0, #80] + stnp x14, x15, [x0, #96] + stnp x16, x17, [x0, #112] + + ret +ENDPROC(copy_page) diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S new file mode 100644 index 000000000..f5b9210f1 --- /dev/null +++ b/arch/arm64/lib/copy_template.S @@ -0,0 +1,193 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + + +/* + * Copy a buffer from src to dest (alignment handled by the hardware) + * + * Parameters: + * x0 - dest + * x1 - src + * x2 - n + * Returns: + * x0 - dest + */ +dstin .req x0 +src .req x1 +count .req x2 +tmp1 .req x3 +tmp1w .req w3 +tmp2 .req x4 +tmp2w .req w4 +dst .req x6 + +A_l .req x7 +A_h .req x8 +B_l .req x9 +B_h .req x10 +C_l .req x11 +C_h .req x12 +D_l .req x13 +D_h .req x14 + + mov dst, dstin + cmp count, #16 + /*When memory length is less than 16, the accessed are not aligned.*/ + b.lo .Ltiny15 + + neg tmp2, src + ands tmp2, tmp2, #15/* Bytes to reach alignment. */ + b.eq .LSrcAligned + sub count, count, tmp2 + /* + * Copy the leading memory data from src to dst in an increasing + * address order.By this way,the risk of overwriting the source + * memory data is eliminated when the distance between src and + * dst is less than 16. The memory accesses here are alignment. + */ + tbz tmp2, #0, 1f + ldrb1 tmp1w, src, #1 + strb1 tmp1w, dst, #1 +1: + tbz tmp2, #1, 2f + ldrh1 tmp1w, src, #2 + strh1 tmp1w, dst, #2 +2: + tbz tmp2, #2, 3f + ldr1 tmp1w, src, #4 + str1 tmp1w, dst, #4 +3: + tbz tmp2, #3, .LSrcAligned + ldr1 tmp1, src, #8 + str1 tmp1, dst, #8 + +.LSrcAligned: + cmp count, #64 + b.ge .Lcpy_over64 + /* + * Deal with small copies quickly by dropping straight into the + * exit block. + */ +.Ltail63: + /* + * Copy up to 48 bytes of data. At this point we only need the + * bottom 6 bits of count to be accurate. + */ + ands tmp1, count, #0x30 + b.eq .Ltiny15 + cmp tmp1w, #0x20 + b.eq 1f + b.lt 2f + ldp1 A_l, A_h, src, #16 + stp1 A_l, A_h, dst, #16 +1: + ldp1 A_l, A_h, src, #16 + stp1 A_l, A_h, dst, #16 +2: + ldp1 A_l, A_h, src, #16 + stp1 A_l, A_h, dst, #16 +.Ltiny15: + /* + * Prefer to break one ldp/stp into several load/store to access + * memory in an increasing address order,rather than to load/store 16 + * bytes from (src-16) to (dst-16) and to backward the src to aligned + * address,which way is used in original cortex memcpy. If keeping + * the original memcpy process here, memmove need to satisfy the + * precondition that src address is at least 16 bytes bigger than dst + * address,otherwise some source data will be overwritten when memove + * call memcpy directly. To make memmove simpler and decouple the + * memcpy's dependency on memmove, withdrew the original process. + */ + tbz count, #3, 1f + ldr1 tmp1, src, #8 + str1 tmp1, dst, #8 +1: + tbz count, #2, 2f + ldr1 tmp1w, src, #4 + str1 tmp1w, dst, #4 +2: + tbz count, #1, 3f + ldrh1 tmp1w, src, #2 + strh1 tmp1w, dst, #2 +3: + tbz count, #0, .Lexitfunc + ldrb1 tmp1w, src, #1 + strb1 tmp1w, dst, #1 + + b .Lexitfunc + +.Lcpy_over64: + subs count, count, #128 + b.ge .Lcpy_body_large + /* + * Less than 128 bytes to copy, so handle 64 here and then jump + * to the tail. + */ + ldp1 A_l, A_h, src, #16 + stp1 A_l, A_h, dst, #16 + ldp1 B_l, B_h, src, #16 + ldp1 C_l, C_h, src, #16 + stp1 B_l, B_h, dst, #16 + stp1 C_l, C_h, dst, #16 + ldp1 D_l, D_h, src, #16 + stp1 D_l, D_h, dst, #16 + + tst count, #0x3f + b.ne .Ltail63 + b .Lexitfunc + + /* + * Critical loop. Start at a new cache line boundary. Assuming + * 64 bytes per line this ensures the entire loop is in one line. + */ + .p2align L1_CACHE_SHIFT +.Lcpy_body_large: + /* pre-get 64 bytes data. */ + ldp1 A_l, A_h, src, #16 + ldp1 B_l, B_h, src, #16 + ldp1 C_l, C_h, src, #16 + ldp1 D_l, D_h, src, #16 +1: + /* + * interlace the load of next 64 bytes data block with store of the last + * loaded 64 bytes data. + */ + stp1 A_l, A_h, dst, #16 + ldp1 A_l, A_h, src, #16 + stp1 B_l, B_h, dst, #16 + ldp1 B_l, B_h, src, #16 + stp1 C_l, C_h, dst, #16 + ldp1 C_l, C_h, src, #16 + stp1 D_l, D_h, dst, #16 + ldp1 D_l, D_h, src, #16 + subs count, count, #64 + b.ge 1b + stp1 A_l, A_h, dst, #16 + stp1 B_l, B_h, dst, #16 + stp1 C_l, C_h, dst, #16 + stp1 D_l, D_h, dst, #16 + + tst count, #0x3f + b.ne .Ltail63 +.Lexitfunc: diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S new file mode 100644 index 000000000..cfdbb1fe8 --- /dev/null +++ b/arch/arm64/lib/copy_to_user.S @@ -0,0 +1,87 @@ +/* + * Copyright (C) 2012 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> + +#include <asm/cache.h> +#include <asm/asm-uaccess.h> + +/* + * Copy to user space from a kernel buffer (alignment handled by the hardware) + * + * Parameters: + * x0 - to + * x1 - from + * x2 - n + * Returns: + * x0 - bytes not copied + */ + .macro ldrb1 ptr, regB, val + ldrb \ptr, [\regB], \val + .endm + + .macro strb1 ptr, regB, val + uao_user_alternative 9998f, strb, sttrb, \ptr, \regB, \val + .endm + + .macro ldrh1 ptr, regB, val + ldrh \ptr, [\regB], \val + .endm + + .macro strh1 ptr, regB, val + uao_user_alternative 9997f, strh, sttrh, \ptr, \regB, \val + .endm + + .macro ldr1 ptr, regB, val + ldr \ptr, [\regB], \val + .endm + + .macro str1 ptr, regB, val + uao_user_alternative 9997f, str, sttr, \ptr, \regB, \val + .endm + + .macro ldp1 ptr, regB, regC, val + ldp \ptr, \regB, [\regC], \val + .endm + + .macro stp1 ptr, regB, regC, val + uao_stp 9997f, \ptr, \regB, \regC, \val + .endm + +end .req x5 +srcin .req x15 +ENTRY(__arch_copy_to_user) + uaccess_enable_not_uao x3, x4, x5 + add end, x0, x2 + mov srcin, x1 +#include "copy_template.S" + uaccess_disable_not_uao x3, x4 + mov x0, #0 + ret +ENDPROC(__arch_copy_to_user) + + .section .fixup,"ax" + .align 2 +9997: cmp dst, dstin + b.ne 9998f + // Before being absolutely sure we couldn't copy anything, try harder + ldrb tmp1w, [srcin] +USER(9998f, sttrb tmp1w, [dst]) + add dst, dst, #1 +9998: sub x0, end, dst // bytes not copied + uaccess_disable_not_uao x3, x4 + ret + .previous diff --git a/arch/arm64/lib/delay.c b/arch/arm64/lib/delay.c new file mode 100644 index 000000000..e48ac402e --- /dev/null +++ b/arch/arm64/lib/delay.c @@ -0,0 +1,70 @@ +/* + * Delay loops based on the OpenRISC implementation. + * + * Copyright (C) 2012 ARM Limited + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * Author: Will Deacon <will.deacon@arm.com> + */ + +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/timex.h> + +#include <clocksource/arm_arch_timer.h> + +#define USECS_TO_CYCLES(time_usecs) \ + xloops_to_cycles((time_usecs) * 0x10C7UL) + +static inline unsigned long xloops_to_cycles(unsigned long xloops) +{ + return (xloops * loops_per_jiffy * HZ) >> 32; +} + +void __delay(unsigned long cycles) +{ + cycles_t start = get_cycles(); + + if (arch_timer_evtstrm_available()) { + const cycles_t timer_evt_period = + USECS_TO_CYCLES(ARCH_TIMER_EVT_STREAM_PERIOD_US); + + while ((get_cycles() - start + timer_evt_period) < cycles) + wfe(); + } + + while ((get_cycles() - start) < cycles) + cpu_relax(); +} +EXPORT_SYMBOL(__delay); + +inline void __const_udelay(unsigned long xloops) +{ + __delay(xloops_to_cycles(xloops)); +} +EXPORT_SYMBOL(__const_udelay); + +void __udelay(unsigned long usecs) +{ + __const_udelay(usecs * 0x10C7UL); /* 2**32 / 1000000 (rounded up) */ +} +EXPORT_SYMBOL(__udelay); + +void __ndelay(unsigned long nsecs) +{ + __const_udelay(nsecs * 0x5UL); /* 2**32 / 1000000000 (rounded up) */ +} +EXPORT_SYMBOL(__ndelay); diff --git a/arch/arm64/lib/memchr.S b/arch/arm64/lib/memchr.S new file mode 100644 index 000000000..0f164a4ba --- /dev/null +++ b/arch/arm64/lib/memchr.S @@ -0,0 +1,44 @@ +/* + * Based on arch/arm/lib/memchr.S + * + * Copyright (C) 1995-2000 Russell King + * Copyright (C) 2013 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * Find a character in an area of memory. + * + * Parameters: + * x0 - buf + * x1 - c + * x2 - n + * Returns: + * x0 - address of first occurrence of 'c' or 0 + */ +WEAK(memchr) + and w1, w1, #0xff +1: subs x2, x2, #1 + b.mi 2f + ldrb w3, [x0], #1 + cmp w3, w1 + b.ne 1b + sub x0, x0, #1 + ret +2: mov x0, #0 + ret +ENDPIPROC(memchr) diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S new file mode 100644 index 000000000..fb295f52e --- /dev/null +++ b/arch/arm64/lib/memcmp.S @@ -0,0 +1,258 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* +* compare memory areas(when two memory areas' offset are different, +* alignment handled by the hardware) +* +* Parameters: +* x0 - const memory area 1 pointer +* x1 - const memory area 2 pointer +* x2 - the maximal compare byte length +* Returns: +* x0 - a compare result, maybe less than, equal to, or greater than ZERO +*/ + +/* Parameters and result. */ +src1 .req x0 +src2 .req x1 +limit .req x2 +result .req x0 + +/* Internal variables. */ +data1 .req x3 +data1w .req w3 +data2 .req x4 +data2w .req w4 +has_nul .req x5 +diff .req x6 +endloop .req x7 +tmp1 .req x8 +tmp2 .req x9 +tmp3 .req x10 +pos .req x11 +limit_wd .req x12 +mask .req x13 + +WEAK(memcmp) + cbz limit, .Lret0 + eor tmp1, src1, src2 + tst tmp1, #7 + b.ne .Lmisaligned8 + ands tmp1, src1, #7 + b.ne .Lmutual_align + sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ + lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ + /* + * The input source addresses are at alignment boundary. + * Directly compare eight bytes each time. + */ +.Lloop_aligned: + ldr data1, [src1], #8 + ldr data2, [src2], #8 +.Lstart_realigned: + subs limit_wd, limit_wd, #1 + eor diff, data1, data2 /* Non-zero if differences found. */ + csinv endloop, diff, xzr, cs /* Last Dword or differences. */ + cbz endloop, .Lloop_aligned + + /* Not reached the limit, must have found a diff. */ + tbz limit_wd, #63, .Lnot_limit + + /* Limit % 8 == 0 => the diff is in the last 8 bytes. */ + ands limit, limit, #7 + b.eq .Lnot_limit + /* + * The remained bytes less than 8. It is needed to extract valid data + * from last eight bytes of the intended memory range. + */ + lsl limit, limit, #3 /* bytes-> bits. */ + mov mask, #~0 +CPU_BE( lsr mask, mask, limit ) +CPU_LE( lsl mask, mask, limit ) + bic data1, data1, mask + bic data2, data2, mask + + orr diff, diff, mask + b .Lnot_limit + +.Lmutual_align: + /* + * Sources are mutually aligned, but are not currently at an + * alignment boundary. Round down the addresses and then mask off + * the bytes that precede the start point. + */ + bic src1, src1, #7 + bic src2, src2, #7 + ldr data1, [src1], #8 + ldr data2, [src2], #8 + /* + * We can not add limit with alignment offset(tmp1) here. Since the + * addition probably make the limit overflown. + */ + sub limit_wd, limit, #1/*limit != 0, so no underflow.*/ + and tmp3, limit_wd, #7 + lsr limit_wd, limit_wd, #3 + add tmp3, tmp3, tmp1 + add limit_wd, limit_wd, tmp3, lsr #3 + add limit, limit, tmp1/* Adjust the limit for the extra. */ + + lsl tmp1, tmp1, #3/* Bytes beyond alignment -> bits.*/ + neg tmp1, tmp1/* Bits to alignment -64. */ + mov tmp2, #~0 + /*mask off the non-intended bytes before the start address.*/ +CPU_BE( lsl tmp2, tmp2, tmp1 )/*Big-endian.Early bytes are at MSB*/ + /* Little-endian. Early bytes are at LSB. */ +CPU_LE( lsr tmp2, tmp2, tmp1 ) + + orr data1, data1, tmp2 + orr data2, data2, tmp2 + b .Lstart_realigned + + /*src1 and src2 have different alignment offset.*/ +.Lmisaligned8: + cmp limit, #8 + b.lo .Ltiny8proc /*limit < 8: compare byte by byte*/ + + and tmp1, src1, #7 + neg tmp1, tmp1 + add tmp1, tmp1, #8/*valid length in the first 8 bytes of src1*/ + and tmp2, src2, #7 + neg tmp2, tmp2 + add tmp2, tmp2, #8/*valid length in the first 8 bytes of src2*/ + subs tmp3, tmp1, tmp2 + csel pos, tmp1, tmp2, hi /*Choose the maximum.*/ + + sub limit, limit, pos + /*compare the proceeding bytes in the first 8 byte segment.*/ +.Ltinycmp: + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + subs pos, pos, #1 + ccmp data1w, data2w, #0, ne /* NZCV = 0b0000. */ + b.eq .Ltinycmp + cbnz pos, 1f /*diff occurred before the last byte.*/ + cmp data1w, data2w + b.eq .Lstart_align +1: + sub result, data1, data2 + ret + +.Lstart_align: + lsr limit_wd, limit, #3 + cbz limit_wd, .Lremain8 + + ands xzr, src1, #7 + b.eq .Lrecal_offset + /*process more leading bytes to make src1 aligned...*/ + add src1, src1, tmp3 /*backwards src1 to alignment boundary*/ + add src2, src2, tmp3 + sub limit, limit, tmp3 + lsr limit_wd, limit, #3 + cbz limit_wd, .Lremain8 + /*load 8 bytes from aligned SRC1..*/ + ldr data1, [src1], #8 + ldr data2, [src2], #8 + + subs limit_wd, limit_wd, #1 + eor diff, data1, data2 /*Non-zero if differences found.*/ + csinv endloop, diff, xzr, ne + cbnz endloop, .Lunequal_proc + /*How far is the current SRC2 from the alignment boundary...*/ + and tmp3, tmp3, #7 + +.Lrecal_offset:/*src1 is aligned now..*/ + neg pos, tmp3 +.Lloopcmp_proc: + /* + * Divide the eight bytes into two parts. First,backwards the src2 + * to an alignment boundary,load eight bytes and compare from + * the SRC2 alignment boundary. If all 8 bytes are equal,then start + * the second part's comparison. Otherwise finish the comparison. + * This special handle can garantee all the accesses are in the + * thread/task space in avoid to overrange access. + */ + ldr data1, [src1,pos] + ldr data2, [src2,pos] + eor diff, data1, data2 /* Non-zero if differences found. */ + cbnz diff, .Lnot_limit + + /*The second part process*/ + ldr data1, [src1], #8 + ldr data2, [src2], #8 + eor diff, data1, data2 /* Non-zero if differences found. */ + subs limit_wd, limit_wd, #1 + csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/ + cbz endloop, .Lloopcmp_proc +.Lunequal_proc: + cbz diff, .Lremain8 + +/* There is difference occurred in the latest comparison. */ +.Lnot_limit: +/* +* For little endian,reverse the low significant equal bits into MSB,then +* following CLZ can find how many equal bits exist. +*/ +CPU_LE( rev diff, diff ) +CPU_LE( rev data1, data1 ) +CPU_LE( rev data2, data2 ) + + /* + * The MS-non-zero bit of DIFF marks either the first bit + * that is different, or the end of the significant data. + * Shifting left now will bring the critical information into the + * top bits. + */ + clz pos, diff + lsl data1, data1, pos + lsl data2, data2, pos + /* + * We need to zero-extend (char is unsigned) the value and then + * perform a signed subtraction. + */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 + ret + +.Lremain8: + /* Limit % 8 == 0 =>. all data are equal.*/ + ands limit, limit, #7 + b.eq .Lret0 + +.Ltiny8proc: + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + subs limit, limit, #1 + + ccmp data1w, data2w, #0, ne /* NZCV = 0b0000. */ + b.eq .Ltiny8proc + sub result, data1, data2 + ret +.Lret0: + mov result, #0 + ret +ENDPIPROC(memcmp) diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S new file mode 100644 index 000000000..dfedd4ab1 --- /dev/null +++ b/arch/arm64/lib/memcpy.S @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> +#include <asm/cache.h> + +/* + * Copy a buffer from src to dest (alignment handled by the hardware) + * + * Parameters: + * x0 - dest + * x1 - src + * x2 - n + * Returns: + * x0 - dest + */ + .macro ldrb1 ptr, regB, val + ldrb \ptr, [\regB], \val + .endm + + .macro strb1 ptr, regB, val + strb \ptr, [\regB], \val + .endm + + .macro ldrh1 ptr, regB, val + ldrh \ptr, [\regB], \val + .endm + + .macro strh1 ptr, regB, val + strh \ptr, [\regB], \val + .endm + + .macro ldr1 ptr, regB, val + ldr \ptr, [\regB], \val + .endm + + .macro str1 ptr, regB, val + str \ptr, [\regB], \val + .endm + + .macro ldp1 ptr, regB, regC, val + ldp \ptr, \regB, [\regC], \val + .endm + + .macro stp1 ptr, regB, regC, val + stp \ptr, \regB, [\regC], \val + .endm + +ENTRY(__memcpy) +WEAK(memcpy) +#include "copy_template.S" + ret +ENDPIPROC(memcpy) +ENDPROC(__memcpy) diff --git a/arch/arm64/lib/memmove.S b/arch/arm64/lib/memmove.S new file mode 100644 index 000000000..e3de8f05c --- /dev/null +++ b/arch/arm64/lib/memmove.S @@ -0,0 +1,199 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> +#include <asm/cache.h> + +/* + * Move a buffer from src to test (alignment handled by the hardware). + * If dest <= src, call memcpy, otherwise copy in reverse order. + * + * Parameters: + * x0 - dest + * x1 - src + * x2 - n + * Returns: + * x0 - dest + */ +dstin .req x0 +src .req x1 +count .req x2 +tmp1 .req x3 +tmp1w .req w3 +tmp2 .req x4 +tmp2w .req w4 +tmp3 .req x5 +tmp3w .req w5 +dst .req x6 + +A_l .req x7 +A_h .req x8 +B_l .req x9 +B_h .req x10 +C_l .req x11 +C_h .req x12 +D_l .req x13 +D_h .req x14 + +ENTRY(__memmove) +WEAK(memmove) + cmp dstin, src + b.lo __memcpy + add tmp1, src, count + cmp dstin, tmp1 + b.hs __memcpy /* No overlap. */ + + add dst, dstin, count + add src, src, count + cmp count, #16 + b.lo .Ltail15 /*probably non-alignment accesses.*/ + + ands tmp2, src, #15 /* Bytes to reach alignment. */ + b.eq .LSrcAligned + sub count, count, tmp2 + /* + * process the aligned offset length to make the src aligned firstly. + * those extra instructions' cost is acceptable. It also make the + * coming accesses are based on aligned address. + */ + tbz tmp2, #0, 1f + ldrb tmp1w, [src, #-1]! + strb tmp1w, [dst, #-1]! +1: + tbz tmp2, #1, 2f + ldrh tmp1w, [src, #-2]! + strh tmp1w, [dst, #-2]! +2: + tbz tmp2, #2, 3f + ldr tmp1w, [src, #-4]! + str tmp1w, [dst, #-4]! +3: + tbz tmp2, #3, .LSrcAligned + ldr tmp1, [src, #-8]! + str tmp1, [dst, #-8]! + +.LSrcAligned: + cmp count, #64 + b.ge .Lcpy_over64 + + /* + * Deal with small copies quickly by dropping straight into the + * exit block. + */ +.Ltail63: + /* + * Copy up to 48 bytes of data. At this point we only need the + * bottom 6 bits of count to be accurate. + */ + ands tmp1, count, #0x30 + b.eq .Ltail15 + cmp tmp1w, #0x20 + b.eq 1f + b.lt 2f + ldp A_l, A_h, [src, #-16]! + stp A_l, A_h, [dst, #-16]! +1: + ldp A_l, A_h, [src, #-16]! + stp A_l, A_h, [dst, #-16]! +2: + ldp A_l, A_h, [src, #-16]! + stp A_l, A_h, [dst, #-16]! + +.Ltail15: + tbz count, #3, 1f + ldr tmp1, [src, #-8]! + str tmp1, [dst, #-8]! +1: + tbz count, #2, 2f + ldr tmp1w, [src, #-4]! + str tmp1w, [dst, #-4]! +2: + tbz count, #1, 3f + ldrh tmp1w, [src, #-2]! + strh tmp1w, [dst, #-2]! +3: + tbz count, #0, .Lexitfunc + ldrb tmp1w, [src, #-1] + strb tmp1w, [dst, #-1] + +.Lexitfunc: + ret + +.Lcpy_over64: + subs count, count, #128 + b.ge .Lcpy_body_large + /* + * Less than 128 bytes to copy, so handle 64 bytes here and then jump + * to the tail. + */ + ldp A_l, A_h, [src, #-16] + stp A_l, A_h, [dst, #-16] + ldp B_l, B_h, [src, #-32] + ldp C_l, C_h, [src, #-48] + stp B_l, B_h, [dst, #-32] + stp C_l, C_h, [dst, #-48] + ldp D_l, D_h, [src, #-64]! + stp D_l, D_h, [dst, #-64]! + + tst count, #0x3f + b.ne .Ltail63 + ret + + /* + * Critical loop. Start at a new cache line boundary. Assuming + * 64 bytes per line this ensures the entire loop is in one line. + */ + .p2align L1_CACHE_SHIFT +.Lcpy_body_large: + /* pre-load 64 bytes data. */ + ldp A_l, A_h, [src, #-16] + ldp B_l, B_h, [src, #-32] + ldp C_l, C_h, [src, #-48] + ldp D_l, D_h, [src, #-64]! +1: + /* + * interlace the load of next 64 bytes data block with store of the last + * loaded 64 bytes data. + */ + stp A_l, A_h, [dst, #-16] + ldp A_l, A_h, [src, #-16] + stp B_l, B_h, [dst, #-32] + ldp B_l, B_h, [src, #-32] + stp C_l, C_h, [dst, #-48] + ldp C_l, C_h, [src, #-48] + stp D_l, D_h, [dst, #-64]! + ldp D_l, D_h, [src, #-64]! + subs count, count, #64 + b.ge 1b + stp A_l, A_h, [dst, #-16] + stp B_l, B_h, [dst, #-32] + stp C_l, C_h, [dst, #-48] + stp D_l, D_h, [dst, #-64]! + + tst count, #0x3f + b.ne .Ltail63 + ret +ENDPIPROC(memmove) +ENDPROC(__memmove) diff --git a/arch/arm64/lib/memset.S b/arch/arm64/lib/memset.S new file mode 100644 index 000000000..316263c47 --- /dev/null +++ b/arch/arm64/lib/memset.S @@ -0,0 +1,218 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> +#include <asm/cache.h> + +/* + * Fill in the buffer with character c (alignment handled by the hardware) + * + * Parameters: + * x0 - buf + * x1 - c + * x2 - n + * Returns: + * x0 - buf + */ + +dstin .req x0 +val .req w1 +count .req x2 +tmp1 .req x3 +tmp1w .req w3 +tmp2 .req x4 +tmp2w .req w4 +zva_len_x .req x5 +zva_len .req w5 +zva_bits_x .req x6 + +A_l .req x7 +A_lw .req w7 +dst .req x8 +tmp3w .req w9 +tmp3 .req x9 + +ENTRY(__memset) +WEAK(memset) + mov dst, dstin /* Preserve return value. */ + and A_lw, val, #255 + orr A_lw, A_lw, A_lw, lsl #8 + orr A_lw, A_lw, A_lw, lsl #16 + orr A_l, A_l, A_l, lsl #32 + + cmp count, #15 + b.hi .Lover16_proc + /*All store maybe are non-aligned..*/ + tbz count, #3, 1f + str A_l, [dst], #8 +1: + tbz count, #2, 2f + str A_lw, [dst], #4 +2: + tbz count, #1, 3f + strh A_lw, [dst], #2 +3: + tbz count, #0, 4f + strb A_lw, [dst] +4: + ret + +.Lover16_proc: + /*Whether the start address is aligned with 16.*/ + neg tmp2, dst + ands tmp2, tmp2, #15 + b.eq .Laligned +/* +* The count is not less than 16, we can use stp to store the start 16 bytes, +* then adjust the dst aligned with 16.This process will make the current +* memory address at alignment boundary. +*/ + stp A_l, A_l, [dst] /*non-aligned store..*/ + /*make the dst aligned..*/ + sub count, count, tmp2 + add dst, dst, tmp2 + +.Laligned: + cbz A_l, .Lzero_mem + +.Ltail_maybe_long: + cmp count, #64 + b.ge .Lnot_short +.Ltail63: + ands tmp1, count, #0x30 + b.eq 3f + cmp tmp1w, #0x20 + b.eq 1f + b.lt 2f + stp A_l, A_l, [dst], #16 +1: + stp A_l, A_l, [dst], #16 +2: + stp A_l, A_l, [dst], #16 +/* +* The last store length is less than 16,use stp to write last 16 bytes. +* It will lead some bytes written twice and the access is non-aligned. +*/ +3: + ands count, count, #15 + cbz count, 4f + add dst, dst, count + stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */ +4: + ret + + /* + * Critical loop. Start at a new cache line boundary. Assuming + * 64 bytes per line, this ensures the entire loop is in one line. + */ + .p2align L1_CACHE_SHIFT +.Lnot_short: + sub dst, dst, #16/* Pre-bias. */ + sub count, count, #64 +1: + stp A_l, A_l, [dst, #16] + stp A_l, A_l, [dst, #32] + stp A_l, A_l, [dst, #48] + stp A_l, A_l, [dst, #64]! + subs count, count, #64 + b.ge 1b + tst count, #0x3f + add dst, dst, #16 + b.ne .Ltail63 +.Lexitfunc: + ret + + /* + * For zeroing memory, check to see if we can use the ZVA feature to + * zero entire 'cache' lines. + */ +.Lzero_mem: + cmp count, #63 + b.le .Ltail63 + /* + * For zeroing small amounts of memory, it's not worth setting up + * the line-clear code. + */ + cmp count, #128 + b.lt .Lnot_short /*count is at least 128 bytes*/ + + mrs tmp1, dczid_el0 + tbnz tmp1, #4, .Lnot_short + mov tmp3w, #4 + and zva_len, tmp1w, #15 /* Safety: other bits reserved. */ + lsl zva_len, tmp3w, zva_len + + ands tmp3w, zva_len, #63 + /* + * ensure the zva_len is not less than 64. + * It is not meaningful to use ZVA if the block size is less than 64. + */ + b.ne .Lnot_short +.Lzero_by_line: + /* + * Compute how far we need to go to become suitably aligned. We're + * already at quad-word alignment. + */ + cmp count, zva_len_x + b.lt .Lnot_short /* Not enough to reach alignment. */ + sub zva_bits_x, zva_len_x, #1 + neg tmp2, dst + ands tmp2, tmp2, zva_bits_x + b.eq 2f /* Already aligned. */ + /* Not aligned, check that there's enough to copy after alignment.*/ + sub tmp1, count, tmp2 + /* + * grantee the remain length to be ZVA is bigger than 64, + * avoid to make the 2f's process over mem range.*/ + cmp tmp1, #64 + ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */ + b.lt .Lnot_short + /* + * We know that there's at least 64 bytes to zero and that it's safe + * to overrun by 64 bytes. + */ + mov count, tmp1 +1: + stp A_l, A_l, [dst] + stp A_l, A_l, [dst, #16] + stp A_l, A_l, [dst, #32] + subs tmp2, tmp2, #64 + stp A_l, A_l, [dst, #48] + add dst, dst, #64 + b.ge 1b + /* We've overrun a bit, so adjust dst downwards.*/ + add dst, dst, tmp2 +2: + sub count, count, zva_len_x +3: + dc zva, dst + add dst, dst, zva_len_x + subs count, count, zva_len_x + b.ge 3b + ands count, count, zva_bits_x + b.ne .Ltail_maybe_long + ret +ENDPIPROC(memset) +ENDPROC(__memset) diff --git a/arch/arm64/lib/strchr.S b/arch/arm64/lib/strchr.S new file mode 100644 index 000000000..7c83091d1 --- /dev/null +++ b/arch/arm64/lib/strchr.S @@ -0,0 +1,42 @@ +/* + * Based on arch/arm/lib/strchr.S + * + * Copyright (C) 1995-2000 Russell King + * Copyright (C) 2013 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * Find the first occurrence of a character in a string. + * + * Parameters: + * x0 - str + * x1 - c + * Returns: + * x0 - address of first occurrence of 'c' or 0 + */ +WEAK(strchr) + and w1, w1, #0xff +1: ldrb w2, [x0], #1 + cmp w2, w1 + ccmp w2, wzr, #4, ne + b.ne 1b + sub x0, x0, #1 + cmp w2, w1 + csel x0, x0, xzr, eq + ret +ENDPROC(strchr) diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S new file mode 100644 index 000000000..7d5d15398 --- /dev/null +++ b/arch/arm64/lib/strcmp.S @@ -0,0 +1,234 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * compare two strings + * + * Parameters: + * x0 - const string 1 pointer + * x1 - const string 2 pointer + * Returns: + * x0 - an integer less than, equal to, or greater than zero + * if s1 is found, respectively, to be less than, to match, + * or be greater than s2. + */ + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + +/* Parameters and result. */ +src1 .req x0 +src2 .req x1 +result .req x0 + +/* Internal variables. */ +data1 .req x2 +data1w .req w2 +data2 .req x3 +data2w .req w3 +has_nul .req x4 +diff .req x5 +syndrome .req x6 +tmp1 .req x7 +tmp2 .req x8 +tmp3 .req x9 +zeroones .req x10 +pos .req x11 + +WEAK(strcmp) + eor tmp1, src1, src2 + mov zeroones, #REP8_01 + tst tmp1, #7 + b.ne .Lmisaligned8 + ands tmp1, src1, #7 + b.ne .Lmutual_align + + /* + * NUL detection works on the principle that (X - 1) & (~X) & 0x80 + * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + * can be done in parallel across the entire word. + */ +.Lloop_aligned: + ldr data1, [src1], #8 + ldr data2, [src2], #8 +.Lstart_realigned: + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + orr syndrome, diff, has_nul + cbz syndrome, .Lloop_aligned + b .Lcal_cmpresult + +.Lmutual_align: + /* + * Sources are mutually aligned, but are not currently at an + * alignment boundary. Round down the addresses and then mask off + * the bytes that preceed the start point. + */ + bic src1, src1, #7 + bic src2, src2, #7 + lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ + ldr data1, [src1], #8 + neg tmp1, tmp1 /* Bits to alignment -64. */ + ldr data2, [src2], #8 + mov tmp2, #~0 + /* Big-endian. Early bytes are at MSB. */ +CPU_BE( lsl tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */ + /* Little-endian. Early bytes are at LSB. */ +CPU_LE( lsr tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */ + + orr data1, data1, tmp2 + orr data2, data2, tmp2 + b .Lstart_realigned + +.Lmisaligned8: + /* + * Get the align offset length to compare per byte first. + * After this process, one string's address will be aligned. + */ + and tmp1, src1, #7 + neg tmp1, tmp1 + add tmp1, tmp1, #8 + and tmp2, src2, #7 + neg tmp2, tmp2 + add tmp2, tmp2, #8 + subs tmp3, tmp1, tmp2 + csel pos, tmp1, tmp2, hi /*Choose the maximum. */ +.Ltinycmp: + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + subs pos, pos, #1 + ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */ + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + b.eq .Ltinycmp + cbnz pos, 1f /*find the null or unequal...*/ + cmp data1w, #1 + ccmp data1w, data2w, #0, cs + b.eq .Lstart_align /*the last bytes are equal....*/ +1: + sub result, data1, data2 + ret + +.Lstart_align: + ands xzr, src1, #7 + b.eq .Lrecal_offset + /*process more leading bytes to make str1 aligned...*/ + add src1, src1, tmp3 + add src2, src2, tmp3 + /*load 8 bytes from aligned str1 and non-aligned str2..*/ + ldr data1, [src1], #8 + ldr data2, [src2], #8 + + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + bic has_nul, tmp1, tmp2 + eor diff, data1, data2 /* Non-zero if differences found. */ + orr syndrome, diff, has_nul + cbnz syndrome, .Lcal_cmpresult + /*How far is the current str2 from the alignment boundary...*/ + and tmp3, tmp3, #7 +.Lrecal_offset: + neg pos, tmp3 +.Lloopcmp_proc: + /* + * Divide the eight bytes into two parts. First,backwards the src2 + * to an alignment boundary,load eight bytes from the SRC2 alignment + * boundary,then compare with the relative bytes from SRC1. + * If all 8 bytes are equal,then start the second part's comparison. + * Otherwise finish the comparison. + * This special handle can garantee all the accesses are in the + * thread/task space in avoid to overrange access. + */ + ldr data1, [src1,pos] + ldr data2, [src2,pos] + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + bic has_nul, tmp1, tmp2 + eor diff, data1, data2 /* Non-zero if differences found. */ + orr syndrome, diff, has_nul + cbnz syndrome, .Lcal_cmpresult + + /*The second part process*/ + ldr data1, [src1], #8 + ldr data2, [src2], #8 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + bic has_nul, tmp1, tmp2 + eor diff, data1, data2 /* Non-zero if differences found. */ + orr syndrome, diff, has_nul + cbz syndrome, .Lloopcmp_proc + +.Lcal_cmpresult: + /* + * reversed the byte-order as big-endian,then CLZ can find the most + * significant zero bits. + */ +CPU_LE( rev syndrome, syndrome ) +CPU_LE( rev data1, data1 ) +CPU_LE( rev data2, data2 ) + + /* + * For big-endian we cannot use the trick with the syndrome value + * as carry-propagation can corrupt the upper bits if the trailing + * bytes in the string contain 0x01. + * However, if there is no NUL byte in the dword, we can generate + * the result directly. We ca not just subtract the bytes as the + * MSB might be significant. + */ +CPU_BE( cbnz has_nul, 1f ) +CPU_BE( cmp data1, data2 ) +CPU_BE( cset result, ne ) +CPU_BE( cneg result, result, lo ) +CPU_BE( ret ) +CPU_BE( 1: ) + /*Re-compute the NUL-byte detection, using a byte-reversed value. */ +CPU_BE( rev tmp3, data1 ) +CPU_BE( sub tmp1, tmp3, zeroones ) +CPU_BE( orr tmp2, tmp3, #REP8_7f ) +CPU_BE( bic has_nul, tmp1, tmp2 ) +CPU_BE( rev has_nul, has_nul ) +CPU_BE( orr syndrome, diff, has_nul ) + + clz pos, syndrome + /* + * The MS-non-zero bit of the syndrome marks either the first bit + * that is different, or the top bit of the first zero byte. + * Shifting left now will bring the critical information into the + * top bits. + */ + lsl data1, data1, pos + lsl data2, data2, pos + /* + * But we need to zero-extend (char is unsigned) the value and then + * perform a signed 32-bit subtraction. + */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 + ret +ENDPIPROC(strcmp) diff --git a/arch/arm64/lib/strlen.S b/arch/arm64/lib/strlen.S new file mode 100644 index 000000000..8e0b14205 --- /dev/null +++ b/arch/arm64/lib/strlen.S @@ -0,0 +1,126 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * calculate the length of a string + * + * Parameters: + * x0 - const string pointer + * Returns: + * x0 - the return length of specific string + */ + +/* Arguments and results. */ +srcin .req x0 +len .req x0 + +/* Locals and temporaries. */ +src .req x1 +data1 .req x2 +data2 .req x3 +data2a .req x4 +has_nul1 .req x5 +has_nul2 .req x6 +tmp1 .req x7 +tmp2 .req x8 +tmp3 .req x9 +tmp4 .req x10 +zeroones .req x11 +pos .req x12 + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + +WEAK(strlen) + mov zeroones, #REP8_01 + bic src, srcin, #15 + ands tmp1, srcin, #15 + b.ne .Lmisaligned + /* + * NUL detection works on the principle that (X - 1) & (~X) & 0x80 + * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + * can be done in parallel across the entire word. + */ + /* + * The inner loop deals with two Dwords at a time. This has a + * slightly higher start-up cost, but we should win quite quickly, + * especially on cores with a high number of issue slots per + * cycle, as we get much better parallelism out of the operations. + */ +.Lloop: + ldp data1, data2, [src], #16 +.Lrealigned: + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, #REP8_7f + bic has_nul1, tmp1, tmp2 + bics has_nul2, tmp3, tmp4 + ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ + b.eq .Lloop + + sub len, src, srcin + cbz has_nul1, .Lnul_in_data2 +CPU_BE( mov data2, data1 ) /*prepare data to re-calculate the syndrome*/ + sub len, len, #8 + mov has_nul2, has_nul1 +.Lnul_in_data2: + /* + * For big-endian, carry propagation (if the final byte in the + * string is 0x01) means we cannot use has_nul directly. The + * easiest way to get the correct byte is to byte-swap the data + * and calculate the syndrome a second time. + */ +CPU_BE( rev data2, data2 ) +CPU_BE( sub tmp1, data2, zeroones ) +CPU_BE( orr tmp2, data2, #REP8_7f ) +CPU_BE( bic has_nul2, tmp1, tmp2 ) + + sub len, len, #8 + rev has_nul2, has_nul2 + clz pos, has_nul2 + add len, len, pos, lsr #3 /* Bits to bytes. */ + ret + +.Lmisaligned: + cmp tmp1, #8 + neg tmp1, tmp1 + ldp data1, data2, [src], #16 + lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ + mov tmp2, #~0 + /* Big-endian. Early bytes are at MSB. */ +CPU_BE( lsl tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */ + /* Little-endian. Early bytes are at LSB. */ +CPU_LE( lsr tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */ + + orr data1, data1, tmp2 + orr data2a, data2, tmp2 + csinv data1, data1, xzr, le + csel data2, data2, data2a, le + b .Lrealigned +ENDPIPROC(strlen) diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S new file mode 100644 index 000000000..66bd14593 --- /dev/null +++ b/arch/arm64/lib/strncmp.S @@ -0,0 +1,310 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * compare two strings + * + * Parameters: + * x0 - const string 1 pointer + * x1 - const string 2 pointer + * x2 - the maximal length to be compared + * Returns: + * x0 - an integer less than, equal to, or greater than zero if s1 is found, + * respectively, to be less than, to match, or be greater than s2. + */ + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + +/* Parameters and result. */ +src1 .req x0 +src2 .req x1 +limit .req x2 +result .req x0 + +/* Internal variables. */ +data1 .req x3 +data1w .req w3 +data2 .req x4 +data2w .req w4 +has_nul .req x5 +diff .req x6 +syndrome .req x7 +tmp1 .req x8 +tmp2 .req x9 +tmp3 .req x10 +zeroones .req x11 +pos .req x12 +limit_wd .req x13 +mask .req x14 +endloop .req x15 + +WEAK(strncmp) + cbz limit, .Lret0 + eor tmp1, src1, src2 + mov zeroones, #REP8_01 + tst tmp1, #7 + b.ne .Lmisaligned8 + ands tmp1, src1, #7 + b.ne .Lmutual_align + /* Calculate the number of full and partial words -1. */ + /* + * when limit is mulitply of 8, if not sub 1, + * the judgement of last dword will wrong. + */ + sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ + lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ + + /* + * NUL detection works on the principle that (X - 1) & (~X) & 0x80 + * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + * can be done in parallel across the entire word. + */ +.Lloop_aligned: + ldr data1, [src1], #8 + ldr data2, [src2], #8 +.Lstart_realigned: + subs limit_wd, limit_wd, #1 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + csinv endloop, diff, xzr, pl /* Last Dword or differences.*/ + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + ccmp endloop, #0, #0, eq + b.eq .Lloop_aligned + + /*Not reached the limit, must have found the end or a diff. */ + tbz limit_wd, #63, .Lnot_limit + + /* Limit % 8 == 0 => all bytes significant. */ + ands limit, limit, #7 + b.eq .Lnot_limit + + lsl limit, limit, #3 /* Bits -> bytes. */ + mov mask, #~0 +CPU_BE( lsr mask, mask, limit ) +CPU_LE( lsl mask, mask, limit ) + bic data1, data1, mask + bic data2, data2, mask + + /* Make sure that the NUL byte is marked in the syndrome. */ + orr has_nul, has_nul, mask + +.Lnot_limit: + orr syndrome, diff, has_nul + b .Lcal_cmpresult + +.Lmutual_align: + /* + * Sources are mutually aligned, but are not currently at an + * alignment boundary. Round down the addresses and then mask off + * the bytes that precede the start point. + * We also need to adjust the limit calculations, but without + * overflowing if the limit is near ULONG_MAX. + */ + bic src1, src1, #7 + bic src2, src2, #7 + ldr data1, [src1], #8 + neg tmp3, tmp1, lsl #3 /* 64 - bits(bytes beyond align). */ + ldr data2, [src2], #8 + mov tmp2, #~0 + sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ + /* Big-endian. Early bytes are at MSB. */ +CPU_BE( lsl tmp2, tmp2, tmp3 ) /* Shift (tmp1 & 63). */ + /* Little-endian. Early bytes are at LSB. */ +CPU_LE( lsr tmp2, tmp2, tmp3 ) /* Shift (tmp1 & 63). */ + + and tmp3, limit_wd, #7 + lsr limit_wd, limit_wd, #3 + /* Adjust the limit. Only low 3 bits used, so overflow irrelevant.*/ + add limit, limit, tmp1 + add tmp3, tmp3, tmp1 + orr data1, data1, tmp2 + orr data2, data2, tmp2 + add limit_wd, limit_wd, tmp3, lsr #3 + b .Lstart_realigned + +/*when src1 offset is not equal to src2 offset...*/ +.Lmisaligned8: + cmp limit, #8 + b.lo .Ltiny8proc /*limit < 8... */ + /* + * Get the align offset length to compare per byte first. + * After this process, one string's address will be aligned.*/ + and tmp1, src1, #7 + neg tmp1, tmp1 + add tmp1, tmp1, #8 + and tmp2, src2, #7 + neg tmp2, tmp2 + add tmp2, tmp2, #8 + subs tmp3, tmp1, tmp2 + csel pos, tmp1, tmp2, hi /*Choose the maximum. */ + /* + * Here, limit is not less than 8, so directly run .Ltinycmp + * without checking the limit.*/ + sub limit, limit, pos +.Ltinycmp: + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + subs pos, pos, #1 + ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */ + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + b.eq .Ltinycmp + cbnz pos, 1f /*find the null or unequal...*/ + cmp data1w, #1 + ccmp data1w, data2w, #0, cs + b.eq .Lstart_align /*the last bytes are equal....*/ +1: + sub result, data1, data2 + ret + +.Lstart_align: + lsr limit_wd, limit, #3 + cbz limit_wd, .Lremain8 + /*process more leading bytes to make str1 aligned...*/ + ands xzr, src1, #7 + b.eq .Lrecal_offset + add src1, src1, tmp3 /*tmp3 is positive in this branch.*/ + add src2, src2, tmp3 + ldr data1, [src1], #8 + ldr data2, [src2], #8 + + sub limit, limit, tmp3 + lsr limit_wd, limit, #3 + subs limit_wd, limit_wd, #1 + + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/ + bics has_nul, tmp1, tmp2 + ccmp endloop, #0, #0, eq /*has_null is ZERO: no null byte*/ + b.ne .Lunequal_proc + /*How far is the current str2 from the alignment boundary...*/ + and tmp3, tmp3, #7 +.Lrecal_offset: + neg pos, tmp3 +.Lloopcmp_proc: + /* + * Divide the eight bytes into two parts. First,backwards the src2 + * to an alignment boundary,load eight bytes from the SRC2 alignment + * boundary,then compare with the relative bytes from SRC1. + * If all 8 bytes are equal,then start the second part's comparison. + * Otherwise finish the comparison. + * This special handle can garantee all the accesses are in the + * thread/task space in avoid to overrange access. + */ + ldr data1, [src1,pos] + ldr data2, [src2,pos] + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + eor diff, data1, data2 /* Non-zero if differences found. */ + csinv endloop, diff, xzr, eq + cbnz endloop, .Lunequal_proc + + /*The second part process*/ + ldr data1, [src1], #8 + ldr data2, [src2], #8 + subs limit_wd, limit_wd, #1 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/ + bics has_nul, tmp1, tmp2 + ccmp endloop, #0, #0, eq /*has_null is ZERO: no null byte*/ + b.eq .Lloopcmp_proc + +.Lunequal_proc: + orr syndrome, diff, has_nul + cbz syndrome, .Lremain8 +.Lcal_cmpresult: + /* + * reversed the byte-order as big-endian,then CLZ can find the most + * significant zero bits. + */ +CPU_LE( rev syndrome, syndrome ) +CPU_LE( rev data1, data1 ) +CPU_LE( rev data2, data2 ) + /* + * For big-endian we cannot use the trick with the syndrome value + * as carry-propagation can corrupt the upper bits if the trailing + * bytes in the string contain 0x01. + * However, if there is no NUL byte in the dword, we can generate + * the result directly. We can't just subtract the bytes as the + * MSB might be significant. + */ +CPU_BE( cbnz has_nul, 1f ) +CPU_BE( cmp data1, data2 ) +CPU_BE( cset result, ne ) +CPU_BE( cneg result, result, lo ) +CPU_BE( ret ) +CPU_BE( 1: ) + /* Re-compute the NUL-byte detection, using a byte-reversed value.*/ +CPU_BE( rev tmp3, data1 ) +CPU_BE( sub tmp1, tmp3, zeroones ) +CPU_BE( orr tmp2, tmp3, #REP8_7f ) +CPU_BE( bic has_nul, tmp1, tmp2 ) +CPU_BE( rev has_nul, has_nul ) +CPU_BE( orr syndrome, diff, has_nul ) + /* + * The MS-non-zero bit of the syndrome marks either the first bit + * that is different, or the top bit of the first zero byte. + * Shifting left now will bring the critical information into the + * top bits. + */ + clz pos, syndrome + lsl data1, data1, pos + lsl data2, data2, pos + /* + * But we need to zero-extend (char is unsigned) the value and then + * perform a signed 32-bit subtraction. + */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 + ret + +.Lremain8: + /* Limit % 8 == 0 => all bytes significant. */ + ands limit, limit, #7 + b.eq .Lret0 +.Ltiny8proc: + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + subs limit, limit, #1 + + ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */ + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + b.eq .Ltiny8proc + sub result, data1, data2 + ret + +.Lret0: + mov result, #0 + ret +ENDPIPROC(strncmp) diff --git a/arch/arm64/lib/strnlen.S b/arch/arm64/lib/strnlen.S new file mode 100644 index 000000000..355be0444 --- /dev/null +++ b/arch/arm64/lib/strnlen.S @@ -0,0 +1,171 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * determine the length of a fixed-size string + * + * Parameters: + * x0 - const string pointer + * x1 - maximal string length + * Returns: + * x0 - the return length of specific string + */ + +/* Arguments and results. */ +srcin .req x0 +len .req x0 +limit .req x1 + +/* Locals and temporaries. */ +src .req x2 +data1 .req x3 +data2 .req x4 +data2a .req x5 +has_nul1 .req x6 +has_nul2 .req x7 +tmp1 .req x8 +tmp2 .req x9 +tmp3 .req x10 +tmp4 .req x11 +zeroones .req x12 +pos .req x13 +limit_wd .req x14 + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + +WEAK(strnlen) + cbz limit, .Lhit_limit + mov zeroones, #REP8_01 + bic src, srcin, #15 + ands tmp1, srcin, #15 + b.ne .Lmisaligned + /* Calculate the number of full and partial words -1. */ + sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */ + lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */ + + /* + * NUL detection works on the principle that (X - 1) & (~X) & 0x80 + * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + * can be done in parallel across the entire word. + */ + /* + * The inner loop deals with two Dwords at a time. This has a + * slightly higher start-up cost, but we should win quite quickly, + * especially on cores with a high number of issue slots per + * cycle, as we get much better parallelism out of the operations. + */ +.Lloop: + ldp data1, data2, [src], #16 +.Lrealigned: + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, #REP8_7f + bic has_nul1, tmp1, tmp2 + bic has_nul2, tmp3, tmp4 + subs limit_wd, limit_wd, #1 + orr tmp1, has_nul1, has_nul2 + ccmp tmp1, #0, #0, pl /* NZCV = 0000 */ + b.eq .Lloop + + cbz tmp1, .Lhit_limit /* No null in final Qword. */ + + /* + * We know there's a null in the final Qword. The easiest thing + * to do now is work out the length of the string and return + * MIN (len, limit). + */ + sub len, src, srcin + cbz has_nul1, .Lnul_in_data2 +CPU_BE( mov data2, data1 ) /*perpare data to re-calculate the syndrome*/ + + sub len, len, #8 + mov has_nul2, has_nul1 +.Lnul_in_data2: + /* + * For big-endian, carry propagation (if the final byte in the + * string is 0x01) means we cannot use has_nul directly. The + * easiest way to get the correct byte is to byte-swap the data + * and calculate the syndrome a second time. + */ +CPU_BE( rev data2, data2 ) +CPU_BE( sub tmp1, data2, zeroones ) +CPU_BE( orr tmp2, data2, #REP8_7f ) +CPU_BE( bic has_nul2, tmp1, tmp2 ) + + sub len, len, #8 + rev has_nul2, has_nul2 + clz pos, has_nul2 + add len, len, pos, lsr #3 /* Bits to bytes. */ + cmp len, limit + csel len, len, limit, ls /* Return the lower value. */ + ret + +.Lmisaligned: + /* + * Deal with a partial first word. + * We're doing two things in parallel here; + * 1) Calculate the number of words (but avoiding overflow if + * limit is near ULONG_MAX) - to do this we need to work out + * limit + tmp1 - 1 as a 65-bit value before shifting it; + * 2) Load and mask the initial data words - we force the bytes + * before the ones we are interested in to 0xff - this ensures + * early bytes will not hit any zero detection. + */ + ldp data1, data2, [src], #16 + + sub limit_wd, limit, #1 + and tmp3, limit_wd, #15 + lsr limit_wd, limit_wd, #4 + + add tmp3, tmp3, tmp1 + add limit_wd, limit_wd, tmp3, lsr #4 + + neg tmp4, tmp1 + lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */ + + mov tmp2, #~0 + /* Big-endian. Early bytes are at MSB. */ +CPU_BE( lsl tmp2, tmp2, tmp4 ) /* Shift (tmp1 & 63). */ + /* Little-endian. Early bytes are at LSB. */ +CPU_LE( lsr tmp2, tmp2, tmp4 ) /* Shift (tmp1 & 63). */ + + cmp tmp1, #8 + + orr data1, data1, tmp2 + orr data2a, data2, tmp2 + + csinv data1, data1, xzr, le + csel data2, data2, data2a, le + b .Lrealigned + +.Lhit_limit: + mov len, limit + ret +ENDPIPROC(strnlen) diff --git a/arch/arm64/lib/strrchr.S b/arch/arm64/lib/strrchr.S new file mode 100644 index 000000000..ea84924d5 --- /dev/null +++ b/arch/arm64/lib/strrchr.S @@ -0,0 +1,43 @@ +/* + * Based on arch/arm/lib/strrchr.S + * + * Copyright (C) 1995-2000 Russell King + * Copyright (C) 2013 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * Find the last occurrence of a character in a string. + * + * Parameters: + * x0 - str + * x1 - c + * Returns: + * x0 - address of last occurrence of 'c' or 0 + */ +WEAK(strrchr) + mov x3, #0 + and w1, w1, #0xff +1: ldrb w2, [x0], #1 + cbz w2, 2f + cmp w2, w1 + b.ne 1b + sub x3, x0, #1 + b 1b +2: mov x0, x3 + ret +ENDPIPROC(strrchr) diff --git a/arch/arm64/lib/tishift.S b/arch/arm64/lib/tishift.S new file mode 100644 index 000000000..0fdff9779 --- /dev/null +++ b/arch/arm64/lib/tishift.S @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) + * + * Copyright (C) 2017-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ + +#include <linux/linkage.h> + +ENTRY(__ashlti3) + cbz x2, 1f + mov x3, #64 + sub x3, x3, x2 + cmp x3, #0 + b.le 2f + lsl x1, x1, x2 + lsr x3, x0, x3 + lsl x2, x0, x2 + orr x1, x1, x3 + mov x0, x2 +1: + ret +2: + neg w1, w3 + mov x2, #0 + lsl x1, x0, x1 + mov x0, x2 + ret +ENDPROC(__ashlti3) + +ENTRY(__ashrti3) + cbz x2, 1f + mov x3, #64 + sub x3, x3, x2 + cmp x3, #0 + b.le 2f + lsr x0, x0, x2 + lsl x3, x1, x3 + asr x2, x1, x2 + orr x0, x0, x3 + mov x1, x2 +1: + ret +2: + neg w0, w3 + asr x2, x1, #63 + asr x0, x1, x0 + mov x1, x2 + ret +ENDPROC(__ashrti3) + +ENTRY(__lshrti3) + cbz x2, 1f + mov x3, #64 + sub x3, x3, x2 + cmp x3, #0 + b.le 2f + lsr x0, x0, x2 + lsl x3, x1, x3 + lsr x2, x1, x2 + orr x0, x0, x3 + mov x1, x2 +1: + ret +2: + neg w0, w3 + mov x2, #0 + lsr x0, x1, x0 + mov x1, x2 + ret +ENDPROC(__lshrti3) diff --git a/arch/arm64/lib/uaccess_flushcache.c b/arch/arm64/lib/uaccess_flushcache.c new file mode 100644 index 000000000..b6ceafdb8 --- /dev/null +++ b/arch/arm64/lib/uaccess_flushcache.c @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2017 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/uaccess.h> +#include <asm/barrier.h> +#include <asm/cacheflush.h> + +void memcpy_flushcache(void *dst, const void *src, size_t cnt) +{ + /* + * We assume this should not be called with @dst pointing to + * non-cacheable memory, such that we don't need an explicit + * barrier to order the cache maintenance against the memcpy. + */ + memcpy(dst, src, cnt); + __clean_dcache_area_pop(dst, cnt); +} +EXPORT_SYMBOL_GPL(memcpy_flushcache); + +void memcpy_page_flushcache(char *to, struct page *page, size_t offset, + size_t len) +{ + memcpy_flushcache(to, page_address(page) + offset, len); +} + +unsigned long __copy_user_flushcache(void *to, const void __user *from, + unsigned long n) +{ + unsigned long rc = __arch_copy_from_user(to, from, n); + + /* See above */ + __clean_dcache_area_pop(to, n - rc); + return rc; +} |