diff options
Diffstat (limited to 'arch/arm64/lib')
27 files changed, 4763 insertions, 0 deletions
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile new file mode 100644 index 0000000000..29490be254 --- /dev/null +++ b/arch/arm64/lib/Makefile @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: GPL-2.0 +lib-y := clear_user.o delay.o copy_from_user.o \ + copy_to_user.o copy_page.o \ + clear_page.o csum.o insn.o memchr.o memcpy.o \ + memset.o memcmp.o strcmp.o strncmp.o strlen.o \ + strnlen.o strchr.o strrchr.o tishift.o + +ifeq ($(CONFIG_KERNEL_MODE_NEON), y) +obj-$(CONFIG_XOR_BLOCKS) += xor-neon.o +CFLAGS_REMOVE_xor-neon.o += -mgeneral-regs-only +CFLAGS_xor-neon.o += -ffreestanding +# Enable <arm_neon.h> +CFLAGS_xor-neon.o += -isystem $(shell $(CC) -print-file-name=include) +endif + +lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o + +obj-$(CONFIG_CRC32) += crc32.o + +obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o + +obj-$(CONFIG_ARM64_MTE) += mte.o + +obj-$(CONFIG_KASAN_SW_TAGS) += kasan_sw_tags.o diff --git a/arch/arm64/lib/clear_page.S b/arch/arm64/lib/clear_page.S new file mode 100644 index 0000000000..ebde40e7fa --- /dev/null +++ b/arch/arm64/lib/clear_page.S @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2012 ARM Ltd. + */ + +#include <linux/linkage.h> +#include <linux/const.h> +#include <asm/assembler.h> +#include <asm/page.h> + +/* + * Clear page @dest + * + * Parameters: + * x0 - dest + */ +SYM_FUNC_START(__pi_clear_page) + mrs x1, dczid_el0 + tbnz x1, #4, 2f /* Branch if DC ZVA is prohibited */ + and w1, w1, #0xf + mov x2, #4 + lsl x1, x2, x1 + +1: dc zva, x0 + add x0, x0, x1 + tst x0, #(PAGE_SIZE - 1) + b.ne 1b + ret + +2: stnp xzr, xzr, [x0] + stnp xzr, xzr, [x0, #16] + stnp xzr, xzr, [x0, #32] + stnp xzr, xzr, [x0, #48] + add x0, x0, #64 + tst x0, #(PAGE_SIZE - 1) + b.ne 2b + ret +SYM_FUNC_END(__pi_clear_page) +SYM_FUNC_ALIAS(clear_page, __pi_clear_page) +EXPORT_SYMBOL(clear_page) diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S new file mode 100644 index 0000000000..a5a5f5b97b --- /dev/null +++ b/arch/arm64/lib/clear_user.S @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2021 Arm Ltd. + */ + +#include <linux/linkage.h> +#include <asm/asm-uaccess.h> + + .text + +/* Prototype: int __arch_clear_user(void *addr, size_t sz) + * Purpose : clear some user memory + * Params : addr - user memory address to clear + * : sz - number of bytes to clear + * Returns : number of bytes NOT cleared + * + * Alignment fixed up by hardware. + */ + + .p2align 4 + // Alignment is for the loop, but since the prologue (including BTI) + // is also 16 bytes we can keep any padding outside the function +SYM_FUNC_START(__arch_clear_user) + add x2, x0, x1 + subs x1, x1, #8 + b.mi 2f +1: +USER(9f, sttr xzr, [x0]) + add x0, x0, #8 + subs x1, x1, #8 + b.hi 1b +USER(9f, sttr xzr, [x2, #-8]) + mov x0, #0 + ret + +2: tbz x1, #2, 3f +USER(9f, sttr wzr, [x0]) +USER(8f, sttr wzr, [x2, #-4]) + mov x0, #0 + ret + +3: tbz x1, #1, 4f +USER(9f, sttrh wzr, [x0]) +4: tbz x1, #0, 5f +USER(7f, sttrb wzr, [x2, #-1]) +5: mov x0, #0 + ret + + // Exception fixups +7: sub x0, x2, #5 // Adjust for faulting on the final byte... +8: add x0, x0, #4 // ...or the second word of the 4-7 byte case +9: sub x0, x2, x0 + ret +SYM_FUNC_END(__arch_clear_user) +EXPORT_SYMBOL(__arch_clear_user) diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S new file mode 100644 index 0000000000..34e3179075 --- /dev/null +++ b/arch/arm64/lib/copy_from_user.S @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2012 ARM Ltd. + */ + +#include <linux/linkage.h> + +#include <asm/asm-uaccess.h> +#include <asm/assembler.h> +#include <asm/cache.h> + +/* + * Copy from user space to a kernel buffer (alignment handled by the hardware) + * + * Parameters: + * x0 - to + * x1 - from + * x2 - n + * Returns: + * x0 - bytes not copied + */ + + .macro ldrb1 reg, ptr, val + user_ldst 9998f, ldtrb, \reg, \ptr, \val + .endm + + .macro strb1 reg, ptr, val + strb \reg, [\ptr], \val + .endm + + .macro ldrh1 reg, ptr, val + user_ldst 9997f, ldtrh, \reg, \ptr, \val + .endm + + .macro strh1 reg, ptr, val + strh \reg, [\ptr], \val + .endm + + .macro ldr1 reg, ptr, val + user_ldst 9997f, ldtr, \reg, \ptr, \val + .endm + + .macro str1 reg, ptr, val + str \reg, [\ptr], \val + .endm + + .macro ldp1 reg1, reg2, ptr, val + user_ldp 9997f, \reg1, \reg2, \ptr, \val + .endm + + .macro stp1 reg1, reg2, ptr, val + stp \reg1, \reg2, [\ptr], \val + .endm + +end .req x5 +srcin .req x15 +SYM_FUNC_START(__arch_copy_from_user) + add end, x0, x2 + mov srcin, x1 +#include "copy_template.S" + mov x0, #0 // Nothing to copy + ret + + // Exception fixups +9997: cmp dst, dstin + b.ne 9998f + // Before being absolutely sure we couldn't copy anything, try harder +USER(9998f, ldtrb tmp1w, [srcin]) + strb tmp1w, [dst], #1 +9998: sub x0, end, dst // bytes not copied + ret +SYM_FUNC_END(__arch_copy_from_user) +EXPORT_SYMBOL(__arch_copy_from_user) diff --git a/arch/arm64/lib/copy_page.S b/arch/arm64/lib/copy_page.S new file mode 100644 index 0000000000..c336d2ffde --- /dev/null +++ b/arch/arm64/lib/copy_page.S @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2012 ARM Ltd. + */ + +#include <linux/linkage.h> +#include <linux/const.h> +#include <asm/assembler.h> +#include <asm/page.h> +#include <asm/cpufeature.h> +#include <asm/alternative.h> + +/* + * Copy a page from src to dest (both are page aligned) + * + * Parameters: + * x0 - dest + * x1 - src + */ +SYM_FUNC_START(__pi_copy_page) +alternative_if ARM64_HAS_NO_HW_PREFETCH + // Prefetch three cache lines ahead. + prfm pldl1strm, [x1, #128] + prfm pldl1strm, [x1, #256] + prfm pldl1strm, [x1, #384] +alternative_else_nop_endif + + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + ldp x6, x7, [x1, #32] + ldp x8, x9, [x1, #48] + ldp x10, x11, [x1, #64] + ldp x12, x13, [x1, #80] + ldp x14, x15, [x1, #96] + ldp x16, x17, [x1, #112] + + add x0, x0, #256 + add x1, x1, #128 +1: + tst x0, #(PAGE_SIZE - 1) + +alternative_if ARM64_HAS_NO_HW_PREFETCH + prfm pldl1strm, [x1, #384] +alternative_else_nop_endif + + stnp x2, x3, [x0, #-256] + ldp x2, x3, [x1] + stnp x4, x5, [x0, #16 - 256] + ldp x4, x5, [x1, #16] + stnp x6, x7, [x0, #32 - 256] + ldp x6, x7, [x1, #32] + stnp x8, x9, [x0, #48 - 256] + ldp x8, x9, [x1, #48] + stnp x10, x11, [x0, #64 - 256] + ldp x10, x11, [x1, #64] + stnp x12, x13, [x0, #80 - 256] + ldp x12, x13, [x1, #80] + stnp x14, x15, [x0, #96 - 256] + ldp x14, x15, [x1, #96] + stnp x16, x17, [x0, #112 - 256] + ldp x16, x17, [x1, #112] + + add x0, x0, #128 + add x1, x1, #128 + + b.ne 1b + + stnp x2, x3, [x0, #-256] + stnp x4, x5, [x0, #16 - 256] + stnp x6, x7, [x0, #32 - 256] + stnp x8, x9, [x0, #48 - 256] + stnp x10, x11, [x0, #64 - 256] + stnp x12, x13, [x0, #80 - 256] + stnp x14, x15, [x0, #96 - 256] + stnp x16, x17, [x0, #112 - 256] + + ret +SYM_FUNC_END(__pi_copy_page) +SYM_FUNC_ALIAS(copy_page, __pi_copy_page) +EXPORT_SYMBOL(copy_page) diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S new file mode 100644 index 0000000000..488df234c4 --- /dev/null +++ b/arch/arm64/lib/copy_template.S @@ -0,0 +1,181 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + */ + + +/* + * Copy a buffer from src to dest (alignment handled by the hardware) + * + * Parameters: + * x0 - dest + * x1 - src + * x2 - n + * Returns: + * x0 - dest + */ +dstin .req x0 +src .req x1 +count .req x2 +tmp1 .req x3 +tmp1w .req w3 +tmp2 .req x4 +tmp2w .req w4 +dst .req x6 + +A_l .req x7 +A_h .req x8 +B_l .req x9 +B_h .req x10 +C_l .req x11 +C_h .req x12 +D_l .req x13 +D_h .req x14 + + mov dst, dstin + cmp count, #16 + /*When memory length is less than 16, the accessed are not aligned.*/ + b.lo .Ltiny15 + + neg tmp2, src + ands tmp2, tmp2, #15/* Bytes to reach alignment. */ + b.eq .LSrcAligned + sub count, count, tmp2 + /* + * Copy the leading memory data from src to dst in an increasing + * address order.By this way,the risk of overwriting the source + * memory data is eliminated when the distance between src and + * dst is less than 16. The memory accesses here are alignment. + */ + tbz tmp2, #0, 1f + ldrb1 tmp1w, src, #1 + strb1 tmp1w, dst, #1 +1: + tbz tmp2, #1, 2f + ldrh1 tmp1w, src, #2 + strh1 tmp1w, dst, #2 +2: + tbz tmp2, #2, 3f + ldr1 tmp1w, src, #4 + str1 tmp1w, dst, #4 +3: + tbz tmp2, #3, .LSrcAligned + ldr1 tmp1, src, #8 + str1 tmp1, dst, #8 + +.LSrcAligned: + cmp count, #64 + b.ge .Lcpy_over64 + /* + * Deal with small copies quickly by dropping straight into the + * exit block. + */ +.Ltail63: + /* + * Copy up to 48 bytes of data. At this point we only need the + * bottom 6 bits of count to be accurate. + */ + ands tmp1, count, #0x30 + b.eq .Ltiny15 + cmp tmp1w, #0x20 + b.eq 1f + b.lt 2f + ldp1 A_l, A_h, src, #16 + stp1 A_l, A_h, dst, #16 +1: + ldp1 A_l, A_h, src, #16 + stp1 A_l, A_h, dst, #16 +2: + ldp1 A_l, A_h, src, #16 + stp1 A_l, A_h, dst, #16 +.Ltiny15: + /* + * Prefer to break one ldp/stp into several load/store to access + * memory in an increasing address order,rather than to load/store 16 + * bytes from (src-16) to (dst-16) and to backward the src to aligned + * address,which way is used in original cortex memcpy. If keeping + * the original memcpy process here, memmove need to satisfy the + * precondition that src address is at least 16 bytes bigger than dst + * address,otherwise some source data will be overwritten when memove + * call memcpy directly. To make memmove simpler and decouple the + * memcpy's dependency on memmove, withdrew the original process. + */ + tbz count, #3, 1f + ldr1 tmp1, src, #8 + str1 tmp1, dst, #8 +1: + tbz count, #2, 2f + ldr1 tmp1w, src, #4 + str1 tmp1w, dst, #4 +2: + tbz count, #1, 3f + ldrh1 tmp1w, src, #2 + strh1 tmp1w, dst, #2 +3: + tbz count, #0, .Lexitfunc + ldrb1 tmp1w, src, #1 + strb1 tmp1w, dst, #1 + + b .Lexitfunc + +.Lcpy_over64: + subs count, count, #128 + b.ge .Lcpy_body_large + /* + * Less than 128 bytes to copy, so handle 64 here and then jump + * to the tail. + */ + ldp1 A_l, A_h, src, #16 + stp1 A_l, A_h, dst, #16 + ldp1 B_l, B_h, src, #16 + ldp1 C_l, C_h, src, #16 + stp1 B_l, B_h, dst, #16 + stp1 C_l, C_h, dst, #16 + ldp1 D_l, D_h, src, #16 + stp1 D_l, D_h, dst, #16 + + tst count, #0x3f + b.ne .Ltail63 + b .Lexitfunc + + /* + * Critical loop. Start at a new cache line boundary. Assuming + * 64 bytes per line this ensures the entire loop is in one line. + */ + .p2align L1_CACHE_SHIFT +.Lcpy_body_large: + /* pre-get 64 bytes data. */ + ldp1 A_l, A_h, src, #16 + ldp1 B_l, B_h, src, #16 + ldp1 C_l, C_h, src, #16 + ldp1 D_l, D_h, src, #16 +1: + /* + * interlace the load of next 64 bytes data block with store of the last + * loaded 64 bytes data. + */ + stp1 A_l, A_h, dst, #16 + ldp1 A_l, A_h, src, #16 + stp1 B_l, B_h, dst, #16 + ldp1 B_l, B_h, src, #16 + stp1 C_l, C_h, dst, #16 + ldp1 C_l, C_h, src, #16 + stp1 D_l, D_h, dst, #16 + ldp1 D_l, D_h, src, #16 + subs count, count, #64 + b.ge 1b + stp1 A_l, A_h, dst, #16 + stp1 B_l, B_h, dst, #16 + stp1 C_l, C_h, dst, #16 + stp1 D_l, D_h, dst, #16 + + tst count, #0x3f + b.ne .Ltail63 +.Lexitfunc: diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S new file mode 100644 index 0000000000..8022317726 --- /dev/null +++ b/arch/arm64/lib/copy_to_user.S @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2012 ARM Ltd. + */ + +#include <linux/linkage.h> + +#include <asm/asm-uaccess.h> +#include <asm/assembler.h> +#include <asm/cache.h> + +/* + * Copy to user space from a kernel buffer (alignment handled by the hardware) + * + * Parameters: + * x0 - to + * x1 - from + * x2 - n + * Returns: + * x0 - bytes not copied + */ + .macro ldrb1 reg, ptr, val + ldrb \reg, [\ptr], \val + .endm + + .macro strb1 reg, ptr, val + user_ldst 9998f, sttrb, \reg, \ptr, \val + .endm + + .macro ldrh1 reg, ptr, val + ldrh \reg, [\ptr], \val + .endm + + .macro strh1 reg, ptr, val + user_ldst 9997f, sttrh, \reg, \ptr, \val + .endm + + .macro ldr1 reg, ptr, val + ldr \reg, [\ptr], \val + .endm + + .macro str1 reg, ptr, val + user_ldst 9997f, sttr, \reg, \ptr, \val + .endm + + .macro ldp1 reg1, reg2, ptr, val + ldp \reg1, \reg2, [\ptr], \val + .endm + + .macro stp1 reg1, reg2, ptr, val + user_stp 9997f, \reg1, \reg2, \ptr, \val + .endm + +end .req x5 +srcin .req x15 +SYM_FUNC_START(__arch_copy_to_user) + add end, x0, x2 + mov srcin, x1 +#include "copy_template.S" + mov x0, #0 + ret + + // Exception fixups +9997: cmp dst, dstin + b.ne 9998f + // Before being absolutely sure we couldn't copy anything, try harder + ldrb tmp1w, [srcin] +USER(9998f, sttrb tmp1w, [dst]) + add dst, dst, #1 +9998: sub x0, end, dst // bytes not copied + ret +SYM_FUNC_END(__arch_copy_to_user) +EXPORT_SYMBOL(__arch_copy_to_user) diff --git a/arch/arm64/lib/crc32.S b/arch/arm64/lib/crc32.S new file mode 100644 index 0000000000..8340dccff4 --- /dev/null +++ b/arch/arm64/lib/crc32.S @@ -0,0 +1,160 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Accelerated CRC32(C) using AArch64 CRC instructions + * + * Copyright (C) 2016 - 2018 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +#include <linux/linkage.h> +#include <asm/alternative.h> +#include <asm/assembler.h> + + .arch armv8-a+crc + + .macro byteorder, reg, be + .if \be +CPU_LE( rev \reg, \reg ) + .else +CPU_BE( rev \reg, \reg ) + .endif + .endm + + .macro byteorder16, reg, be + .if \be +CPU_LE( rev16 \reg, \reg ) + .else +CPU_BE( rev16 \reg, \reg ) + .endif + .endm + + .macro bitorder, reg, be + .if \be + rbit \reg, \reg + .endif + .endm + + .macro bitorder16, reg, be + .if \be + rbit \reg, \reg + lsr \reg, \reg, #16 + .endif + .endm + + .macro bitorder8, reg, be + .if \be + rbit \reg, \reg + lsr \reg, \reg, #24 + .endif + .endm + + .macro __crc32, c, be=0 + bitorder w0, \be + cmp x2, #16 + b.lt 8f // less than 16 bytes + + and x7, x2, #0x1f + and x2, x2, #~0x1f + cbz x7, 32f // multiple of 32 bytes + + and x8, x7, #0xf + ldp x3, x4, [x1] + add x8, x8, x1 + add x1, x1, x7 + ldp x5, x6, [x8] + byteorder x3, \be + byteorder x4, \be + byteorder x5, \be + byteorder x6, \be + bitorder x3, \be + bitorder x4, \be + bitorder x5, \be + bitorder x6, \be + + tst x7, #8 + crc32\c\()x w8, w0, x3 + csel x3, x3, x4, eq + csel w0, w0, w8, eq + tst x7, #4 + lsr x4, x3, #32 + crc32\c\()w w8, w0, w3 + csel x3, x3, x4, eq + csel w0, w0, w8, eq + tst x7, #2 + lsr w4, w3, #16 + crc32\c\()h w8, w0, w3 + csel w3, w3, w4, eq + csel w0, w0, w8, eq + tst x7, #1 + crc32\c\()b w8, w0, w3 + csel w0, w0, w8, eq + tst x7, #16 + crc32\c\()x w8, w0, x5 + crc32\c\()x w8, w8, x6 + csel w0, w0, w8, eq + cbz x2, 0f + +32: ldp x3, x4, [x1], #32 + sub x2, x2, #32 + ldp x5, x6, [x1, #-16] + byteorder x3, \be + byteorder x4, \be + byteorder x5, \be + byteorder x6, \be + bitorder x3, \be + bitorder x4, \be + bitorder x5, \be + bitorder x6, \be + crc32\c\()x w0, w0, x3 + crc32\c\()x w0, w0, x4 + crc32\c\()x w0, w0, x5 + crc32\c\()x w0, w0, x6 + cbnz x2, 32b +0: bitorder w0, \be + ret + +8: tbz x2, #3, 4f + ldr x3, [x1], #8 + byteorder x3, \be + bitorder x3, \be + crc32\c\()x w0, w0, x3 +4: tbz x2, #2, 2f + ldr w3, [x1], #4 + byteorder w3, \be + bitorder w3, \be + crc32\c\()w w0, w0, w3 +2: tbz x2, #1, 1f + ldrh w3, [x1], #2 + byteorder16 w3, \be + bitorder16 w3, \be + crc32\c\()h w0, w0, w3 +1: tbz x2, #0, 0f + ldrb w3, [x1] + bitorder8 w3, \be + crc32\c\()b w0, w0, w3 +0: bitorder w0, \be + ret + .endm + + .align 5 +SYM_FUNC_START(crc32_le) +alternative_if_not ARM64_HAS_CRC32 + b crc32_le_base +alternative_else_nop_endif + __crc32 +SYM_FUNC_END(crc32_le) + + .align 5 +SYM_FUNC_START(__crc32c_le) +alternative_if_not ARM64_HAS_CRC32 + b __crc32c_le_base +alternative_else_nop_endif + __crc32 c +SYM_FUNC_END(__crc32c_le) + + .align 5 +SYM_FUNC_START(crc32_be) +alternative_if_not ARM64_HAS_CRC32 + b crc32_be_base +alternative_else_nop_endif + __crc32 be=1 +SYM_FUNC_END(crc32_be) diff --git a/arch/arm64/lib/csum.c b/arch/arm64/lib/csum.c new file mode 100644 index 0000000000..2432683e48 --- /dev/null +++ b/arch/arm64/lib/csum.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright (C) 2019-2020 Arm Ltd. + +#include <linux/compiler.h> +#include <linux/kasan-checks.h> +#include <linux/kernel.h> + +#include <net/checksum.h> + +/* Looks dumb, but generates nice-ish code */ +static u64 accumulate(u64 sum, u64 data) +{ + __uint128_t tmp = (__uint128_t)sum + data; + return tmp + (tmp >> 64); +} + +/* + * We over-read the buffer and this makes KASAN unhappy. Instead, disable + * instrumentation and call kasan explicitly. + */ +unsigned int __no_sanitize_address do_csum(const unsigned char *buff, int len) +{ + unsigned int offset, shift, sum; + const u64 *ptr; + u64 data, sum64 = 0; + + if (unlikely(len <= 0)) + return 0; + + offset = (unsigned long)buff & 7; + /* + * This is to all intents and purposes safe, since rounding down cannot + * result in a different page or cache line being accessed, and @buff + * should absolutely not be pointing to anything read-sensitive. We do, + * however, have to be careful not to piss off KASAN, which means using + * unchecked reads to accommodate the head and tail, for which we'll + * compensate with an explicit check up-front. + */ + kasan_check_read(buff, len); + ptr = (u64 *)(buff - offset); + len = len + offset - 8; + + /* + * Head: zero out any excess leading bytes. Shifting back by the same + * amount should be at least as fast as any other way of handling the + * odd/even alignment, and means we can ignore it until the very end. + */ + shift = offset * 8; + data = *ptr++; +#ifdef __LITTLE_ENDIAN + data = (data >> shift) << shift; +#else + data = (data << shift) >> shift; +#endif + + /* + * Body: straightforward aligned loads from here on (the paired loads + * underlying the quadword type still only need dword alignment). The + * main loop strictly excludes the tail, so the second loop will always + * run at least once. + */ + while (unlikely(len > 64)) { + __uint128_t tmp1, tmp2, tmp3, tmp4; + + tmp1 = *(__uint128_t *)ptr; + tmp2 = *(__uint128_t *)(ptr + 2); + tmp3 = *(__uint128_t *)(ptr + 4); + tmp4 = *(__uint128_t *)(ptr + 6); + + len -= 64; + ptr += 8; + + /* This is the "don't dump the carry flag into a GPR" idiom */ + tmp1 += (tmp1 >> 64) | (tmp1 << 64); + tmp2 += (tmp2 >> 64) | (tmp2 << 64); + tmp3 += (tmp3 >> 64) | (tmp3 << 64); + tmp4 += (tmp4 >> 64) | (tmp4 << 64); + tmp1 = ((tmp1 >> 64) << 64) | (tmp2 >> 64); + tmp1 += (tmp1 >> 64) | (tmp1 << 64); + tmp3 = ((tmp3 >> 64) << 64) | (tmp4 >> 64); + tmp3 += (tmp3 >> 64) | (tmp3 << 64); + tmp1 = ((tmp1 >> 64) << 64) | (tmp3 >> 64); + tmp1 += (tmp1 >> 64) | (tmp1 << 64); + tmp1 = ((tmp1 >> 64) << 64) | sum64; + tmp1 += (tmp1 >> 64) | (tmp1 << 64); + sum64 = tmp1 >> 64; + } + while (len > 8) { + __uint128_t tmp; + + sum64 = accumulate(sum64, data); + tmp = *(__uint128_t *)ptr; + + len -= 16; + ptr += 2; + +#ifdef __LITTLE_ENDIAN + data = tmp >> 64; + sum64 = accumulate(sum64, tmp); +#else + data = tmp; + sum64 = accumulate(sum64, tmp >> 64); +#endif + } + if (len > 0) { + sum64 = accumulate(sum64, data); + data = *ptr; + len -= 8; + } + /* + * Tail: zero any over-read bytes similarly to the head, again + * preserving odd/even alignment. + */ + shift = len * -8; +#ifdef __LITTLE_ENDIAN + data = (data << shift) >> shift; +#else + data = (data >> shift) << shift; +#endif + sum64 = accumulate(sum64, data); + + /* Finally, folding */ + sum64 += (sum64 >> 32) | (sum64 << 32); + sum = sum64 >> 32; + sum += (sum >> 16) | (sum << 16); + if (offset & 1) + return (u16)swab32(sum); + + return sum >> 16; +} + +__sum16 csum_ipv6_magic(const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u32 len, __u8 proto, __wsum csum) +{ + __uint128_t src, dst; + u64 sum = (__force u64)csum; + + src = *(const __uint128_t *)saddr->s6_addr; + dst = *(const __uint128_t *)daddr->s6_addr; + + sum += (__force u32)htonl(len); +#ifdef __LITTLE_ENDIAN + sum += (u32)proto << 24; +#else + sum += proto; +#endif + src += (src >> 64) | (src << 64); + dst += (dst >> 64) | (dst << 64); + + sum = accumulate(sum, src >> 64); + sum = accumulate(sum, dst >> 64); + + sum += ((sum >> 32) | (sum << 32)); + return csum_fold((__force __wsum)(sum >> 32)); +} +EXPORT_SYMBOL(csum_ipv6_magic); diff --git a/arch/arm64/lib/delay.c b/arch/arm64/lib/delay.c new file mode 100644 index 0000000000..5b7890139b --- /dev/null +++ b/arch/arm64/lib/delay.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Delay loops based on the OpenRISC implementation. + * + * Copyright (C) 2012 ARM Limited + * + * Author: Will Deacon <will.deacon@arm.com> + */ + +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/timex.h> + +#include <clocksource/arm_arch_timer.h> + +#define USECS_TO_CYCLES(time_usecs) \ + xloops_to_cycles((time_usecs) * 0x10C7UL) + +static inline unsigned long xloops_to_cycles(unsigned long xloops) +{ + return (xloops * loops_per_jiffy * HZ) >> 32; +} + +void __delay(unsigned long cycles) +{ + cycles_t start = get_cycles(); + + if (cpus_have_const_cap(ARM64_HAS_WFXT)) { + u64 end = start + cycles; + + /* + * Start with WFIT. If an interrupt makes us resume + * early, use a WFET loop to complete the delay. + */ + wfit(end); + while ((get_cycles() - start) < cycles) + wfet(end); + } else if (arch_timer_evtstrm_available()) { + const cycles_t timer_evt_period = + USECS_TO_CYCLES(ARCH_TIMER_EVT_STREAM_PERIOD_US); + + while ((get_cycles() - start + timer_evt_period) < cycles) + wfe(); + } + + while ((get_cycles() - start) < cycles) + cpu_relax(); +} +EXPORT_SYMBOL(__delay); + +inline void __const_udelay(unsigned long xloops) +{ + __delay(xloops_to_cycles(xloops)); +} +EXPORT_SYMBOL(__const_udelay); + +void __udelay(unsigned long usecs) +{ + __const_udelay(usecs * 0x10C7UL); /* 2**32 / 1000000 (rounded up) */ +} +EXPORT_SYMBOL(__udelay); + +void __ndelay(unsigned long nsecs) +{ + __const_udelay(nsecs * 0x5UL); /* 2**32 / 1000000000 (rounded up) */ +} +EXPORT_SYMBOL(__ndelay); diff --git a/arch/arm64/lib/error-inject.c b/arch/arm64/lib/error-inject.c new file mode 100644 index 0000000000..ed15021da3 --- /dev/null +++ b/arch/arm64/lib/error-inject.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/error-injection.h> +#include <linux/kprobes.h> + +void override_function_with_return(struct pt_regs *regs) +{ + /* + * 'regs' represents the state on entry of a predefined function in + * the kernel/module and which is captured on a kprobe. + * + * When kprobe returns back from exception it will override the end + * of probed function and directly return to the predefined + * function's caller. + */ + instruction_pointer_set(regs, procedure_link_pointer(regs)); +} +NOKPROBE_SYMBOL(override_function_with_return); diff --git a/arch/arm64/lib/insn.c b/arch/arm64/lib/insn.c new file mode 100644 index 0000000000..a635ab83fe --- /dev/null +++ b/arch/arm64/lib/insn.c @@ -0,0 +1,1517 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2013 Huawei Ltd. + * Author: Jiang Liu <liuj97@gmail.com> + * + * Copyright (C) 2014-2016 Zi Shen Lim <zlim.lnx@gmail.com> + */ +#include <linux/bitops.h> +#include <linux/bug.h> +#include <linux/printk.h> +#include <linux/sizes.h> +#include <linux/types.h> + +#include <asm/debug-monitors.h> +#include <asm/errno.h> +#include <asm/insn.h> +#include <asm/kprobes.h> + +#define AARCH64_INSN_SF_BIT BIT(31) +#define AARCH64_INSN_N_BIT BIT(22) +#define AARCH64_INSN_LSL_12 BIT(22) + +static int __kprobes aarch64_get_imm_shift_mask(enum aarch64_insn_imm_type type, + u32 *maskp, int *shiftp) +{ + u32 mask; + int shift; + + switch (type) { + case AARCH64_INSN_IMM_26: + mask = BIT(26) - 1; + shift = 0; + break; + case AARCH64_INSN_IMM_19: + mask = BIT(19) - 1; + shift = 5; + break; + case AARCH64_INSN_IMM_16: + mask = BIT(16) - 1; + shift = 5; + break; + case AARCH64_INSN_IMM_14: + mask = BIT(14) - 1; + shift = 5; + break; + case AARCH64_INSN_IMM_12: + mask = BIT(12) - 1; + shift = 10; + break; + case AARCH64_INSN_IMM_9: + mask = BIT(9) - 1; + shift = 12; + break; + case AARCH64_INSN_IMM_7: + mask = BIT(7) - 1; + shift = 15; + break; + case AARCH64_INSN_IMM_6: + case AARCH64_INSN_IMM_S: + mask = BIT(6) - 1; + shift = 10; + break; + case AARCH64_INSN_IMM_R: + mask = BIT(6) - 1; + shift = 16; + break; + case AARCH64_INSN_IMM_N: + mask = 1; + shift = 22; + break; + default: + return -EINVAL; + } + + *maskp = mask; + *shiftp = shift; + + return 0; +} + +#define ADR_IMM_HILOSPLIT 2 +#define ADR_IMM_SIZE SZ_2M +#define ADR_IMM_LOMASK ((1 << ADR_IMM_HILOSPLIT) - 1) +#define ADR_IMM_HIMASK ((ADR_IMM_SIZE >> ADR_IMM_HILOSPLIT) - 1) +#define ADR_IMM_LOSHIFT 29 +#define ADR_IMM_HISHIFT 5 + +u64 aarch64_insn_decode_immediate(enum aarch64_insn_imm_type type, u32 insn) +{ + u32 immlo, immhi, mask; + int shift; + + switch (type) { + case AARCH64_INSN_IMM_ADR: + shift = 0; + immlo = (insn >> ADR_IMM_LOSHIFT) & ADR_IMM_LOMASK; + immhi = (insn >> ADR_IMM_HISHIFT) & ADR_IMM_HIMASK; + insn = (immhi << ADR_IMM_HILOSPLIT) | immlo; + mask = ADR_IMM_SIZE - 1; + break; + default: + if (aarch64_get_imm_shift_mask(type, &mask, &shift) < 0) { + pr_err("%s: unknown immediate encoding %d\n", __func__, + type); + return 0; + } + } + + return (insn >> shift) & mask; +} + +u32 __kprobes aarch64_insn_encode_immediate(enum aarch64_insn_imm_type type, + u32 insn, u64 imm) +{ + u32 immlo, immhi, mask; + int shift; + + if (insn == AARCH64_BREAK_FAULT) + return AARCH64_BREAK_FAULT; + + switch (type) { + case AARCH64_INSN_IMM_ADR: + shift = 0; + immlo = (imm & ADR_IMM_LOMASK) << ADR_IMM_LOSHIFT; + imm >>= ADR_IMM_HILOSPLIT; + immhi = (imm & ADR_IMM_HIMASK) << ADR_IMM_HISHIFT; + imm = immlo | immhi; + mask = ((ADR_IMM_LOMASK << ADR_IMM_LOSHIFT) | + (ADR_IMM_HIMASK << ADR_IMM_HISHIFT)); + break; + default: + if (aarch64_get_imm_shift_mask(type, &mask, &shift) < 0) { + pr_err("%s: unknown immediate encoding %d\n", __func__, + type); + return AARCH64_BREAK_FAULT; + } + } + + /* Update the immediate field. */ + insn &= ~(mask << shift); + insn |= (imm & mask) << shift; + + return insn; +} + +u32 aarch64_insn_decode_register(enum aarch64_insn_register_type type, + u32 insn) +{ + int shift; + + switch (type) { + case AARCH64_INSN_REGTYPE_RT: + case AARCH64_INSN_REGTYPE_RD: + shift = 0; + break; + case AARCH64_INSN_REGTYPE_RN: + shift = 5; + break; + case AARCH64_INSN_REGTYPE_RT2: + case AARCH64_INSN_REGTYPE_RA: + shift = 10; + break; + case AARCH64_INSN_REGTYPE_RM: + shift = 16; + break; + default: + pr_err("%s: unknown register type encoding %d\n", __func__, + type); + return 0; + } + + return (insn >> shift) & GENMASK(4, 0); +} + +static u32 aarch64_insn_encode_register(enum aarch64_insn_register_type type, + u32 insn, + enum aarch64_insn_register reg) +{ + int shift; + + if (insn == AARCH64_BREAK_FAULT) + return AARCH64_BREAK_FAULT; + + if (reg < AARCH64_INSN_REG_0 || reg > AARCH64_INSN_REG_SP) { + pr_err("%s: unknown register encoding %d\n", __func__, reg); + return AARCH64_BREAK_FAULT; + } + + switch (type) { + case AARCH64_INSN_REGTYPE_RT: + case AARCH64_INSN_REGTYPE_RD: + shift = 0; + break; + case AARCH64_INSN_REGTYPE_RN: + shift = 5; + break; + case AARCH64_INSN_REGTYPE_RT2: + case AARCH64_INSN_REGTYPE_RA: + shift = 10; + break; + case AARCH64_INSN_REGTYPE_RM: + case AARCH64_INSN_REGTYPE_RS: + shift = 16; + break; + default: + pr_err("%s: unknown register type encoding %d\n", __func__, + type); + return AARCH64_BREAK_FAULT; + } + + insn &= ~(GENMASK(4, 0) << shift); + insn |= reg << shift; + + return insn; +} + +static const u32 aarch64_insn_ldst_size[] = { + [AARCH64_INSN_SIZE_8] = 0, + [AARCH64_INSN_SIZE_16] = 1, + [AARCH64_INSN_SIZE_32] = 2, + [AARCH64_INSN_SIZE_64] = 3, +}; + +static u32 aarch64_insn_encode_ldst_size(enum aarch64_insn_size_type type, + u32 insn) +{ + u32 size; + + if (type < AARCH64_INSN_SIZE_8 || type > AARCH64_INSN_SIZE_64) { + pr_err("%s: unknown size encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + size = aarch64_insn_ldst_size[type]; + insn &= ~GENMASK(31, 30); + insn |= size << 30; + + return insn; +} + +static inline long label_imm_common(unsigned long pc, unsigned long addr, + long range) +{ + long offset; + + if ((pc & 0x3) || (addr & 0x3)) { + pr_err("%s: A64 instructions must be word aligned\n", __func__); + return range; + } + + offset = ((long)addr - (long)pc); + + if (offset < -range || offset >= range) { + pr_err("%s: offset out of range\n", __func__); + return range; + } + + return offset; +} + +u32 __kprobes aarch64_insn_gen_branch_imm(unsigned long pc, unsigned long addr, + enum aarch64_insn_branch_type type) +{ + u32 insn; + long offset; + + /* + * B/BL support [-128M, 128M) offset + * ARM64 virtual address arrangement guarantees all kernel and module + * texts are within +/-128M. + */ + offset = label_imm_common(pc, addr, SZ_128M); + if (offset >= SZ_128M) + return AARCH64_BREAK_FAULT; + + switch (type) { + case AARCH64_INSN_BRANCH_LINK: + insn = aarch64_insn_get_bl_value(); + break; + case AARCH64_INSN_BRANCH_NOLINK: + insn = aarch64_insn_get_b_value(); + break; + default: + pr_err("%s: unknown branch encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_26, insn, + offset >> 2); +} + +u32 aarch64_insn_gen_comp_branch_imm(unsigned long pc, unsigned long addr, + enum aarch64_insn_register reg, + enum aarch64_insn_variant variant, + enum aarch64_insn_branch_type type) +{ + u32 insn; + long offset; + + offset = label_imm_common(pc, addr, SZ_1M); + if (offset >= SZ_1M) + return AARCH64_BREAK_FAULT; + + switch (type) { + case AARCH64_INSN_BRANCH_COMP_ZERO: + insn = aarch64_insn_get_cbz_value(); + break; + case AARCH64_INSN_BRANCH_COMP_NONZERO: + insn = aarch64_insn_get_cbnz_value(); + break; + default: + pr_err("%s: unknown branch encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + switch (variant) { + case AARCH64_INSN_VARIANT_32BIT: + break; + case AARCH64_INSN_VARIANT_64BIT: + insn |= AARCH64_INSN_SF_BIT; + break; + default: + pr_err("%s: unknown variant encoding %d\n", __func__, variant); + return AARCH64_BREAK_FAULT; + } + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg); + + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn, + offset >> 2); +} + +u32 aarch64_insn_gen_cond_branch_imm(unsigned long pc, unsigned long addr, + enum aarch64_insn_condition cond) +{ + u32 insn; + long offset; + + offset = label_imm_common(pc, addr, SZ_1M); + + insn = aarch64_insn_get_bcond_value(); + + if (cond < AARCH64_INSN_COND_EQ || cond > AARCH64_INSN_COND_AL) { + pr_err("%s: unknown condition encoding %d\n", __func__, cond); + return AARCH64_BREAK_FAULT; + } + insn |= cond; + + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn, + offset >> 2); +} + +u32 aarch64_insn_gen_branch_reg(enum aarch64_insn_register reg, + enum aarch64_insn_branch_type type) +{ + u32 insn; + + switch (type) { + case AARCH64_INSN_BRANCH_NOLINK: + insn = aarch64_insn_get_br_value(); + break; + case AARCH64_INSN_BRANCH_LINK: + insn = aarch64_insn_get_blr_value(); + break; + case AARCH64_INSN_BRANCH_RETURN: + insn = aarch64_insn_get_ret_value(); + break; + default: + pr_err("%s: unknown branch encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, reg); +} + +u32 aarch64_insn_gen_load_store_reg(enum aarch64_insn_register reg, + enum aarch64_insn_register base, + enum aarch64_insn_register offset, + enum aarch64_insn_size_type size, + enum aarch64_insn_ldst_type type) +{ + u32 insn; + + switch (type) { + case AARCH64_INSN_LDST_LOAD_REG_OFFSET: + insn = aarch64_insn_get_ldr_reg_value(); + break; + case AARCH64_INSN_LDST_SIGNED_LOAD_REG_OFFSET: + insn = aarch64_insn_get_signed_ldr_reg_value(); + break; + case AARCH64_INSN_LDST_STORE_REG_OFFSET: + insn = aarch64_insn_get_str_reg_value(); + break; + default: + pr_err("%s: unknown load/store encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + insn = aarch64_insn_encode_ldst_size(size, insn); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, + base); + + return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, + offset); +} + +u32 aarch64_insn_gen_load_store_imm(enum aarch64_insn_register reg, + enum aarch64_insn_register base, + unsigned int imm, + enum aarch64_insn_size_type size, + enum aarch64_insn_ldst_type type) +{ + u32 insn; + u32 shift; + + if (size < AARCH64_INSN_SIZE_8 || size > AARCH64_INSN_SIZE_64) { + pr_err("%s: unknown size encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + shift = aarch64_insn_ldst_size[size]; + if (imm & ~(BIT(12 + shift) - BIT(shift))) { + pr_err("%s: invalid imm: %d\n", __func__, imm); + return AARCH64_BREAK_FAULT; + } + + imm >>= shift; + + switch (type) { + case AARCH64_INSN_LDST_LOAD_IMM_OFFSET: + insn = aarch64_insn_get_ldr_imm_value(); + break; + case AARCH64_INSN_LDST_SIGNED_LOAD_IMM_OFFSET: + insn = aarch64_insn_get_signed_load_imm_value(); + break; + case AARCH64_INSN_LDST_STORE_IMM_OFFSET: + insn = aarch64_insn_get_str_imm_value(); + break; + default: + pr_err("%s: unknown load/store encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + insn = aarch64_insn_encode_ldst_size(size, insn); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, + base); + + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_12, insn, imm); +} + +u32 aarch64_insn_gen_load_literal(unsigned long pc, unsigned long addr, + enum aarch64_insn_register reg, + bool is64bit) +{ + u32 insn; + long offset; + + offset = label_imm_common(pc, addr, SZ_1M); + if (offset >= SZ_1M) + return AARCH64_BREAK_FAULT; + + insn = aarch64_insn_get_ldr_lit_value(); + + if (is64bit) + insn |= BIT(30); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg); + + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn, + offset >> 2); +} + +u32 aarch64_insn_gen_load_store_pair(enum aarch64_insn_register reg1, + enum aarch64_insn_register reg2, + enum aarch64_insn_register base, + int offset, + enum aarch64_insn_variant variant, + enum aarch64_insn_ldst_type type) +{ + u32 insn; + int shift; + + switch (type) { + case AARCH64_INSN_LDST_LOAD_PAIR_PRE_INDEX: + insn = aarch64_insn_get_ldp_pre_value(); + break; + case AARCH64_INSN_LDST_STORE_PAIR_PRE_INDEX: + insn = aarch64_insn_get_stp_pre_value(); + break; + case AARCH64_INSN_LDST_LOAD_PAIR_POST_INDEX: + insn = aarch64_insn_get_ldp_post_value(); + break; + case AARCH64_INSN_LDST_STORE_PAIR_POST_INDEX: + insn = aarch64_insn_get_stp_post_value(); + break; + default: + pr_err("%s: unknown load/store encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + switch (variant) { + case AARCH64_INSN_VARIANT_32BIT: + if ((offset & 0x3) || (offset < -256) || (offset > 252)) { + pr_err("%s: offset must be multiples of 4 in the range of [-256, 252] %d\n", + __func__, offset); + return AARCH64_BREAK_FAULT; + } + shift = 2; + break; + case AARCH64_INSN_VARIANT_64BIT: + if ((offset & 0x7) || (offset < -512) || (offset > 504)) { + pr_err("%s: offset must be multiples of 8 in the range of [-512, 504] %d\n", + __func__, offset); + return AARCH64_BREAK_FAULT; + } + shift = 3; + insn |= AARCH64_INSN_SF_BIT; + break; + default: + pr_err("%s: unknown variant encoding %d\n", __func__, variant); + return AARCH64_BREAK_FAULT; + } + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, + reg1); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT2, insn, + reg2); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, + base); + + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_7, insn, + offset >> shift); +} + +u32 aarch64_insn_gen_load_store_ex(enum aarch64_insn_register reg, + enum aarch64_insn_register base, + enum aarch64_insn_register state, + enum aarch64_insn_size_type size, + enum aarch64_insn_ldst_type type) +{ + u32 insn; + + switch (type) { + case AARCH64_INSN_LDST_LOAD_EX: + case AARCH64_INSN_LDST_LOAD_ACQ_EX: + insn = aarch64_insn_get_load_ex_value(); + if (type == AARCH64_INSN_LDST_LOAD_ACQ_EX) + insn |= BIT(15); + break; + case AARCH64_INSN_LDST_STORE_EX: + case AARCH64_INSN_LDST_STORE_REL_EX: + insn = aarch64_insn_get_store_ex_value(); + if (type == AARCH64_INSN_LDST_STORE_REL_EX) + insn |= BIT(15); + break; + default: + pr_err("%s: unknown load/store exclusive encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + insn = aarch64_insn_encode_ldst_size(size, insn); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, + reg); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, + base); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT2, insn, + AARCH64_INSN_REG_ZR); + + return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RS, insn, + state); +} + +#ifdef CONFIG_ARM64_LSE_ATOMICS +static u32 aarch64_insn_encode_ldst_order(enum aarch64_insn_mem_order_type type, + u32 insn) +{ + u32 order; + + switch (type) { + case AARCH64_INSN_MEM_ORDER_NONE: + order = 0; + break; + case AARCH64_INSN_MEM_ORDER_ACQ: + order = 2; + break; + case AARCH64_INSN_MEM_ORDER_REL: + order = 1; + break; + case AARCH64_INSN_MEM_ORDER_ACQREL: + order = 3; + break; + default: + pr_err("%s: unknown mem order %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + insn &= ~GENMASK(23, 22); + insn |= order << 22; + + return insn; +} + +u32 aarch64_insn_gen_atomic_ld_op(enum aarch64_insn_register result, + enum aarch64_insn_register address, + enum aarch64_insn_register value, + enum aarch64_insn_size_type size, + enum aarch64_insn_mem_atomic_op op, + enum aarch64_insn_mem_order_type order) +{ + u32 insn; + + switch (op) { + case AARCH64_INSN_MEM_ATOMIC_ADD: + insn = aarch64_insn_get_ldadd_value(); + break; + case AARCH64_INSN_MEM_ATOMIC_CLR: + insn = aarch64_insn_get_ldclr_value(); + break; + case AARCH64_INSN_MEM_ATOMIC_EOR: + insn = aarch64_insn_get_ldeor_value(); + break; + case AARCH64_INSN_MEM_ATOMIC_SET: + insn = aarch64_insn_get_ldset_value(); + break; + case AARCH64_INSN_MEM_ATOMIC_SWP: + insn = aarch64_insn_get_swp_value(); + break; + default: + pr_err("%s: unimplemented mem atomic op %d\n", __func__, op); + return AARCH64_BREAK_FAULT; + } + + switch (size) { + case AARCH64_INSN_SIZE_32: + case AARCH64_INSN_SIZE_64: + break; + default: + pr_err("%s: unimplemented size encoding %d\n", __func__, size); + return AARCH64_BREAK_FAULT; + } + + insn = aarch64_insn_encode_ldst_size(size, insn); + + insn = aarch64_insn_encode_ldst_order(order, insn); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, + result); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, + address); + + return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RS, insn, + value); +} + +static u32 aarch64_insn_encode_cas_order(enum aarch64_insn_mem_order_type type, + u32 insn) +{ + u32 order; + + switch (type) { + case AARCH64_INSN_MEM_ORDER_NONE: + order = 0; + break; + case AARCH64_INSN_MEM_ORDER_ACQ: + order = BIT(22); + break; + case AARCH64_INSN_MEM_ORDER_REL: + order = BIT(15); + break; + case AARCH64_INSN_MEM_ORDER_ACQREL: + order = BIT(15) | BIT(22); + break; + default: + pr_err("%s: unknown mem order %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + insn &= ~(BIT(15) | BIT(22)); + insn |= order; + + return insn; +} + +u32 aarch64_insn_gen_cas(enum aarch64_insn_register result, + enum aarch64_insn_register address, + enum aarch64_insn_register value, + enum aarch64_insn_size_type size, + enum aarch64_insn_mem_order_type order) +{ + u32 insn; + + switch (size) { + case AARCH64_INSN_SIZE_32: + case AARCH64_INSN_SIZE_64: + break; + default: + pr_err("%s: unimplemented size encoding %d\n", __func__, size); + return AARCH64_BREAK_FAULT; + } + + insn = aarch64_insn_get_cas_value(); + + insn = aarch64_insn_encode_ldst_size(size, insn); + + insn = aarch64_insn_encode_cas_order(order, insn); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, + result); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, + address); + + return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RS, insn, + value); +} +#endif + +u32 aarch64_insn_gen_add_sub_imm(enum aarch64_insn_register dst, + enum aarch64_insn_register src, + int imm, enum aarch64_insn_variant variant, + enum aarch64_insn_adsb_type type) +{ + u32 insn; + + switch (type) { + case AARCH64_INSN_ADSB_ADD: + insn = aarch64_insn_get_add_imm_value(); + break; + case AARCH64_INSN_ADSB_SUB: + insn = aarch64_insn_get_sub_imm_value(); + break; + case AARCH64_INSN_ADSB_ADD_SETFLAGS: + insn = aarch64_insn_get_adds_imm_value(); + break; + case AARCH64_INSN_ADSB_SUB_SETFLAGS: + insn = aarch64_insn_get_subs_imm_value(); + break; + default: + pr_err("%s: unknown add/sub encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + switch (variant) { + case AARCH64_INSN_VARIANT_32BIT: + break; + case AARCH64_INSN_VARIANT_64BIT: + insn |= AARCH64_INSN_SF_BIT; + break; + default: + pr_err("%s: unknown variant encoding %d\n", __func__, variant); + return AARCH64_BREAK_FAULT; + } + + /* We can't encode more than a 24bit value (12bit + 12bit shift) */ + if (imm & ~(BIT(24) - 1)) + goto out; + + /* If we have something in the top 12 bits... */ + if (imm & ~(SZ_4K - 1)) { + /* ... and in the low 12 bits -> error */ + if (imm & (SZ_4K - 1)) + goto out; + + imm >>= 12; + insn |= AARCH64_INSN_LSL_12; + } + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src); + + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_12, insn, imm); + +out: + pr_err("%s: invalid immediate encoding %d\n", __func__, imm); + return AARCH64_BREAK_FAULT; +} + +u32 aarch64_insn_gen_bitfield(enum aarch64_insn_register dst, + enum aarch64_insn_register src, + int immr, int imms, + enum aarch64_insn_variant variant, + enum aarch64_insn_bitfield_type type) +{ + u32 insn; + u32 mask; + + switch (type) { + case AARCH64_INSN_BITFIELD_MOVE: + insn = aarch64_insn_get_bfm_value(); + break; + case AARCH64_INSN_BITFIELD_MOVE_UNSIGNED: + insn = aarch64_insn_get_ubfm_value(); + break; + case AARCH64_INSN_BITFIELD_MOVE_SIGNED: + insn = aarch64_insn_get_sbfm_value(); + break; + default: + pr_err("%s: unknown bitfield encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + switch (variant) { + case AARCH64_INSN_VARIANT_32BIT: + mask = GENMASK(4, 0); + break; + case AARCH64_INSN_VARIANT_64BIT: + insn |= AARCH64_INSN_SF_BIT | AARCH64_INSN_N_BIT; + mask = GENMASK(5, 0); + break; + default: + pr_err("%s: unknown variant encoding %d\n", __func__, variant); + return AARCH64_BREAK_FAULT; + } + + if (immr & ~mask) { + pr_err("%s: invalid immr encoding %d\n", __func__, immr); + return AARCH64_BREAK_FAULT; + } + if (imms & ~mask) { + pr_err("%s: invalid imms encoding %d\n", __func__, imms); + return AARCH64_BREAK_FAULT; + } + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src); + + insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_R, insn, immr); + + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, imms); +} + +u32 aarch64_insn_gen_movewide(enum aarch64_insn_register dst, + int imm, int shift, + enum aarch64_insn_variant variant, + enum aarch64_insn_movewide_type type) +{ + u32 insn; + + switch (type) { + case AARCH64_INSN_MOVEWIDE_ZERO: + insn = aarch64_insn_get_movz_value(); + break; + case AARCH64_INSN_MOVEWIDE_KEEP: + insn = aarch64_insn_get_movk_value(); + break; + case AARCH64_INSN_MOVEWIDE_INVERSE: + insn = aarch64_insn_get_movn_value(); + break; + default: + pr_err("%s: unknown movewide encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + if (imm & ~(SZ_64K - 1)) { + pr_err("%s: invalid immediate encoding %d\n", __func__, imm); + return AARCH64_BREAK_FAULT; + } + + switch (variant) { + case AARCH64_INSN_VARIANT_32BIT: + if (shift != 0 && shift != 16) { + pr_err("%s: invalid shift encoding %d\n", __func__, + shift); + return AARCH64_BREAK_FAULT; + } + break; + case AARCH64_INSN_VARIANT_64BIT: + insn |= AARCH64_INSN_SF_BIT; + if (shift != 0 && shift != 16 && shift != 32 && shift != 48) { + pr_err("%s: invalid shift encoding %d\n", __func__, + shift); + return AARCH64_BREAK_FAULT; + } + break; + default: + pr_err("%s: unknown variant encoding %d\n", __func__, variant); + return AARCH64_BREAK_FAULT; + } + + insn |= (shift >> 4) << 21; + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); + + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_16, insn, imm); +} + +u32 aarch64_insn_gen_add_sub_shifted_reg(enum aarch64_insn_register dst, + enum aarch64_insn_register src, + enum aarch64_insn_register reg, + int shift, + enum aarch64_insn_variant variant, + enum aarch64_insn_adsb_type type) +{ + u32 insn; + + switch (type) { + case AARCH64_INSN_ADSB_ADD: + insn = aarch64_insn_get_add_value(); + break; + case AARCH64_INSN_ADSB_SUB: + insn = aarch64_insn_get_sub_value(); + break; + case AARCH64_INSN_ADSB_ADD_SETFLAGS: + insn = aarch64_insn_get_adds_value(); + break; + case AARCH64_INSN_ADSB_SUB_SETFLAGS: + insn = aarch64_insn_get_subs_value(); + break; + default: + pr_err("%s: unknown add/sub encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + switch (variant) { + case AARCH64_INSN_VARIANT_32BIT: + if (shift & ~(SZ_32 - 1)) { + pr_err("%s: invalid shift encoding %d\n", __func__, + shift); + return AARCH64_BREAK_FAULT; + } + break; + case AARCH64_INSN_VARIANT_64BIT: + insn |= AARCH64_INSN_SF_BIT; + if (shift & ~(SZ_64 - 1)) { + pr_err("%s: invalid shift encoding %d\n", __func__, + shift); + return AARCH64_BREAK_FAULT; + } + break; + default: + pr_err("%s: unknown variant encoding %d\n", __func__, variant); + return AARCH64_BREAK_FAULT; + } + + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, reg); + + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_6, insn, shift); +} + +u32 aarch64_insn_gen_data1(enum aarch64_insn_register dst, + enum aarch64_insn_register src, + enum aarch64_insn_variant variant, + enum aarch64_insn_data1_type type) +{ + u32 insn; + + switch (type) { + case AARCH64_INSN_DATA1_REVERSE_16: + insn = aarch64_insn_get_rev16_value(); + break; + case AARCH64_INSN_DATA1_REVERSE_32: + insn = aarch64_insn_get_rev32_value(); + break; + case AARCH64_INSN_DATA1_REVERSE_64: + if (variant != AARCH64_INSN_VARIANT_64BIT) { + pr_err("%s: invalid variant for reverse64 %d\n", + __func__, variant); + return AARCH64_BREAK_FAULT; + } + insn = aarch64_insn_get_rev64_value(); + break; + default: + pr_err("%s: unknown data1 encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + switch (variant) { + case AARCH64_INSN_VARIANT_32BIT: + break; + case AARCH64_INSN_VARIANT_64BIT: + insn |= AARCH64_INSN_SF_BIT; + break; + default: + pr_err("%s: unknown variant encoding %d\n", __func__, variant); + return AARCH64_BREAK_FAULT; + } + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); + + return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src); +} + +u32 aarch64_insn_gen_data2(enum aarch64_insn_register dst, + enum aarch64_insn_register src, + enum aarch64_insn_register reg, + enum aarch64_insn_variant variant, + enum aarch64_insn_data2_type type) +{ + u32 insn; + + switch (type) { + case AARCH64_INSN_DATA2_UDIV: + insn = aarch64_insn_get_udiv_value(); + break; + case AARCH64_INSN_DATA2_SDIV: + insn = aarch64_insn_get_sdiv_value(); + break; + case AARCH64_INSN_DATA2_LSLV: + insn = aarch64_insn_get_lslv_value(); + break; + case AARCH64_INSN_DATA2_LSRV: + insn = aarch64_insn_get_lsrv_value(); + break; + case AARCH64_INSN_DATA2_ASRV: + insn = aarch64_insn_get_asrv_value(); + break; + case AARCH64_INSN_DATA2_RORV: + insn = aarch64_insn_get_rorv_value(); + break; + default: + pr_err("%s: unknown data2 encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + switch (variant) { + case AARCH64_INSN_VARIANT_32BIT: + break; + case AARCH64_INSN_VARIANT_64BIT: + insn |= AARCH64_INSN_SF_BIT; + break; + default: + pr_err("%s: unknown variant encoding %d\n", __func__, variant); + return AARCH64_BREAK_FAULT; + } + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src); + + return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, reg); +} + +u32 aarch64_insn_gen_data3(enum aarch64_insn_register dst, + enum aarch64_insn_register src, + enum aarch64_insn_register reg1, + enum aarch64_insn_register reg2, + enum aarch64_insn_variant variant, + enum aarch64_insn_data3_type type) +{ + u32 insn; + + switch (type) { + case AARCH64_INSN_DATA3_MADD: + insn = aarch64_insn_get_madd_value(); + break; + case AARCH64_INSN_DATA3_MSUB: + insn = aarch64_insn_get_msub_value(); + break; + default: + pr_err("%s: unknown data3 encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + switch (variant) { + case AARCH64_INSN_VARIANT_32BIT: + break; + case AARCH64_INSN_VARIANT_64BIT: + insn |= AARCH64_INSN_SF_BIT; + break; + default: + pr_err("%s: unknown variant encoding %d\n", __func__, variant); + return AARCH64_BREAK_FAULT; + } + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RA, insn, src); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, + reg1); + + return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, + reg2); +} + +u32 aarch64_insn_gen_logical_shifted_reg(enum aarch64_insn_register dst, + enum aarch64_insn_register src, + enum aarch64_insn_register reg, + int shift, + enum aarch64_insn_variant variant, + enum aarch64_insn_logic_type type) +{ + u32 insn; + + switch (type) { + case AARCH64_INSN_LOGIC_AND: + insn = aarch64_insn_get_and_value(); + break; + case AARCH64_INSN_LOGIC_BIC: + insn = aarch64_insn_get_bic_value(); + break; + case AARCH64_INSN_LOGIC_ORR: + insn = aarch64_insn_get_orr_value(); + break; + case AARCH64_INSN_LOGIC_ORN: + insn = aarch64_insn_get_orn_value(); + break; + case AARCH64_INSN_LOGIC_EOR: + insn = aarch64_insn_get_eor_value(); + break; + case AARCH64_INSN_LOGIC_EON: + insn = aarch64_insn_get_eon_value(); + break; + case AARCH64_INSN_LOGIC_AND_SETFLAGS: + insn = aarch64_insn_get_ands_value(); + break; + case AARCH64_INSN_LOGIC_BIC_SETFLAGS: + insn = aarch64_insn_get_bics_value(); + break; + default: + pr_err("%s: unknown logical encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + switch (variant) { + case AARCH64_INSN_VARIANT_32BIT: + if (shift & ~(SZ_32 - 1)) { + pr_err("%s: invalid shift encoding %d\n", __func__, + shift); + return AARCH64_BREAK_FAULT; + } + break; + case AARCH64_INSN_VARIANT_64BIT: + insn |= AARCH64_INSN_SF_BIT; + if (shift & ~(SZ_64 - 1)) { + pr_err("%s: invalid shift encoding %d\n", __func__, + shift); + return AARCH64_BREAK_FAULT; + } + break; + default: + pr_err("%s: unknown variant encoding %d\n", __func__, variant); + return AARCH64_BREAK_FAULT; + } + + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, reg); + + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_6, insn, shift); +} + +/* + * MOV (register) is architecturally an alias of ORR (shifted register) where + * MOV <*d>, <*m> is equivalent to ORR <*d>, <*ZR>, <*m> + */ +u32 aarch64_insn_gen_move_reg(enum aarch64_insn_register dst, + enum aarch64_insn_register src, + enum aarch64_insn_variant variant) +{ + return aarch64_insn_gen_logical_shifted_reg(dst, AARCH64_INSN_REG_ZR, + src, 0, variant, + AARCH64_INSN_LOGIC_ORR); +} + +u32 aarch64_insn_gen_adr(unsigned long pc, unsigned long addr, + enum aarch64_insn_register reg, + enum aarch64_insn_adr_type type) +{ + u32 insn; + s32 offset; + + switch (type) { + case AARCH64_INSN_ADR_TYPE_ADR: + insn = aarch64_insn_get_adr_value(); + offset = addr - pc; + break; + case AARCH64_INSN_ADR_TYPE_ADRP: + insn = aarch64_insn_get_adrp_value(); + offset = (addr - ALIGN_DOWN(pc, SZ_4K)) >> 12; + break; + default: + pr_err("%s: unknown adr encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + if (offset < -SZ_1M || offset >= SZ_1M) + return AARCH64_BREAK_FAULT; + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, reg); + + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_ADR, insn, offset); +} + +/* + * Decode the imm field of a branch, and return the byte offset as a + * signed value (so it can be used when computing a new branch + * target). + */ +s32 aarch64_get_branch_offset(u32 insn) +{ + s32 imm; + + if (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn)) { + imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_26, insn); + return (imm << 6) >> 4; + } + + if (aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) || + aarch64_insn_is_bcond(insn)) { + imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_19, insn); + return (imm << 13) >> 11; + } + + if (aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn)) { + imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_14, insn); + return (imm << 18) >> 16; + } + + /* Unhandled instruction */ + BUG(); +} + +/* + * Encode the displacement of a branch in the imm field and return the + * updated instruction. + */ +u32 aarch64_set_branch_offset(u32 insn, s32 offset) +{ + if (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn)) + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_26, insn, + offset >> 2); + + if (aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) || + aarch64_insn_is_bcond(insn)) + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn, + offset >> 2); + + if (aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn)) + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_14, insn, + offset >> 2); + + /* Unhandled instruction */ + BUG(); +} + +s32 aarch64_insn_adrp_get_offset(u32 insn) +{ + BUG_ON(!aarch64_insn_is_adrp(insn)); + return aarch64_insn_decode_immediate(AARCH64_INSN_IMM_ADR, insn) << 12; +} + +u32 aarch64_insn_adrp_set_offset(u32 insn, s32 offset) +{ + BUG_ON(!aarch64_insn_is_adrp(insn)); + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_ADR, insn, + offset >> 12); +} + +/* + * Extract the Op/CR data from a msr/mrs instruction. + */ +u32 aarch64_insn_extract_system_reg(u32 insn) +{ + return (insn & 0x1FFFE0) >> 5; +} + +bool aarch32_insn_is_wide(u32 insn) +{ + return insn >= 0xe800; +} + +/* + * Macros/defines for extracting register numbers from instruction. + */ +u32 aarch32_insn_extract_reg_num(u32 insn, int offset) +{ + return (insn & (0xf << offset)) >> offset; +} + +#define OPC2_MASK 0x7 +#define OPC2_OFFSET 5 +u32 aarch32_insn_mcr_extract_opc2(u32 insn) +{ + return (insn & (OPC2_MASK << OPC2_OFFSET)) >> OPC2_OFFSET; +} + +#define CRM_MASK 0xf +u32 aarch32_insn_mcr_extract_crm(u32 insn) +{ + return insn & CRM_MASK; +} + +static bool range_of_ones(u64 val) +{ + /* Doesn't handle full ones or full zeroes */ + u64 sval = val >> __ffs64(val); + + /* One of Sean Eron Anderson's bithack tricks */ + return ((sval + 1) & (sval)) == 0; +} + +static u32 aarch64_encode_immediate(u64 imm, + enum aarch64_insn_variant variant, + u32 insn) +{ + unsigned int immr, imms, n, ones, ror, esz, tmp; + u64 mask; + + switch (variant) { + case AARCH64_INSN_VARIANT_32BIT: + esz = 32; + break; + case AARCH64_INSN_VARIANT_64BIT: + insn |= AARCH64_INSN_SF_BIT; + esz = 64; + break; + default: + pr_err("%s: unknown variant encoding %d\n", __func__, variant); + return AARCH64_BREAK_FAULT; + } + + mask = GENMASK(esz - 1, 0); + + /* Can't encode full zeroes, full ones, or value wider than the mask */ + if (!imm || imm == mask || imm & ~mask) + return AARCH64_BREAK_FAULT; + + /* + * Inverse of Replicate(). Try to spot a repeating pattern + * with a pow2 stride. + */ + for (tmp = esz / 2; tmp >= 2; tmp /= 2) { + u64 emask = BIT(tmp) - 1; + + if ((imm & emask) != ((imm >> tmp) & emask)) + break; + + esz = tmp; + mask = emask; + } + + /* N is only set if we're encoding a 64bit value */ + n = esz == 64; + + /* Trim imm to the element size */ + imm &= mask; + + /* That's how many ones we need to encode */ + ones = hweight64(imm); + + /* + * imms is set to (ones - 1), prefixed with a string of ones + * and a zero if they fit. Cap it to 6 bits. + */ + imms = ones - 1; + imms |= 0xf << ffs(esz); + imms &= BIT(6) - 1; + + /* Compute the rotation */ + if (range_of_ones(imm)) { + /* + * Pattern: 0..01..10..0 + * + * Compute how many rotate we need to align it right + */ + ror = __ffs64(imm); + } else { + /* + * Pattern: 0..01..10..01..1 + * + * Fill the unused top bits with ones, and check if + * the result is a valid immediate (all ones with a + * contiguous ranges of zeroes). + */ + imm |= ~mask; + if (!range_of_ones(~imm)) + return AARCH64_BREAK_FAULT; + + /* + * Compute the rotation to get a continuous set of + * ones, with the first bit set at position 0 + */ + ror = fls64(~imm); + } + + /* + * immr is the number of bits we need to rotate back to the + * original set of ones. Note that this is relative to the + * element size... + */ + immr = (esz - ror) % esz; + + insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_N, insn, n); + insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_R, insn, immr); + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, imms); +} + +u32 aarch64_insn_gen_logical_immediate(enum aarch64_insn_logic_type type, + enum aarch64_insn_variant variant, + enum aarch64_insn_register Rn, + enum aarch64_insn_register Rd, + u64 imm) +{ + u32 insn; + + switch (type) { + case AARCH64_INSN_LOGIC_AND: + insn = aarch64_insn_get_and_imm_value(); + break; + case AARCH64_INSN_LOGIC_ORR: + insn = aarch64_insn_get_orr_imm_value(); + break; + case AARCH64_INSN_LOGIC_EOR: + insn = aarch64_insn_get_eor_imm_value(); + break; + case AARCH64_INSN_LOGIC_AND_SETFLAGS: + insn = aarch64_insn_get_ands_imm_value(); + break; + default: + pr_err("%s: unknown logical encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, Rd); + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, Rn); + return aarch64_encode_immediate(imm, variant, insn); +} + +u32 aarch64_insn_gen_extr(enum aarch64_insn_variant variant, + enum aarch64_insn_register Rm, + enum aarch64_insn_register Rn, + enum aarch64_insn_register Rd, + u8 lsb) +{ + u32 insn; + + insn = aarch64_insn_get_extr_value(); + + switch (variant) { + case AARCH64_INSN_VARIANT_32BIT: + if (lsb > 31) + return AARCH64_BREAK_FAULT; + break; + case AARCH64_INSN_VARIANT_64BIT: + if (lsb > 63) + return AARCH64_BREAK_FAULT; + insn |= AARCH64_INSN_SF_BIT; + insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_N, insn, 1); + break; + default: + pr_err("%s: unknown variant encoding %d\n", __func__, variant); + return AARCH64_BREAK_FAULT; + } + + insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, lsb); + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, Rd); + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, Rn); + return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, Rm); +} + +u32 aarch64_insn_gen_dmb(enum aarch64_insn_mb_type type) +{ + u32 opt; + u32 insn; + + switch (type) { + case AARCH64_INSN_MB_SY: + opt = 0xf; + break; + case AARCH64_INSN_MB_ST: + opt = 0xe; + break; + case AARCH64_INSN_MB_LD: + opt = 0xd; + break; + case AARCH64_INSN_MB_ISH: + opt = 0xb; + break; + case AARCH64_INSN_MB_ISHST: + opt = 0xa; + break; + case AARCH64_INSN_MB_ISHLD: + opt = 0x9; + break; + case AARCH64_INSN_MB_NSH: + opt = 0x7; + break; + case AARCH64_INSN_MB_NSHST: + opt = 0x6; + break; + case AARCH64_INSN_MB_NSHLD: + opt = 0x5; + break; + default: + pr_err("%s: unknown dmb type %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + insn = aarch64_insn_get_dmb_value(); + insn &= ~GENMASK(11, 8); + insn |= (opt << 8); + + return insn; +} diff --git a/arch/arm64/lib/kasan_sw_tags.S b/arch/arm64/lib/kasan_sw_tags.S new file mode 100644 index 0000000000..20784ce75d --- /dev/null +++ b/arch/arm64/lib/kasan_sw_tags.S @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2020 Google LLC + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * Report a tag mismatch detected by tag-based KASAN. + * + * A compiler-generated thunk calls this with a non-AAPCS calling + * convention. Upon entry to this function, registers are as follows: + * + * x0: fault address (see below for restore) + * x1: fault description (see below for restore) + * x2 to x15: callee-saved + * x16 to x17: safe to clobber + * x18 to x30: callee-saved + * sp: pre-decremented by 256 bytes (see below for restore) + * + * The caller has decremented the SP by 256 bytes, and created a + * structure on the stack as follows: + * + * sp + 0..15: x0 and x1 to be restored + * sp + 16..231: free for use + * sp + 232..247: x29 and x30 (same as in GPRs) + * sp + 248..255: free for use + * + * Note that this is not a struct pt_regs. + * + * To call a regular AAPCS function we must save x2 to x15 (which we can + * store in the gaps), and create a frame record (for which we can use + * x29 and x30 spilled by the caller as those match the GPRs). + * + * The caller expects x0 and x1 to be restored from the structure, and + * for the structure to be removed from the stack (i.e. the SP must be + * incremented by 256 prior to return). + */ +SYM_CODE_START(__hwasan_tag_mismatch) + bti c + add x29, sp, #232 + stp x2, x3, [sp, #8 * 2] + stp x4, x5, [sp, #8 * 4] + stp x6, x7, [sp, #8 * 6] + stp x8, x9, [sp, #8 * 8] + stp x10, x11, [sp, #8 * 10] + stp x12, x13, [sp, #8 * 12] + stp x14, x15, [sp, #8 * 14] +#ifndef CONFIG_SHADOW_CALL_STACK + str x18, [sp, #8 * 18] +#endif + + mov x2, x30 + bl kasan_tag_mismatch + + ldp x0, x1, [sp] + ldp x2, x3, [sp, #8 * 2] + ldp x4, x5, [sp, #8 * 4] + ldp x6, x7, [sp, #8 * 6] + ldp x8, x9, [sp, #8 * 8] + ldp x10, x11, [sp, #8 * 10] + ldp x12, x13, [sp, #8 * 12] + ldp x14, x15, [sp, #8 * 14] +#ifndef CONFIG_SHADOW_CALL_STACK + ldr x18, [sp, #8 * 18] +#endif + ldp x29, x30, [sp, #8 * 29] + + /* remove the structure from the stack */ + add sp, sp, #256 + ret +SYM_CODE_END(__hwasan_tag_mismatch) +EXPORT_SYMBOL(__hwasan_tag_mismatch) diff --git a/arch/arm64/lib/memchr.S b/arch/arm64/lib/memchr.S new file mode 100644 index 0000000000..37a9f2a4f7 --- /dev/null +++ b/arch/arm64/lib/memchr.S @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2021 Arm Ltd. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * Find a character in an area of memory. + * + * Parameters: + * x0 - buf + * x1 - c + * x2 - n + * Returns: + * x0 - address of first occurrence of 'c' or 0 + */ + +#define L(label) .L ## label + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f + +#define srcin x0 +#define chrin w1 +#define cntin x2 + +#define result x0 + +#define wordcnt x3 +#define rep01 x4 +#define repchr x5 +#define cur_word x6 +#define cur_byte w6 +#define tmp x7 +#define tmp2 x8 + + .p2align 4 + nop +SYM_FUNC_START(__pi_memchr) + and chrin, chrin, #0xff + lsr wordcnt, cntin, #3 + cbz wordcnt, L(byte_loop) + mov rep01, #REP8_01 + mul repchr, x1, rep01 + and cntin, cntin, #7 +L(word_loop): + ldr cur_word, [srcin], #8 + sub wordcnt, wordcnt, #1 + eor cur_word, cur_word, repchr + sub tmp, cur_word, rep01 + orr tmp2, cur_word, #REP8_7f + bics tmp, tmp, tmp2 + b.ne L(found_word) + cbnz wordcnt, L(word_loop) +L(byte_loop): + cbz cntin, L(not_found) + ldrb cur_byte, [srcin], #1 + sub cntin, cntin, #1 + cmp cur_byte, chrin + b.ne L(byte_loop) + sub srcin, srcin, #1 + ret +L(found_word): +CPU_LE( rev tmp, tmp) + clz tmp, tmp + sub tmp, tmp, #64 + add result, srcin, tmp, asr #3 + ret +L(not_found): + mov result, #0 + ret +SYM_FUNC_END(__pi_memchr) +SYM_FUNC_ALIAS_WEAK(memchr, __pi_memchr) +EXPORT_SYMBOL_NOKASAN(memchr) diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S new file mode 100644 index 0000000000..a5ccf2c55f --- /dev/null +++ b/arch/arm64/lib/memcmp.S @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2013-2021, Arm Limited. + * + * Adapted from the original at: + * https://github.com/ARM-software/optimized-routines/blob/e823e3abf5f89ecb/string/aarch64/memcmp.S + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* Assumptions: + * + * ARMv8-a, AArch64, unaligned accesses. + */ + +#define L(label) .L ## label + +/* Parameters and result. */ +#define src1 x0 +#define src2 x1 +#define limit x2 +#define result w0 + +/* Internal variables. */ +#define data1 x3 +#define data1w w3 +#define data1h x4 +#define data2 x5 +#define data2w w5 +#define data2h x6 +#define tmp1 x7 +#define tmp2 x8 + +SYM_FUNC_START(__pi_memcmp) + subs limit, limit, 8 + b.lo L(less8) + + ldr data1, [src1], 8 + ldr data2, [src2], 8 + cmp data1, data2 + b.ne L(return) + + subs limit, limit, 8 + b.gt L(more16) + + ldr data1, [src1, limit] + ldr data2, [src2, limit] + b L(return) + +L(more16): + ldr data1, [src1], 8 + ldr data2, [src2], 8 + cmp data1, data2 + bne L(return) + + /* Jump directly to comparing the last 16 bytes for 32 byte (or less) + strings. */ + subs limit, limit, 16 + b.ls L(last_bytes) + + /* We overlap loads between 0-32 bytes at either side of SRC1 when we + try to align, so limit it only to strings larger than 128 bytes. */ + cmp limit, 96 + b.ls L(loop16) + + /* Align src1 and adjust src2 with bytes not yet done. */ + and tmp1, src1, 15 + add limit, limit, tmp1 + sub src1, src1, tmp1 + sub src2, src2, tmp1 + + /* Loop performing 16 bytes per iteration using aligned src1. + Limit is pre-decremented by 16 and must be larger than zero. + Exit if <= 16 bytes left to do or if the data is not equal. */ + .p2align 4 +L(loop16): + ldp data1, data1h, [src1], 16 + ldp data2, data2h, [src2], 16 + subs limit, limit, 16 + ccmp data1, data2, 0, hi + ccmp data1h, data2h, 0, eq + b.eq L(loop16) + + cmp data1, data2 + bne L(return) + mov data1, data1h + mov data2, data2h + cmp data1, data2 + bne L(return) + + /* Compare last 1-16 bytes using unaligned access. */ +L(last_bytes): + add src1, src1, limit + add src2, src2, limit + ldp data1, data1h, [src1] + ldp data2, data2h, [src2] + cmp data1, data2 + bne L(return) + mov data1, data1h + mov data2, data2h + cmp data1, data2 + + /* Compare data bytes and set return value to 0, -1 or 1. */ +L(return): +#ifndef __AARCH64EB__ + rev data1, data1 + rev data2, data2 +#endif + cmp data1, data2 +L(ret_eq): + cset result, ne + cneg result, result, lo + ret + + .p2align 4 + /* Compare up to 8 bytes. Limit is [-8..-1]. */ +L(less8): + adds limit, limit, 4 + b.lo L(less4) + ldr data1w, [src1], 4 + ldr data2w, [src2], 4 + cmp data1w, data2w + b.ne L(return) + sub limit, limit, 4 +L(less4): + adds limit, limit, 4 + beq L(ret_eq) +L(byte_loop): + ldrb data1w, [src1], 1 + ldrb data2w, [src2], 1 + subs limit, limit, 1 + ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ + b.eq L(byte_loop) + sub result, data1w, data2w + ret +SYM_FUNC_END(__pi_memcmp) +SYM_FUNC_ALIAS_WEAK(memcmp, __pi_memcmp) +EXPORT_SYMBOL_NOKASAN(memcmp) diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S new file mode 100644 index 0000000000..4ab48d49c4 --- /dev/null +++ b/arch/arm64/lib/memcpy.S @@ -0,0 +1,253 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2012-2021, Arm Limited. + * + * Adapted from the original at: + * https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/memcpy.S + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* Assumptions: + * + * ARMv8-a, AArch64, unaligned accesses. + * + */ + +#define L(label) .L ## label + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define A_l x6 +#define A_lw w6 +#define A_h x7 +#define B_l x8 +#define B_lw w8 +#define B_h x9 +#define C_l x10 +#define C_lw w10 +#define C_h x11 +#define D_l x12 +#define D_h x13 +#define E_l x14 +#define E_h x15 +#define F_l x16 +#define F_h x17 +#define G_l count +#define G_h dst +#define H_l src +#define H_h srcend +#define tmp1 x14 + +/* This implementation handles overlaps and supports both memcpy and memmove + from a single entry point. It uses unaligned accesses and branchless + sequences to keep the code small, simple and improve performance. + + Copies are split into 3 main cases: small copies of up to 32 bytes, medium + copies of up to 128 bytes, and large copies. The overhead of the overlap + check is negligible since it is only required for large copies. + + Large copies use a software pipelined loop processing 64 bytes per iteration. + The destination pointer is 16-byte aligned to minimize unaligned accesses. + The loop tail is handled by always copying 64 bytes from the end. +*/ + +SYM_FUNC_START(__pi_memcpy) + add srcend, src, count + add dstend, dstin, count + cmp count, 128 + b.hi L(copy_long) + cmp count, 32 + b.hi L(copy32_128) + + /* Small copies: 0..32 bytes. */ + cmp count, 16 + b.lo L(copy16) + ldp A_l, A_h, [src] + ldp D_l, D_h, [srcend, -16] + stp A_l, A_h, [dstin] + stp D_l, D_h, [dstend, -16] + ret + + /* Copy 8-15 bytes. */ +L(copy16): + tbz count, 3, L(copy8) + ldr A_l, [src] + ldr A_h, [srcend, -8] + str A_l, [dstin] + str A_h, [dstend, -8] + ret + + .p2align 3 + /* Copy 4-7 bytes. */ +L(copy8): + tbz count, 2, L(copy4) + ldr A_lw, [src] + ldr B_lw, [srcend, -4] + str A_lw, [dstin] + str B_lw, [dstend, -4] + ret + + /* Copy 0..3 bytes using a branchless sequence. */ +L(copy4): + cbz count, L(copy0) + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb C_lw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb C_lw, [dstend, -1] +L(copy0): + ret + + .p2align 4 + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + ldp A_l, A_h, [src] + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [srcend, -32] + ldp D_l, D_h, [srcend, -16] + cmp count, 64 + b.hi L(copy128) + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstend, -32] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 + /* Copy 65..128 bytes. */ +L(copy128): + ldp E_l, E_h, [src, 32] + ldp F_l, F_h, [src, 48] + cmp count, 96 + b.ls L(copy96) + ldp G_l, G_h, [srcend, -64] + ldp H_l, H_h, [srcend, -48] + stp G_l, G_h, [dstend, -64] + stp H_l, H_h, [dstend, -48] +L(copy96): + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp E_l, E_h, [dstin, 32] + stp F_l, F_h, [dstin, 48] + stp C_l, C_h, [dstend, -32] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 + /* Copy more than 128 bytes. */ +L(copy_long): + /* Use backwards copy if there is an overlap. */ + sub tmp1, dstin, src + cbz tmp1, L(copy0) + cmp tmp1, count + b.lo L(copy_long_backwards) + + /* Copy 16 bytes and then align dst to 16-byte alignment. */ + + ldp D_l, D_h, [src] + and tmp1, dstin, 15 + bic dst, dstin, 15 + sub src, src, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_l, A_h, [src, 16] + stp D_l, D_h, [dstin] + ldp B_l, B_h, [src, 32] + ldp C_l, C_h, [src, 48] + ldp D_l, D_h, [src, 64]! + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(copy64_from_end) + +L(loop64): + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [src, 16] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [src, 32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [src, 48] + stp D_l, D_h, [dst, 64]! + ldp D_l, D_h, [src, 64]! + subs count, count, 64 + b.hi L(loop64) + + /* Write the last iteration and copy 64 bytes from the end. */ +L(copy64_from_end): + ldp E_l, E_h, [srcend, -64] + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [srcend, -48] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [srcend, -16] + stp D_l, D_h, [dst, 64] + stp E_l, E_h, [dstend, -64] + stp A_l, A_h, [dstend, -48] + stp B_l, B_h, [dstend, -32] + stp C_l, C_h, [dstend, -16] + ret + + .p2align 4 + + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align dst to 16-byte alignment. */ +L(copy_long_backwards): + ldp D_l, D_h, [srcend, -16] + and tmp1, dstend, 15 + sub srcend, srcend, tmp1 + sub count, count, tmp1 + ldp A_l, A_h, [srcend, -16] + stp D_l, D_h, [dstend, -16] + ldp B_l, B_h, [srcend, -32] + ldp C_l, C_h, [srcend, -48] + ldp D_l, D_h, [srcend, -64]! + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls L(copy64_from_start) + +L(loop64_backwards): + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [srcend, -16] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [srcend, -48] + stp D_l, D_h, [dstend, -64]! + ldp D_l, D_h, [srcend, -64]! + subs count, count, 64 + b.hi L(loop64_backwards) + + /* Write the last iteration and copy 64 bytes from the start. */ +L(copy64_from_start): + ldp G_l, G_h, [src, 48] + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [src, 32] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [src, 16] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [src] + stp D_l, D_h, [dstend, -64] + stp G_l, G_h, [dstin, 48] + stp A_l, A_h, [dstin, 32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin] + ret +SYM_FUNC_END(__pi_memcpy) + +SYM_FUNC_ALIAS(__memcpy, __pi_memcpy) +EXPORT_SYMBOL(__memcpy) +SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy) +EXPORT_SYMBOL(memcpy) + +SYM_FUNC_ALIAS(__pi_memmove, __pi_memcpy) + +SYM_FUNC_ALIAS(__memmove, __pi_memmove) +EXPORT_SYMBOL(__memmove) +SYM_FUNC_ALIAS_WEAK(memmove, __memmove) +EXPORT_SYMBOL(memmove) diff --git a/arch/arm64/lib/memset.S b/arch/arm64/lib/memset.S new file mode 100644 index 0000000000..a5aebe82ad --- /dev/null +++ b/arch/arm64/lib/memset.S @@ -0,0 +1,210 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> +#include <asm/cache.h> + +/* + * Fill in the buffer with character c (alignment handled by the hardware) + * + * Parameters: + * x0 - buf + * x1 - c + * x2 - n + * Returns: + * x0 - buf + */ + +dstin .req x0 +val .req w1 +count .req x2 +tmp1 .req x3 +tmp1w .req w3 +tmp2 .req x4 +tmp2w .req w4 +zva_len_x .req x5 +zva_len .req w5 +zva_bits_x .req x6 + +A_l .req x7 +A_lw .req w7 +dst .req x8 +tmp3w .req w9 +tmp3 .req x9 + +SYM_FUNC_START(__pi_memset) + mov dst, dstin /* Preserve return value. */ + and A_lw, val, #255 + orr A_lw, A_lw, A_lw, lsl #8 + orr A_lw, A_lw, A_lw, lsl #16 + orr A_l, A_l, A_l, lsl #32 + + cmp count, #15 + b.hi .Lover16_proc + /*All store maybe are non-aligned..*/ + tbz count, #3, 1f + str A_l, [dst], #8 +1: + tbz count, #2, 2f + str A_lw, [dst], #4 +2: + tbz count, #1, 3f + strh A_lw, [dst], #2 +3: + tbz count, #0, 4f + strb A_lw, [dst] +4: + ret + +.Lover16_proc: + /*Whether the start address is aligned with 16.*/ + neg tmp2, dst + ands tmp2, tmp2, #15 + b.eq .Laligned +/* +* The count is not less than 16, we can use stp to store the start 16 bytes, +* then adjust the dst aligned with 16.This process will make the current +* memory address at alignment boundary. +*/ + stp A_l, A_l, [dst] /*non-aligned store..*/ + /*make the dst aligned..*/ + sub count, count, tmp2 + add dst, dst, tmp2 + +.Laligned: + cbz A_l, .Lzero_mem + +.Ltail_maybe_long: + cmp count, #64 + b.ge .Lnot_short +.Ltail63: + ands tmp1, count, #0x30 + b.eq 3f + cmp tmp1w, #0x20 + b.eq 1f + b.lt 2f + stp A_l, A_l, [dst], #16 +1: + stp A_l, A_l, [dst], #16 +2: + stp A_l, A_l, [dst], #16 +/* +* The last store length is less than 16,use stp to write last 16 bytes. +* It will lead some bytes written twice and the access is non-aligned. +*/ +3: + ands count, count, #15 + cbz count, 4f + add dst, dst, count + stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */ +4: + ret + + /* + * Critical loop. Start at a new cache line boundary. Assuming + * 64 bytes per line, this ensures the entire loop is in one line. + */ + .p2align L1_CACHE_SHIFT +.Lnot_short: + sub dst, dst, #16/* Pre-bias. */ + sub count, count, #64 +1: + stp A_l, A_l, [dst, #16] + stp A_l, A_l, [dst, #32] + stp A_l, A_l, [dst, #48] + stp A_l, A_l, [dst, #64]! + subs count, count, #64 + b.ge 1b + tst count, #0x3f + add dst, dst, #16 + b.ne .Ltail63 +.Lexitfunc: + ret + + /* + * For zeroing memory, check to see if we can use the ZVA feature to + * zero entire 'cache' lines. + */ +.Lzero_mem: + cmp count, #63 + b.le .Ltail63 + /* + * For zeroing small amounts of memory, it's not worth setting up + * the line-clear code. + */ + cmp count, #128 + b.lt .Lnot_short /*count is at least 128 bytes*/ + + mrs tmp1, dczid_el0 + tbnz tmp1, #4, .Lnot_short + mov tmp3w, #4 + and zva_len, tmp1w, #15 /* Safety: other bits reserved. */ + lsl zva_len, tmp3w, zva_len + + ands tmp3w, zva_len, #63 + /* + * ensure the zva_len is not less than 64. + * It is not meaningful to use ZVA if the block size is less than 64. + */ + b.ne .Lnot_short +.Lzero_by_line: + /* + * Compute how far we need to go to become suitably aligned. We're + * already at quad-word alignment. + */ + cmp count, zva_len_x + b.lt .Lnot_short /* Not enough to reach alignment. */ + sub zva_bits_x, zva_len_x, #1 + neg tmp2, dst + ands tmp2, tmp2, zva_bits_x + b.eq 2f /* Already aligned. */ + /* Not aligned, check that there's enough to copy after alignment.*/ + sub tmp1, count, tmp2 + /* + * grantee the remain length to be ZVA is bigger than 64, + * avoid to make the 2f's process over mem range.*/ + cmp tmp1, #64 + ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */ + b.lt .Lnot_short + /* + * We know that there's at least 64 bytes to zero and that it's safe + * to overrun by 64 bytes. + */ + mov count, tmp1 +1: + stp A_l, A_l, [dst] + stp A_l, A_l, [dst, #16] + stp A_l, A_l, [dst, #32] + subs tmp2, tmp2, #64 + stp A_l, A_l, [dst, #48] + add dst, dst, #64 + b.ge 1b + /* We've overrun a bit, so adjust dst downwards.*/ + add dst, dst, tmp2 +2: + sub count, count, zva_len_x +3: + dc zva, dst + add dst, dst, zva_len_x + subs count, count, zva_len_x + b.ge 3b + ands count, count, zva_bits_x + b.ne .Ltail_maybe_long + ret +SYM_FUNC_END(__pi_memset) + +SYM_FUNC_ALIAS(__memset, __pi_memset) +EXPORT_SYMBOL(__memset) + +SYM_FUNC_ALIAS_WEAK(memset, __pi_memset) +EXPORT_SYMBOL(memset) diff --git a/arch/arm64/lib/mte.S b/arch/arm64/lib/mte.S new file mode 100644 index 0000000000..5018ac03b6 --- /dev/null +++ b/arch/arm64/lib/mte.S @@ -0,0 +1,177 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2020 ARM Ltd. + */ +#include <linux/linkage.h> + +#include <asm/asm-uaccess.h> +#include <asm/assembler.h> +#include <asm/mte.h> +#include <asm/page.h> +#include <asm/sysreg.h> + + .arch armv8.5-a+memtag + +/* + * multitag_transfer_size - set \reg to the block size that is accessed by the + * LDGM/STGM instructions. + */ + .macro multitag_transfer_size, reg, tmp + mrs_s \reg, SYS_GMID_EL1 + ubfx \reg, \reg, #GMID_EL1_BS_SHIFT, #GMID_EL1_BS_WIDTH + mov \tmp, #4 + lsl \reg, \tmp, \reg + .endm + +/* + * Clear the tags in a page + * x0 - address of the page to be cleared + */ +SYM_FUNC_START(mte_clear_page_tags) + multitag_transfer_size x1, x2 +1: stgm xzr, [x0] + add x0, x0, x1 + tst x0, #(PAGE_SIZE - 1) + b.ne 1b + ret +SYM_FUNC_END(mte_clear_page_tags) + +/* + * Zero the page and tags at the same time + * + * Parameters: + * x0 - address to the beginning of the page + */ +SYM_FUNC_START(mte_zero_clear_page_tags) + and x0, x0, #(1 << MTE_TAG_SHIFT) - 1 // clear the tag + mrs x1, dczid_el0 + tbnz x1, #4, 2f // Branch if DC GZVA is prohibited + and w1, w1, #0xf + mov x2, #4 + lsl x1, x2, x1 + +1: dc gzva, x0 + add x0, x0, x1 + tst x0, #(PAGE_SIZE - 1) + b.ne 1b + ret + +2: stz2g x0, [x0], #(MTE_GRANULE_SIZE * 2) + tst x0, #(PAGE_SIZE - 1) + b.ne 2b + ret +SYM_FUNC_END(mte_zero_clear_page_tags) + +/* + * Copy the tags from the source page to the destination one + * x0 - address of the destination page + * x1 - address of the source page + */ +SYM_FUNC_START(mte_copy_page_tags) + mov x2, x0 + mov x3, x1 + multitag_transfer_size x5, x6 +1: ldgm x4, [x3] + stgm x4, [x2] + add x2, x2, x5 + add x3, x3, x5 + tst x2, #(PAGE_SIZE - 1) + b.ne 1b + ret +SYM_FUNC_END(mte_copy_page_tags) + +/* + * Read tags from a user buffer (one tag per byte) and set the corresponding + * tags at the given kernel address. Used by PTRACE_POKEMTETAGS. + * x0 - kernel address (to) + * x1 - user buffer (from) + * x2 - number of tags/bytes (n) + * Returns: + * x0 - number of tags read/set + */ +SYM_FUNC_START(mte_copy_tags_from_user) + mov x3, x1 + cbz x2, 2f +1: +USER(2f, ldtrb w4, [x1]) + lsl x4, x4, #MTE_TAG_SHIFT + stg x4, [x0], #MTE_GRANULE_SIZE + add x1, x1, #1 + subs x2, x2, #1 + b.ne 1b + + // exception handling and function return +2: sub x0, x1, x3 // update the number of tags set + ret +SYM_FUNC_END(mte_copy_tags_from_user) + +/* + * Get the tags from a kernel address range and write the tag values to the + * given user buffer (one tag per byte). Used by PTRACE_PEEKMTETAGS. + * x0 - user buffer (to) + * x1 - kernel address (from) + * x2 - number of tags/bytes (n) + * Returns: + * x0 - number of tags read/set + */ +SYM_FUNC_START(mte_copy_tags_to_user) + mov x3, x0 + cbz x2, 2f +1: + ldg x4, [x1] + ubfx x4, x4, #MTE_TAG_SHIFT, #MTE_TAG_SIZE +USER(2f, sttrb w4, [x0]) + add x0, x0, #1 + add x1, x1, #MTE_GRANULE_SIZE + subs x2, x2, #1 + b.ne 1b + + // exception handling and function return +2: sub x0, x0, x3 // update the number of tags copied + ret +SYM_FUNC_END(mte_copy_tags_to_user) + +/* + * Save the tags in a page + * x0 - page address + * x1 - tag storage, MTE_PAGE_TAG_STORAGE bytes + */ +SYM_FUNC_START(mte_save_page_tags) + multitag_transfer_size x7, x5 +1: + mov x2, #0 +2: + ldgm x5, [x0] + orr x2, x2, x5 + add x0, x0, x7 + tst x0, #0xFF // 16 tag values fit in a register, + b.ne 2b // which is 16*16=256 bytes + + str x2, [x1], #8 + + tst x0, #(PAGE_SIZE - 1) + b.ne 1b + + ret +SYM_FUNC_END(mte_save_page_tags) + +/* + * Restore the tags in a page + * x0 - page address + * x1 - tag storage, MTE_PAGE_TAG_STORAGE bytes + */ +SYM_FUNC_START(mte_restore_page_tags) + multitag_transfer_size x7, x5 +1: + ldr x2, [x1], #8 +2: + stgm x2, [x0] + add x0, x0, x7 + tst x0, #0xFF + b.ne 2b + + tst x0, #(PAGE_SIZE - 1) + b.ne 1b + + ret +SYM_FUNC_END(mte_restore_page_tags) diff --git a/arch/arm64/lib/strchr.S b/arch/arm64/lib/strchr.S new file mode 100644 index 0000000000..94ee67a6b2 --- /dev/null +++ b/arch/arm64/lib/strchr.S @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Based on arch/arm/lib/strchr.S + * + * Copyright (C) 1995-2000 Russell King + * Copyright (C) 2013 ARM Ltd. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * Find the first occurrence of a character in a string. + * + * Parameters: + * x0 - str + * x1 - c + * Returns: + * x0 - address of first occurrence of 'c' or 0 + */ +SYM_FUNC_START(__pi_strchr) + and w1, w1, #0xff +1: ldrb w2, [x0], #1 + cmp w2, w1 + ccmp w2, wzr, #4, ne + b.ne 1b + sub x0, x0, #1 + cmp w2, w1 + csel x0, x0, xzr, eq + ret +SYM_FUNC_END(__pi_strchr) + +SYM_FUNC_ALIAS_WEAK(strchr, __pi_strchr) +EXPORT_SYMBOL_NOKASAN(strchr) diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S new file mode 100644 index 0000000000..9b89b45336 --- /dev/null +++ b/arch/arm64/lib/strcmp.S @@ -0,0 +1,190 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2012-2022, Arm Limited. + * + * Adapted from the original at: + * https://github.com/ARM-software/optimized-routines/blob/189dfefe37d54c5b/string/aarch64/strcmp.S + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* Assumptions: + * + * ARMv8-a, AArch64. + * MTE compatible. + */ + +#define L(label) .L ## label + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f + +#define src1 x0 +#define src2 x1 +#define result x0 + +#define data1 x2 +#define data1w w2 +#define data2 x3 +#define data2w w3 +#define has_nul x4 +#define diff x5 +#define off1 x5 +#define syndrome x6 +#define tmp x6 +#define data3 x7 +#define zeroones x8 +#define shift x9 +#define off2 x10 + +/* On big-endian early bytes are at MSB and on little-endian LSB. + LS_FW means shifting towards early bytes. */ +#ifdef __AARCH64EB__ +# define LS_FW lsl +#else +# define LS_FW lsr +#endif + +/* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. + Since carry propagation makes 0x1 bytes before a NUL byte appear + NUL too in big-endian, byte-reverse the data before the NUL check. */ + + +SYM_FUNC_START(__pi_strcmp) + sub off2, src2, src1 + mov zeroones, REP8_01 + and tmp, src1, 7 + tst off2, 7 + b.ne L(misaligned8) + cbnz tmp, L(mutual_align) + + .p2align 4 + +L(loop_aligned): + ldr data2, [src1, off2] + ldr data1, [src1], 8 +L(start_realigned): +#ifdef __AARCH64EB__ + rev tmp, data1 + sub has_nul, tmp, zeroones + orr tmp, tmp, REP8_7f +#else + sub has_nul, data1, zeroones + orr tmp, data1, REP8_7f +#endif + bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */ + ccmp data1, data2, 0, eq + b.eq L(loop_aligned) +#ifdef __AARCH64EB__ + rev has_nul, has_nul +#endif + eor diff, data1, data2 + orr syndrome, diff, has_nul +L(end): +#ifndef __AARCH64EB__ + rev syndrome, syndrome + rev data1, data1 + rev data2, data2 +#endif + clz shift, syndrome + /* The most-significant-non-zero bit of the syndrome marks either the + first bit that is different, or the top bit of the first zero byte. + Shifting left now will bring the critical information into the + top bits. */ + lsl data1, data1, shift + lsl data2, data2, shift + /* But we need to zero-extend (char is unsigned) the value and then + perform a signed 32-bit subtraction. */ + lsr data1, data1, 56 + sub result, data1, data2, lsr 56 + ret + + .p2align 4 + +L(mutual_align): + /* Sources are mutually aligned, but are not currently at an + alignment boundary. Round down the addresses and then mask off + the bytes that precede the start point. */ + bic src1, src1, 7 + ldr data2, [src1, off2] + ldr data1, [src1], 8 + neg shift, src2, lsl 3 /* Bits to alignment -64. */ + mov tmp, -1 + LS_FW tmp, tmp, shift + orr data1, data1, tmp + orr data2, data2, tmp + b L(start_realigned) + +L(misaligned8): + /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always + checking to make sure that we don't access beyond the end of SRC2. */ + cbz tmp, L(src1_aligned) +L(do_misaligned): + ldrb data1w, [src1], 1 + ldrb data2w, [src2], 1 + cmp data1w, 0 + ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ + b.ne L(done) + tst src1, 7 + b.ne L(do_misaligned) + +L(src1_aligned): + neg shift, src2, lsl 3 + bic src2, src2, 7 + ldr data3, [src2], 8 +#ifdef __AARCH64EB__ + rev data3, data3 +#endif + lsr tmp, zeroones, shift + orr data3, data3, tmp + sub has_nul, data3, zeroones + orr tmp, data3, REP8_7f + bics has_nul, has_nul, tmp + b.ne L(tail) + + sub off1, src2, src1 + + .p2align 4 + +L(loop_unaligned): + ldr data3, [src1, off1] + ldr data2, [src1, off2] +#ifdef __AARCH64EB__ + rev data3, data3 +#endif + sub has_nul, data3, zeroones + orr tmp, data3, REP8_7f + ldr data1, [src1], 8 + bics has_nul, has_nul, tmp + ccmp data1, data2, 0, eq + b.eq L(loop_unaligned) + + lsl tmp, has_nul, shift +#ifdef __AARCH64EB__ + rev tmp, tmp +#endif + eor diff, data1, data2 + orr syndrome, diff, tmp + cbnz syndrome, L(end) +L(tail): + ldr data1, [src1] + neg shift, shift + lsr data2, data3, shift + lsr has_nul, has_nul, shift +#ifdef __AARCH64EB__ + rev data2, data2 + rev has_nul, has_nul +#endif + eor diff, data1, data2 + orr syndrome, diff, has_nul + b L(end) + +L(done): + sub result, data1, data2 + ret +SYM_FUNC_END(__pi_strcmp) +SYM_FUNC_ALIAS_WEAK(strcmp, __pi_strcmp) +EXPORT_SYMBOL_NOKASAN(strcmp) diff --git a/arch/arm64/lib/strlen.S b/arch/arm64/lib/strlen.S new file mode 100644 index 0000000000..4919fe81ae --- /dev/null +++ b/arch/arm64/lib/strlen.S @@ -0,0 +1,213 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2013-2021, Arm Limited. + * + * Adapted from the original at: + * https://github.com/ARM-software/optimized-routines/blob/98e4d6a5c13c8e54/string/aarch64/strlen.S + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> +#include <asm/mte-def.h> + +/* Assumptions: + * + * ARMv8-a, AArch64, unaligned accesses, min page size 4k. + */ + +#define L(label) .L ## label + +/* Arguments and results. */ +#define srcin x0 +#define len x0 + +/* Locals and temporaries. */ +#define src x1 +#define data1 x2 +#define data2 x3 +#define has_nul1 x4 +#define has_nul2 x5 +#define tmp1 x4 +#define tmp2 x5 +#define tmp3 x6 +#define tmp4 x7 +#define zeroones x8 + + /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. A faster check + (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives + false hits for characters 129..255. */ + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + +/* + * When KASAN_HW_TAGS is in use, memory is checked at MTE_GRANULE_SIZE + * (16-byte) granularity, and we must ensure that no access straddles this + * alignment boundary. + */ +#ifdef CONFIG_KASAN_HW_TAGS +#define MIN_PAGE_SIZE MTE_GRANULE_SIZE +#else +#define MIN_PAGE_SIZE 4096 +#endif + + /* Since strings are short on average, we check the first 16 bytes + of the string for a NUL character. In order to do an unaligned ldp + safely we have to do a page cross check first. If there is a NUL + byte we calculate the length from the 2 8-byte words using + conditional select to reduce branch mispredictions (it is unlikely + strlen will be repeatedly called on strings with the same length). + + If the string is longer than 16 bytes, we align src so don't need + further page cross checks, and process 32 bytes per iteration + using the fast NUL check. If we encounter non-ASCII characters, + fallback to a second loop using the full NUL check. + + If the page cross check fails, we read 16 bytes from an aligned + address, remove any characters before the string, and continue + in the main loop using aligned loads. Since strings crossing a + page in the first 16 bytes are rare (probability of + 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized. + + AArch64 systems have a minimum page size of 4k. We don't bother + checking for larger page sizes - the cost of setting up the correct + page size is just not worth the extra gain from a small reduction in + the cases taking the slow path. Note that we only care about + whether the first fetch, which may be misaligned, crosses a page + boundary. */ + +SYM_FUNC_START(__pi_strlen) + and tmp1, srcin, MIN_PAGE_SIZE - 1 + mov zeroones, REP8_01 + cmp tmp1, MIN_PAGE_SIZE - 16 + b.gt L(page_cross) + ldp data1, data2, [srcin] +#ifdef __AARCH64EB__ + /* For big-endian, carry propagation (if the final byte in the + string is 0x01) means we cannot use has_nul1/2 directly. + Since we expect strings to be small and early-exit, + byte-swap the data now so has_null1/2 will be correct. */ + rev data1, data1 + rev data2, data2 +#endif + sub tmp1, data1, zeroones + orr tmp2, data1, REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, REP8_7f + bics has_nul1, tmp1, tmp2 + bic has_nul2, tmp3, tmp4 + ccmp has_nul2, 0, 0, eq + beq L(main_loop_entry) + + /* Enter with C = has_nul1 == 0. */ + csel has_nul1, has_nul1, has_nul2, cc + mov len, 8 + rev has_nul1, has_nul1 + clz tmp1, has_nul1 + csel len, xzr, len, cc + add len, len, tmp1, lsr 3 + ret + + /* The inner loop processes 32 bytes per iteration and uses the fast + NUL check. If we encounter non-ASCII characters, use a second + loop with the accurate NUL check. */ + .p2align 4 +L(main_loop_entry): + bic src, srcin, 15 + sub src, src, 16 +L(main_loop): + ldp data1, data2, [src, 32]! +L(page_cross_entry): + sub tmp1, data1, zeroones + sub tmp3, data2, zeroones + orr tmp2, tmp1, tmp3 + tst tmp2, zeroones, lsl 7 + bne 1f + ldp data1, data2, [src, 16] + sub tmp1, data1, zeroones + sub tmp3, data2, zeroones + orr tmp2, tmp1, tmp3 + tst tmp2, zeroones, lsl 7 + beq L(main_loop) + add src, src, 16 +1: + /* The fast check failed, so do the slower, accurate NUL check. */ + orr tmp2, data1, REP8_7f + orr tmp4, data2, REP8_7f + bics has_nul1, tmp1, tmp2 + bic has_nul2, tmp3, tmp4 + ccmp has_nul2, 0, 0, eq + beq L(nonascii_loop) + + /* Enter with C = has_nul1 == 0. */ +L(tail): +#ifdef __AARCH64EB__ + /* For big-endian, carry propagation (if the final byte in the + string is 0x01) means we cannot use has_nul1/2 directly. The + easiest way to get the correct byte is to byte-swap the data + and calculate the syndrome a second time. */ + csel data1, data1, data2, cc + rev data1, data1 + sub tmp1, data1, zeroones + orr tmp2, data1, REP8_7f + bic has_nul1, tmp1, tmp2 +#else + csel has_nul1, has_nul1, has_nul2, cc +#endif + sub len, src, srcin + rev has_nul1, has_nul1 + add tmp2, len, 8 + clz tmp1, has_nul1 + csel len, len, tmp2, cc + add len, len, tmp1, lsr 3 + ret + +L(nonascii_loop): + ldp data1, data2, [src, 16]! + sub tmp1, data1, zeroones + orr tmp2, data1, REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, REP8_7f + bics has_nul1, tmp1, tmp2 + bic has_nul2, tmp3, tmp4 + ccmp has_nul2, 0, 0, eq + bne L(tail) + ldp data1, data2, [src, 16]! + sub tmp1, data1, zeroones + orr tmp2, data1, REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, REP8_7f + bics has_nul1, tmp1, tmp2 + bic has_nul2, tmp3, tmp4 + ccmp has_nul2, 0, 0, eq + beq L(nonascii_loop) + b L(tail) + + /* Load 16 bytes from [srcin & ~15] and force the bytes that precede + srcin to 0x7f, so we ignore any NUL bytes before the string. + Then continue in the aligned loop. */ +L(page_cross): + bic src, srcin, 15 + ldp data1, data2, [src] + lsl tmp1, srcin, 3 + mov tmp4, -1 +#ifdef __AARCH64EB__ + /* Big-endian. Early bytes are at MSB. */ + lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ +#else + /* Little-endian. Early bytes are at LSB. */ + lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ +#endif + orr tmp1, tmp1, REP8_80 + orn data1, data1, tmp1 + orn tmp2, data2, tmp1 + tst srcin, 8 + csel data1, data1, tmp4, eq + csel data2, data2, tmp2, eq + b L(page_cross_entry) +SYM_FUNC_END(__pi_strlen) +SYM_FUNC_ALIAS_WEAK(strlen, __pi_strlen) +EXPORT_SYMBOL_NOKASAN(strlen) diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S new file mode 100644 index 0000000000..fe7bbc0b42 --- /dev/null +++ b/arch/arm64/lib/strncmp.S @@ -0,0 +1,310 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2013-2022, Arm Limited. + * + * Adapted from the original at: + * https://github.com/ARM-software/optimized-routines/blob/189dfefe37d54c5b/string/aarch64/strncmp.S + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* Assumptions: + * + * ARMv8-a, AArch64. + * MTE compatible. + */ + +#define L(label) .L ## label + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f + +/* Parameters and result. */ +#define src1 x0 +#define src2 x1 +#define limit x2 +#define result x0 + +/* Internal variables. */ +#define data1 x3 +#define data1w w3 +#define data2 x4 +#define data2w w4 +#define has_nul x5 +#define diff x6 +#define syndrome x7 +#define tmp1 x8 +#define tmp2 x9 +#define tmp3 x10 +#define zeroones x11 +#define pos x12 +#define mask x13 +#define endloop x14 +#define count mask +#define offset pos +#define neg_offset x15 + +/* Define endian dependent shift operations. + On big-endian early bytes are at MSB and on little-endian LSB. + LS_FW means shifting towards early bytes. + LS_BK means shifting towards later bytes. + */ +#ifdef __AARCH64EB__ +#define LS_FW lsl +#define LS_BK lsr +#else +#define LS_FW lsr +#define LS_BK lsl +#endif + +SYM_FUNC_START(__pi_strncmp) + cbz limit, L(ret0) + eor tmp1, src1, src2 + mov zeroones, #REP8_01 + tst tmp1, #7 + and count, src1, #7 + b.ne L(misaligned8) + cbnz count, L(mutual_align) + + /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. */ + .p2align 4 +L(loop_aligned): + ldr data1, [src1], #8 + ldr data2, [src2], #8 +L(start_realigned): + subs limit, limit, #8 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + csinv endloop, diff, xzr, hi /* Last Dword or differences. */ + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + ccmp endloop, #0, #0, eq + b.eq L(loop_aligned) + /* End of main loop */ + +L(full_check): +#ifndef __AARCH64EB__ + orr syndrome, diff, has_nul + add limit, limit, 8 /* Rewind limit to before last subs. */ +L(syndrome_check): + /* Limit was reached. Check if the NUL byte or the difference + is before the limit. */ + rev syndrome, syndrome + rev data1, data1 + clz pos, syndrome + rev data2, data2 + lsl data1, data1, pos + cmp limit, pos, lsr #3 + lsl data2, data2, pos + /* But we need to zero-extend (char is unsigned) the value and then + perform a signed 32-bit subtraction. */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 + csel result, result, xzr, hi + ret +#else + /* Not reached the limit, must have found the end or a diff. */ + tbz limit, #63, L(not_limit) + add tmp1, limit, 8 + cbz limit, L(not_limit) + + lsl limit, tmp1, #3 /* Bits -> bytes. */ + mov mask, #~0 + lsr mask, mask, limit + bic data1, data1, mask + bic data2, data2, mask + + /* Make sure that the NUL byte is marked in the syndrome. */ + orr has_nul, has_nul, mask + +L(not_limit): + /* For big-endian we cannot use the trick with the syndrome value + as carry-propagation can corrupt the upper bits if the trailing + bytes in the string contain 0x01. */ + /* However, if there is no NUL byte in the dword, we can generate + the result directly. We can't just subtract the bytes as the + MSB might be significant. */ + cbnz has_nul, 1f + cmp data1, data2 + cset result, ne + cneg result, result, lo + ret +1: + /* Re-compute the NUL-byte detection, using a byte-reversed value. */ + rev tmp3, data1 + sub tmp1, tmp3, zeroones + orr tmp2, tmp3, #REP8_7f + bic has_nul, tmp1, tmp2 + rev has_nul, has_nul + orr syndrome, diff, has_nul + clz pos, syndrome + /* The most-significant-non-zero bit of the syndrome marks either the + first bit that is different, or the top bit of the first zero byte. + Shifting left now will bring the critical information into the + top bits. */ +L(end_quick): + lsl data1, data1, pos + lsl data2, data2, pos + /* But we need to zero-extend (char is unsigned) the value and then + perform a signed 32-bit subtraction. */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 + ret +#endif + +L(mutual_align): + /* Sources are mutually aligned, but are not currently at an + alignment boundary. Round down the addresses and then mask off + the bytes that precede the start point. + We also need to adjust the limit calculations, but without + overflowing if the limit is near ULONG_MAX. */ + bic src1, src1, #7 + bic src2, src2, #7 + ldr data1, [src1], #8 + neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ + ldr data2, [src2], #8 + mov tmp2, #~0 + LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */ + /* Adjust the limit and ensure it doesn't overflow. */ + adds limit, limit, count + csinv limit, limit, xzr, lo + orr data1, data1, tmp2 + orr data2, data2, tmp2 + b L(start_realigned) + + .p2align 4 + /* Don't bother with dwords for up to 16 bytes. */ +L(misaligned8): + cmp limit, #16 + b.hs L(try_misaligned_words) + +L(byte_loop): + /* Perhaps we can do better than this. */ + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + subs limit, limit, #1 + ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + b.eq L(byte_loop) +L(done): + sub result, data1, data2 + ret + /* Align the SRC1 to a dword by doing a bytewise compare and then do + the dword loop. */ +L(try_misaligned_words): + cbz count, L(src1_aligned) + + neg count, count + and count, count, #7 + sub limit, limit, count + +L(page_end_loop): + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + cmp data1w, #1 + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + b.ne L(done) + subs count, count, #1 + b.hi L(page_end_loop) + + /* The following diagram explains the comparison of misaligned strings. + The bytes are shown in natural order. For little-endian, it is + reversed in the registers. The "x" bytes are before the string. + The "|" separates data that is loaded at one time. + src1 | a a a a a a a a | b b b c c c c c | . . . + src2 | x x x x x a a a a a a a a b b b | c c c c c . . . + + After shifting in each step, the data looks like this: + STEP_A STEP_B STEP_C + data1 a a a a a a a a b b b c c c c c b b b c c c c c + data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c + + The bytes with "0" are eliminated from the syndrome via mask. + + Align SRC2 down to 16 bytes. This way we can read 16 bytes at a + time from SRC2. The comparison happens in 3 steps. After each step + the loop can exit, or read from SRC1 or SRC2. */ +L(src1_aligned): + /* Calculate offset from 8 byte alignment to string start in bits. No + need to mask offset since shifts are ignoring upper bits. */ + lsl offset, src2, #3 + bic src2, src2, #0xf + mov mask, -1 + neg neg_offset, offset + ldr data1, [src1], #8 + ldp tmp1, tmp2, [src2], #16 + LS_BK mask, mask, neg_offset + and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */ + /* Skip the first compare if data in tmp1 is irrelevant. */ + tbnz offset, 6, L(misaligned_mid_loop) + +L(loop_misaligned): + /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/ + LS_FW data2, tmp1, offset + LS_BK tmp1, tmp2, neg_offset + subs limit, limit, #8 + orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/ + sub has_nul, data1, zeroones + eor diff, data1, data2 /* Non-zero if differences found. */ + orr tmp3, data1, #REP8_7f + csinv endloop, diff, xzr, hi /* If limit, set to all ones. */ + bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */ + orr tmp3, endloop, has_nul + cbnz tmp3, L(full_check) + + ldr data1, [src1], #8 +L(misaligned_mid_loop): + /* STEP_B: Compare first part of data1 to second part of tmp2. */ + LS_FW data2, tmp2, offset +#ifdef __AARCH64EB__ + /* For big-endian we do a byte reverse to avoid carry-propagation + problem described above. This way we can reuse the has_nul in the + next step and also use syndrome value trick at the end. */ + rev tmp3, data1 + #define data1_fixed tmp3 +#else + #define data1_fixed data1 +#endif + sub has_nul, data1_fixed, zeroones + orr tmp3, data1_fixed, #REP8_7f + eor diff, data2, data1 /* Non-zero if differences found. */ + bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */ +#ifdef __AARCH64EB__ + rev has_nul, has_nul +#endif + cmp limit, neg_offset, lsr #3 + orr syndrome, diff, has_nul + bic syndrome, syndrome, mask /* Ignore later bytes. */ + csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ + cbnz tmp3, L(syndrome_check) + + /* STEP_C: Compare second part of data1 to first part of tmp1. */ + ldp tmp1, tmp2, [src2], #16 + cmp limit, #8 + LS_BK data2, tmp1, neg_offset + eor diff, data2, data1 /* Non-zero if differences found. */ + orr syndrome, diff, has_nul + and syndrome, syndrome, mask /* Ignore earlier bytes. */ + csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ + cbnz tmp3, L(syndrome_check) + + ldr data1, [src1], #8 + sub limit, limit, #8 + b L(loop_misaligned) + +#ifdef __AARCH64EB__ +L(syndrome_check): + clz pos, syndrome + cmp pos, limit, lsl #3 + b.lo L(end_quick) +#endif + +L(ret0): + mov result, #0 + ret +SYM_FUNC_END(__pi_strncmp) +SYM_FUNC_ALIAS_WEAK(strncmp, __pi_strncmp) +EXPORT_SYMBOL_NOKASAN(strncmp) diff --git a/arch/arm64/lib/strnlen.S b/arch/arm64/lib/strnlen.S new file mode 100644 index 0000000000..d5ac0e10a0 --- /dev/null +++ b/arch/arm64/lib/strnlen.S @@ -0,0 +1,162 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * determine the length of a fixed-size string + * + * Parameters: + * x0 - const string pointer + * x1 - maximal string length + * Returns: + * x0 - the return length of specific string + */ + +/* Arguments and results. */ +srcin .req x0 +len .req x0 +limit .req x1 + +/* Locals and temporaries. */ +src .req x2 +data1 .req x3 +data2 .req x4 +data2a .req x5 +has_nul1 .req x6 +has_nul2 .req x7 +tmp1 .req x8 +tmp2 .req x9 +tmp3 .req x10 +tmp4 .req x11 +zeroones .req x12 +pos .req x13 +limit_wd .req x14 + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + +SYM_FUNC_START(__pi_strnlen) + cbz limit, .Lhit_limit + mov zeroones, #REP8_01 + bic src, srcin, #15 + ands tmp1, srcin, #15 + b.ne .Lmisaligned + /* Calculate the number of full and partial words -1. */ + sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */ + lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */ + + /* + * NUL detection works on the principle that (X - 1) & (~X) & 0x80 + * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + * can be done in parallel across the entire word. + */ + /* + * The inner loop deals with two Dwords at a time. This has a + * slightly higher start-up cost, but we should win quite quickly, + * especially on cores with a high number of issue slots per + * cycle, as we get much better parallelism out of the operations. + */ +.Lloop: + ldp data1, data2, [src], #16 +.Lrealigned: + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, #REP8_7f + bic has_nul1, tmp1, tmp2 + bic has_nul2, tmp3, tmp4 + subs limit_wd, limit_wd, #1 + orr tmp1, has_nul1, has_nul2 + ccmp tmp1, #0, #0, pl /* NZCV = 0000 */ + b.eq .Lloop + + cbz tmp1, .Lhit_limit /* No null in final Qword. */ + + /* + * We know there's a null in the final Qword. The easiest thing + * to do now is work out the length of the string and return + * MIN (len, limit). + */ + sub len, src, srcin + cbz has_nul1, .Lnul_in_data2 +CPU_BE( mov data2, data1 ) /*perpare data to re-calculate the syndrome*/ + + sub len, len, #8 + mov has_nul2, has_nul1 +.Lnul_in_data2: + /* + * For big-endian, carry propagation (if the final byte in the + * string is 0x01) means we cannot use has_nul directly. The + * easiest way to get the correct byte is to byte-swap the data + * and calculate the syndrome a second time. + */ +CPU_BE( rev data2, data2 ) +CPU_BE( sub tmp1, data2, zeroones ) +CPU_BE( orr tmp2, data2, #REP8_7f ) +CPU_BE( bic has_nul2, tmp1, tmp2 ) + + sub len, len, #8 + rev has_nul2, has_nul2 + clz pos, has_nul2 + add len, len, pos, lsr #3 /* Bits to bytes. */ + cmp len, limit + csel len, len, limit, ls /* Return the lower value. */ + ret + +.Lmisaligned: + /* + * Deal with a partial first word. + * We're doing two things in parallel here; + * 1) Calculate the number of words (but avoiding overflow if + * limit is near ULONG_MAX) - to do this we need to work out + * limit + tmp1 - 1 as a 65-bit value before shifting it; + * 2) Load and mask the initial data words - we force the bytes + * before the ones we are interested in to 0xff - this ensures + * early bytes will not hit any zero detection. + */ + ldp data1, data2, [src], #16 + + sub limit_wd, limit, #1 + and tmp3, limit_wd, #15 + lsr limit_wd, limit_wd, #4 + + add tmp3, tmp3, tmp1 + add limit_wd, limit_wd, tmp3, lsr #4 + + neg tmp4, tmp1 + lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */ + + mov tmp2, #~0 + /* Big-endian. Early bytes are at MSB. */ +CPU_BE( lsl tmp2, tmp2, tmp4 ) /* Shift (tmp1 & 63). */ + /* Little-endian. Early bytes are at LSB. */ +CPU_LE( lsr tmp2, tmp2, tmp4 ) /* Shift (tmp1 & 63). */ + + cmp tmp1, #8 + + orr data1, data1, tmp2 + orr data2a, data2, tmp2 + + csinv data1, data1, xzr, le + csel data2, data2, data2a, le + b .Lrealigned + +.Lhit_limit: + mov len, limit + ret +SYM_FUNC_END(__pi_strnlen) + +SYM_FUNC_ALIAS_WEAK(strnlen, __pi_strnlen) +EXPORT_SYMBOL_NOKASAN(strnlen) diff --git a/arch/arm64/lib/strrchr.S b/arch/arm64/lib/strrchr.S new file mode 100644 index 0000000000..a5123cf0ce --- /dev/null +++ b/arch/arm64/lib/strrchr.S @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Based on arch/arm/lib/strrchr.S + * + * Copyright (C) 1995-2000 Russell King + * Copyright (C) 2013 ARM Ltd. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * Find the last occurrence of a character in a string. + * + * Parameters: + * x0 - str + * x1 - c + * Returns: + * x0 - address of last occurrence of 'c' or 0 + */ +SYM_FUNC_START(__pi_strrchr) + mov x3, #0 + and w1, w1, #0xff +1: ldrb w2, [x0], #1 + cbz w2, 2f + cmp w2, w1 + b.ne 1b + sub x3, x0, #1 + b 1b +2: mov x0, x3 + ret +SYM_FUNC_END(__pi_strrchr) +SYM_FUNC_ALIAS_WEAK(strrchr, __pi_strrchr) +EXPORT_SYMBOL_NOKASAN(strrchr) diff --git a/arch/arm64/lib/tishift.S b/arch/arm64/lib/tishift.S new file mode 100644 index 0000000000..a88613834f --- /dev/null +++ b/arch/arm64/lib/tishift.S @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) + * + * Copyright (C) 2017-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ + +#include <linux/linkage.h> + +#include <asm/assembler.h> + +SYM_FUNC_START(__ashlti3) + cbz x2, 1f + mov x3, #64 + sub x3, x3, x2 + cmp x3, #0 + b.le 2f + lsl x1, x1, x2 + lsr x3, x0, x3 + lsl x2, x0, x2 + orr x1, x1, x3 + mov x0, x2 +1: + ret +2: + neg w1, w3 + mov x2, #0 + lsl x1, x0, x1 + mov x0, x2 + ret +SYM_FUNC_END(__ashlti3) +EXPORT_SYMBOL(__ashlti3) + +SYM_FUNC_START(__ashrti3) + cbz x2, 1f + mov x3, #64 + sub x3, x3, x2 + cmp x3, #0 + b.le 2f + lsr x0, x0, x2 + lsl x3, x1, x3 + asr x2, x1, x2 + orr x0, x0, x3 + mov x1, x2 +1: + ret +2: + neg w0, w3 + asr x2, x1, #63 + asr x0, x1, x0 + mov x1, x2 + ret +SYM_FUNC_END(__ashrti3) +EXPORT_SYMBOL(__ashrti3) + +SYM_FUNC_START(__lshrti3) + cbz x2, 1f + mov x3, #64 + sub x3, x3, x2 + cmp x3, #0 + b.le 2f + lsr x0, x0, x2 + lsl x3, x1, x3 + lsr x2, x1, x2 + orr x0, x0, x3 + mov x1, x2 +1: + ret +2: + neg w0, w3 + mov x2, #0 + lsr x0, x1, x0 + mov x1, x2 + ret +SYM_FUNC_END(__lshrti3) +EXPORT_SYMBOL(__lshrti3) diff --git a/arch/arm64/lib/uaccess_flushcache.c b/arch/arm64/lib/uaccess_flushcache.c new file mode 100644 index 0000000000..7510d1a231 --- /dev/null +++ b/arch/arm64/lib/uaccess_flushcache.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017 ARM Ltd. + */ + +#include <linux/uaccess.h> +#include <asm/barrier.h> +#include <asm/cacheflush.h> + +void memcpy_flushcache(void *dst, const void *src, size_t cnt) +{ + /* + * We assume this should not be called with @dst pointing to + * non-cacheable memory, such that we don't need an explicit + * barrier to order the cache maintenance against the memcpy. + */ + memcpy(dst, src, cnt); + dcache_clean_pop((unsigned long)dst, (unsigned long)dst + cnt); +} +EXPORT_SYMBOL_GPL(memcpy_flushcache); + +unsigned long __copy_user_flushcache(void *to, const void __user *from, + unsigned long n) +{ + unsigned long rc; + + rc = raw_copy_from_user(to, from, n); + + /* See above */ + dcache_clean_pop((unsigned long)to, (unsigned long)to + n - rc); + return rc; +} diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c new file mode 100644 index 0000000000..f9a53b7f98 --- /dev/null +++ b/arch/arm64/lib/xor-neon.c @@ -0,0 +1,338 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * arch/arm64/lib/xor-neon.c + * + * Authors: Jackie Liu <liuyun01@kylinos.cn> + * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd. + */ + +#include <linux/raid/xor.h> +#include <linux/module.h> +#include <asm/neon-intrinsics.h> + +static void xor_arm64_neon_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2) +{ + uint64_t *dp1 = (uint64_t *)p1; + uint64_t *dp2 = (uint64_t *)p2; + + register uint64x2_t v0, v1, v2, v3; + long lines = bytes / (sizeof(uint64x2_t) * 4); + + do { + /* p1 ^= p2 */ + v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); + v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); + v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); + v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); + + /* store */ + vst1q_u64(dp1 + 0, v0); + vst1q_u64(dp1 + 2, v1); + vst1q_u64(dp1 + 4, v2); + vst1q_u64(dp1 + 6, v3); + + dp1 += 8; + dp2 += 8; + } while (--lines > 0); +} + +static void xor_arm64_neon_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3) +{ + uint64_t *dp1 = (uint64_t *)p1; + uint64_t *dp2 = (uint64_t *)p2; + uint64_t *dp3 = (uint64_t *)p3; + + register uint64x2_t v0, v1, v2, v3; + long lines = bytes / (sizeof(uint64x2_t) * 4); + + do { + /* p1 ^= p2 */ + v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); + v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); + v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); + v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); + + /* p1 ^= p3 */ + v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); + v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); + v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); + v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); + + /* store */ + vst1q_u64(dp1 + 0, v0); + vst1q_u64(dp1 + 2, v1); + vst1q_u64(dp1 + 4, v2); + vst1q_u64(dp1 + 6, v3); + + dp1 += 8; + dp2 += 8; + dp3 += 8; + } while (--lines > 0); +} + +static void xor_arm64_neon_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4) +{ + uint64_t *dp1 = (uint64_t *)p1; + uint64_t *dp2 = (uint64_t *)p2; + uint64_t *dp3 = (uint64_t *)p3; + uint64_t *dp4 = (uint64_t *)p4; + + register uint64x2_t v0, v1, v2, v3; + long lines = bytes / (sizeof(uint64x2_t) * 4); + + do { + /* p1 ^= p2 */ + v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); + v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); + v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); + v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); + + /* p1 ^= p3 */ + v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); + v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); + v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); + v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); + + /* p1 ^= p4 */ + v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); + v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); + v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); + v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); + + /* store */ + vst1q_u64(dp1 + 0, v0); + vst1q_u64(dp1 + 2, v1); + vst1q_u64(dp1 + 4, v2); + vst1q_u64(dp1 + 6, v3); + + dp1 += 8; + dp2 += 8; + dp3 += 8; + dp4 += 8; + } while (--lines > 0); +} + +static void xor_arm64_neon_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5) +{ + uint64_t *dp1 = (uint64_t *)p1; + uint64_t *dp2 = (uint64_t *)p2; + uint64_t *dp3 = (uint64_t *)p3; + uint64_t *dp4 = (uint64_t *)p4; + uint64_t *dp5 = (uint64_t *)p5; + + register uint64x2_t v0, v1, v2, v3; + long lines = bytes / (sizeof(uint64x2_t) * 4); + + do { + /* p1 ^= p2 */ + v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); + v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); + v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); + v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); + + /* p1 ^= p3 */ + v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); + v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); + v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); + v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); + + /* p1 ^= p4 */ + v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); + v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); + v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); + v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); + + /* p1 ^= p5 */ + v0 = veorq_u64(v0, vld1q_u64(dp5 + 0)); + v1 = veorq_u64(v1, vld1q_u64(dp5 + 2)); + v2 = veorq_u64(v2, vld1q_u64(dp5 + 4)); + v3 = veorq_u64(v3, vld1q_u64(dp5 + 6)); + + /* store */ + vst1q_u64(dp1 + 0, v0); + vst1q_u64(dp1 + 2, v1); + vst1q_u64(dp1 + 4, v2); + vst1q_u64(dp1 + 6, v3); + + dp1 += 8; + dp2 += 8; + dp3 += 8; + dp4 += 8; + dp5 += 8; + } while (--lines > 0); +} + +struct xor_block_template xor_block_inner_neon __ro_after_init = { + .name = "__inner_neon__", + .do_2 = xor_arm64_neon_2, + .do_3 = xor_arm64_neon_3, + .do_4 = xor_arm64_neon_4, + .do_5 = xor_arm64_neon_5, +}; +EXPORT_SYMBOL(xor_block_inner_neon); + +static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r) +{ + uint64x2_t res; + + asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n" + "eor3 %0.16b, %1.16b, %2.16b, %3.16b" + : "=w"(res) : "w"(p), "w"(q), "w"(r)); + return res; +} + +static void xor_arm64_eor3_3(unsigned long bytes, + unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3) +{ + uint64_t *dp1 = (uint64_t *)p1; + uint64_t *dp2 = (uint64_t *)p2; + uint64_t *dp3 = (uint64_t *)p3; + + register uint64x2_t v0, v1, v2, v3; + long lines = bytes / (sizeof(uint64x2_t) * 4); + + do { + /* p1 ^= p2 ^ p3 */ + v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), + vld1q_u64(dp3 + 0)); + v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), + vld1q_u64(dp3 + 2)); + v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), + vld1q_u64(dp3 + 4)); + v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), + vld1q_u64(dp3 + 6)); + + /* store */ + vst1q_u64(dp1 + 0, v0); + vst1q_u64(dp1 + 2, v1); + vst1q_u64(dp1 + 4, v2); + vst1q_u64(dp1 + 6, v3); + + dp1 += 8; + dp2 += 8; + dp3 += 8; + } while (--lines > 0); +} + +static void xor_arm64_eor3_4(unsigned long bytes, + unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4) +{ + uint64_t *dp1 = (uint64_t *)p1; + uint64_t *dp2 = (uint64_t *)p2; + uint64_t *dp3 = (uint64_t *)p3; + uint64_t *dp4 = (uint64_t *)p4; + + register uint64x2_t v0, v1, v2, v3; + long lines = bytes / (sizeof(uint64x2_t) * 4); + + do { + /* p1 ^= p2 ^ p3 */ + v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), + vld1q_u64(dp3 + 0)); + v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), + vld1q_u64(dp3 + 2)); + v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), + vld1q_u64(dp3 + 4)); + v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), + vld1q_u64(dp3 + 6)); + + /* p1 ^= p4 */ + v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); + v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); + v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); + v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); + + /* store */ + vst1q_u64(dp1 + 0, v0); + vst1q_u64(dp1 + 2, v1); + vst1q_u64(dp1 + 4, v2); + vst1q_u64(dp1 + 6, v3); + + dp1 += 8; + dp2 += 8; + dp3 += 8; + dp4 += 8; + } while (--lines > 0); +} + +static void xor_arm64_eor3_5(unsigned long bytes, + unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5) +{ + uint64_t *dp1 = (uint64_t *)p1; + uint64_t *dp2 = (uint64_t *)p2; + uint64_t *dp3 = (uint64_t *)p3; + uint64_t *dp4 = (uint64_t *)p4; + uint64_t *dp5 = (uint64_t *)p5; + + register uint64x2_t v0, v1, v2, v3; + long lines = bytes / (sizeof(uint64x2_t) * 4); + + do { + /* p1 ^= p2 ^ p3 */ + v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), + vld1q_u64(dp3 + 0)); + v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), + vld1q_u64(dp3 + 2)); + v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), + vld1q_u64(dp3 + 4)); + v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), + vld1q_u64(dp3 + 6)); + + /* p1 ^= p4 ^ p5 */ + v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0)); + v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2)); + v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4)); + v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6)); + + /* store */ + vst1q_u64(dp1 + 0, v0); + vst1q_u64(dp1 + 2, v1); + vst1q_u64(dp1 + 4, v2); + vst1q_u64(dp1 + 6, v3); + + dp1 += 8; + dp2 += 8; + dp3 += 8; + dp4 += 8; + dp5 += 8; + } while (--lines > 0); +} + +static int __init xor_neon_init(void) +{ + if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) { + xor_block_inner_neon.do_3 = xor_arm64_eor3_3; + xor_block_inner_neon.do_4 = xor_arm64_eor3_4; + xor_block_inner_neon.do_5 = xor_arm64_eor3_5; + } + return 0; +} +module_init(xor_neon_init); + +static void __exit xor_neon_exit(void) +{ +} +module_exit(xor_neon_exit); + +MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>"); +MODULE_DESCRIPTION("ARMv8 XOR Extensions"); +MODULE_LICENSE("GPL"); |