From 2c3c1048746a4622d8c89a29670120dc8fab93c4 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 20:49:45 +0200 Subject: Adding upstream version 6.1.76. Signed-off-by: Daniel Baumann --- arch/xtensa/lib/Makefile | 12 + arch/xtensa/lib/ashldi3.S | 28 +++ arch/xtensa/lib/ashrdi3.S | 28 +++ arch/xtensa/lib/bswapdi2.S | 21 ++ arch/xtensa/lib/bswapsi2.S | 16 ++ arch/xtensa/lib/checksum.S | 357 ++++++++++++++++++++++++++ arch/xtensa/lib/divsi3.S | 74 ++++++ arch/xtensa/lib/kcsan-stubs.c | 54 ++++ arch/xtensa/lib/lshrdi3.S | 28 +++ arch/xtensa/lib/memcopy.S | 553 +++++++++++++++++++++++++++++++++++++++++ arch/xtensa/lib/memset.S | 153 ++++++++++++ arch/xtensa/lib/modsi3.S | 87 +++++++ arch/xtensa/lib/mulsi3.S | 133 ++++++++++ arch/xtensa/lib/pci-auto.c | 316 +++++++++++++++++++++++ arch/xtensa/lib/strncpy_user.S | 216 ++++++++++++++++ arch/xtensa/lib/strnlen_user.S | 141 +++++++++++ arch/xtensa/lib/udivsi3.S | 68 +++++ arch/xtensa/lib/umodsi3.S | 57 +++++ arch/xtensa/lib/umulsidi3.S | 232 +++++++++++++++++ arch/xtensa/lib/usercopy.S | 300 ++++++++++++++++++++++ 20 files changed, 2874 insertions(+) create mode 100644 arch/xtensa/lib/Makefile create mode 100644 arch/xtensa/lib/ashldi3.S create mode 100644 arch/xtensa/lib/ashrdi3.S create mode 100644 arch/xtensa/lib/bswapdi2.S create mode 100644 arch/xtensa/lib/bswapsi2.S create mode 100644 arch/xtensa/lib/checksum.S create mode 100644 arch/xtensa/lib/divsi3.S create mode 100644 arch/xtensa/lib/kcsan-stubs.c create mode 100644 arch/xtensa/lib/lshrdi3.S create mode 100644 arch/xtensa/lib/memcopy.S create mode 100644 arch/xtensa/lib/memset.S create mode 100644 arch/xtensa/lib/modsi3.S create mode 100644 arch/xtensa/lib/mulsi3.S create mode 100644 arch/xtensa/lib/pci-auto.c create mode 100644 arch/xtensa/lib/strncpy_user.S create mode 100644 arch/xtensa/lib/strnlen_user.S create mode 100644 arch/xtensa/lib/udivsi3.S create mode 100644 arch/xtensa/lib/umodsi3.S create mode 100644 arch/xtensa/lib/umulsidi3.S create mode 100644 arch/xtensa/lib/usercopy.S (limited to 'arch/xtensa/lib') diff --git a/arch/xtensa/lib/Makefile b/arch/xtensa/lib/Makefile new file mode 100644 index 000000000..c9c261418 --- /dev/null +++ b/arch/xtensa/lib/Makefile @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Makefile for Xtensa-specific library files. +# + +lib-y += memcopy.o memset.o checksum.o \ + ashldi3.o ashrdi3.o bswapdi2.o bswapsi2.o lshrdi3.o \ + divsi3.o udivsi3.o modsi3.o umodsi3.o mulsi3.o umulsidi3.o \ + usercopy.o strncpy_user.o strnlen_user.o +lib-$(CONFIG_PCI) += pci-auto.o +lib-$(CONFIG_KCSAN) += kcsan-stubs.o +KCSAN_SANITIZE_kcsan-stubs.o := n diff --git a/arch/xtensa/lib/ashldi3.S b/arch/xtensa/lib/ashldi3.S new file mode 100644 index 000000000..67fb0da9e --- /dev/null +++ b/arch/xtensa/lib/ashldi3.S @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later WITH GCC-exception-2.0 */ +#include +#include +#include + +#ifdef __XTENSA_EB__ +#define uh a2 +#define ul a3 +#else +#define uh a3 +#define ul a2 +#endif /* __XTENSA_EB__ */ + +ENTRY(__ashldi3) + + abi_entry_default + ssl a4 + bgei a4, 32, .Llow_only + src uh, uh, ul + sll ul, ul + abi_ret_default + +.Llow_only: + sll uh, ul + movi ul, 0 + abi_ret_default + +ENDPROC(__ashldi3) diff --git a/arch/xtensa/lib/ashrdi3.S b/arch/xtensa/lib/ashrdi3.S new file mode 100644 index 000000000..cbf052c51 --- /dev/null +++ b/arch/xtensa/lib/ashrdi3.S @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later WITH GCC-exception-2.0 */ +#include +#include +#include + +#ifdef __XTENSA_EB__ +#define uh a2 +#define ul a3 +#else +#define uh a3 +#define ul a2 +#endif /* __XTENSA_EB__ */ + +ENTRY(__ashrdi3) + + abi_entry_default + ssr a4 + bgei a4, 32, .Lhigh_only + src ul, uh, ul + sra uh, uh + abi_ret_default + +.Lhigh_only: + sra ul, uh + srai uh, uh, 31 + abi_ret_default + +ENDPROC(__ashrdi3) diff --git a/arch/xtensa/lib/bswapdi2.S b/arch/xtensa/lib/bswapdi2.S new file mode 100644 index 000000000..d8e52e05e --- /dev/null +++ b/arch/xtensa/lib/bswapdi2.S @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later WITH GCC-exception-2.0 */ +#include +#include +#include + +ENTRY(__bswapdi2) + + abi_entry_default + ssai 8 + srli a4, a2, 16 + src a4, a4, a2 + src a4, a4, a4 + src a4, a2, a4 + srli a2, a3, 16 + src a2, a2, a3 + src a2, a2, a2 + src a2, a3, a2 + mov a3, a4 + abi_ret_default + +ENDPROC(__bswapdi2) diff --git a/arch/xtensa/lib/bswapsi2.S b/arch/xtensa/lib/bswapsi2.S new file mode 100644 index 000000000..9c1de1344 --- /dev/null +++ b/arch/xtensa/lib/bswapsi2.S @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later WITH GCC-exception-2.0 */ +#include +#include +#include + +ENTRY(__bswapsi2) + + abi_entry_default + ssai 8 + srli a3, a2, 16 + src a3, a3, a2 + src a3, a3, a3 + src a2, a2, a3 + abi_ret_default + +ENDPROC(__bswapsi2) diff --git a/arch/xtensa/lib/checksum.S b/arch/xtensa/lib/checksum.S new file mode 100644 index 000000000..cf1bed1a5 --- /dev/null +++ b/arch/xtensa/lib/checksum.S @@ -0,0 +1,357 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IP/TCP/UDP checksumming routines + * + * Xtensa version: Copyright (C) 2001 Tensilica, Inc. by Kevin Chea + * Optimized by Joe Taylor + */ + +#include +#include +#include +#include + +/* + * computes a partial checksum, e.g. for TCP/UDP fragments + */ + +/* + * unsigned int csum_partial(const unsigned char *buf, int len, + * unsigned int sum); + * a2 = buf + * a3 = len + * a4 = sum + * + * This function assumes 2- or 4-byte alignment. Other alignments will fail! + */ + +/* ONES_ADD converts twos-complement math to ones-complement. */ +#define ONES_ADD(sum, val) \ + add sum, sum, val ; \ + bgeu sum, val, 99f ; \ + addi sum, sum, 1 ; \ +99: ; + +.text +ENTRY(csum_partial) + + /* + * Experiments with Ethernet and SLIP connections show that buf + * is aligned on either a 2-byte or 4-byte boundary. + */ + abi_entry_default + extui a5, a2, 0, 2 + bnez a5, 8f /* branch if 2-byte aligned */ + /* Fall-through on common case, 4-byte alignment */ +1: + srli a5, a3, 5 /* 32-byte chunks */ +#if XCHAL_HAVE_LOOPS + loopgtz a5, 2f +#else + beqz a5, 2f + slli a5, a5, 5 + add a5, a5, a2 /* a5 = end of last 32-byte chunk */ +.Loop1: +#endif + l32i a6, a2, 0 + l32i a7, a2, 4 + ONES_ADD(a4, a6) + ONES_ADD(a4, a7) + l32i a6, a2, 8 + l32i a7, a2, 12 + ONES_ADD(a4, a6) + ONES_ADD(a4, a7) + l32i a6, a2, 16 + l32i a7, a2, 20 + ONES_ADD(a4, a6) + ONES_ADD(a4, a7) + l32i a6, a2, 24 + l32i a7, a2, 28 + ONES_ADD(a4, a6) + ONES_ADD(a4, a7) + addi a2, a2, 4*8 +#if !XCHAL_HAVE_LOOPS + blt a2, a5, .Loop1 +#endif +2: + extui a5, a3, 2, 3 /* remaining 4-byte chunks */ +#if XCHAL_HAVE_LOOPS + loopgtz a5, 3f +#else + beqz a5, 3f + slli a5, a5, 2 + add a5, a5, a2 /* a5 = end of last 4-byte chunk */ +.Loop2: +#endif + l32i a6, a2, 0 + ONES_ADD(a4, a6) + addi a2, a2, 4 +#if !XCHAL_HAVE_LOOPS + blt a2, a5, .Loop2 +#endif +3: + _bbci.l a3, 1, 5f /* remaining 2-byte chunk */ + l16ui a6, a2, 0 + ONES_ADD(a4, a6) + addi a2, a2, 2 +5: + _bbci.l a3, 0, 7f /* remaining 1-byte chunk */ +6: l8ui a6, a2, 0 +#ifdef __XTENSA_EB__ + slli a6, a6, 8 /* load byte into bits 8..15 */ +#endif + ONES_ADD(a4, a6) +7: + mov a2, a4 + abi_ret_default + + /* uncommon case, buf is 2-byte aligned */ +8: + beqz a3, 7b /* branch if len == 0 */ + beqi a3, 1, 6b /* branch if len == 1 */ + + extui a5, a2, 0, 1 + bnez a5, 8f /* branch if 1-byte aligned */ + + l16ui a6, a2, 0 /* common case, len >= 2 */ + ONES_ADD(a4, a6) + addi a2, a2, 2 /* adjust buf */ + addi a3, a3, -2 /* adjust len */ + j 1b /* now buf is 4-byte aligned */ + + /* case: odd-byte aligned, len > 1 + * This case is dog slow, so don't give us an odd address. + * (I don't think this ever happens, but just in case.) + */ +8: + srli a5, a3, 2 /* 4-byte chunks */ +#if XCHAL_HAVE_LOOPS + loopgtz a5, 2f +#else + beqz a5, 2f + slli a5, a5, 2 + add a5, a5, a2 /* a5 = end of last 4-byte chunk */ +.Loop3: +#endif + l8ui a6, a2, 0 /* bits 24..31 */ + l16ui a7, a2, 1 /* bits 8..23 */ + l8ui a8, a2, 3 /* bits 0.. 8 */ +#ifdef __XTENSA_EB__ + slli a6, a6, 24 +#else + slli a8, a8, 24 +#endif + slli a7, a7, 8 + or a7, a7, a6 + or a7, a7, a8 + ONES_ADD(a4, a7) + addi a2, a2, 4 +#if !XCHAL_HAVE_LOOPS + blt a2, a5, .Loop3 +#endif +2: + _bbci.l a3, 1, 3f /* remaining 2-byte chunk, still odd addr */ + l8ui a6, a2, 0 + l8ui a7, a2, 1 +#ifdef __XTENSA_EB__ + slli a6, a6, 8 +#else + slli a7, a7, 8 +#endif + or a7, a7, a6 + ONES_ADD(a4, a7) + addi a2, a2, 2 +3: + j 5b /* branch to handle the remaining byte */ + +ENDPROC(csum_partial) + +/* + * Copy from ds while checksumming, otherwise like csum_partial + */ + +/* +unsigned int csum_partial_copy_generic (const char *src, char *dst, int len) + a2 = src + a3 = dst + a4 = len + a5 = sum + a8 = temp + a9 = temp + a10 = temp + + This function is optimized for 4-byte aligned addresses. Other + alignments work, but not nearly as efficiently. + */ + +ENTRY(csum_partial_copy_generic) + + abi_entry_default + movi a5, -1 + or a10, a2, a3 + + /* We optimize the following alignment tests for the 4-byte + aligned case. Two bbsi.l instructions might seem more optimal + (commented out below). However, both labels 5: and 3: are out + of the imm8 range, so the assembler relaxes them into + equivalent bbci.l, j combinations, which is actually + slower. */ + + extui a9, a10, 0, 2 + beqz a9, 1f /* branch if both are 4-byte aligned */ + bbsi.l a10, 0, 5f /* branch if one address is odd */ + j 3f /* one address is 2-byte aligned */ + +/* _bbsi.l a10, 0, 5f */ /* branch if odd address */ +/* _bbsi.l a10, 1, 3f */ /* branch if 2-byte-aligned address */ + +1: + /* src and dst are both 4-byte aligned */ + srli a10, a4, 5 /* 32-byte chunks */ +#if XCHAL_HAVE_LOOPS + loopgtz a10, 2f +#else + beqz a10, 2f + slli a10, a10, 5 + add a10, a10, a2 /* a10 = end of last 32-byte src chunk */ +.Loop5: +#endif +EX(10f) l32i a9, a2, 0 +EX(10f) l32i a8, a2, 4 +EX(10f) s32i a9, a3, 0 +EX(10f) s32i a8, a3, 4 + ONES_ADD(a5, a9) + ONES_ADD(a5, a8) +EX(10f) l32i a9, a2, 8 +EX(10f) l32i a8, a2, 12 +EX(10f) s32i a9, a3, 8 +EX(10f) s32i a8, a3, 12 + ONES_ADD(a5, a9) + ONES_ADD(a5, a8) +EX(10f) l32i a9, a2, 16 +EX(10f) l32i a8, a2, 20 +EX(10f) s32i a9, a3, 16 +EX(10f) s32i a8, a3, 20 + ONES_ADD(a5, a9) + ONES_ADD(a5, a8) +EX(10f) l32i a9, a2, 24 +EX(10f) l32i a8, a2, 28 +EX(10f) s32i a9, a3, 24 +EX(10f) s32i a8, a3, 28 + ONES_ADD(a5, a9) + ONES_ADD(a5, a8) + addi a2, a2, 32 + addi a3, a3, 32 +#if !XCHAL_HAVE_LOOPS + blt a2, a10, .Loop5 +#endif +2: + extui a10, a4, 2, 3 /* remaining 4-byte chunks */ + extui a4, a4, 0, 2 /* reset len for general-case, 2-byte chunks */ +#if XCHAL_HAVE_LOOPS + loopgtz a10, 3f +#else + beqz a10, 3f + slli a10, a10, 2 + add a10, a10, a2 /* a10 = end of last 4-byte src chunk */ +.Loop6: +#endif +EX(10f) l32i a9, a2, 0 +EX(10f) s32i a9, a3, 0 + ONES_ADD(a5, a9) + addi a2, a2, 4 + addi a3, a3, 4 +#if !XCHAL_HAVE_LOOPS + blt a2, a10, .Loop6 +#endif +3: + /* + Control comes to here in two cases: (1) It may fall through + to here from the 4-byte alignment case to process, at most, + one 2-byte chunk. (2) It branches to here from above if + either src or dst is 2-byte aligned, and we process all bytes + here, except for perhaps a trailing odd byte. It's + inefficient, so align your addresses to 4-byte boundaries. + + a2 = src + a3 = dst + a4 = len + a5 = sum + */ + srli a10, a4, 1 /* 2-byte chunks */ +#if XCHAL_HAVE_LOOPS + loopgtz a10, 4f +#else + beqz a10, 4f + slli a10, a10, 1 + add a10, a10, a2 /* a10 = end of last 2-byte src chunk */ +.Loop7: +#endif +EX(10f) l16ui a9, a2, 0 +EX(10f) s16i a9, a3, 0 + ONES_ADD(a5, a9) + addi a2, a2, 2 + addi a3, a3, 2 +#if !XCHAL_HAVE_LOOPS + blt a2, a10, .Loop7 +#endif +4: + /* This section processes a possible trailing odd byte. */ + _bbci.l a4, 0, 8f /* 1-byte chunk */ +EX(10f) l8ui a9, a2, 0 +EX(10f) s8i a9, a3, 0 +#ifdef __XTENSA_EB__ + slli a9, a9, 8 /* shift byte to bits 8..15 */ +#endif + ONES_ADD(a5, a9) +8: + mov a2, a5 + abi_ret_default + +5: + /* Control branch to here when either src or dst is odd. We + process all bytes using 8-bit accesses. Grossly inefficient, + so don't feed us an odd address. */ + + srli a10, a4, 1 /* handle in pairs for 16-bit csum */ +#if XCHAL_HAVE_LOOPS + loopgtz a10, 6f +#else + beqz a10, 6f + slli a10, a10, 1 + add a10, a10, a2 /* a10 = end of last odd-aligned, 2-byte src chunk */ +.Loop8: +#endif +EX(10f) l8ui a9, a2, 0 +EX(10f) l8ui a8, a2, 1 +EX(10f) s8i a9, a3, 0 +EX(10f) s8i a8, a3, 1 +#ifdef __XTENSA_EB__ + slli a9, a9, 8 /* combine into a single 16-bit value */ +#else /* for checksum computation */ + slli a8, a8, 8 +#endif + or a9, a9, a8 + ONES_ADD(a5, a9) + addi a2, a2, 2 + addi a3, a3, 2 +#if !XCHAL_HAVE_LOOPS + blt a2, a10, .Loop8 +#endif +6: + j 4b /* process the possible trailing odd byte */ + +ENDPROC(csum_partial_copy_generic) + + +# Exception handler: +.section .fixup, "ax" +10: + movi a2, 0 + abi_ret_default + +.previous diff --git a/arch/xtensa/lib/divsi3.S b/arch/xtensa/lib/divsi3.S new file mode 100644 index 000000000..b044b4744 --- /dev/null +++ b/arch/xtensa/lib/divsi3.S @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later WITH GCC-exception-2.0 */ +#include +#include +#include + +ENTRY(__divsi3) + + abi_entry_default +#if XCHAL_HAVE_DIV32 + quos a2, a2, a3 +#else + xor a7, a2, a3 /* sign = dividend ^ divisor */ + do_abs a6, a2, a4 /* udividend = abs (dividend) */ + do_abs a3, a3, a4 /* udivisor = abs (divisor) */ + bltui a3, 2, .Lle_one /* check if udivisor <= 1 */ + do_nsau a5, a6, a2, a8 /* udividend_shift = nsau (udividend) */ + do_nsau a4, a3, a2, a8 /* udivisor_shift = nsau (udivisor) */ + bgeu a5, a4, .Lspecial + + sub a4, a4, a5 /* count = udivisor_shift - udividend_shift */ + ssl a4 + sll a3, a3 /* udivisor <<= count */ + movi a2, 0 /* quotient = 0 */ + + /* test-subtract-and-shift loop; one quotient bit on each iteration */ +#if XCHAL_HAVE_LOOPS + loopnez a4, .Lloopend +#endif /* XCHAL_HAVE_LOOPS */ +.Lloop: + bltu a6, a3, .Lzerobit + sub a6, a6, a3 + addi a2, a2, 1 +.Lzerobit: + slli a2, a2, 1 + srli a3, a3, 1 +#if !XCHAL_HAVE_LOOPS + addi a4, a4, -1 + bnez a4, .Lloop +#endif /* !XCHAL_HAVE_LOOPS */ +.Lloopend: + + bltu a6, a3, .Lreturn + addi a2, a2, 1 /* increment if udividend >= udivisor */ +.Lreturn: + neg a5, a2 + movltz a2, a5, a7 /* return (sign < 0) ? -quotient : quotient */ + abi_ret_default + +.Lle_one: + beqz a3, .Lerror + neg a2, a6 /* if udivisor == 1, then return... */ + movgez a2, a6, a7 /* (sign < 0) ? -udividend : udividend */ + abi_ret_default + +.Lspecial: + bltu a6, a3, .Lreturn0 /* if dividend < divisor, return 0 */ + movi a2, 1 + movi a4, -1 + movltz a2, a4, a7 /* else return (sign < 0) ? -1 : 1 */ + abi_ret_default + +.Lerror: + /* Divide by zero: Use an illegal instruction to force an exception. + The subsequent "DIV0" string can be recognized by the exception + handler to identify the real cause of the exception. */ + ill + .ascii "DIV0" + +.Lreturn0: + movi a2, 0 +#endif /* XCHAL_HAVE_DIV32 */ + abi_ret_default + +ENDPROC(__divsi3) diff --git a/arch/xtensa/lib/kcsan-stubs.c b/arch/xtensa/lib/kcsan-stubs.c new file mode 100644 index 000000000..2b08faa62 --- /dev/null +++ b/arch/xtensa/lib/kcsan-stubs.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +void __atomic_store_8(volatile void *p, u64 v, int i) +{ + BUG(); +} + +u64 __atomic_load_8(const volatile void *p, int i) +{ + BUG(); +} + +u64 __atomic_exchange_8(volatile void *p, u64 v, int i) +{ + BUG(); +} + +bool __atomic_compare_exchange_8(volatile void *p1, void *p2, u64 v, bool b, int i1, int i2) +{ + BUG(); +} + +u64 __atomic_fetch_add_8(volatile void *p, u64 v, int i) +{ + BUG(); +} + +u64 __atomic_fetch_sub_8(volatile void *p, u64 v, int i) +{ + BUG(); +} + +u64 __atomic_fetch_and_8(volatile void *p, u64 v, int i) +{ + BUG(); +} + +u64 __atomic_fetch_or_8(volatile void *p, u64 v, int i) +{ + BUG(); +} + +u64 __atomic_fetch_xor_8(volatile void *p, u64 v, int i) +{ + BUG(); +} + +u64 __atomic_fetch_nand_8(volatile void *p, u64 v, int i) +{ + BUG(); +} diff --git a/arch/xtensa/lib/lshrdi3.S b/arch/xtensa/lib/lshrdi3.S new file mode 100644 index 000000000..129ef8d17 --- /dev/null +++ b/arch/xtensa/lib/lshrdi3.S @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later WITH GCC-exception-2.0 */ +#include +#include +#include + +#ifdef __XTENSA_EB__ +#define uh a2 +#define ul a3 +#else +#define uh a3 +#define ul a2 +#endif /* __XTENSA_EB__ */ + +ENTRY(__lshrdi3) + + abi_entry_default + ssr a4 + bgei a4, 32, .Lhigh_only + src ul, uh, ul + srl uh, uh + abi_ret_default + +.Lhigh_only: + srl ul, uh + movi uh, 0 + abi_ret_default + +ENDPROC(__lshrdi3) diff --git a/arch/xtensa/lib/memcopy.S b/arch/xtensa/lib/memcopy.S new file mode 100644 index 000000000..b20d206bc --- /dev/null +++ b/arch/xtensa/lib/memcopy.S @@ -0,0 +1,553 @@ +/* + * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions + * xthal_memcpy and xthal_bcopy + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 2002 - 2012 Tensilica Inc. + */ + +#include +#include +#include + +/* + * void *memcpy(void *dst, const void *src, size_t len); + * + * This function is intended to do the same thing as the standard + * library function memcpy() for most cases. + * However, where the source and/or destination references + * an instruction RAM or ROM or a data RAM or ROM, that + * source and/or destination will always be accessed with + * 32-bit load and store instructions (as required for these + * types of devices). + * + * !!!!!!! XTFIXME: + * !!!!!!! Handling of IRAM/IROM has not yet + * !!!!!!! been implemented. + * + * The (general case) algorithm is as follows: + * If destination is unaligned, align it by conditionally + * copying 1 and 2 bytes. + * If source is aligned, + * do 16 bytes with a loop, and then finish up with + * 8, 4, 2, and 1 byte copies conditional on the length; + * else (if source is unaligned), + * do the same, but use SRC to align the source data. + * This code tries to use fall-through branches for the common + * case of aligned source and destination and multiple + * of 4 (or 8) length. + * + * Register use: + * a0/ return address + * a1/ stack pointer + * a2/ return value + * a3/ src + * a4/ length + * a5/ dst + * a6/ tmp + * a7/ tmp + * a8/ tmp + * a9/ tmp + * a10/ tmp + * a11/ tmp + */ + + .text + +/* + * Byte by byte copy + */ + .align 4 + .byte 0 # 1 mod 4 alignment for LOOPNEZ + # (0 mod 4 alignment for LBEG) +.Lbytecopy: +#if XCHAL_HAVE_LOOPS + loopnez a4, .Lbytecopydone +#else /* !XCHAL_HAVE_LOOPS */ + beqz a4, .Lbytecopydone + add a7, a3, a4 # a7 = end address for source +#endif /* !XCHAL_HAVE_LOOPS */ +.Lnextbyte: + l8ui a6, a3, 0 + addi a3, a3, 1 + s8i a6, a5, 0 + addi a5, a5, 1 +#if !XCHAL_HAVE_LOOPS + bne a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end +#endif /* !XCHAL_HAVE_LOOPS */ +.Lbytecopydone: + abi_ret_default + +/* + * Destination is unaligned + */ + + .align 4 +.Ldst1mod2: # dst is only byte aligned + _bltui a4, 7, .Lbytecopy # do short copies byte by byte + + # copy 1 byte + l8ui a6, a3, 0 + addi a3, a3, 1 + addi a4, a4, -1 + s8i a6, a5, 0 + addi a5, a5, 1 + _bbci.l a5, 1, .Ldstaligned # if dst is now aligned, then + # return to main algorithm +.Ldst2mod4: # dst 16-bit aligned + # copy 2 bytes + _bltui a4, 6, .Lbytecopy # do short copies byte by byte + l8ui a6, a3, 0 + l8ui a7, a3, 1 + addi a3, a3, 2 + addi a4, a4, -2 + s8i a6, a5, 0 + s8i a7, a5, 1 + addi a5, a5, 2 + j .Ldstaligned # dst is now aligned, return to main algorithm + +ENTRY(__memcpy) +WEAK(memcpy) + + abi_entry_default + # a2/ dst, a3/ src, a4/ len + mov a5, a2 # copy dst so that a2 is return value +.Lcommon: + _bbsi.l a2, 0, .Ldst1mod2 # if dst is 1 mod 2 + _bbsi.l a2, 1, .Ldst2mod4 # if dst is 2 mod 4 +.Ldstaligned: # return here from .Ldst?mod? once dst is aligned + srli a7, a4, 4 # number of loop iterations with 16B + # per iteration + movi a8, 3 # if source is not aligned, + _bany a3, a8, .Lsrcunaligned # then use shifting copy + /* + * Destination and source are word-aligned, use word copy. + */ + # copy 16 bytes per iteration for word-aligned dst and word-aligned src +#if XCHAL_HAVE_LOOPS + loopnez a7, .Loop1done +#else /* !XCHAL_HAVE_LOOPS */ + beqz a7, .Loop1done + slli a8, a7, 4 + add a8, a8, a3 # a8 = end of last 16B source chunk +#endif /* !XCHAL_HAVE_LOOPS */ +.Loop1: + l32i a6, a3, 0 + l32i a7, a3, 4 + s32i a6, a5, 0 + l32i a6, a3, 8 + s32i a7, a5, 4 + l32i a7, a3, 12 + s32i a6, a5, 8 + addi a3, a3, 16 + s32i a7, a5, 12 + addi a5, a5, 16 +#if !XCHAL_HAVE_LOOPS + bne a3, a8, .Loop1 # continue loop if a3:src != a8:src_end +#endif /* !XCHAL_HAVE_LOOPS */ +.Loop1done: + bbci.l a4, 3, .L2 + # copy 8 bytes + l32i a6, a3, 0 + l32i a7, a3, 4 + addi a3, a3, 8 + s32i a6, a5, 0 + s32i a7, a5, 4 + addi a5, a5, 8 +.L2: + bbsi.l a4, 2, .L3 + bbsi.l a4, 1, .L4 + bbsi.l a4, 0, .L5 + abi_ret_default +.L3: + # copy 4 bytes + l32i a6, a3, 0 + addi a3, a3, 4 + s32i a6, a5, 0 + addi a5, a5, 4 + bbsi.l a4, 1, .L4 + bbsi.l a4, 0, .L5 + abi_ret_default +.L4: + # copy 2 bytes + l16ui a6, a3, 0 + addi a3, a3, 2 + s16i a6, a5, 0 + addi a5, a5, 2 + bbsi.l a4, 0, .L5 + abi_ret_default +.L5: + # copy 1 byte + l8ui a6, a3, 0 + s8i a6, a5, 0 + abi_ret_default + +/* + * Destination is aligned, Source is unaligned + */ + + .align 4 +.Lsrcunaligned: + _beqz a4, .Ldone # avoid loading anything for zero-length copies + # copy 16 bytes per iteration for word-aligned dst and unaligned src + __ssa8 a3 # set shift amount from byte offset + +/* set to 1 when running on ISS (simulator) with the + lint or ferret client, or 0 to save a few cycles */ +#define SIM_CHECKS_ALIGNMENT 1 +#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT + and a11, a3, a8 # save unalignment offset for below + sub a3, a3, a11 # align a3 +#endif + l32i a6, a3, 0 # load first word +#if XCHAL_HAVE_LOOPS + loopnez a7, .Loop2done +#else /* !XCHAL_HAVE_LOOPS */ + beqz a7, .Loop2done + slli a10, a7, 4 + add a10, a10, a3 # a10 = end of last 16B source chunk +#endif /* !XCHAL_HAVE_LOOPS */ +.Loop2: + l32i a7, a3, 4 + l32i a8, a3, 8 + __src_b a6, a6, a7 + s32i a6, a5, 0 + l32i a9, a3, 12 + __src_b a7, a7, a8 + s32i a7, a5, 4 + l32i a6, a3, 16 + __src_b a8, a8, a9 + s32i a8, a5, 8 + addi a3, a3, 16 + __src_b a9, a9, a6 + s32i a9, a5, 12 + addi a5, a5, 16 +#if !XCHAL_HAVE_LOOPS + bne a3, a10, .Loop2 # continue loop if a3:src != a10:src_end +#endif /* !XCHAL_HAVE_LOOPS */ +.Loop2done: + bbci.l a4, 3, .L12 + # copy 8 bytes + l32i a7, a3, 4 + l32i a8, a3, 8 + __src_b a6, a6, a7 + s32i a6, a5, 0 + addi a3, a3, 8 + __src_b a7, a7, a8 + s32i a7, a5, 4 + addi a5, a5, 8 + mov a6, a8 +.L12: + bbci.l a4, 2, .L13 + # copy 4 bytes + l32i a7, a3, 4 + addi a3, a3, 4 + __src_b a6, a6, a7 + s32i a6, a5, 0 + addi a5, a5, 4 + mov a6, a7 +.L13: +#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT + add a3, a3, a11 # readjust a3 with correct misalignment +#endif + bbsi.l a4, 1, .L14 + bbsi.l a4, 0, .L15 +.Ldone: abi_ret_default +.L14: + # copy 2 bytes + l8ui a6, a3, 0 + l8ui a7, a3, 1 + addi a3, a3, 2 + s8i a6, a5, 0 + s8i a7, a5, 1 + addi a5, a5, 2 + bbsi.l a4, 0, .L15 + abi_ret_default +.L15: + # copy 1 byte + l8ui a6, a3, 0 + s8i a6, a5, 0 + abi_ret_default + +ENDPROC(__memcpy) + +/* + * void bcopy(const void *src, void *dest, size_t n); + */ + +ENTRY(bcopy) + + abi_entry_default + # a2=src, a3=dst, a4=len + mov a5, a3 + mov a3, a2 + mov a2, a5 + j .Lmovecommon # go to common code for memmove+bcopy + +ENDPROC(bcopy) + +/* + * void *memmove(void *dst, const void *src, size_t len); + * + * This function is intended to do the same thing as the standard + * library function memmove() for most cases. + * However, where the source and/or destination references + * an instruction RAM or ROM or a data RAM or ROM, that + * source and/or destination will always be accessed with + * 32-bit load and store instructions (as required for these + * types of devices). + * + * !!!!!!! XTFIXME: + * !!!!!!! Handling of IRAM/IROM has not yet + * !!!!!!! been implemented. + * + * The (general case) algorithm is as follows: + * If end of source doesn't overlap destination then use memcpy. + * Otherwise do memcpy backwards. + * + * Register use: + * a0/ return address + * a1/ stack pointer + * a2/ return value + * a3/ src + * a4/ length + * a5/ dst + * a6/ tmp + * a7/ tmp + * a8/ tmp + * a9/ tmp + * a10/ tmp + * a11/ tmp + */ + +/* + * Byte by byte copy + */ + .align 4 + .byte 0 # 1 mod 4 alignment for LOOPNEZ + # (0 mod 4 alignment for LBEG) +.Lbackbytecopy: +#if XCHAL_HAVE_LOOPS + loopnez a4, .Lbackbytecopydone +#else /* !XCHAL_HAVE_LOOPS */ + beqz a4, .Lbackbytecopydone + sub a7, a3, a4 # a7 = start address for source +#endif /* !XCHAL_HAVE_LOOPS */ +.Lbacknextbyte: + addi a3, a3, -1 + l8ui a6, a3, 0 + addi a5, a5, -1 + s8i a6, a5, 0 +#if !XCHAL_HAVE_LOOPS + bne a3, a7, .Lbacknextbyte # continue loop if + # $a3:src != $a7:src_start +#endif /* !XCHAL_HAVE_LOOPS */ +.Lbackbytecopydone: + abi_ret_default + +/* + * Destination is unaligned + */ + + .align 4 +.Lbackdst1mod2: # dst is only byte aligned + _bltui a4, 7, .Lbackbytecopy # do short copies byte by byte + + # copy 1 byte + addi a3, a3, -1 + l8ui a6, a3, 0 + addi a5, a5, -1 + s8i a6, a5, 0 + addi a4, a4, -1 + _bbci.l a5, 1, .Lbackdstaligned # if dst is now aligned, then + # return to main algorithm +.Lbackdst2mod4: # dst 16-bit aligned + # copy 2 bytes + _bltui a4, 6, .Lbackbytecopy # do short copies byte by byte + addi a3, a3, -2 + l8ui a6, a3, 0 + l8ui a7, a3, 1 + addi a5, a5, -2 + s8i a6, a5, 0 + s8i a7, a5, 1 + addi a4, a4, -2 + j .Lbackdstaligned # dst is now aligned, + # return to main algorithm + +ENTRY(__memmove) +WEAK(memmove) + + abi_entry_default + # a2/ dst, a3/ src, a4/ len + mov a5, a2 # copy dst so that a2 is return value +.Lmovecommon: + sub a6, a5, a3 + bgeu a6, a4, .Lcommon + + add a5, a5, a4 + add a3, a3, a4 + + _bbsi.l a5, 0, .Lbackdst1mod2 # if dst is 1 mod 2 + _bbsi.l a5, 1, .Lbackdst2mod4 # if dst is 2 mod 4 +.Lbackdstaligned: # return here from .Lbackdst?mod? once dst is aligned + srli a7, a4, 4 # number of loop iterations with 16B + # per iteration + movi a8, 3 # if source is not aligned, + _bany a3, a8, .Lbacksrcunaligned # then use shifting copy + /* + * Destination and source are word-aligned, use word copy. + */ + # copy 16 bytes per iteration for word-aligned dst and word-aligned src +#if XCHAL_HAVE_LOOPS + loopnez a7, .LbackLoop1done +#else /* !XCHAL_HAVE_LOOPS */ + beqz a7, .LbackLoop1done + slli a8, a7, 4 + sub a8, a3, a8 # a8 = start of first 16B source chunk +#endif /* !XCHAL_HAVE_LOOPS */ +.LbackLoop1: + addi a3, a3, -16 + l32i a7, a3, 12 + l32i a6, a3, 8 + addi a5, a5, -16 + s32i a7, a5, 12 + l32i a7, a3, 4 + s32i a6, a5, 8 + l32i a6, a3, 0 + s32i a7, a5, 4 + s32i a6, a5, 0 +#if !XCHAL_HAVE_LOOPS + bne a3, a8, .LbackLoop1 # continue loop if a3:src != a8:src_start +#endif /* !XCHAL_HAVE_LOOPS */ +.LbackLoop1done: + bbci.l a4, 3, .Lback2 + # copy 8 bytes + addi a3, a3, -8 + l32i a6, a3, 0 + l32i a7, a3, 4 + addi a5, a5, -8 + s32i a6, a5, 0 + s32i a7, a5, 4 +.Lback2: + bbsi.l a4, 2, .Lback3 + bbsi.l a4, 1, .Lback4 + bbsi.l a4, 0, .Lback5 + abi_ret_default +.Lback3: + # copy 4 bytes + addi a3, a3, -4 + l32i a6, a3, 0 + addi a5, a5, -4 + s32i a6, a5, 0 + bbsi.l a4, 1, .Lback4 + bbsi.l a4, 0, .Lback5 + abi_ret_default +.Lback4: + # copy 2 bytes + addi a3, a3, -2 + l16ui a6, a3, 0 + addi a5, a5, -2 + s16i a6, a5, 0 + bbsi.l a4, 0, .Lback5 + abi_ret_default +.Lback5: + # copy 1 byte + addi a3, a3, -1 + l8ui a6, a3, 0 + addi a5, a5, -1 + s8i a6, a5, 0 + abi_ret_default + +/* + * Destination is aligned, Source is unaligned + */ + + .align 4 +.Lbacksrcunaligned: + _beqz a4, .Lbackdone # avoid loading anything for zero-length copies + # copy 16 bytes per iteration for word-aligned dst and unaligned src + __ssa8 a3 # set shift amount from byte offset +#define SIM_CHECKS_ALIGNMENT 1 /* set to 1 when running on ISS with + * the lint or ferret client, or 0 + * to save a few cycles */ +#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT + and a11, a3, a8 # save unalignment offset for below + sub a3, a3, a11 # align a3 +#endif + l32i a6, a3, 0 # load first word +#if XCHAL_HAVE_LOOPS + loopnez a7, .LbackLoop2done +#else /* !XCHAL_HAVE_LOOPS */ + beqz a7, .LbackLoop2done + slli a10, a7, 4 + sub a10, a3, a10 # a10 = start of first 16B source chunk +#endif /* !XCHAL_HAVE_LOOPS */ +.LbackLoop2: + addi a3, a3, -16 + l32i a7, a3, 12 + l32i a8, a3, 8 + addi a5, a5, -16 + __src_b a6, a7, a6 + s32i a6, a5, 12 + l32i a9, a3, 4 + __src_b a7, a8, a7 + s32i a7, a5, 8 + l32i a6, a3, 0 + __src_b a8, a9, a8 + s32i a8, a5, 4 + __src_b a9, a6, a9 + s32i a9, a5, 0 +#if !XCHAL_HAVE_LOOPS + bne a3, a10, .LbackLoop2 # continue loop if a3:src != a10:src_start +#endif /* !XCHAL_HAVE_LOOPS */ +.LbackLoop2done: + bbci.l a4, 3, .Lback12 + # copy 8 bytes + addi a3, a3, -8 + l32i a7, a3, 4 + l32i a8, a3, 0 + addi a5, a5, -8 + __src_b a6, a7, a6 + s32i a6, a5, 4 + __src_b a7, a8, a7 + s32i a7, a5, 0 + mov a6, a8 +.Lback12: + bbci.l a4, 2, .Lback13 + # copy 4 bytes + addi a3, a3, -4 + l32i a7, a3, 0 + addi a5, a5, -4 + __src_b a6, a7, a6 + s32i a6, a5, 0 + mov a6, a7 +.Lback13: +#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT + add a3, a3, a11 # readjust a3 with correct misalignment +#endif + bbsi.l a4, 1, .Lback14 + bbsi.l a4, 0, .Lback15 +.Lbackdone: + abi_ret_default +.Lback14: + # copy 2 bytes + addi a3, a3, -2 + l8ui a6, a3, 0 + l8ui a7, a3, 1 + addi a5, a5, -2 + s8i a6, a5, 0 + s8i a7, a5, 1 + bbsi.l a4, 0, .Lback15 + abi_ret_default +.Lback15: + # copy 1 byte + addi a3, a3, -1 + addi a5, a5, -1 + l8ui a6, a3, 0 + s8i a6, a5, 0 + abi_ret_default + +ENDPROC(__memmove) diff --git a/arch/xtensa/lib/memset.S b/arch/xtensa/lib/memset.S new file mode 100644 index 000000000..59b1524fd --- /dev/null +++ b/arch/xtensa/lib/memset.S @@ -0,0 +1,153 @@ +/* + * arch/xtensa/lib/memset.S + * + * ANSI C standard library function memset + * (Well, almost. .fixup code might return zero.) + * + * This file is subject to the terms and conditions of the GNU General + * Public License. See the file "COPYING" in the main directory of + * this archive for more details. + * + * Copyright (C) 2002 Tensilica Inc. + */ + +#include +#include +#include + +/* + * void *memset(void *dst, int c, size_t length) + * + * The algorithm is as follows: + * Create a word with c in all byte positions + * If the destination is aligned, + * do 16B chucks with a loop, and then finish up with + * 8B, 4B, 2B, and 1B stores conditional on the length. + * If destination is unaligned, align it by conditionally + * setting 1B and 2B and then go to aligned case. + * This code tries to use fall-through branches for the common + * case of an aligned destination (except for the branches to + * the alignment labels). + */ + +.text +ENTRY(__memset) +WEAK(memset) + + abi_entry_default + # a2/ dst, a3/ c, a4/ length + extui a3, a3, 0, 8 # mask to just 8 bits + slli a7, a3, 8 # duplicate character in all bytes of word + or a3, a3, a7 # ... + slli a7, a3, 16 # ... + or a3, a3, a7 # ... + mov a5, a2 # copy dst so that a2 is return value + movi a6, 3 # for alignment tests + bany a2, a6, .Ldstunaligned # if dst is unaligned +.L0: # return here from .Ldstunaligned when dst is aligned + srli a7, a4, 4 # number of loop iterations with 16B + # per iteration + bnez a4, .Laligned + abi_ret_default + +/* + * Destination is word-aligned. + */ + # set 16 bytes per iteration for word-aligned dst + .align 4 # 1 mod 4 alignment for LOOPNEZ + .byte 0 # (0 mod 4 alignment for LBEG) +.Laligned: +#if XCHAL_HAVE_LOOPS + loopnez a7, .Loop1done +#else /* !XCHAL_HAVE_LOOPS */ + beqz a7, .Loop1done + slli a6, a7, 4 + add a6, a6, a5 # a6 = end of last 16B chunk +#endif /* !XCHAL_HAVE_LOOPS */ +.Loop1: +EX(10f) s32i a3, a5, 0 +EX(10f) s32i a3, a5, 4 +EX(10f) s32i a3, a5, 8 +EX(10f) s32i a3, a5, 12 + addi a5, a5, 16 +#if !XCHAL_HAVE_LOOPS + blt a5, a6, .Loop1 +#endif /* !XCHAL_HAVE_LOOPS */ +.Loop1done: + bbci.l a4, 3, .L2 + # set 8 bytes +EX(10f) s32i a3, a5, 0 +EX(10f) s32i a3, a5, 4 + addi a5, a5, 8 +.L2: + bbci.l a4, 2, .L3 + # set 4 bytes +EX(10f) s32i a3, a5, 0 + addi a5, a5, 4 +.L3: + bbci.l a4, 1, .L4 + # set 2 bytes +EX(10f) s16i a3, a5, 0 + addi a5, a5, 2 +.L4: + bbci.l a4, 0, .L5 + # set 1 byte +EX(10f) s8i a3, a5, 0 +.L5: +.Lret1: + abi_ret_default + +/* + * Destination is unaligned + */ + +.Ldstunaligned: + bltui a4, 8, .Lbyteset # do short copies byte by byte + bbci.l a5, 0, .L20 # branch if dst alignment half-aligned + # dst is only byte aligned + # set 1 byte +EX(10f) s8i a3, a5, 0 + addi a5, a5, 1 + addi a4, a4, -1 + # now retest if dst aligned + bbci.l a5, 1, .L0 # if now aligned, return to main algorithm +.L20: + # dst half-aligned + # set 2 bytes +EX(10f) s16i a3, a5, 0 + addi a5, a5, 2 + addi a4, a4, -2 + j .L0 # dst is now aligned, return to main algorithm + +/* + * Byte by byte set + */ + .align 4 + .byte 0 # 1 mod 4 alignment for LOOPNEZ + # (0 mod 4 alignment for LBEG) +.Lbyteset: +#if XCHAL_HAVE_LOOPS + loopnez a4, .Lbytesetdone +#else /* !XCHAL_HAVE_LOOPS */ + beqz a4, .Lbytesetdone + add a6, a5, a4 # a6 = ending address +#endif /* !XCHAL_HAVE_LOOPS */ +.Lbyteloop: +EX(10f) s8i a3, a5, 0 + addi a5, a5, 1 +#if !XCHAL_HAVE_LOOPS + blt a5, a6, .Lbyteloop +#endif /* !XCHAL_HAVE_LOOPS */ +.Lbytesetdone: + abi_ret_default + +ENDPROC(__memset) + + .section .fixup, "ax" + .align 4 + +/* We return zero if a failure occurred. */ + +10: + movi a2, 0 + abi_ret_default diff --git a/arch/xtensa/lib/modsi3.S b/arch/xtensa/lib/modsi3.S new file mode 100644 index 000000000..d00e77181 --- /dev/null +++ b/arch/xtensa/lib/modsi3.S @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later WITH GCC-exception-2.0 */ +#include +#include +#include + +ENTRY(__modsi3) + + abi_entry_default +#if XCHAL_HAVE_DIV32 + rems a2, a2, a3 +#else + mov a7, a2 /* save original (signed) dividend */ + do_abs a2, a2, a4 /* udividend = abs (dividend) */ + do_abs a3, a3, a4 /* udivisor = abs (divisor) */ + bltui a3, 2, .Lle_one /* check if udivisor <= 1 */ + do_nsau a5, a2, a6, a8 /* udividend_shift = nsau (udividend) */ + do_nsau a4, a3, a6, a8 /* udivisor_shift = nsau (udivisor) */ + bgeu a5, a4, .Lspecial + + sub a4, a4, a5 /* count = udivisor_shift - udividend_shift */ + ssl a4 + sll a3, a3 /* udivisor <<= count */ + + /* test-subtract-and-shift loop */ +#if XCHAL_HAVE_LOOPS + loopnez a4, .Lloopend +#endif /* XCHAL_HAVE_LOOPS */ +.Lloop: + bltu a2, a3, .Lzerobit + sub a2, a2, a3 +.Lzerobit: + srli a3, a3, 1 +#if !XCHAL_HAVE_LOOPS + addi a4, a4, -1 + bnez a4, .Lloop +#endif /* !XCHAL_HAVE_LOOPS */ +.Lloopend: + +.Lspecial: + bltu a2, a3, .Lreturn + sub a2, a2, a3 /* subtract again if udividend >= udivisor */ +.Lreturn: + bgez a7, .Lpositive + neg a2, a2 /* if (dividend < 0), return -udividend */ +.Lpositive: + abi_ret_default + +.Lle_one: + bnez a3, .Lreturn0 + + /* Divide by zero: Use an illegal instruction to force an exception. + The subsequent "DIV0" string can be recognized by the exception + handler to identify the real cause of the exception. */ + ill + .ascii "DIV0" + +.Lreturn0: + movi a2, 0 +#endif /* XCHAL_HAVE_DIV32 */ + abi_ret_default + +ENDPROC(__modsi3) + +#if !XCHAL_HAVE_NSA + .section .rodata + .align 4 + .global __nsau_data + .type __nsau_data, @object +__nsau_data: + .byte 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4 + .byte 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 + .byte 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 + .byte 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 + .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .size __nsau_data, . - __nsau_data +#endif /* !XCHAL_HAVE_NSA */ diff --git a/arch/xtensa/lib/mulsi3.S b/arch/xtensa/lib/mulsi3.S new file mode 100644 index 000000000..91a9d7c62 --- /dev/null +++ b/arch/xtensa/lib/mulsi3.S @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later WITH GCC-exception-2.0 */ +#include +#include +#include + + .macro do_addx2 dst, as, at, tmp +#if XCHAL_HAVE_ADDX + addx2 \dst, \as, \at +#else + slli \tmp, \as, 1 + add \dst, \tmp, \at +#endif + .endm + + .macro do_addx4 dst, as, at, tmp +#if XCHAL_HAVE_ADDX + addx4 \dst, \as, \at +#else + slli \tmp, \as, 2 + add \dst, \tmp, \at +#endif + .endm + + .macro do_addx8 dst, as, at, tmp +#if XCHAL_HAVE_ADDX + addx8 \dst, \as, \at +#else + slli \tmp, \as, 3 + add \dst, \tmp, \at +#endif + .endm + +ENTRY(__mulsi3) + + abi_entry_default + +#if XCHAL_HAVE_MUL32 + mull a2, a2, a3 + +#elif XCHAL_HAVE_MUL16 + or a4, a2, a3 + srai a4, a4, 16 + bnez a4, .LMUL16 + mul16u a2, a2, a3 + abi_ret_default +.LMUL16: + srai a4, a2, 16 + srai a5, a3, 16 + mul16u a7, a4, a3 + mul16u a6, a5, a2 + mul16u a4, a2, a3 + add a7, a7, a6 + slli a7, a7, 16 + add a2, a7, a4 + +#elif XCHAL_HAVE_MAC16 + mul.aa.hl a2, a3 + mula.aa.lh a2, a3 + rsr a5, ACCLO + umul.aa.ll a2, a3 + rsr a4, ACCLO + slli a5, a5, 16 + add a2, a4, a5 + +#else /* !MUL32 && !MUL16 && !MAC16 */ + + /* Multiply one bit at a time, but unroll the loop 4x to better + exploit the addx instructions and avoid overhead. + Peel the first iteration to save a cycle on init. */ + + /* Avoid negative numbers. */ + xor a5, a2, a3 /* Top bit is 1 if one input is negative. */ + do_abs a3, a3, a6 + do_abs a2, a2, a6 + + /* Swap so the second argument is smaller. */ + sub a7, a2, a3 + mov a4, a3 + movgez a4, a2, a7 /* a4 = max (a2, a3) */ + movltz a3, a2, a7 /* a3 = min (a2, a3) */ + + movi a2, 0 + extui a6, a3, 0, 1 + movnez a2, a4, a6 + + do_addx2 a7, a4, a2, a7 + extui a6, a3, 1, 1 + movnez a2, a7, a6 + + do_addx4 a7, a4, a2, a7 + extui a6, a3, 2, 1 + movnez a2, a7, a6 + + do_addx8 a7, a4, a2, a7 + extui a6, a3, 3, 1 + movnez a2, a7, a6 + + bgeui a3, 16, .Lmult_main_loop + neg a3, a2 + movltz a2, a3, a5 + abi_ret_default + + .align 4 +.Lmult_main_loop: + srli a3, a3, 4 + slli a4, a4, 4 + + add a7, a4, a2 + extui a6, a3, 0, 1 + movnez a2, a7, a6 + + do_addx2 a7, a4, a2, a7 + extui a6, a3, 1, 1 + movnez a2, a7, a6 + + do_addx4 a7, a4, a2, a7 + extui a6, a3, 2, 1 + movnez a2, a7, a6 + + do_addx8 a7, a4, a2, a7 + extui a6, a3, 3, 1 + movnez a2, a7, a6 + + bgeui a3, 16, .Lmult_main_loop + + neg a3, a2 + movltz a2, a3, a5 + +#endif /* !MUL32 && !MUL16 && !MAC16 */ + + abi_ret_default + +ENDPROC(__mulsi3) diff --git a/arch/xtensa/lib/pci-auto.c b/arch/xtensa/lib/pci-auto.c new file mode 100644 index 000000000..aa6752237 --- /dev/null +++ b/arch/xtensa/lib/pci-auto.c @@ -0,0 +1,316 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * arch/xtensa/lib/pci-auto.c + * + * PCI autoconfiguration library + * + * Copyright (C) 2001 - 2005 Tensilica Inc. + * + * Chris Zankel + * + * Based on work from Matt Porter + */ + +#include +#include +#include + +#include + + +/* + * + * Setting up a PCI + * + * pci_ctrl->first_busno = + * pci_ctrl->last_busno = + * pci_ctrl->ops = + * pci_ctrl->map_irq = + * + * pci_ctrl->io_space.start = + * pci_ctrl->io_space.end = + * pci_ctrl->io_space.base = + * pci_ctrl->mem_space.start = + * pci_ctrl->mem_space.end = + * pci_ctrl->mem_space.base = + * + * pcibios_init_resource(&pci_ctrl->io_resource, , + * , IORESOURCE_IO, "PCI host bridge"); + * pcibios_init_resource(&pci_ctrl->mem_resources[0], , + * , IORESOURCE_MEM, "PCI host bridge"); + * + * pci_ctrl->last_busno = pciauto_bus_scan(pci_ctrl,pci_ctrl->first_busno); + * + * int __init pciauto_bus_scan(struct pci_controller *pci_ctrl, int current_bus) + * + */ + +static int pciauto_upper_iospc; +static int pciauto_upper_memspc; + +static struct pci_dev pciauto_dev; +static struct pci_bus pciauto_bus; + +/* + * Helper functions + */ + +/* Initialize the bars of a PCI device. */ + +static void __init +pciauto_setup_bars(struct pci_dev *dev, int bar_limit) +{ + int bar_size; + int bar, bar_nr; + int *upper_limit; + int found_mem64 = 0; + + for (bar = PCI_BASE_ADDRESS_0, bar_nr = 0; + bar <= bar_limit; + bar+=4, bar_nr++) + { + /* Tickle the BAR and get the size */ + pci_write_config_dword(dev, bar, 0xffffffff); + pci_read_config_dword(dev, bar, &bar_size); + + /* If BAR is not implemented go to the next BAR */ + if (!bar_size) + continue; + + /* Check the BAR type and set our address mask */ + if (bar_size & PCI_BASE_ADDRESS_SPACE_IO) + { + bar_size &= PCI_BASE_ADDRESS_IO_MASK; + upper_limit = &pciauto_upper_iospc; + pr_debug("PCI Autoconfig: BAR %d, I/O, ", bar_nr); + } + else + { + if ((bar_size & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == + PCI_BASE_ADDRESS_MEM_TYPE_64) + found_mem64 = 1; + + bar_size &= PCI_BASE_ADDRESS_MEM_MASK; + upper_limit = &pciauto_upper_memspc; + pr_debug("PCI Autoconfig: BAR %d, Mem, ", bar_nr); + } + + /* Allocate a base address (bar_size is negative!) */ + *upper_limit = (*upper_limit + bar_size) & bar_size; + + /* Write it out and update our limit */ + pci_write_config_dword(dev, bar, *upper_limit); + + /* + * If we are a 64-bit decoder then increment to the + * upper 32 bits of the bar and force it to locate + * in the lower 4GB of memory. + */ + + if (found_mem64) + pci_write_config_dword(dev, (bar+=4), 0x00000000); + + pr_debug("size=0x%x, address=0x%x\n", + ~bar_size + 1, *upper_limit); + } +} + +/* Initialize the interrupt number. */ + +static void __init +pciauto_setup_irq(struct pci_controller* pci_ctrl,struct pci_dev *dev,int devfn) +{ + u8 pin; + int irq = 0; + + pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); + + /* Fix illegal pin numbers. */ + + if (pin == 0 || pin > 4) + pin = 1; + + if (pci_ctrl->map_irq) + irq = pci_ctrl->map_irq(dev, PCI_SLOT(devfn), pin); + + if (irq == -1) + irq = 0; + + pr_debug("PCI Autoconfig: Interrupt %d, pin %d\n", irq, pin); + + pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq); +} + + +static void __init +pciauto_prescan_setup_bridge(struct pci_dev *dev, int current_bus, + int sub_bus, int *iosave, int *memsave) +{ + /* Configure bus number registers */ + pci_write_config_byte(dev, PCI_PRIMARY_BUS, current_bus); + pci_write_config_byte(dev, PCI_SECONDARY_BUS, sub_bus + 1); + pci_write_config_byte(dev, PCI_SUBORDINATE_BUS, 0xff); + + /* Round memory allocator to 1MB boundary */ + pciauto_upper_memspc &= ~(0x100000 - 1); + *memsave = pciauto_upper_memspc; + + /* Round I/O allocator to 4KB boundary */ + pciauto_upper_iospc &= ~(0x1000 - 1); + *iosave = pciauto_upper_iospc; + + /* Set up memory and I/O filter limits, assume 32-bit I/O space */ + pci_write_config_word(dev, PCI_MEMORY_LIMIT, + ((pciauto_upper_memspc - 1) & 0xfff00000) >> 16); + pci_write_config_byte(dev, PCI_IO_LIMIT, + ((pciauto_upper_iospc - 1) & 0x0000f000) >> 8); + pci_write_config_word(dev, PCI_IO_LIMIT_UPPER16, + ((pciauto_upper_iospc - 1) & 0xffff0000) >> 16); +} + +static void __init +pciauto_postscan_setup_bridge(struct pci_dev *dev, int current_bus, int sub_bus, + int *iosave, int *memsave) +{ + int cmdstat; + + /* Configure bus number registers */ + pci_write_config_byte(dev, PCI_SUBORDINATE_BUS, sub_bus); + + /* + * Round memory allocator to 1MB boundary. + * If no space used, allocate minimum. + */ + pciauto_upper_memspc &= ~(0x100000 - 1); + if (*memsave == pciauto_upper_memspc) + pciauto_upper_memspc -= 0x00100000; + + pci_write_config_word(dev, PCI_MEMORY_BASE, pciauto_upper_memspc >> 16); + + /* Allocate 1MB for pre-fretch */ + pci_write_config_word(dev, PCI_PREF_MEMORY_LIMIT, + ((pciauto_upper_memspc - 1) & 0xfff00000) >> 16); + + pciauto_upper_memspc -= 0x100000; + + pci_write_config_word(dev, PCI_PREF_MEMORY_BASE, + pciauto_upper_memspc >> 16); + + /* Round I/O allocator to 4KB boundary */ + pciauto_upper_iospc &= ~(0x1000 - 1); + if (*iosave == pciauto_upper_iospc) + pciauto_upper_iospc -= 0x1000; + + pci_write_config_byte(dev, PCI_IO_BASE, + (pciauto_upper_iospc & 0x0000f000) >> 8); + pci_write_config_word(dev, PCI_IO_BASE_UPPER16, + pciauto_upper_iospc >> 16); + + /* Enable memory and I/O accesses, enable bus master */ + pci_read_config_dword(dev, PCI_COMMAND, &cmdstat); + pci_write_config_dword(dev, PCI_COMMAND, + cmdstat | + PCI_COMMAND_IO | + PCI_COMMAND_MEMORY | + PCI_COMMAND_MASTER); +} + +/* + * Scan the current PCI bus. + */ + + +int __init pciauto_bus_scan(struct pci_controller *pci_ctrl, int current_bus) +{ + int sub_bus, pci_devfn, pci_class, cmdstat, found_multi=0; + unsigned short vid; + unsigned char header_type; + struct pci_dev *dev = &pciauto_dev; + + pciauto_dev.bus = &pciauto_bus; + pciauto_dev.sysdata = pci_ctrl; + pciauto_bus.ops = pci_ctrl->ops; + + /* + * Fetch our I/O and memory space upper boundaries used + * to allocated base addresses on this pci_controller. + */ + + if (current_bus == pci_ctrl->first_busno) + { + pciauto_upper_iospc = pci_ctrl->io_resource.end + 1; + pciauto_upper_memspc = pci_ctrl->mem_resources[0].end + 1; + } + + sub_bus = current_bus; + + for (pci_devfn = 0; pci_devfn < 0xff; pci_devfn++) + { + /* Skip our host bridge */ + if ((current_bus == pci_ctrl->first_busno) && (pci_devfn == 0)) + continue; + + if (PCI_FUNC(pci_devfn) && !found_multi) + continue; + + pciauto_bus.number = current_bus; + pciauto_dev.devfn = pci_devfn; + + /* If config space read fails from this device, move on */ + if (pci_read_config_byte(dev, PCI_HEADER_TYPE, &header_type)) + continue; + + if (!PCI_FUNC(pci_devfn)) + found_multi = header_type & 0x80; + pci_read_config_word(dev, PCI_VENDOR_ID, &vid); + + if (vid == 0xffff || vid == 0x0000) { + found_multi = 0; + continue; + } + + pci_read_config_dword(dev, PCI_CLASS_REVISION, &pci_class); + + if ((pci_class >> 16) == PCI_CLASS_BRIDGE_PCI) { + + int iosave, memsave; + + pr_debug("PCI Autoconfig: Found P2P bridge, device %d\n", + PCI_SLOT(pci_devfn)); + + /* Allocate PCI I/O and/or memory space */ + pciauto_setup_bars(dev, PCI_BASE_ADDRESS_1); + + pciauto_prescan_setup_bridge(dev, current_bus, sub_bus, + &iosave, &memsave); + sub_bus = pciauto_bus_scan(pci_ctrl, sub_bus+1); + pciauto_postscan_setup_bridge(dev, current_bus, sub_bus, + &iosave, &memsave); + pciauto_bus.number = current_bus; + + continue; + + } + + /* + * Found a peripheral, enable some standard + * settings + */ + + pci_read_config_dword(dev, PCI_COMMAND, &cmdstat); + pci_write_config_dword(dev, PCI_COMMAND, + cmdstat | + PCI_COMMAND_IO | + PCI_COMMAND_MEMORY | + PCI_COMMAND_MASTER); + pci_write_config_byte(dev, PCI_LATENCY_TIMER, 0x80); + + /* Allocate PCI I/O and/or memory space */ + pr_debug("PCI Autoconfig: Found Bus %d, Device %d, Function %d\n", + current_bus, PCI_SLOT(pci_devfn), PCI_FUNC(pci_devfn)); + + pciauto_setup_bars(dev, PCI_BASE_ADDRESS_5); + pciauto_setup_irq(pci_ctrl, dev, pci_devfn); + } + return sub_bus; +} diff --git a/arch/xtensa/lib/strncpy_user.S b/arch/xtensa/lib/strncpy_user.S new file mode 100644 index 000000000..073191222 --- /dev/null +++ b/arch/xtensa/lib/strncpy_user.S @@ -0,0 +1,216 @@ +/* + * arch/xtensa/lib/strncpy_user.S + * + * This file is subject to the terms and conditions of the GNU General + * Public License. See the file "COPYING" in the main directory of + * this archive for more details. + * + * Returns: -EFAULT if exception before terminator, N if the entire + * buffer filled, else strlen. + * + * Copyright (C) 2002 Tensilica Inc. + */ + +#include +#include +#include +#include + +/* + * char *__strncpy_user(char *dst, const char *src, size_t len) + */ + +#ifdef __XTENSA_EB__ +# define MASK0 0xff000000 +# define MASK1 0x00ff0000 +# define MASK2 0x0000ff00 +# define MASK3 0x000000ff +#else +# define MASK0 0x000000ff +# define MASK1 0x0000ff00 +# define MASK2 0x00ff0000 +# define MASK3 0xff000000 +#endif + +# Register use +# a0/ return address +# a1/ stack pointer +# a2/ return value +# a3/ src +# a4/ len +# a5/ mask0 +# a6/ mask1 +# a7/ mask2 +# a8/ mask3 +# a9/ tmp +# a10/ tmp +# a11/ dst + +.text +ENTRY(__strncpy_user) + + abi_entry_default + # a2/ dst, a3/ src, a4/ len + mov a11, a2 # leave dst in return value register + beqz a4, .Lret # if len is zero + movi a5, MASK0 # mask for byte 0 + movi a6, MASK1 # mask for byte 1 + movi a7, MASK2 # mask for byte 2 + movi a8, MASK3 # mask for byte 3 + bbsi.l a3, 0, .Lsrc1mod2 # if only 8-bit aligned + bbsi.l a3, 1, .Lsrc2mod4 # if only 16-bit aligned +.Lsrcaligned: # return here when src is word-aligned + srli a10, a4, 2 # number of loop iterations with 4B per loop + movi a9, 3 + bnone a11, a9, .Laligned + j .Ldstunaligned + +.Lsrc1mod2: # src address is odd +EX(11f) l8ui a9, a3, 0 # get byte 0 + addi a3, a3, 1 # advance src pointer +EX(10f) s8i a9, a11, 0 # store byte 0 + beqz a9, .Lret # if byte 0 is zero + addi a11, a11, 1 # advance dst pointer + addi a4, a4, -1 # decrement len + beqz a4, .Lret # if len is zero + bbci.l a3, 1, .Lsrcaligned # if src is now word-aligned + +.Lsrc2mod4: # src address is 2 mod 4 +EX(11f) l8ui a9, a3, 0 # get byte 0 + /* 1-cycle interlock */ +EX(10f) s8i a9, a11, 0 # store byte 0 + beqz a9, .Lret # if byte 0 is zero + addi a11, a11, 1 # advance dst pointer + addi a4, a4, -1 # decrement len + beqz a4, .Lret # if len is zero +EX(11f) l8ui a9, a3, 1 # get byte 0 + addi a3, a3, 2 # advance src pointer +EX(10f) s8i a9, a11, 0 # store byte 0 + beqz a9, .Lret # if byte 0 is zero + addi a11, a11, 1 # advance dst pointer + addi a4, a4, -1 # decrement len + bnez a4, .Lsrcaligned # if len is nonzero +.Lret: + sub a2, a11, a2 # compute strlen + abi_ret_default + +/* + * dst is word-aligned, src is word-aligned + */ + .align 4 # 1 mod 4 alignment for LOOPNEZ + .byte 0 # (0 mod 4 alignment for LBEG) +.Laligned: +#if XCHAL_HAVE_LOOPS + loopnez a10, .Loop1done +#else + beqz a10, .Loop1done + slli a10, a10, 2 + add a10, a10, a11 # a10 = end of last 4B chunck +#endif +.Loop1: +EX(11f) l32i a9, a3, 0 # get word from src + addi a3, a3, 4 # advance src pointer + bnone a9, a5, .Lz0 # if byte 0 is zero + bnone a9, a6, .Lz1 # if byte 1 is zero + bnone a9, a7, .Lz2 # if byte 2 is zero +EX(10f) s32i a9, a11, 0 # store word to dst + bnone a9, a8, .Lz3 # if byte 3 is zero + addi a11, a11, 4 # advance dst pointer +#if !XCHAL_HAVE_LOOPS + blt a11, a10, .Loop1 +#endif + +.Loop1done: + bbci.l a4, 1, .L100 + # copy 2 bytes +EX(11f) l16ui a9, a3, 0 + addi a3, a3, 2 # advance src pointer +#ifdef __XTENSA_EB__ + bnone a9, a7, .Lz0 # if byte 2 is zero + bnone a9, a8, .Lz1 # if byte 3 is zero +#else + bnone a9, a5, .Lz0 # if byte 0 is zero + bnone a9, a6, .Lz1 # if byte 1 is zero +#endif +EX(10f) s16i a9, a11, 0 + addi a11, a11, 2 # advance dst pointer +.L100: + bbci.l a4, 0, .Lret +EX(11f) l8ui a9, a3, 0 + /* slot */ +EX(10f) s8i a9, a11, 0 + beqz a9, .Lret # if byte is zero + addi a11, a11, 1-3 # advance dst ptr 1, but also cancel + # the effect of adding 3 in .Lz3 code + /* fall thru to .Lz3 and "retw" */ + +.Lz3: # byte 3 is zero + addi a11, a11, 3 # advance dst pointer + sub a2, a11, a2 # compute strlen + abi_ret_default +.Lz0: # byte 0 is zero +#ifdef __XTENSA_EB__ + movi a9, 0 +#endif /* __XTENSA_EB__ */ +EX(10f) s8i a9, a11, 0 + sub a2, a11, a2 # compute strlen + abi_ret_default +.Lz1: # byte 1 is zero +#ifdef __XTENSA_EB__ + extui a9, a9, 16, 16 +#endif /* __XTENSA_EB__ */ +EX(10f) s16i a9, a11, 0 + addi a11, a11, 1 # advance dst pointer + sub a2, a11, a2 # compute strlen + abi_ret_default +.Lz2: # byte 2 is zero +#ifdef __XTENSA_EB__ + extui a9, a9, 16, 16 +#endif /* __XTENSA_EB__ */ +EX(10f) s16i a9, a11, 0 + movi a9, 0 +EX(10f) s8i a9, a11, 2 + addi a11, a11, 2 # advance dst pointer + sub a2, a11, a2 # compute strlen + abi_ret_default + + .align 4 # 1 mod 4 alignment for LOOPNEZ + .byte 0 # (0 mod 4 alignment for LBEG) +.Ldstunaligned: +/* + * for now just use byte copy loop + */ +#if XCHAL_HAVE_LOOPS + loopnez a4, .Lunalignedend +#else + beqz a4, .Lunalignedend + add a10, a11, a4 # a10 = ending address +#endif /* XCHAL_HAVE_LOOPS */ +.Lnextbyte: +EX(11f) l8ui a9, a3, 0 + addi a3, a3, 1 +EX(10f) s8i a9, a11, 0 + beqz a9, .Lunalignedend + addi a11, a11, 1 +#if !XCHAL_HAVE_LOOPS + blt a11, a10, .Lnextbyte +#endif + +.Lunalignedend: + sub a2, a11, a2 # compute strlen + abi_ret_default + +ENDPROC(__strncpy_user) + + .section .fixup, "ax" + .align 4 + + /* For now, just return -EFAULT. Future implementations might + * like to clear remaining kernel space, like the fixup + * implementation in memset(). Thus, we differentiate between + * load/store fixups. */ + +10: +11: + movi a2, -EFAULT + abi_ret_default diff --git a/arch/xtensa/lib/strnlen_user.S b/arch/xtensa/lib/strnlen_user.S new file mode 100644 index 000000000..3d391dca3 --- /dev/null +++ b/arch/xtensa/lib/strnlen_user.S @@ -0,0 +1,141 @@ +/* + * arch/xtensa/lib/strnlen_user.S + * + * This file is subject to the terms and conditions of the GNU General + * Public License. See the file "COPYING" in the main directory of + * this archive for more details. + * + * Returns strnlen, including trailing zero terminator. + * Zero indicates error. + * + * Copyright (C) 2002 Tensilica Inc. + */ + +#include +#include +#include + +/* + * size_t __strnlen_user(const char *s, size_t len) + */ + +#ifdef __XTENSA_EB__ +# define MASK0 0xff000000 +# define MASK1 0x00ff0000 +# define MASK2 0x0000ff00 +# define MASK3 0x000000ff +#else +# define MASK0 0x000000ff +# define MASK1 0x0000ff00 +# define MASK2 0x00ff0000 +# define MASK3 0xff000000 +#endif + +# Register use: +# a2/ src +# a3/ len +# a4/ tmp +# a5/ mask0 +# a6/ mask1 +# a7/ mask2 +# a8/ mask3 +# a9/ tmp +# a10/ tmp + +.text +ENTRY(__strnlen_user) + + abi_entry_default + # a2/ s, a3/ len + addi a4, a2, -4 # because we overincrement at the end; + # we compensate with load offsets of 4 + movi a5, MASK0 # mask for byte 0 + movi a6, MASK1 # mask for byte 1 + movi a7, MASK2 # mask for byte 2 + movi a8, MASK3 # mask for byte 3 + bbsi.l a2, 0, .L1mod2 # if only 8-bit aligned + bbsi.l a2, 1, .L2mod4 # if only 16-bit aligned + +/* + * String is word-aligned. + */ +.Laligned: + srli a10, a3, 2 # number of loop iterations with 4B per loop +#if XCHAL_HAVE_LOOPS + loopnez a10, .Ldone +#else + beqz a10, .Ldone + slli a10, a10, 2 + add a10, a10, a4 # a10 = end of last 4B chunk +#endif /* XCHAL_HAVE_LOOPS */ +.Loop: +EX(10f) l32i a9, a4, 4 # get next word of string + addi a4, a4, 4 # advance string pointer + bnone a9, a5, .Lz0 # if byte 0 is zero + bnone a9, a6, .Lz1 # if byte 1 is zero + bnone a9, a7, .Lz2 # if byte 2 is zero + bnone a9, a8, .Lz3 # if byte 3 is zero +#if !XCHAL_HAVE_LOOPS + blt a4, a10, .Loop +#endif + +.Ldone: +EX(10f) l32i a9, a4, 4 # load 4 bytes for remaining checks + + bbci.l a3, 1, .L100 + # check two more bytes (bytes 0, 1 of word) + addi a4, a4, 2 # advance string pointer + bnone a9, a5, .Lz0 # if byte 0 is zero + bnone a9, a6, .Lz1 # if byte 1 is zero +.L100: + bbci.l a3, 0, .L101 + # check one more byte (byte 2 of word) + # Actually, we don't need to check. Zero or nonzero, we'll add one. + # Do not add an extra one for the NULL terminator since we have + # exhausted the original len parameter. + addi a4, a4, 1 # advance string pointer +.L101: + sub a2, a4, a2 # compute length + abi_ret_default + +# NOTE that in several places below, we point to the byte just after +# the zero byte in order to include the NULL terminator in the count. + +.Lz3: # byte 3 is zero + addi a4, a4, 3 # point to zero byte +.Lz0: # byte 0 is zero + addi a4, a4, 1 # point just beyond zero byte + sub a2, a4, a2 # subtract to get length + abi_ret_default +.Lz1: # byte 1 is zero + addi a4, a4, 1+1 # point just beyond zero byte + sub a2, a4, a2 # subtract to get length + abi_ret_default +.Lz2: # byte 2 is zero + addi a4, a4, 2+1 # point just beyond zero byte + sub a2, a4, a2 # subtract to get length + abi_ret_default + +.L1mod2: # address is odd +EX(10f) l8ui a9, a4, 4 # get byte 0 + addi a4, a4, 1 # advance string pointer + beqz a9, .Lz3 # if byte 0 is zero + bbci.l a4, 1, .Laligned # if string pointer is now word-aligned + +.L2mod4: # address is 2 mod 4 + addi a4, a4, 2 # advance ptr for aligned access +EX(10f) l32i a9, a4, 0 # get word with first two bytes of string + bnone a9, a7, .Lz2 # if byte 2 (of word, not string) is zero + bany a9, a8, .Laligned # if byte 3 (of word, not string) is nonzero + # byte 3 is zero + addi a4, a4, 3+1 # point just beyond zero byte + sub a2, a4, a2 # subtract to get length + abi_ret_default + +ENDPROC(__strnlen_user) + + .section .fixup, "ax" + .align 4 +10: + movi a2, 0 + abi_ret_default diff --git a/arch/xtensa/lib/udivsi3.S b/arch/xtensa/lib/udivsi3.S new file mode 100644 index 000000000..d2477e078 --- /dev/null +++ b/arch/xtensa/lib/udivsi3.S @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later WITH GCC-exception-2.0 */ +#include +#include +#include + +ENTRY(__udivsi3) + + abi_entry_default +#if XCHAL_HAVE_DIV32 + quou a2, a2, a3 +#else + bltui a3, 2, .Lle_one /* check if the divisor <= 1 */ + + mov a6, a2 /* keep dividend in a6 */ + do_nsau a5, a6, a2, a7 /* dividend_shift = nsau (dividend) */ + do_nsau a4, a3, a2, a7 /* divisor_shift = nsau (divisor) */ + bgeu a5, a4, .Lspecial + + sub a4, a4, a5 /* count = divisor_shift - dividend_shift */ + ssl a4 + sll a3, a3 /* divisor <<= count */ + movi a2, 0 /* quotient = 0 */ + + /* test-subtract-and-shift loop; one quotient bit on each iteration */ +#if XCHAL_HAVE_LOOPS + loopnez a4, .Lloopend +#endif /* XCHAL_HAVE_LOOPS */ +.Lloop: + bltu a6, a3, .Lzerobit + sub a6, a6, a3 + addi a2, a2, 1 +.Lzerobit: + slli a2, a2, 1 + srli a3, a3, 1 +#if !XCHAL_HAVE_LOOPS + addi a4, a4, -1 + bnez a4, .Lloop +#endif /* !XCHAL_HAVE_LOOPS */ +.Lloopend: + + bltu a6, a3, .Lreturn + addi a2, a2, 1 /* increment quotient if dividend >= divisor */ +.Lreturn: + abi_ret_default + +.Lle_one: + beqz a3, .Lerror /* if divisor == 1, return the dividend */ + abi_ret_default + +.Lspecial: + /* return dividend >= divisor */ + bltu a6, a3, .Lreturn0 + movi a2, 1 + abi_ret_default + +.Lerror: + /* Divide by zero: Use an illegal instruction to force an exception. + The subsequent "DIV0" string can be recognized by the exception + handler to identify the real cause of the exception. */ + ill + .ascii "DIV0" + +.Lreturn0: + movi a2, 0 +#endif /* XCHAL_HAVE_DIV32 */ + abi_ret_default + +ENDPROC(__udivsi3) diff --git a/arch/xtensa/lib/umodsi3.S b/arch/xtensa/lib/umodsi3.S new file mode 100644 index 000000000..5f031bfa0 --- /dev/null +++ b/arch/xtensa/lib/umodsi3.S @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later WITH GCC-exception-2.0 */ +#include +#include +#include + +ENTRY(__umodsi3) + + abi_entry_default +#if XCHAL_HAVE_DIV32 + remu a2, a2, a3 +#else + bltui a3, 2, .Lle_one /* check if the divisor is <= 1 */ + + do_nsau a5, a2, a6, a7 /* dividend_shift = nsau (dividend) */ + do_nsau a4, a3, a6, a7 /* divisor_shift = nsau (divisor) */ + bgeu a5, a4, .Lspecial + + sub a4, a4, a5 /* count = divisor_shift - dividend_shift */ + ssl a4 + sll a3, a3 /* divisor <<= count */ + + /* test-subtract-and-shift loop */ +#if XCHAL_HAVE_LOOPS + loopnez a4, .Lloopend +#endif /* XCHAL_HAVE_LOOPS */ +.Lloop: + bltu a2, a3, .Lzerobit + sub a2, a2, a3 +.Lzerobit: + srli a3, a3, 1 +#if !XCHAL_HAVE_LOOPS + addi a4, a4, -1 + bnez a4, .Lloop +#endif /* !XCHAL_HAVE_LOOPS */ +.Lloopend: + +.Lspecial: + bltu a2, a3, .Lreturn + sub a2, a2, a3 /* subtract once more if dividend >= divisor */ +.Lreturn: + abi_ret_default + +.Lle_one: + bnez a3, .Lreturn0 + + /* Divide by zero: Use an illegal instruction to force an exception. + The subsequent "DIV0" string can be recognized by the exception + handler to identify the real cause of the exception. */ + ill + .ascii "DIV0" + +.Lreturn0: + movi a2, 0 +#endif /* XCHAL_HAVE_DIV32 */ + abi_ret_default + +ENDPROC(__umodsi3) diff --git a/arch/xtensa/lib/umulsidi3.S b/arch/xtensa/lib/umulsidi3.S new file mode 100644 index 000000000..4d9ba2387 --- /dev/null +++ b/arch/xtensa/lib/umulsidi3.S @@ -0,0 +1,232 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later WITH GCC-exception-2.0 */ +#include +#include +#include + +#if XCHAL_HAVE_MUL16 || XCHAL_HAVE_MUL32 || XCHAL_HAVE_MAC16 +#define XCHAL_NO_MUL 0 +#else +#define XCHAL_NO_MUL 1 +#endif + +ENTRY(__umulsidi3) + +#ifdef __XTENSA_CALL0_ABI__ + abi_entry(32) + s32i a12, sp, 16 + s32i a13, sp, 20 + s32i a14, sp, 24 + s32i a15, sp, 28 +#elif XCHAL_NO_MUL + /* This is not really a leaf function; allocate enough stack space + to allow CALL12s to a helper function. */ + abi_entry(32) +#else + abi_entry_default +#endif + +#ifdef __XTENSA_EB__ +#define wh a2 +#define wl a3 +#else +#define wh a3 +#define wl a2 +#endif /* __XTENSA_EB__ */ + + /* This code is taken from the mulsf3 routine in ieee754-sf.S. + See more comments there. */ + +#if XCHAL_HAVE_MUL32_HIGH + mull a6, a2, a3 + muluh wh, a2, a3 + mov wl, a6 + +#else /* ! MUL32_HIGH */ + +#if defined(__XTENSA_CALL0_ABI__) && XCHAL_NO_MUL + /* a0 and a8 will be clobbered by calling the multiply function + but a8 is not used here and need not be saved. */ + s32i a0, sp, 0 +#endif + +#if XCHAL_HAVE_MUL16 || XCHAL_HAVE_MUL32 + +#define a2h a4 +#define a3h a5 + + /* Get the high halves of the inputs into registers. */ + srli a2h, a2, 16 + srli a3h, a3, 16 + +#define a2l a2 +#define a3l a3 + +#if XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MUL16 + /* Clear the high halves of the inputs. This does not matter + for MUL16 because the high bits are ignored. */ + extui a2, a2, 0, 16 + extui a3, a3, 0, 16 +#endif +#endif /* MUL16 || MUL32 */ + + +#if XCHAL_HAVE_MUL16 + +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + mul16u dst, xreg ## xhalf, yreg ## yhalf + +#elif XCHAL_HAVE_MUL32 + +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + mull dst, xreg ## xhalf, yreg ## yhalf + +#elif XCHAL_HAVE_MAC16 + +/* The preprocessor insists on inserting a space when concatenating after + a period in the definition of do_mul below. These macros are a workaround + using underscores instead of periods when doing the concatenation. */ +#define umul_aa_ll umul.aa.ll +#define umul_aa_lh umul.aa.lh +#define umul_aa_hl umul.aa.hl +#define umul_aa_hh umul.aa.hh + +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + umul_aa_ ## xhalf ## yhalf xreg, yreg; \ + rsr dst, ACCLO + +#else /* no multiply hardware */ + +#define set_arg_l(dst, src) \ + extui dst, src, 0, 16 +#define set_arg_h(dst, src) \ + srli dst, src, 16 + +#ifdef __XTENSA_CALL0_ABI__ +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + set_arg_ ## xhalf (a13, xreg); \ + set_arg_ ## yhalf (a14, yreg); \ + call0 .Lmul_mulsi3; \ + mov dst, a12 +#else +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + set_arg_ ## xhalf (a14, xreg); \ + set_arg_ ## yhalf (a15, yreg); \ + call12 .Lmul_mulsi3; \ + mov dst, a14 +#endif /* __XTENSA_CALL0_ABI__ */ + +#endif /* no multiply hardware */ + + /* Add pp1 and pp2 into a6 with carry-out in a9. */ + do_mul(a6, a2, l, a3, h) /* pp 1 */ + do_mul(a11, a2, h, a3, l) /* pp 2 */ + movi a9, 0 + add a6, a6, a11 + bgeu a6, a11, 1f + addi a9, a9, 1 +1: + /* Shift the high half of a9/a6 into position in a9. Note that + this value can be safely incremented without any carry-outs. */ + ssai 16 + src a9, a9, a6 + + /* Compute the low word into a6. */ + do_mul(a11, a2, l, a3, l) /* pp 0 */ + sll a6, a6 + add a6, a6, a11 + bgeu a6, a11, 1f + addi a9, a9, 1 +1: + /* Compute the high word into wh. */ + do_mul(wh, a2, h, a3, h) /* pp 3 */ + add wh, wh, a9 + mov wl, a6 + +#endif /* !MUL32_HIGH */ + +#if defined(__XTENSA_CALL0_ABI__) && XCHAL_NO_MUL + /* Restore the original return address. */ + l32i a0, sp, 0 +#endif +#ifdef __XTENSA_CALL0_ABI__ + l32i a12, sp, 16 + l32i a13, sp, 20 + l32i a14, sp, 24 + l32i a15, sp, 28 + abi_ret(32) +#else + abi_ret_default +#endif + +#if XCHAL_NO_MUL + + .macro do_addx2 dst, as, at, tmp +#if XCHAL_HAVE_ADDX + addx2 \dst, \as, \at +#else + slli \tmp, \as, 1 + add \dst, \tmp, \at +#endif + .endm + + .macro do_addx4 dst, as, at, tmp +#if XCHAL_HAVE_ADDX + addx4 \dst, \as, \at +#else + slli \tmp, \as, 2 + add \dst, \tmp, \at +#endif + .endm + + .macro do_addx8 dst, as, at, tmp +#if XCHAL_HAVE_ADDX + addx8 \dst, \as, \at +#else + slli \tmp, \as, 3 + add \dst, \tmp, \at +#endif + .endm + + /* For Xtensa processors with no multiply hardware, this simplified + version of _mulsi3 is used for multiplying 16-bit chunks of + the floating-point mantissas. When using CALL0, this function + uses a custom ABI: the inputs are passed in a13 and a14, the + result is returned in a12, and a8 and a15 are clobbered. */ + .align 4 +.Lmul_mulsi3: + abi_entry_default + + .macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2 + movi \dst, 0 +1: add \tmp1, \src2, \dst + extui \tmp2, \src1, 0, 1 + movnez \dst, \tmp1, \tmp2 + + do_addx2 \tmp1, \src2, \dst, \tmp1 + extui \tmp2, \src1, 1, 1 + movnez \dst, \tmp1, \tmp2 + + do_addx4 \tmp1, \src2, \dst, \tmp1 + extui \tmp2, \src1, 2, 1 + movnez \dst, \tmp1, \tmp2 + + do_addx8 \tmp1, \src2, \dst, \tmp1 + extui \tmp2, \src1, 3, 1 + movnez \dst, \tmp1, \tmp2 + + srli \src1, \src1, 4 + slli \src2, \src2, 4 + bnez \src1, 1b + .endm + +#ifdef __XTENSA_CALL0_ABI__ + mul_mulsi3_body a12, a13, a14, a15, a8 +#else + /* The result will be written into a2, so save that argument in a4. */ + mov a4, a2 + mul_mulsi3_body a2, a4, a3, a5, a6 +#endif + abi_ret_default +#endif /* XCHAL_NO_MUL */ + +ENDPROC(__umulsidi3) diff --git a/arch/xtensa/lib/usercopy.S b/arch/xtensa/lib/usercopy.S new file mode 100644 index 000000000..16128c094 --- /dev/null +++ b/arch/xtensa/lib/usercopy.S @@ -0,0 +1,300 @@ +/* + * arch/xtensa/lib/usercopy.S + * + * Copy to/from user space (derived from arch/xtensa/lib/hal/memcopy.S) + * + * DO NOT COMBINE this function with . + * It needs to remain separate and distinct. The hal files are part + * of the Xtensa link-time HAL, and those files may differ per + * processor configuration. Patching the kernel for another + * processor configuration includes replacing the hal files, and we + * could lose the special functionality for accessing user-space + * memory during such a patch. We sacrifice a little code space here + * in favor to simplify code maintenance. + * + * This file is subject to the terms and conditions of the GNU General + * Public License. See the file "COPYING" in the main directory of + * this archive for more details. + * + * Copyright (C) 2002 Tensilica Inc. + */ + + +/* + * size_t __xtensa_copy_user (void *dst, const void *src, size_t len); + * + * The returned value is the number of bytes not copied. Implies zero + * is success. + * + * The general case algorithm is as follows: + * If the destination and source are both aligned, + * do 16B chunks with a loop, and then finish up with + * 8B, 4B, 2B, and 1B copies conditional on the length. + * If destination is aligned and source unaligned, + * do the same, but use SRC to align the source data. + * If destination is unaligned, align it by conditionally + * copying 1B and 2B and then retest. + * This code tries to use fall-through braches for the common + * case of aligned destinations (except for the branches to + * the alignment label). + * + * Register use: + * a0/ return address + * a1/ stack pointer + * a2/ return value + * a3/ src + * a4/ length + * a5/ dst + * a6/ tmp + * a7/ tmp + * a8/ tmp + * a9/ tmp + * a10/ tmp + * a11/ original length + */ + +#include +#include +#include + + .text +ENTRY(__xtensa_copy_user) + +#if !XCHAL_HAVE_LOOPS && defined(__XTENSA_CALL0_ABI__) +#define STACK_SIZE 4 +#else +#define STACK_SIZE 0 +#endif + abi_entry(STACK_SIZE) + # a2/ dst, a3/ src, a4/ len + mov a5, a2 # copy dst so that a2 is return value + mov a11, a4 # preserve original len for error case +.Lcommon: + bbsi.l a2, 0, .Ldst1mod2 # if dst is 1 mod 2 + bbsi.l a2, 1, .Ldst2mod4 # if dst is 2 mod 4 +.Ldstaligned: # return here from .Ldstunaligned when dst is aligned + srli a7, a4, 4 # number of loop iterations with 16B + # per iteration + movi a8, 3 # if source is also aligned, + bnone a3, a8, .Laligned # then use word copy + __ssa8 a3 # set shift amount from byte offset + bnez a4, .Lsrcunaligned + movi a2, 0 # return success for len==0 + abi_ret(STACK_SIZE) + +/* + * Destination is unaligned + */ + +.Ldst1mod2: # dst is only byte aligned + bltui a4, 7, .Lbytecopy # do short copies byte by byte + + # copy 1 byte +EX(10f) l8ui a6, a3, 0 + addi a3, a3, 1 +EX(10f) s8i a6, a5, 0 + addi a5, a5, 1 + addi a4, a4, -1 + bbci.l a5, 1, .Ldstaligned # if dst is now aligned, then + # return to main algorithm +.Ldst2mod4: # dst 16-bit aligned + # copy 2 bytes + bltui a4, 6, .Lbytecopy # do short copies byte by byte +EX(10f) l8ui a6, a3, 0 +EX(10f) l8ui a7, a3, 1 + addi a3, a3, 2 +EX(10f) s8i a6, a5, 0 +EX(10f) s8i a7, a5, 1 + addi a5, a5, 2 + addi a4, a4, -2 + j .Ldstaligned # dst is now aligned, return to main algorithm + +/* + * Byte by byte copy + */ + .align 4 + .byte 0 # 1 mod 4 alignment for LOOPNEZ + # (0 mod 4 alignment for LBEG) +.Lbytecopy: +#if XCHAL_HAVE_LOOPS + loopnez a4, .Lbytecopydone +#else /* !XCHAL_HAVE_LOOPS */ + beqz a4, .Lbytecopydone + add a7, a3, a4 # a7 = end address for source +#endif /* !XCHAL_HAVE_LOOPS */ +.Lnextbyte: +EX(10f) l8ui a6, a3, 0 + addi a3, a3, 1 +EX(10f) s8i a6, a5, 0 + addi a5, a5, 1 +#if !XCHAL_HAVE_LOOPS + blt a3, a7, .Lnextbyte +#endif /* !XCHAL_HAVE_LOOPS */ +.Lbytecopydone: + movi a2, 0 # return success for len bytes copied + abi_ret(STACK_SIZE) + +/* + * Destination and source are word-aligned. + */ + # copy 16 bytes per iteration for word-aligned dst and word-aligned src + .align 4 # 1 mod 4 alignment for LOOPNEZ + .byte 0 # (0 mod 4 alignment for LBEG) +.Laligned: +#if XCHAL_HAVE_LOOPS + loopnez a7, .Loop1done +#else /* !XCHAL_HAVE_LOOPS */ + beqz a7, .Loop1done + slli a8, a7, 4 + add a8, a8, a3 # a8 = end of last 16B source chunk +#endif /* !XCHAL_HAVE_LOOPS */ +.Loop1: +EX(10f) l32i a6, a3, 0 +EX(10f) l32i a7, a3, 4 +EX(10f) s32i a6, a5, 0 +EX(10f) l32i a6, a3, 8 +EX(10f) s32i a7, a5, 4 +EX(10f) l32i a7, a3, 12 +EX(10f) s32i a6, a5, 8 + addi a3, a3, 16 +EX(10f) s32i a7, a5, 12 + addi a5, a5, 16 +#if !XCHAL_HAVE_LOOPS + blt a3, a8, .Loop1 +#endif /* !XCHAL_HAVE_LOOPS */ +.Loop1done: + bbci.l a4, 3, .L2 + # copy 8 bytes +EX(10f) l32i a6, a3, 0 +EX(10f) l32i a7, a3, 4 + addi a3, a3, 8 +EX(10f) s32i a6, a5, 0 +EX(10f) s32i a7, a5, 4 + addi a5, a5, 8 +.L2: + bbci.l a4, 2, .L3 + # copy 4 bytes +EX(10f) l32i a6, a3, 0 + addi a3, a3, 4 +EX(10f) s32i a6, a5, 0 + addi a5, a5, 4 +.L3: + bbci.l a4, 1, .L4 + # copy 2 bytes +EX(10f) l16ui a6, a3, 0 + addi a3, a3, 2 +EX(10f) s16i a6, a5, 0 + addi a5, a5, 2 +.L4: + bbci.l a4, 0, .L5 + # copy 1 byte +EX(10f) l8ui a6, a3, 0 +EX(10f) s8i a6, a5, 0 +.L5: + movi a2, 0 # return success for len bytes copied + abi_ret(STACK_SIZE) + +/* + * Destination is aligned, Source is unaligned + */ + + .align 4 + .byte 0 # 1 mod 4 alignement for LOOPNEZ + # (0 mod 4 alignment for LBEG) +.Lsrcunaligned: + # copy 16 bytes per iteration for word-aligned dst and unaligned src + and a10, a3, a8 # save unalignment offset for below + sub a3, a3, a10 # align a3 (to avoid sim warnings only; not needed for hardware) +EX(10f) l32i a6, a3, 0 # load first word +#if XCHAL_HAVE_LOOPS + loopnez a7, .Loop2done +#else /* !XCHAL_HAVE_LOOPS */ + beqz a7, .Loop2done +#if defined(__XTENSA_CALL0_ABI__) + s32i a10, a1, 0 + slli a10, a7, 4 + add a10, a10, a3 # a10 = end of last 16B source chunk +#else + slli a12, a7, 4 + add a12, a12, a3 # a12 = end of last 16B source chunk +#endif +#endif /* !XCHAL_HAVE_LOOPS */ +.Loop2: +EX(10f) l32i a7, a3, 4 +EX(10f) l32i a8, a3, 8 + __src_b a6, a6, a7 +EX(10f) s32i a6, a5, 0 +EX(10f) l32i a9, a3, 12 + __src_b a7, a7, a8 +EX(10f) s32i a7, a5, 4 +EX(10f) l32i a6, a3, 16 + __src_b a8, a8, a9 +EX(10f) s32i a8, a5, 8 + addi a3, a3, 16 + __src_b a9, a9, a6 +EX(10f) s32i a9, a5, 12 + addi a5, a5, 16 +#if !XCHAL_HAVE_LOOPS +#if defined(__XTENSA_CALL0_ABI__) + blt a3, a10, .Loop2 + l32i a10, a1, 0 +#else + blt a3, a12, .Loop2 +#endif +#endif /* !XCHAL_HAVE_LOOPS */ +.Loop2done: + bbci.l a4, 3, .L12 + # copy 8 bytes +EX(10f) l32i a7, a3, 4 +EX(10f) l32i a8, a3, 8 + __src_b a6, a6, a7 +EX(10f) s32i a6, a5, 0 + addi a3, a3, 8 + __src_b a7, a7, a8 +EX(10f) s32i a7, a5, 4 + addi a5, a5, 8 + mov a6, a8 +.L12: + bbci.l a4, 2, .L13 + # copy 4 bytes +EX(10f) l32i a7, a3, 4 + addi a3, a3, 4 + __src_b a6, a6, a7 +EX(10f) s32i a6, a5, 0 + addi a5, a5, 4 + mov a6, a7 +.L13: + add a3, a3, a10 # readjust a3 with correct misalignment + bbci.l a4, 1, .L14 + # copy 2 bytes +EX(10f) l8ui a6, a3, 0 +EX(10f) l8ui a7, a3, 1 + addi a3, a3, 2 +EX(10f) s8i a6, a5, 0 +EX(10f) s8i a7, a5, 1 + addi a5, a5, 2 +.L14: + bbci.l a4, 0, .L15 + # copy 1 byte +EX(10f) l8ui a6, a3, 0 +EX(10f) s8i a6, a5, 0 +.L15: + movi a2, 0 # return success for len bytes copied + abi_ret(STACK_SIZE) + +ENDPROC(__xtensa_copy_user) + + .section .fixup, "ax" + .align 4 + +/* a2 = original dst; a5 = current dst; a11= original len + * bytes_copied = a5 - a2 + * retval = bytes_not_copied = original len - bytes_copied + * retval = a11 - (a5 - a2) + */ + + +10: + sub a2, a5, a2 /* a2 <-- bytes copied */ + sub a2, a11, a2 /* a2 <-- bytes not copied */ + abi_ret(STACK_SIZE) -- cgit v1.2.3