diff options
Diffstat (limited to '')
29 files changed, 4850 insertions, 0 deletions
diff --git a/arch/sh/lib/Makefile b/arch/sh/lib/Makefile new file mode 100644 index 000000000..eb473d373 --- /dev/null +++ b/arch/sh/lib/Makefile @@ -0,0 +1,32 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for SuperH-specific library files.. +# + +lib-y = delay.o memmove.o memchr.o \ + checksum.o strlen.o div64.o div64-generic.o + +# Extracted from libgcc +obj-y += movmem.o ashldi3.o ashrdi3.o lshrdi3.o \ + ashlsi3.o ashrsi3.o ashiftrt.o lshrsi3.o \ + udiv_qrnnd.o + +udivsi3-y := udivsi3_i4i-Os.o + +ifneq ($(CONFIG_CC_OPTIMIZE_FOR_SIZE),y) +udivsi3-$(CONFIG_CPU_SH3) := udivsi3_i4i.o +udivsi3-$(CONFIG_CPU_SH4) := udivsi3_i4i.o +endif +udivsi3-y += udivsi3.o + +obj-y += io.o + +memcpy-y := memcpy.o +memcpy-$(CONFIG_CPU_SH4) := memcpy-sh4.o + +memset-y := memset.o +memset-$(CONFIG_CPU_SH4) := memset-sh4.o + +lib-$(CONFIG_MMU) += copy_page.o __clear_user.o +lib-$(CONFIG_MCOUNT) += mcount.o +lib-y += $(memcpy-y) $(memset-y) $(udivsi3-y) diff --git a/arch/sh/lib/__clear_user.S b/arch/sh/lib/__clear_user.S new file mode 100644 index 000000000..097860690 --- /dev/null +++ b/arch/sh/lib/__clear_user.S @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * __clear_user_page, __clear_user, clear_page implementation of SuperH + * + * Copyright (C) 2001 Kaz Kojima + * Copyright (C) 2001, 2002 Niibe Yutaka + * Copyright (C) 2006 Paul Mundt + */ +#include <linux/linkage.h> +#include <asm/page.h> + +ENTRY(__clear_user) + ! + mov #0, r0 + mov #0xffffffe0, r1 + ! + ! r4..(r4+31)&~32 -------- not aligned [ Area 0 ] + ! (r4+31)&~32..(r4+r5)&~32 -------- aligned [ Area 1 ] + ! (r4+r5)&~32..r4+r5 -------- not aligned [ Area 2 ] + ! + ! Clear area 0 + mov r4, r2 + ! + tst r1, r5 ! length < 32 + bt .Larea2 ! skip to remainder + ! + add #31, r2 + and r1, r2 + cmp/eq r4, r2 + bt .Larea1 + mov r2, r3 + sub r4, r3 + mov r3, r7 + mov r4, r2 + ! +.L0: dt r3 +0: mov.b r0, @r2 + bf/s .L0 + add #1, r2 + ! + sub r7, r5 + mov r2, r4 +.Larea1: + mov r4, r3 + add r5, r3 + and r1, r3 + cmp/hi r2, r3 + bf .Larea2 + ! + ! Clear area 1 +#if defined(CONFIG_CPU_SH4) +1: movca.l r0, @r2 +#else +1: mov.l r0, @r2 +#endif + add #4, r2 +2: mov.l r0, @r2 + add #4, r2 +3: mov.l r0, @r2 + add #4, r2 +4: mov.l r0, @r2 + add #4, r2 +5: mov.l r0, @r2 + add #4, r2 +6: mov.l r0, @r2 + add #4, r2 +7: mov.l r0, @r2 + add #4, r2 +8: mov.l r0, @r2 + add #4, r2 + cmp/hi r2, r3 + bt/s 1b + nop + ! + ! Clear area 2 +.Larea2: + mov r4, r3 + add r5, r3 + cmp/hs r3, r2 + bt/s .Ldone + sub r2, r3 +.L2: dt r3 +9: mov.b r0, @r2 + bf/s .L2 + add #1, r2 + ! +.Ldone: rts + mov #0, r0 ! return 0 as normal return + + ! return the number of bytes remained +.Lbad_clear_user: + mov r4, r0 + add r5, r0 + rts + sub r2, r0 + +.section __ex_table,"a" + .align 2 + .long 0b, .Lbad_clear_user + .long 1b, .Lbad_clear_user + .long 2b, .Lbad_clear_user + .long 3b, .Lbad_clear_user + .long 4b, .Lbad_clear_user + .long 5b, .Lbad_clear_user + .long 6b, .Lbad_clear_user + .long 7b, .Lbad_clear_user + .long 8b, .Lbad_clear_user + .long 9b, .Lbad_clear_user +.previous diff --git a/arch/sh/lib/ashiftrt.S b/arch/sh/lib/ashiftrt.S new file mode 100644 index 000000000..0f7145e3c --- /dev/null +++ b/arch/sh/lib/ashiftrt.S @@ -0,0 +1,128 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH GCC-exception-2.0 + + Copyright (C) 1994, 1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003, + 2004, 2005, 2006 + Free Software Foundation, Inc. +*/ + +!! libgcc routines for the Renesas / SuperH SH CPUs. +!! Contributed by Steve Chamberlain. +!! sac@cygnus.com + +!! ashiftrt_r4_x, ___ashrsi3, ___ashlsi3, ___lshrsi3 routines +!! recoded in assembly by Toshiyasu Morita +!! tm@netcom.com + +/* SH2 optimizations for ___ashrsi3, ___ashlsi3, ___lshrsi3 and + ELF local label prefixes by J"orn Rennecke + amylaar@cygnus.com */ + + .global __ashiftrt_r4_0 + .global __ashiftrt_r4_1 + .global __ashiftrt_r4_2 + .global __ashiftrt_r4_3 + .global __ashiftrt_r4_4 + .global __ashiftrt_r4_5 + .global __ashiftrt_r4_6 + .global __ashiftrt_r4_7 + .global __ashiftrt_r4_8 + .global __ashiftrt_r4_9 + .global __ashiftrt_r4_10 + .global __ashiftrt_r4_11 + .global __ashiftrt_r4_12 + .global __ashiftrt_r4_13 + .global __ashiftrt_r4_14 + .global __ashiftrt_r4_15 + .global __ashiftrt_r4_16 + .global __ashiftrt_r4_17 + .global __ashiftrt_r4_18 + .global __ashiftrt_r4_19 + .global __ashiftrt_r4_20 + .global __ashiftrt_r4_21 + .global __ashiftrt_r4_22 + .global __ashiftrt_r4_23 + .global __ashiftrt_r4_24 + .global __ashiftrt_r4_25 + .global __ashiftrt_r4_26 + .global __ashiftrt_r4_27 + .global __ashiftrt_r4_28 + .global __ashiftrt_r4_29 + .global __ashiftrt_r4_30 + .global __ashiftrt_r4_31 + .global __ashiftrt_r4_32 + + .align 1 +__ashiftrt_r4_32: +__ashiftrt_r4_31: + rotcl r4 + rts + subc r4,r4 +__ashiftrt_r4_30: + shar r4 +__ashiftrt_r4_29: + shar r4 +__ashiftrt_r4_28: + shar r4 +__ashiftrt_r4_27: + shar r4 +__ashiftrt_r4_26: + shar r4 +__ashiftrt_r4_25: + shar r4 +__ashiftrt_r4_24: + shlr16 r4 + shlr8 r4 + rts + exts.b r4,r4 +__ashiftrt_r4_23: + shar r4 +__ashiftrt_r4_22: + shar r4 +__ashiftrt_r4_21: + shar r4 +__ashiftrt_r4_20: + shar r4 +__ashiftrt_r4_19: + shar r4 +__ashiftrt_r4_18: + shar r4 +__ashiftrt_r4_17: + shar r4 +__ashiftrt_r4_16: + shlr16 r4 + rts + exts.w r4,r4 +__ashiftrt_r4_15: + shar r4 +__ashiftrt_r4_14: + shar r4 +__ashiftrt_r4_13: + shar r4 +__ashiftrt_r4_12: + shar r4 +__ashiftrt_r4_11: + shar r4 +__ashiftrt_r4_10: + shar r4 +__ashiftrt_r4_9: + shar r4 +__ashiftrt_r4_8: + shar r4 +__ashiftrt_r4_7: + shar r4 +__ashiftrt_r4_6: + shar r4 +__ashiftrt_r4_5: + shar r4 +__ashiftrt_r4_4: + shar r4 +__ashiftrt_r4_3: + shar r4 +__ashiftrt_r4_2: + shar r4 +__ashiftrt_r4_1: + rts + shar r4 +__ashiftrt_r4_0: + rts + nop diff --git a/arch/sh/lib/ashldi3.c b/arch/sh/lib/ashldi3.c new file mode 100644 index 000000000..e5afe0935 --- /dev/null +++ b/arch/sh/lib/ashldi3.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/module.h> + +#include "libgcc.h" + +long long __ashldi3(long long u, word_type b) +{ + DWunion uu, w; + word_type bm; + + if (b == 0) + return u; + + uu.ll = u; + bm = 32 - b; + + if (bm <= 0) { + w.s.low = 0; + w.s.high = (unsigned int) uu.s.low << -bm; + } else { + const unsigned int carries = (unsigned int) uu.s.low >> bm; + + w.s.low = (unsigned int) uu.s.low << b; + w.s.high = ((unsigned int) uu.s.high << b) | carries; + } + + return w.ll; +} + +EXPORT_SYMBOL(__ashldi3); diff --git a/arch/sh/lib/ashlsi3.S b/arch/sh/lib/ashlsi3.S new file mode 100644 index 000000000..4df4401cd --- /dev/null +++ b/arch/sh/lib/ashlsi3.S @@ -0,0 +1,189 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH GCC-exception-2.0 + + Copyright (C) 1994, 1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003, + 2004, 2005, 2006 + Free Software Foundation, Inc. +*/ + +!! libgcc routines for the Renesas / SuperH SH CPUs. +!! Contributed by Steve Chamberlain. +!! sac@cygnus.com + +!! ashiftrt_r4_x, ___ashrsi3, ___ashlsi3, ___lshrsi3 routines +!! recoded in assembly by Toshiyasu Morita +!! tm@netcom.com + +/* SH2 optimizations for ___ashrsi3, ___ashlsi3, ___lshrsi3 and + ELF local label prefixes by J"orn Rennecke + amylaar@cygnus.com */ + +! +! __ashlsi3 +! +! Entry: +! +! r4: Value to shift +! r5: Shifts +! +! Exit: +! +! r0: Result +! +! Destroys: +! +! (none) +! +! __ashlsi3_r0 +! +! Entry: +! +! r4: Value to shift +! r0: Shifts +! +! Exit: +! +! r0: Result +! +! Destroys: +! +! (none) + + + .global __ashlsi3 + .global __ashlsi3_r0 + + .align 2 +__ashlsi3: + mov r5,r0 + .align 2 +__ashlsi3_r0: + and #31,r0 + mov.l r4,@-r15 + mov r0,r4 + mova ashlsi3_table,r0 + mov.b @(r0,r4),r4 + add r4,r0 + jmp @r0 + mov.l @r15+,r0 + + .align 2 +ashlsi3_table: + .byte ashlsi3_0-ashlsi3_table + .byte ashlsi3_1-ashlsi3_table + .byte ashlsi3_2-ashlsi3_table + .byte ashlsi3_3-ashlsi3_table + .byte ashlsi3_4-ashlsi3_table + .byte ashlsi3_5-ashlsi3_table + .byte ashlsi3_6-ashlsi3_table + .byte ashlsi3_7-ashlsi3_table + .byte ashlsi3_8-ashlsi3_table + .byte ashlsi3_9-ashlsi3_table + .byte ashlsi3_10-ashlsi3_table + .byte ashlsi3_11-ashlsi3_table + .byte ashlsi3_12-ashlsi3_table + .byte ashlsi3_13-ashlsi3_table + .byte ashlsi3_14-ashlsi3_table + .byte ashlsi3_15-ashlsi3_table + .byte ashlsi3_16-ashlsi3_table + .byte ashlsi3_17-ashlsi3_table + .byte ashlsi3_18-ashlsi3_table + .byte ashlsi3_19-ashlsi3_table + .byte ashlsi3_20-ashlsi3_table + .byte ashlsi3_21-ashlsi3_table + .byte ashlsi3_22-ashlsi3_table + .byte ashlsi3_23-ashlsi3_table + .byte ashlsi3_24-ashlsi3_table + .byte ashlsi3_25-ashlsi3_table + .byte ashlsi3_26-ashlsi3_table + .byte ashlsi3_27-ashlsi3_table + .byte ashlsi3_28-ashlsi3_table + .byte ashlsi3_29-ashlsi3_table + .byte ashlsi3_30-ashlsi3_table + .byte ashlsi3_31-ashlsi3_table + +ashlsi3_6: + shll2 r0 +ashlsi3_4: + shll2 r0 +ashlsi3_2: + rts + shll2 r0 + +ashlsi3_7: + shll2 r0 +ashlsi3_5: + shll2 r0 +ashlsi3_3: + shll2 r0 +ashlsi3_1: + rts + shll r0 + +ashlsi3_14: + shll2 r0 +ashlsi3_12: + shll2 r0 +ashlsi3_10: + shll2 r0 +ashlsi3_8: + rts + shll8 r0 + +ashlsi3_15: + shll2 r0 +ashlsi3_13: + shll2 r0 +ashlsi3_11: + shll2 r0 +ashlsi3_9: + shll8 r0 + rts + shll r0 + +ashlsi3_22: + shll2 r0 +ashlsi3_20: + shll2 r0 +ashlsi3_18: + shll2 r0 +ashlsi3_16: + rts + shll16 r0 + +ashlsi3_23: + shll2 r0 +ashlsi3_21: + shll2 r0 +ashlsi3_19: + shll2 r0 +ashlsi3_17: + shll16 r0 + rts + shll r0 + +ashlsi3_30: + shll2 r0 +ashlsi3_28: + shll2 r0 +ashlsi3_26: + shll2 r0 +ashlsi3_24: + shll16 r0 + rts + shll8 r0 + +ashlsi3_31: + shll2 r0 +ashlsi3_29: + shll2 r0 +ashlsi3_27: + shll2 r0 +ashlsi3_25: + shll16 r0 + shll8 r0 + rts + shll r0 + +ashlsi3_0: + rts + nop diff --git a/arch/sh/lib/ashrdi3.c b/arch/sh/lib/ashrdi3.c new file mode 100644 index 000000000..ae263fbf2 --- /dev/null +++ b/arch/sh/lib/ashrdi3.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/module.h> + +#include "libgcc.h" + +long long __ashrdi3(long long u, word_type b) +{ + DWunion uu, w; + word_type bm; + + if (b == 0) + return u; + + uu.ll = u; + bm = 32 - b; + + if (bm <= 0) { + /* w.s.high = 1..1 or 0..0 */ + w.s.high = + uu.s.high >> 31; + w.s.low = uu.s.high >> -bm; + } else { + const unsigned int carries = (unsigned int) uu.s.high << bm; + + w.s.high = uu.s.high >> b; + w.s.low = ((unsigned int) uu.s.low >> b) | carries; + } + + return w.ll; +} + +EXPORT_SYMBOL(__ashrdi3); diff --git a/arch/sh/lib/ashrsi3.S b/arch/sh/lib/ashrsi3.S new file mode 100644 index 000000000..bf3c4e03e --- /dev/null +++ b/arch/sh/lib/ashrsi3.S @@ -0,0 +1,179 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH GCC-exception-2.0 + + Copyright (C) 1994, 1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003, + 2004, 2005, 2006 + Free Software Foundation, Inc. +*/ + +!! libgcc routines for the Renesas / SuperH SH CPUs. +!! Contributed by Steve Chamberlain. +!! sac@cygnus.com + +!! ashiftrt_r4_x, ___ashrsi3, ___ashlsi3, ___lshrsi3 routines +!! recoded in assembly by Toshiyasu Morita +!! tm@netcom.com + +/* SH2 optimizations for ___ashrsi3, ___ashlsi3, ___lshrsi3 and + ELF local label prefixes by J"orn Rennecke + amylaar@cygnus.com */ + +! +! __ashrsi3 +! +! Entry: +! +! r4: Value to shift +! r5: Shifts +! +! Exit: +! +! r0: Result +! +! Destroys: +! +! (none) +! +! __ashrsi3_r0 +! +! Entry: +! +! r4: Value to shift +! r0: Shifts +! +! Exit: +! +! r0: Result +! +! Destroys: +! +! (none) + + .global __ashrsi3 + .global __ashrsi3_r0 + + .align 2 +__ashrsi3: + mov r5,r0 + .align 2 +__ashrsi3_r0: + and #31,r0 + mov.l r4,@-r15 + mov r0,r4 + mova ashrsi3_table,r0 + mov.b @(r0,r4),r4 + add r4,r0 + jmp @r0 + mov.l @r15+,r0 + + .align 2 +ashrsi3_table: + .byte ashrsi3_0-ashrsi3_table + .byte ashrsi3_1-ashrsi3_table + .byte ashrsi3_2-ashrsi3_table + .byte ashrsi3_3-ashrsi3_table + .byte ashrsi3_4-ashrsi3_table + .byte ashrsi3_5-ashrsi3_table + .byte ashrsi3_6-ashrsi3_table + .byte ashrsi3_7-ashrsi3_table + .byte ashrsi3_8-ashrsi3_table + .byte ashrsi3_9-ashrsi3_table + .byte ashrsi3_10-ashrsi3_table + .byte ashrsi3_11-ashrsi3_table + .byte ashrsi3_12-ashrsi3_table + .byte ashrsi3_13-ashrsi3_table + .byte ashrsi3_14-ashrsi3_table + .byte ashrsi3_15-ashrsi3_table + .byte ashrsi3_16-ashrsi3_table + .byte ashrsi3_17-ashrsi3_table + .byte ashrsi3_18-ashrsi3_table + .byte ashrsi3_19-ashrsi3_table + .byte ashrsi3_20-ashrsi3_table + .byte ashrsi3_21-ashrsi3_table + .byte ashrsi3_22-ashrsi3_table + .byte ashrsi3_23-ashrsi3_table + .byte ashrsi3_24-ashrsi3_table + .byte ashrsi3_25-ashrsi3_table + .byte ashrsi3_26-ashrsi3_table + .byte ashrsi3_27-ashrsi3_table + .byte ashrsi3_28-ashrsi3_table + .byte ashrsi3_29-ashrsi3_table + .byte ashrsi3_30-ashrsi3_table + .byte ashrsi3_31-ashrsi3_table + +ashrsi3_31: + rotcl r0 + rts + subc r0,r0 + +ashrsi3_30: + shar r0 +ashrsi3_29: + shar r0 +ashrsi3_28: + shar r0 +ashrsi3_27: + shar r0 +ashrsi3_26: + shar r0 +ashrsi3_25: + shar r0 +ashrsi3_24: + shlr16 r0 + shlr8 r0 + rts + exts.b r0,r0 + +ashrsi3_23: + shar r0 +ashrsi3_22: + shar r0 +ashrsi3_21: + shar r0 +ashrsi3_20: + shar r0 +ashrsi3_19: + shar r0 +ashrsi3_18: + shar r0 +ashrsi3_17: + shar r0 +ashrsi3_16: + shlr16 r0 + rts + exts.w r0,r0 + +ashrsi3_15: + shar r0 +ashrsi3_14: + shar r0 +ashrsi3_13: + shar r0 +ashrsi3_12: + shar r0 +ashrsi3_11: + shar r0 +ashrsi3_10: + shar r0 +ashrsi3_9: + shar r0 +ashrsi3_8: + shar r0 +ashrsi3_7: + shar r0 +ashrsi3_6: + shar r0 +ashrsi3_5: + shar r0 +ashrsi3_4: + shar r0 +ashrsi3_3: + shar r0 +ashrsi3_2: + shar r0 +ashrsi3_1: + rts + shar r0 + +ashrsi3_0: + rts + nop diff --git a/arch/sh/lib/checksum.S b/arch/sh/lib/checksum.S new file mode 100644 index 000000000..3e07074e0 --- /dev/null +++ b/arch/sh/lib/checksum.S @@ -0,0 +1,365 @@ +/* SPDX-License-Identifier: GPL-2.0+ + * + * $Id: checksum.S,v 1.10 2001/07/06 13:11:32 gniibe Exp $ + * + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IP/TCP/UDP checksumming routines + * + * Authors: Jorge Cwik, <jorge@laser.satlink.net> + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> + * Tom May, <ftom@netcom.com> + * Pentium Pro/II routines: + * Alexander Kjeldaas <astor@guardian.no> + * Finn Arne Gangstad <finnag@guardian.no> + * Lots of code moved from tcp.c and ip.c; see those files + * for more names. + * + * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception + * handling. + * Andi Kleen, add zeroing on error + * converted to pure assembler + * + * SuperH version: Copyright (C) 1999 Niibe Yutaka + */ + +#include <asm/errno.h> +#include <linux/linkage.h> + +/* + * computes a partial checksum, e.g. for TCP/UDP fragments + */ + +/* + * asmlinkage __wsum csum_partial(const void *buf, int len, __wsum sum); + */ + +.text +ENTRY(csum_partial) + /* + * Experiments with Ethernet and SLIP connections show that buff + * is aligned on either a 2-byte or 4-byte boundary. We get at + * least a twofold speedup on 486 and Pentium if it is 4-byte aligned. + * Fortunately, it is easy to convert 2-byte alignment to 4-byte + * alignment for the unrolled loop. + */ + mov r4, r0 + tst #3, r0 ! Check alignment. + bt/s 2f ! Jump if alignment is ok. + mov r4, r7 ! Keep a copy to check for alignment + ! + tst #1, r0 ! Check alignment. + bt 21f ! Jump if alignment is boundary of 2bytes. + + ! buf is odd + tst r5, r5 + add #-1, r5 + bt 9f + mov.b @r4+, r0 + extu.b r0, r0 + addc r0, r6 ! t=0 from previous tst + mov r6, r0 + shll8 r6 + shlr16 r0 + shlr8 r0 + or r0, r6 + mov r4, r0 + tst #2, r0 + bt 2f +21: + ! buf is 2 byte aligned (len could be 0) + add #-2, r5 ! Alignment uses up two bytes. + cmp/pz r5 ! + bt/s 1f ! Jump if we had at least two bytes. + clrt + bra 6f + add #2, r5 ! r5 was < 2. Deal with it. +1: + mov.w @r4+, r0 + extu.w r0, r0 + addc r0, r6 + bf 2f + add #1, r6 +2: + ! buf is 4 byte aligned (len could be 0) + mov r5, r1 + mov #-5, r0 + shld r0, r1 + tst r1, r1 + bt/s 4f ! if it's =0, go to 4f + clrt + .align 2 +3: + mov.l @r4+, r0 + mov.l @r4+, r2 + mov.l @r4+, r3 + addc r0, r6 + mov.l @r4+, r0 + addc r2, r6 + mov.l @r4+, r2 + addc r3, r6 + mov.l @r4+, r3 + addc r0, r6 + mov.l @r4+, r0 + addc r2, r6 + mov.l @r4+, r2 + addc r3, r6 + addc r0, r6 + addc r2, r6 + movt r0 + dt r1 + bf/s 3b + cmp/eq #1, r0 + ! here, we know r1==0 + addc r1, r6 ! add carry to r6 +4: + mov r5, r0 + and #0x1c, r0 + tst r0, r0 + bt 6f + ! 4 bytes or more remaining + mov r0, r1 + shlr2 r1 + mov #0, r2 +5: + addc r2, r6 + mov.l @r4+, r2 + movt r0 + dt r1 + bf/s 5b + cmp/eq #1, r0 + addc r2, r6 + addc r1, r6 ! r1==0 here, so it means add carry-bit +6: + ! 3 bytes or less remaining + mov #3, r0 + and r0, r5 + tst r5, r5 + bt 9f ! if it's =0 go to 9f + mov #2, r1 + cmp/hs r1, r5 + bf 7f + mov.w @r4+, r0 + extu.w r0, r0 + cmp/eq r1, r5 + bt/s 8f + clrt + shll16 r0 + addc r0, r6 +7: + mov.b @r4+, r0 + extu.b r0, r0 +#ifndef __LITTLE_ENDIAN__ + shll8 r0 +#endif +8: + addc r0, r6 + mov #0, r0 + addc r0, r6 +9: + ! Check if the buffer was misaligned, if so realign sum + mov r7, r0 + tst #1, r0 + bt 10f + mov r6, r0 + shll8 r6 + shlr16 r0 + shlr8 r0 + or r0, r6 +10: + rts + mov r6, r0 + +/* +unsigned int csum_partial_copy_generic (const char *src, char *dst, int len) + */ + +/* + * Copy from ds while checksumming, otherwise like csum_partial with initial + * sum being ~0U + */ + +#define EXC(...) \ + 9999: __VA_ARGS__ ; \ + .section __ex_table, "a"; \ + .long 9999b, 6001f ; \ + .previous + +! +! r4: const char *SRC +! r5: char *DST +! r6: int LEN +! +ENTRY(csum_partial_copy_generic) + mov #-1,r7 + mov #3,r0 ! Check src and dest are equally aligned + mov r4,r1 + and r0,r1 + and r5,r0 + cmp/eq r1,r0 + bf 3f ! Different alignments, use slow version + tst #1,r0 ! Check dest word aligned + bf 3f ! If not, do it the slow way + + mov #2,r0 + tst r0,r5 ! Check dest alignment. + bt 2f ! Jump if alignment is ok. + add #-2,r6 ! Alignment uses up two bytes. + cmp/pz r6 ! Jump if we had at least two bytes. + bt/s 1f + clrt + add #2,r6 ! r6 was < 2. Deal with it. + bra 4f + mov r6,r2 + +3: ! Handle different src and dest alignments. + ! This is not common, so simple byte by byte copy will do. + mov r6,r2 + shlr r6 + tst r6,r6 + bt 4f + clrt + .align 2 +5: +EXC( mov.b @r4+,r1 ) +EXC( mov.b @r4+,r0 ) + extu.b r1,r1 +EXC( mov.b r1,@r5 ) +EXC( mov.b r0,@(1,r5) ) + extu.b r0,r0 + add #2,r5 + +#ifdef __LITTLE_ENDIAN__ + shll8 r0 +#else + shll8 r1 +#endif + or r1,r0 + + addc r0,r7 + movt r0 + dt r6 + bf/s 5b + cmp/eq #1,r0 + mov #0,r0 + addc r0, r7 + + mov r2, r0 + tst #1, r0 + bt 7f + bra 5f + clrt + + ! src and dest equally aligned, but to a two byte boundary. + ! Handle first two bytes as a special case + .align 2 +1: +EXC( mov.w @r4+,r0 ) +EXC( mov.w r0,@r5 ) + add #2,r5 + extu.w r0,r0 + addc r0,r7 + mov #0,r0 + addc r0,r7 +2: + mov r6,r2 + mov #-5,r0 + shld r0,r6 + tst r6,r6 + bt/s 2f + clrt + .align 2 +1: +EXC( mov.l @r4+,r0 ) +EXC( mov.l @r4+,r1 ) + addc r0,r7 +EXC( mov.l r0,@r5 ) +EXC( mov.l r1,@(4,r5) ) + addc r1,r7 + +EXC( mov.l @r4+,r0 ) +EXC( mov.l @r4+,r1 ) + addc r0,r7 +EXC( mov.l r0,@(8,r5) ) +EXC( mov.l r1,@(12,r5) ) + addc r1,r7 + +EXC( mov.l @r4+,r0 ) +EXC( mov.l @r4+,r1 ) + addc r0,r7 +EXC( mov.l r0,@(16,r5) ) +EXC( mov.l r1,@(20,r5) ) + addc r1,r7 + +EXC( mov.l @r4+,r0 ) +EXC( mov.l @r4+,r1 ) + addc r0,r7 +EXC( mov.l r0,@(24,r5) ) +EXC( mov.l r1,@(28,r5) ) + addc r1,r7 + add #32,r5 + movt r0 + dt r6 + bf/s 1b + cmp/eq #1,r0 + mov #0,r0 + addc r0,r7 + +2: mov r2,r6 + mov #0x1c,r0 + and r0,r6 + cmp/pl r6 + bf/s 4f + clrt + shlr2 r6 +3: +EXC( mov.l @r4+,r0 ) + addc r0,r7 +EXC( mov.l r0,@r5 ) + add #4,r5 + movt r0 + dt r6 + bf/s 3b + cmp/eq #1,r0 + mov #0,r0 + addc r0,r7 +4: mov r2,r6 + mov #3,r0 + and r0,r6 + cmp/pl r6 + bf 7f + mov #2,r1 + cmp/hs r1,r6 + bf 5f +EXC( mov.w @r4+,r0 ) +EXC( mov.w r0,@r5 ) + extu.w r0,r0 + add #2,r5 + cmp/eq r1,r6 + bt/s 6f + clrt + shll16 r0 + addc r0,r7 +5: +EXC( mov.b @r4+,r0 ) +EXC( mov.b r0,@r5 ) + extu.b r0,r0 +#ifndef __LITTLE_ENDIAN__ + shll8 r0 +#endif +6: addc r0,r7 + mov #0,r0 + addc r0,r7 +7: + +# Exception handler: +.section .fixup, "ax" + +6001: + rts + mov #0,r0 +.previous + rts + mov r7,r0 diff --git a/arch/sh/lib/copy_page.S b/arch/sh/lib/copy_page.S new file mode 100644 index 000000000..d4e9d18ce --- /dev/null +++ b/arch/sh/lib/copy_page.S @@ -0,0 +1,390 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * copy_page, __copy_user_page, __copy_user implementation of SuperH + * + * Copyright (C) 2001 Niibe Yutaka & Kaz Kojima + * Copyright (C) 2002 Toshinobu Sugioka + * Copyright (C) 2006 Paul Mundt + */ +#include <linux/linkage.h> +#include <asm/page.h> + +/* + * copy_page + * @to: P1 address + * @from: P1 address + * + * void copy_page(void *to, void *from) + */ + +/* + * r0, r1, r2, r3, r4, r5, r6, r7 --- scratch + * r8 --- from + PAGE_SIZE + * r9 --- not used + * r10 --- to + * r11 --- from + */ +ENTRY(copy_page) + mov.l r8,@-r15 + mov.l r10,@-r15 + mov.l r11,@-r15 + mov r4,r10 + mov r5,r11 + mov r5,r8 + mov #(PAGE_SIZE >> 10), r0 + shll8 r0 + shll2 r0 + add r0,r8 + ! +1: mov.l @r11+,r0 + mov.l @r11+,r1 + mov.l @r11+,r2 + mov.l @r11+,r3 + mov.l @r11+,r4 + mov.l @r11+,r5 + mov.l @r11+,r6 + mov.l @r11+,r7 +#if defined(CONFIG_CPU_SH4) + movca.l r0,@r10 +#else + mov.l r0,@r10 +#endif + add #32,r10 + mov.l r7,@-r10 + mov.l r6,@-r10 + mov.l r5,@-r10 + mov.l r4,@-r10 + mov.l r3,@-r10 + mov.l r2,@-r10 + mov.l r1,@-r10 + cmp/eq r11,r8 + bf/s 1b + add #28,r10 + ! + mov.l @r15+,r11 + mov.l @r15+,r10 + mov.l @r15+,r8 + rts + nop + +/* + * __kernel_size_t __copy_user(void *to, const void *from, __kernel_size_t n); + * Return the number of bytes NOT copied + */ +#define EX(...) \ + 9999: __VA_ARGS__ ; \ + .section __ex_table, "a"; \ + .long 9999b, 6000f ; \ + .previous +#define EX_NO_POP(...) \ + 9999: __VA_ARGS__ ; \ + .section __ex_table, "a"; \ + .long 9999b, 6005f ; \ + .previous +ENTRY(__copy_user) + ! Check if small number of bytes + mov #11,r0 + mov r4,r3 + cmp/gt r0,r6 ! r6 (len) > r0 (11) + bf/s .L_cleanup_loop_no_pop + add r6,r3 ! last destination address + + ! Calculate bytes needed to align to src + mov.l r11,@-r15 + neg r5,r0 + mov.l r10,@-r15 + add #4,r0 + mov.l r9,@-r15 + and #3,r0 + mov.l r8,@-r15 + tst r0,r0 + bt 2f + +1: + ! Copy bytes to long word align src +EX( mov.b @r5+,r1 ) + dt r0 + add #-1,r6 +EX( mov.b r1,@r4 ) + bf/s 1b + add #1,r4 + + ! Jump to appropriate routine depending on dest +2: mov #3,r1 + mov r6, r2 + and r4,r1 + shlr2 r2 + shll2 r1 + mova .L_jump_tbl,r0 + mov.l @(r0,r1),r1 + jmp @r1 + nop + + .align 2 +.L_jump_tbl: + .long .L_dest00 + .long .L_dest01 + .long .L_dest10 + .long .L_dest11 + +/* + * Come here if there are less than 12 bytes to copy + * + * Keep the branch target close, so the bf/s callee doesn't overflow + * and result in a more expensive branch being inserted. This is the + * fast-path for small copies, the jump via the jump table will hit the + * default slow-path cleanup. -PFM. + */ +.L_cleanup_loop_no_pop: + tst r6,r6 ! Check explicitly for zero + bt 1f + +2: +EX_NO_POP( mov.b @r5+,r0 ) + dt r6 +EX_NO_POP( mov.b r0,@r4 ) + bf/s 2b + add #1,r4 + +1: mov #0,r0 ! normal return +5000: + +# Exception handler: +.section .fixup, "ax" +6005: + mov.l 8000f,r1 + mov r3,r0 + jmp @r1 + sub r4,r0 + .align 2 +8000: .long 5000b + +.previous + rts + nop + +! Destination = 00 + +.L_dest00: + ! Skip the large copy for small transfers + mov #(32+32-4), r0 + cmp/gt r6, r0 ! r0 (60) > r6 (len) + bt 1f + + ! Align dest to a 32 byte boundary + neg r4,r0 + add #0x20, r0 + and #0x1f, r0 + tst r0, r0 + bt 2f + + sub r0, r6 + shlr2 r0 +3: +EX( mov.l @r5+,r1 ) + dt r0 +EX( mov.l r1,@r4 ) + bf/s 3b + add #4,r4 + +2: +EX( mov.l @r5+,r0 ) +EX( mov.l @r5+,r1 ) +EX( mov.l @r5+,r2 ) +EX( mov.l @r5+,r7 ) +EX( mov.l @r5+,r8 ) +EX( mov.l @r5+,r9 ) +EX( mov.l @r5+,r10 ) +EX( mov.l @r5+,r11 ) +#ifdef CONFIG_CPU_SH4 +EX( movca.l r0,@r4 ) +#else +EX( mov.l r0,@r4 ) +#endif + add #-32, r6 +EX( mov.l r1,@(4,r4) ) + mov #32, r0 +EX( mov.l r2,@(8,r4) ) + cmp/gt r6, r0 ! r0 (32) > r6 (len) +EX( mov.l r7,@(12,r4) ) +EX( mov.l r8,@(16,r4) ) +EX( mov.l r9,@(20,r4) ) +EX( mov.l r10,@(24,r4) ) +EX( mov.l r11,@(28,r4) ) + bf/s 2b + add #32,r4 + +1: mov r6, r0 + shlr2 r0 + tst r0, r0 + bt .L_cleanup +1: +EX( mov.l @r5+,r1 ) + dt r0 +EX( mov.l r1,@r4 ) + bf/s 1b + add #4,r4 + + bra .L_cleanup + nop + +! Destination = 10 + +.L_dest10: + mov r2,r7 + shlr2 r7 + shlr r7 + tst r7,r7 + mov #7,r0 + bt/s 1f + and r0,r2 +2: + dt r7 +#ifdef CONFIG_CPU_LITTLE_ENDIAN +EX( mov.l @r5+,r0 ) +EX( mov.l @r5+,r1 ) +EX( mov.l @r5+,r8 ) +EX( mov.l @r5+,r9 ) +EX( mov.l @r5+,r10 ) +EX( mov.w r0,@r4 ) + add #2,r4 + xtrct r1,r0 + xtrct r8,r1 + xtrct r9,r8 + xtrct r10,r9 + +EX( mov.l r0,@r4 ) +EX( mov.l r1,@(4,r4) ) +EX( mov.l r8,@(8,r4) ) +EX( mov.l r9,@(12,r4) ) + +EX( mov.l @r5+,r1 ) +EX( mov.l @r5+,r8 ) +EX( mov.l @r5+,r0 ) + xtrct r1,r10 + xtrct r8,r1 + xtrct r0,r8 + shlr16 r0 +EX( mov.l r10,@(16,r4) ) +EX( mov.l r1,@(20,r4) ) +EX( mov.l r8,@(24,r4) ) +EX( mov.w r0,@(28,r4) ) + bf/s 2b + add #30,r4 +#else +EX( mov.l @(28,r5),r0 ) +EX( mov.l @(24,r5),r8 ) +EX( mov.l @(20,r5),r9 ) +EX( mov.l @(16,r5),r10 ) +EX( mov.w r0,@(30,r4) ) + add #-2,r4 + xtrct r8,r0 + xtrct r9,r8 + xtrct r10,r9 +EX( mov.l r0,@(28,r4) ) +EX( mov.l r8,@(24,r4) ) +EX( mov.l r9,@(20,r4) ) + +EX( mov.l @(12,r5),r0 ) +EX( mov.l @(8,r5),r8 ) + xtrct r0,r10 +EX( mov.l @(4,r5),r9 ) + mov.l r10,@(16,r4) +EX( mov.l @r5,r10 ) + xtrct r8,r0 + xtrct r9,r8 + xtrct r10,r9 +EX( mov.l r0,@(12,r4) ) +EX( mov.l r8,@(8,r4) ) + swap.w r10,r0 +EX( mov.l r9,@(4,r4) ) +EX( mov.w r0,@(2,r4) ) + + add #32,r5 + bf/s 2b + add #34,r4 +#endif + tst r2,r2 + bt .L_cleanup + +1: ! Read longword, write two words per iteration +EX( mov.l @r5+,r0 ) + dt r2 +#ifdef CONFIG_CPU_LITTLE_ENDIAN +EX( mov.w r0,@r4 ) + shlr16 r0 +EX( mov.w r0,@(2,r4) ) +#else +EX( mov.w r0,@(2,r4) ) + shlr16 r0 +EX( mov.w r0,@r4 ) +#endif + bf/s 1b + add #4,r4 + + bra .L_cleanup + nop + +! Destination = 01 or 11 + +.L_dest01: +.L_dest11: + ! Read longword, write byte, word, byte per iteration +EX( mov.l @r5+,r0 ) + dt r2 +#ifdef CONFIG_CPU_LITTLE_ENDIAN +EX( mov.b r0,@r4 ) + shlr8 r0 + add #1,r4 +EX( mov.w r0,@r4 ) + shlr16 r0 +EX( mov.b r0,@(2,r4) ) + bf/s .L_dest01 + add #3,r4 +#else +EX( mov.b r0,@(3,r4) ) + shlr8 r0 + swap.w r0,r7 +EX( mov.b r7,@r4 ) + add #1,r4 +EX( mov.w r0,@r4 ) + bf/s .L_dest01 + add #3,r4 +#endif + +! Cleanup last few bytes +.L_cleanup: + mov r6,r0 + and #3,r0 + tst r0,r0 + bt .L_exit + mov r0,r6 + +.L_cleanup_loop: +EX( mov.b @r5+,r0 ) + dt r6 +EX( mov.b r0,@r4 ) + bf/s .L_cleanup_loop + add #1,r4 + +.L_exit: + mov #0,r0 ! normal return + +5000: + +# Exception handler: +.section .fixup, "ax" +6000: + mov.l 8000f,r1 + mov r3,r0 + jmp @r1 + sub r4,r0 + .align 2 +8000: .long 5000b + +.previous + mov.l @r15+,r8 + mov.l @r15+,r9 + mov.l @r15+,r10 + rts + mov.l @r15+,r11 diff --git a/arch/sh/lib/delay.c b/arch/sh/lib/delay.c new file mode 100644 index 000000000..dad8e6a54 --- /dev/null +++ b/arch/sh/lib/delay.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Precise Delay Loops for SuperH + * + * Copyright (C) 1999 Niibe Yutaka & Kaz Kojima + */ + +#include <linux/sched.h> +#include <linux/delay.h> + +void __delay(unsigned long loops) +{ + __asm__ __volatile__( + /* + * ST40-300 appears to have an issue with this code, + * normally taking two cycles each loop, as with all + * other SH variants. If however the branch and the + * delay slot straddle an 8 byte boundary, this increases + * to 3 cycles. + * This align directive ensures this doesn't occur. + */ + ".balign 8\n\t" + + "tst %0, %0\n\t" + "1:\t" + "bf/s 1b\n\t" + " dt %0" + : "=r" (loops) + : "0" (loops) + : "t"); +} + +inline void __const_udelay(unsigned long xloops) +{ + xloops *= 4; + __asm__("dmulu.l %0, %2\n\t" + "sts mach, %0" + : "=r" (xloops) + : "0" (xloops), + "r" (cpu_data[raw_smp_processor_id()].loops_per_jiffy * (HZ/4)) + : "macl", "mach"); + __delay(++xloops); +} + +void __udelay(unsigned long usecs) +{ + __const_udelay(usecs * 0x000010c6); /* 2**32 / 1000000 */ +} + +void __ndelay(unsigned long nsecs) +{ + __const_udelay(nsecs * 0x00000005); +} + diff --git a/arch/sh/lib/div64-generic.c b/arch/sh/lib/div64-generic.c new file mode 100644 index 000000000..0b67fbc44 --- /dev/null +++ b/arch/sh/lib/div64-generic.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Generic __div64_32 wrapper for __xdiv64_32. + */ + +#include <linux/types.h> +#include <asm/div64.h> + +extern uint64_t __xdiv64_32(u64 n, u32 d); + +uint32_t __div64_32(u64 *xp, u32 y) +{ + uint32_t rem; + uint64_t q = __xdiv64_32(*xp, y); + + rem = *xp - q * y; + *xp = q; + + return rem; +} diff --git a/arch/sh/lib/div64.S b/arch/sh/lib/div64.S new file mode 100644 index 000000000..4a9a966e7 --- /dev/null +++ b/arch/sh/lib/div64.S @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * unsigned long __xdiv64_32(unsigned long long n, unsigned long d); + */ + +#include <linux/linkage.h> + +.text +ENTRY(__xdiv64_32) +#ifdef CONFIG_CPU_LITTLE_ENDIAN + mov r4, r0 + mov r5, r1 +#else + mov r4, r1 + mov r5, r0 +#endif + cmp/hs r6, r1 + bf.s 1f + mov #0, r2 + + mov r1, r2 + mov #0, r3 + div0u + .rept 32 + rotcl r2 + div1 r6, r3 + .endr + rotcl r2 + mul.l r6, r2 + sts macl, r3 + sub r3, r1 +1: + div0u + .rept 32 + rotcl r0 + div1 r6, r1 + .endr +#ifdef CONFIG_CPU_LITTLE_ENDIAN + mov r2, r1 + rts + rotcl r0 +#else + rotcl r0 + mov r0, r1 + rts + mov r2, r0 +#endif diff --git a/arch/sh/lib/io.c b/arch/sh/lib/io.c new file mode 100644 index 000000000..ebcf7c0a7 --- /dev/null +++ b/arch/sh/lib/io.c @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * arch/sh/lib/io.c - SH32 optimized I/O routines + * + * Copyright (C) 2000 Stuart Menefy + * Copyright (C) 2005 Paul Mundt + * + * Provide real functions which expand to whatever the header file defined. + * Also definitions of machine independent IO functions. + */ +#include <linux/module.h> +#include <linux/io.h> + +void __raw_readsl(const void __iomem *addr, void *datap, int len) +{ + u32 *data; + + for (data = datap; (len != 0) && (((u32)data & 0x1f) != 0); len--) + *data++ = __raw_readl(addr); + + if (likely(len >= (0x20 >> 2))) { + int tmp2, tmp3, tmp4, tmp5, tmp6; + + __asm__ __volatile__( + "1: \n\t" + "mov.l @%7, r0 \n\t" + "mov.l @%7, %2 \n\t" +#ifdef CONFIG_CPU_SH4 + "movca.l r0, @%0 \n\t" +#else + "mov.l r0, @%0 \n\t" +#endif + "mov.l @%7, %3 \n\t" + "mov.l @%7, %4 \n\t" + "mov.l @%7, %5 \n\t" + "mov.l @%7, %6 \n\t" + "mov.l @%7, r7 \n\t" + "mov.l @%7, r0 \n\t" + "mov.l %2, @(0x04,%0) \n\t" + "mov #0x20>>2, %2 \n\t" + "mov.l %3, @(0x08,%0) \n\t" + "sub %2, %1 \n\t" + "mov.l %4, @(0x0c,%0) \n\t" + "cmp/hi %1, %2 ! T if 32 > len \n\t" + "mov.l %5, @(0x10,%0) \n\t" + "mov.l %6, @(0x14,%0) \n\t" + "mov.l r7, @(0x18,%0) \n\t" + "mov.l r0, @(0x1c,%0) \n\t" + "bf.s 1b \n\t" + " add #0x20, %0 \n\t" + : "=&r" (data), "=&r" (len), + "=&r" (tmp2), "=&r" (tmp3), "=&r" (tmp4), + "=&r" (tmp5), "=&r" (tmp6) + : "r"(addr), "0" (data), "1" (len) + : "r0", "r7", "t", "memory"); + } + + for (; len != 0; len--) + *data++ = __raw_readl(addr); +} +EXPORT_SYMBOL(__raw_readsl); + +void __raw_writesl(void __iomem *addr, const void *data, int len) +{ + if (likely(len != 0)) { + int tmp1; + + __asm__ __volatile__ ( + "1: \n\t" + "mov.l @%0+, %1 \n\t" + "dt %3 \n\t" + "bf.s 1b \n\t" + " mov.l %1, @%4 \n\t" + : "=&r" (data), "=&r" (tmp1) + : "0" (data), "r" (len), "r"(addr) + : "t", "memory"); + } +} +EXPORT_SYMBOL(__raw_writesl); diff --git a/arch/sh/lib/libgcc.h b/arch/sh/lib/libgcc.h new file mode 100644 index 000000000..58ada9e8f --- /dev/null +++ b/arch/sh/lib/libgcc.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __ASM_LIBGCC_H +#define __ASM_LIBGCC_H + +#include <asm/byteorder.h> + +typedef int word_type __attribute__ ((mode (__word__))); + +#ifdef __BIG_ENDIAN +struct DWstruct { + int high, low; +}; +#elif defined(__LITTLE_ENDIAN) +struct DWstruct { + int low, high; +}; +#else +#error I feel sick. +#endif + +typedef union { + struct DWstruct s; + long long ll; +} DWunion; + +#endif /* __ASM_LIBGCC_H */ diff --git a/arch/sh/lib/lshrdi3.c b/arch/sh/lib/lshrdi3.c new file mode 100644 index 000000000..33eaa1edb --- /dev/null +++ b/arch/sh/lib/lshrdi3.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/module.h> + +#include "libgcc.h" + +long long __lshrdi3(long long u, word_type b) +{ + DWunion uu, w; + word_type bm; + + if (b == 0) + return u; + + uu.ll = u; + bm = 32 - b; + + if (bm <= 0) { + w.s.high = 0; + w.s.low = (unsigned int) uu.s.high >> -bm; + } else { + const unsigned int carries = (unsigned int) uu.s.high << bm; + + w.s.high = (unsigned int) uu.s.high >> b; + w.s.low = ((unsigned int) uu.s.low >> b) | carries; + } + + return w.ll; +} + +EXPORT_SYMBOL(__lshrdi3); diff --git a/arch/sh/lib/lshrsi3.S b/arch/sh/lib/lshrsi3.S new file mode 100644 index 000000000..b79b81700 --- /dev/null +++ b/arch/sh/lib/lshrsi3.S @@ -0,0 +1,188 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH GCC-exception-2.0 + + Copyright (C) 1994, 1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003, + 2004, 2005, 2006 + Free Software Foundation, Inc. +*/ + +!! libgcc routines for the Renesas / SuperH SH CPUs. +!! Contributed by Steve Chamberlain. +!! sac@cygnus.com + +!! ashiftrt_r4_x, ___ashrsi3, ___ashlsi3, ___lshrsi3 routines +!! recoded in assembly by Toshiyasu Morita +!! tm@netcom.com + +/* SH2 optimizations for ___ashrsi3, ___ashlsi3, ___lshrsi3 and + ELF local label prefixes by J"orn Rennecke + amylaar@cygnus.com */ + +! +! __lshrsi3 +! +! Entry: +! +! r4: Value to shift +! r5: Shifts +! +! Exit: +! +! r0: Result +! +! Destroys: +! +! (none) +! +! __lshrsi3_r0 +! +! Entry: +! +! r0: Value to shift +! r5: Shifts +! +! Exit: +! +! r0: Result +! +! Destroys: +! +! (none) +! + .global __lshrsi3 + .global __lshrsi3_r0 + + .align 2 +__lshrsi3: + mov r5,r0 + .align 2 +__lshrsi3_r0: + and #31,r0 + mov.l r4,@-r15 + mov r0,r4 + mova lshrsi3_table,r0 + mov.b @(r0,r4),r4 + add r4,r0 + jmp @r0 + mov.l @r15+,r0 + + .align 2 +lshrsi3_table: + .byte lshrsi3_0-lshrsi3_table + .byte lshrsi3_1-lshrsi3_table + .byte lshrsi3_2-lshrsi3_table + .byte lshrsi3_3-lshrsi3_table + .byte lshrsi3_4-lshrsi3_table + .byte lshrsi3_5-lshrsi3_table + .byte lshrsi3_6-lshrsi3_table + .byte lshrsi3_7-lshrsi3_table + .byte lshrsi3_8-lshrsi3_table + .byte lshrsi3_9-lshrsi3_table + .byte lshrsi3_10-lshrsi3_table + .byte lshrsi3_11-lshrsi3_table + .byte lshrsi3_12-lshrsi3_table + .byte lshrsi3_13-lshrsi3_table + .byte lshrsi3_14-lshrsi3_table + .byte lshrsi3_15-lshrsi3_table + .byte lshrsi3_16-lshrsi3_table + .byte lshrsi3_17-lshrsi3_table + .byte lshrsi3_18-lshrsi3_table + .byte lshrsi3_19-lshrsi3_table + .byte lshrsi3_20-lshrsi3_table + .byte lshrsi3_21-lshrsi3_table + .byte lshrsi3_22-lshrsi3_table + .byte lshrsi3_23-lshrsi3_table + .byte lshrsi3_24-lshrsi3_table + .byte lshrsi3_25-lshrsi3_table + .byte lshrsi3_26-lshrsi3_table + .byte lshrsi3_27-lshrsi3_table + .byte lshrsi3_28-lshrsi3_table + .byte lshrsi3_29-lshrsi3_table + .byte lshrsi3_30-lshrsi3_table + .byte lshrsi3_31-lshrsi3_table + +lshrsi3_6: + shlr2 r0 +lshrsi3_4: + shlr2 r0 +lshrsi3_2: + rts + shlr2 r0 + +lshrsi3_7: + shlr2 r0 +lshrsi3_5: + shlr2 r0 +lshrsi3_3: + shlr2 r0 +lshrsi3_1: + rts + shlr r0 + +lshrsi3_14: + shlr2 r0 +lshrsi3_12: + shlr2 r0 +lshrsi3_10: + shlr2 r0 +lshrsi3_8: + rts + shlr8 r0 + +lshrsi3_15: + shlr2 r0 +lshrsi3_13: + shlr2 r0 +lshrsi3_11: + shlr2 r0 +lshrsi3_9: + shlr8 r0 + rts + shlr r0 + +lshrsi3_22: + shlr2 r0 +lshrsi3_20: + shlr2 r0 +lshrsi3_18: + shlr2 r0 +lshrsi3_16: + rts + shlr16 r0 + +lshrsi3_23: + shlr2 r0 +lshrsi3_21: + shlr2 r0 +lshrsi3_19: + shlr2 r0 +lshrsi3_17: + shlr16 r0 + rts + shlr r0 + +lshrsi3_30: + shlr2 r0 +lshrsi3_28: + shlr2 r0 +lshrsi3_26: + shlr2 r0 +lshrsi3_24: + shlr16 r0 + rts + shlr8 r0 + +lshrsi3_31: + shlr2 r0 +lshrsi3_29: + shlr2 r0 +lshrsi3_27: + shlr2 r0 +lshrsi3_25: + shlr16 r0 + shlr8 r0 + rts + shlr r0 + +lshrsi3_0: + rts + nop diff --git a/arch/sh/lib/mcount.S b/arch/sh/lib/mcount.S new file mode 100644 index 000000000..c6ca90cc9 --- /dev/null +++ b/arch/sh/lib/mcount.S @@ -0,0 +1,287 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * arch/sh/lib/mcount.S + * + * Copyright (C) 2008, 2009 Paul Mundt + * Copyright (C) 2008, 2009 Matt Fleming + */ +#include <asm/ftrace.h> +#include <asm/thread_info.h> +#include <asm/asm-offsets.h> + +#define MCOUNT_ENTER() \ + mov.l r4, @-r15; \ + mov.l r5, @-r15; \ + mov.l r6, @-r15; \ + mov.l r7, @-r15; \ + sts.l pr, @-r15; \ + \ + mov.l @(20,r15),r4; \ + sts pr, r5 + +#define MCOUNT_LEAVE() \ + lds.l @r15+, pr; \ + mov.l @r15+, r7; \ + mov.l @r15+, r6; \ + mov.l @r15+, r5; \ + rts; \ + mov.l @r15+, r4 + +#ifdef CONFIG_STACK_DEBUG +/* + * Perform diagnostic checks on the state of the kernel stack. + * + * Check for stack overflow. If there is less than 1KB free + * then it has overflowed. + * + * Make sure the stack pointer contains a valid address. Valid + * addresses for kernel stacks are anywhere after the bss + * (after __bss_stop) and anywhere in init_thread_union (init_stack). + */ +#define STACK_CHECK() \ + mov #(THREAD_SIZE >> 10), r0; \ + shll8 r0; \ + shll2 r0; \ + \ + /* r1 = sp & (THREAD_SIZE - 1) */ \ + mov #-1, r1; \ + add r0, r1; \ + and r15, r1; \ + \ + mov #TI_SIZE, r3; \ + mov #(STACK_WARN >> 8), r2; \ + shll8 r2; \ + add r3, r2; \ + \ + /* Is the stack overflowing? */ \ + cmp/hi r2, r1; \ + bf stack_panic; \ + \ + /* If sp > __bss_stop then we're OK. */ \ + mov.l .L_ebss, r1; \ + cmp/hi r1, r15; \ + bt 1f; \ + \ + /* If sp < init_stack, we're not OK. */ \ + mov.l .L_init_thread_union, r1; \ + cmp/hs r1, r15; \ + bf stack_panic; \ + \ + /* If sp > init_stack && sp < __bss_stop, not OK. */ \ + add r0, r1; \ + cmp/hs r1, r15; \ + bt stack_panic; \ +1: +#else +#define STACK_CHECK() +#endif /* CONFIG_STACK_DEBUG */ + + .align 2 + .globl _mcount + .type _mcount,@function + .globl mcount + .type mcount,@function +_mcount: +mcount: + STACK_CHECK() + +#ifndef CONFIG_FUNCTION_TRACER + rts + nop +#else + MCOUNT_ENTER() + +#ifdef CONFIG_DYNAMIC_FTRACE + .globl mcount_call +mcount_call: + mov.l .Lftrace_stub, r6 +#else + mov.l .Lftrace_trace_function, r6 + mov.l ftrace_stub, r7 + cmp/eq r6, r7 + bt skip_trace + mov.l @r6, r6 +#endif + + jsr @r6 + nop + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + mov.l .Lftrace_graph_return, r6 + mov.l .Lftrace_stub, r7 + cmp/eq r6, r7 + bt 1f + + mov.l .Lftrace_graph_caller, r0 + jmp @r0 + nop + +1: + mov.l .Lftrace_graph_entry, r6 + mov.l .Lftrace_graph_entry_stub, r7 + cmp/eq r6, r7 + bt skip_trace + + mov.l .Lftrace_graph_caller, r0 + jmp @r0 + nop + + .align 2 +.Lftrace_graph_return: + .long ftrace_graph_return +.Lftrace_graph_entry: + .long ftrace_graph_entry +.Lftrace_graph_entry_stub: + .long ftrace_graph_entry_stub +.Lftrace_graph_caller: + .long ftrace_graph_caller +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + + .globl skip_trace +skip_trace: + MCOUNT_LEAVE() + + .align 2 +.Lftrace_trace_function: + .long ftrace_trace_function + +#ifdef CONFIG_DYNAMIC_FTRACE +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +/* + * NOTE: Do not move either ftrace_graph_call or ftrace_caller + * as this will affect the calculation of GRAPH_INSN_OFFSET. + */ + .globl ftrace_graph_call +ftrace_graph_call: + mov.l .Lskip_trace, r0 + jmp @r0 + nop + + .align 2 +.Lskip_trace: + .long skip_trace +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + + .globl ftrace_caller +ftrace_caller: + MCOUNT_ENTER() + + .globl ftrace_call +ftrace_call: + mov.l .Lftrace_stub, r6 + jsr @r6 + nop + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + bra ftrace_graph_call + nop +#else + MCOUNT_LEAVE() +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ +#endif /* CONFIG_DYNAMIC_FTRACE */ + + .align 2 + +/* + * NOTE: From here on the locations of the .Lftrace_stub label and + * ftrace_stub itself are fixed. Adding additional data here will skew + * the displacement for the memory table and break the block replacement. + * Place new labels either after the ftrace_stub body, or before + * ftrace_caller. You have been warned. + */ +.Lftrace_stub: + .long ftrace_stub + + .globl ftrace_stub +ftrace_stub: + rts + nop + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + .globl ftrace_graph_caller +ftrace_graph_caller: + mov.l 2f, r1 + jmp @r1 + nop +1: + /* + * MCOUNT_ENTER() pushed 5 registers onto the stack, so + * the stack address containing our return address is + * r15 + 20. + */ + mov #20, r0 + add r15, r0 + mov r0, r4 + + mov.l .Lprepare_ftrace_return, r0 + jsr @r0 + nop + + MCOUNT_LEAVE() + + .align 2 +2: .long skip_trace +.Lprepare_ftrace_return: + .long prepare_ftrace_return + + .globl return_to_handler +return_to_handler: + /* + * Save the return values. + */ + mov.l r0, @-r15 + mov.l r1, @-r15 + + mov #0, r4 + + mov.l .Lftrace_return_to_handler, r0 + jsr @r0 + nop + + /* + * The return value from ftrace_return_handler has the real + * address that we should return to. + */ + lds r0, pr + mov.l @r15+, r1 + rts + mov.l @r15+, r0 + + + .align 2 +.Lftrace_return_to_handler: + .long ftrace_return_to_handler +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ +#endif /* CONFIG_FUNCTION_TRACER */ + +#ifdef CONFIG_STACK_DEBUG + .globl stack_panic +stack_panic: + mov.l .Ldump_stack, r0 + jsr @r0 + nop + + mov.l .Lpanic, r0 + jsr @r0 + mov.l .Lpanic_s, r4 + + rts + nop + + .align 2 +.L_init_thread_union: + .long init_thread_union +.L_ebss: + .long __bss_stop +.Lpanic: + .long panic +.Lpanic_s: + .long .Lpanic_str +.Ldump_stack: + .long dump_stack + + .section .rodata + .align 2 +.Lpanic_str: + .string "Stack error" +#endif /* CONFIG_STACK_DEBUG */ diff --git a/arch/sh/lib/memchr.S b/arch/sh/lib/memchr.S new file mode 100644 index 000000000..8ded10407 --- /dev/null +++ b/arch/sh/lib/memchr.S @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* $Id: memchr.S,v 1.1 2000/04/14 16:49:01 mjd Exp $ + * + * "memchr" implementation of SuperH + * + * Copyright (C) 1999 Niibe Yutaka + * + */ + +/* + * void *memchr(const void *s, int c, size_t n); + */ + +#include <linux/linkage.h> +ENTRY(memchr) + tst r6,r6 + bt/s 2f + exts.b r5,r5 +1: mov.b @r4,r1 + cmp/eq r1,r5 + bt/s 3f + dt r6 + bf/s 1b + add #1,r4 +2: mov #0,r4 +3: rts + mov r4,r0 diff --git a/arch/sh/lib/memcpy-sh4.S b/arch/sh/lib/memcpy-sh4.S new file mode 100644 index 000000000..a2435c0f6 --- /dev/null +++ b/arch/sh/lib/memcpy-sh4.S @@ -0,0 +1,800 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * "memcpy" implementation of SuperH + * + * Copyright (C) 1999 Niibe Yutaka + * Copyright (c) 2002 STMicroelectronics Ltd + * Modified from memcpy.S and micro-optimised for SH4 + * Stuart Menefy (stuart.menefy@st.com) + * + */ +#include <linux/linkage.h> + +/* + * void *memcpy(void *dst, const void *src, size_t n); + * + * It is assumed that there is no overlap between src and dst. + * If there is an overlap, then the results are undefined. + */ + + ! + ! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR. + ! + + ! Size is 16 or greater, and may have trailing bytes + + .balign 32 +.Lcase1: + ! Read a long word and write a long word at once + ! At the start of each iteration, r7 contains last long load + add #-1,r5 ! 79 EX + mov r4,r2 ! 5 MT (0 cycles latency) + + mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency) + add #-4,r5 ! 50 EX + + add #7,r2 ! 79 EX + ! +#ifdef CONFIG_CPU_LITTLE_ENDIAN + ! 6 cycles, 4 bytes per iteration +3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK + mov r7, r3 ! 5 MT (latency=0) ! RQPO + + cmp/hi r2,r0 ! 57 MT + shll16 r3 ! 103 EX + + mov r1,r6 ! 5 MT (latency=0) + shll8 r3 ! 102 EX ! Oxxx + + shlr8 r6 ! 106 EX ! xNML + mov r1, r7 ! 5 MT (latency=0) + + or r6,r3 ! 82 EX ! ONML + bt/s 3b ! 109 BR + + mov.l r3,@-r0 ! 30 LS +#else +3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! KLMN + mov r7,r3 ! 5 MT (latency=0) ! OPQR + + cmp/hi r2,r0 ! 57 MT + shlr16 r3 ! 107 EX + + shlr8 r3 ! 106 EX ! xxxO + mov r1,r6 ! 5 MT (latency=0) + + shll8 r6 ! 102 EX ! LMNx + mov r1,r7 ! 5 MT (latency=0) + + or r6,r3 ! 82 EX ! LMNO + bt/s 3b ! 109 BR + + mov.l r3,@-r0 ! 30 LS +#endif + ! Finally, copy a byte at once, if necessary + + add #4,r5 ! 50 EX + cmp/eq r4,r0 ! 54 MT + + add #-6,r2 ! 50 EX + bt 9f ! 109 BR + +8: cmp/hi r2,r0 ! 57 MT + mov.b @(r0,r5),r1 ! 20 LS (latency=2) + + bt/s 8b ! 109 BR + + mov.b r1,@-r0 ! 29 LS + +9: rts + nop + + + ! + ! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R... + ! + + ! Size is 16 or greater, and may have trailing bytes + + .balign 32 +.Lcase3: + ! Read a long word and write a long word at once + ! At the start of each iteration, r7 contains last long load + add #-3,r5 ! 79 EX + mov r4,r2 ! 5 MT (0 cycles latency) + + mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency) + add #-4,r5 ! 50 EX + + add #7,r2 ! 79 EX + ! +#ifdef CONFIG_CPU_LITTLE_ENDIAN + ! 6 cycles, 4 bytes per iteration +3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK + mov r7, r3 ! 5 MT (latency=0) ! RQPO + + cmp/hi r2,r0 ! 57 MT + shll8 r3 ! 102 EX ! QPOx + + mov r1,r6 ! 5 MT (latency=0) + shlr16 r6 ! 107 EX + + shlr8 r6 ! 106 EX ! xxxN + mov r1, r7 ! 5 MT (latency=0) + + or r6,r3 ! 82 EX ! QPON + bt/s 3b ! 109 BR + + mov.l r3,@-r0 ! 30 LS +#else +3: mov r7,r3 ! OPQR + shlr8 r3 ! xOPQ + mov.l @(r0,r5),r7 ! KLMN + mov r7,r6 + shll16 r6 + shll8 r6 ! Nxxx + or r6,r3 ! NOPQ + cmp/hi r2,r0 + bt/s 3b + mov.l r3,@-r0 +#endif + + ! Finally, copy a byte at once, if necessary + + add #6,r5 ! 50 EX + cmp/eq r4,r0 ! 54 MT + + add #-6,r2 ! 50 EX + bt 9f ! 109 BR + +8: cmp/hi r2,r0 ! 57 MT + mov.b @(r0,r5),r1 ! 20 LS (latency=2) + + bt/s 8b ! 109 BR + + mov.b r1,@-r0 ! 29 LS + +9: rts + nop + +ENTRY(memcpy) + + ! Calculate the invariants which will be used in the remainder + ! of the code: + ! + ! r4 --> [ ... ] DST [ ... ] SRC + ! [ ... ] [ ... ] + ! : : + ! r0 --> [ ... ] r0+r5 --> [ ... ] + ! + ! + + ! Short circuit the common case of src, dst and len being 32 bit aligned + ! and test for zero length move + + mov r6, r0 ! 5 MT (0 cycle latency) + or r4, r0 ! 82 EX + + or r5, r0 ! 82 EX + tst r6, r6 ! 86 MT + + bt/s 99f ! 111 BR (zero len) + tst #3, r0 ! 87 MT + + mov r4, r0 ! 5 MT (0 cycle latency) + add r6, r0 ! 49 EX + + mov #16, r1 ! 6 EX + bt/s .Lcase00 ! 111 BR (aligned) + + sub r4, r5 ! 75 EX + + ! Arguments are not nicely long word aligned or zero len. + ! Check for small copies, and if so do a simple byte at a time copy. + ! + ! Deciding on an exact value of 'small' is not easy, as the point at which + ! using the optimised routines become worthwhile varies (these are the + ! cycle counts for differnet sizes using byte-at-a-time vs. optimised): + ! size byte-at-time long word byte + ! 16 42 39-40 46-50 50-55 + ! 24 58 43-44 54-58 62-67 + ! 36 82 49-50 66-70 80-85 + ! However the penalty for getting it 'wrong' is much higher for long word + ! aligned data (and this is more common), so use a value of 16. + + cmp/gt r6,r1 ! 56 MT + + add #-1,r5 ! 50 EX + bf/s 6f ! 108 BR (not small) + + mov r5, r3 ! 5 MT (latency=0) + shlr r6 ! 104 EX + + mov.b @(r0,r5),r1 ! 20 LS (latency=2) + bf/s 4f ! 111 BR + + add #-1,r3 ! 50 EX + tst r6, r6 ! 86 MT + + bt/s 98f ! 110 BR + mov.b r1,@-r0 ! 29 LS + + ! 4 cycles, 2 bytes per iteration +3: mov.b @(r0,r5),r1 ! 20 LS (latency=2) + +4: mov.b @(r0,r3),r2 ! 20 LS (latency=2) + dt r6 ! 67 EX + + mov.b r1,@-r0 ! 29 LS + bf/s 3b ! 111 BR + + mov.b r2,@-r0 ! 29 LS +98: + rts + nop + +99: rts + mov r4, r0 + + ! Size is not small, so its worthwhile looking for optimisations. + ! First align destination to a long word boundary. + ! + ! r5 = normal value -1 + +6: tst #3, r0 ! 87 MT + mov #3, r3 ! 6 EX + + bt/s 2f ! 111 BR + and r0,r3 ! 78 EX + + ! 3 cycles, 1 byte per iteration +1: dt r3 ! 67 EX + mov.b @(r0,r5),r1 ! 19 LS (latency=2) + + add #-1, r6 ! 79 EX + bf/s 1b ! 109 BR + + mov.b r1,@-r0 ! 28 LS + +2: add #1, r5 ! 79 EX + + ! Now select the appropriate bulk transfer code based on relative + ! alignment of src and dst. + + mov r0, r3 ! 5 MT (latency=0) + + mov r5, r0 ! 5 MT (latency=0) + tst #1, r0 ! 87 MT + + bf/s 1f ! 111 BR + mov #64, r7 ! 6 EX + + ! bit 0 clear + + cmp/ge r7, r6 ! 55 MT + + bt/s 2f ! 111 BR + tst #2, r0 ! 87 MT + + ! small + bt/s .Lcase0 + mov r3, r0 + + bra .Lcase2 + nop + + ! big +2: bt/s .Lcase0b + mov r3, r0 + + bra .Lcase2b + nop + + ! bit 0 set +1: tst #2, r0 ! 87 MT + + bt/s .Lcase1 + mov r3, r0 + + bra .Lcase3 + nop + + + ! + ! GHIJ KLMN OPQR --> GHIJ KLMN OPQR + ! + + ! src, dst and size are all long word aligned + ! size is non-zero + + .balign 32 +.Lcase00: + mov #64, r1 ! 6 EX + mov r5, r3 ! 5 MT (latency=0) + + cmp/gt r6, r1 ! 56 MT + add #-4, r5 ! 50 EX + + bf .Lcase00b ! 108 BR (big loop) + shlr2 r6 ! 105 EX + + shlr r6 ! 104 EX + mov.l @(r0, r5), r1 ! 21 LS (latency=2) + + bf/s 4f ! 111 BR + add #-8, r3 ! 50 EX + + tst r6, r6 ! 86 MT + bt/s 5f ! 110 BR + + mov.l r1,@-r0 ! 30 LS + + ! 4 cycles, 2 long words per iteration +3: mov.l @(r0, r5), r1 ! 21 LS (latency=2) + +4: mov.l @(r0, r3), r2 ! 21 LS (latency=2) + dt r6 ! 67 EX + + mov.l r1, @-r0 ! 30 LS + bf/s 3b ! 109 BR + + mov.l r2, @-r0 ! 30 LS + +5: rts + nop + + + ! Size is 16 or greater and less than 64, but may have trailing bytes + + .balign 32 +.Lcase0: + add #-4, r5 ! 50 EX + mov r4, r7 ! 5 MT (latency=0) + + mov.l @(r0, r5), r1 ! 21 LS (latency=2) + mov #4, r2 ! 6 EX + + add #11, r7 ! 50 EX + tst r2, r6 ! 86 MT + + mov r5, r3 ! 5 MT (latency=0) + bt/s 4f ! 111 BR + + add #-4, r3 ! 50 EX + mov.l r1,@-r0 ! 30 LS + + ! 4 cycles, 2 long words per iteration +3: mov.l @(r0, r5), r1 ! 21 LS (latency=2) + +4: mov.l @(r0, r3), r2 ! 21 LS (latency=2) + cmp/hi r7, r0 + + mov.l r1, @-r0 ! 30 LS + bt/s 3b ! 109 BR + + mov.l r2, @-r0 ! 30 LS + + ! Copy the final 0-3 bytes + + add #3,r5 ! 50 EX + + cmp/eq r0, r4 ! 54 MT + add #-10, r7 ! 50 EX + + bt 9f ! 110 BR + + ! 3 cycles, 1 byte per iteration +1: mov.b @(r0,r5),r1 ! 19 LS + cmp/hi r7,r0 ! 57 MT + + bt/s 1b ! 111 BR + mov.b r1,@-r0 ! 28 LS + +9: rts + nop + + ! Size is at least 64 bytes, so will be going round the big loop at least once. + ! + ! r2 = rounded up r4 + ! r3 = rounded down r0 + + .balign 32 +.Lcase0b: + add #-4, r5 ! 50 EX + +.Lcase00b: + mov r0, r3 ! 5 MT (latency=0) + mov #(~0x1f), r1 ! 6 EX + + and r1, r3 ! 78 EX + mov r4, r2 ! 5 MT (latency=0) + + cmp/eq r3, r0 ! 54 MT + add #0x1f, r2 ! 50 EX + + bt/s 1f ! 110 BR + and r1, r2 ! 78 EX + + ! copy initial words until cache line aligned + + mov.l @(r0, r5), r1 ! 21 LS (latency=2) + tst #4, r0 ! 87 MT + + mov r5, r6 ! 5 MT (latency=0) + add #-4, r6 ! 50 EX + + bt/s 4f ! 111 BR + add #8, r3 ! 50 EX + + tst #0x18, r0 ! 87 MT + + bt/s 1f ! 109 BR + mov.l r1,@-r0 ! 30 LS + + ! 4 cycles, 2 long words per iteration +3: mov.l @(r0, r5), r1 ! 21 LS (latency=2) + +4: mov.l @(r0, r6), r7 ! 21 LS (latency=2) + cmp/eq r3, r0 ! 54 MT + + mov.l r1, @-r0 ! 30 LS + bf/s 3b ! 109 BR + + mov.l r7, @-r0 ! 30 LS + + ! Copy the cache line aligned blocks + ! + ! In use: r0, r2, r4, r5 + ! Scratch: r1, r3, r6, r7 + ! + ! We could do this with the four scratch registers, but if src + ! and dest hit the same cache line, this will thrash, so make + ! use of additional registers. + ! + ! We also need r0 as a temporary (for movca), so 'undo' the invariant: + ! r5: src (was r0+r5) + ! r1: dest (was r0) + ! this can be reversed at the end, so we don't need to save any extra + ! state. + ! +1: mov.l r8, @-r15 ! 30 LS + add r0, r5 ! 49 EX + + mov.l r9, @-r15 ! 30 LS + mov r0, r1 ! 5 MT (latency=0) + + mov.l r10, @-r15 ! 30 LS + add #-0x1c, r5 ! 50 EX + + mov.l r11, @-r15 ! 30 LS + + ! 16 cycles, 32 bytes per iteration +2: mov.l @(0x00,r5),r0 ! 18 LS (latency=2) + add #-0x20, r1 ! 50 EX + mov.l @(0x04,r5),r3 ! 18 LS (latency=2) + mov.l @(0x08,r5),r6 ! 18 LS (latency=2) + mov.l @(0x0c,r5),r7 ! 18 LS (latency=2) + mov.l @(0x10,r5),r8 ! 18 LS (latency=2) + mov.l @(0x14,r5),r9 ! 18 LS (latency=2) + mov.l @(0x18,r5),r10 ! 18 LS (latency=2) + mov.l @(0x1c,r5),r11 ! 18 LS (latency=2) + movca.l r0,@r1 ! 40 LS (latency=3-7) + mov.l r3,@(0x04,r1) ! 33 LS + mov.l r6,@(0x08,r1) ! 33 LS + mov.l r7,@(0x0c,r1) ! 33 LS + + mov.l r8,@(0x10,r1) ! 33 LS + add #-0x20, r5 ! 50 EX + + mov.l r9,@(0x14,r1) ! 33 LS + cmp/eq r2,r1 ! 54 MT + + mov.l r10,@(0x18,r1) ! 33 LS + bf/s 2b ! 109 BR + + mov.l r11,@(0x1c,r1) ! 33 LS + + mov r1, r0 ! 5 MT (latency=0) + + mov.l @r15+, r11 ! 15 LS + sub r1, r5 ! 75 EX + + mov.l @r15+, r10 ! 15 LS + cmp/eq r4, r0 ! 54 MT + + bf/s 1f ! 109 BR + mov.l @r15+, r9 ! 15 LS + + rts +1: mov.l @r15+, r8 ! 15 LS + sub r4, r1 ! 75 EX (len remaining) + + ! number of trailing bytes is non-zero + ! + ! invariants restored (r5 already decremented by 4) + ! also r1=num bytes remaining + + mov #4, r2 ! 6 EX + mov r4, r7 ! 5 MT (latency=0) + + add #0x1c, r5 ! 50 EX (back to -4) + cmp/hs r2, r1 ! 58 MT + + bf/s 5f ! 108 BR + add #11, r7 ! 50 EX + + mov.l @(r0, r5), r6 ! 21 LS (latency=2) + tst r2, r1 ! 86 MT + + mov r5, r3 ! 5 MT (latency=0) + bt/s 4f ! 111 BR + + add #-4, r3 ! 50 EX + cmp/hs r2, r1 ! 58 MT + + bt/s 5f ! 111 BR + mov.l r6,@-r0 ! 30 LS + + ! 4 cycles, 2 long words per iteration +3: mov.l @(r0, r5), r6 ! 21 LS (latency=2) + +4: mov.l @(r0, r3), r2 ! 21 LS (latency=2) + cmp/hi r7, r0 + + mov.l r6, @-r0 ! 30 LS + bt/s 3b ! 109 BR + + mov.l r2, @-r0 ! 30 LS + + ! Copy the final 0-3 bytes + +5: cmp/eq r0, r4 ! 54 MT + add #-10, r7 ! 50 EX + + bt 9f ! 110 BR + add #3,r5 ! 50 EX + + ! 3 cycles, 1 byte per iteration +1: mov.b @(r0,r5),r1 ! 19 LS + cmp/hi r7,r0 ! 57 MT + + bt/s 1b ! 111 BR + mov.b r1,@-r0 ! 28 LS + +9: rts + nop + + ! + ! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR.. + ! + + .balign 32 +.Lcase2: + ! Size is 16 or greater and less then 64, but may have trailing bytes + +2: mov r5, r6 ! 5 MT (latency=0) + add #-2,r5 ! 50 EX + + mov r4,r2 ! 5 MT (latency=0) + add #-4,r6 ! 50 EX + + add #7,r2 ! 50 EX +3: mov.w @(r0,r5),r1 ! 20 LS (latency=2) + + mov.w @(r0,r6),r3 ! 20 LS (latency=2) + cmp/hi r2,r0 ! 57 MT + + mov.w r1,@-r0 ! 29 LS + bt/s 3b ! 111 BR + + mov.w r3,@-r0 ! 29 LS + + bra 10f + nop + + + .balign 32 +.Lcase2b: + ! Size is at least 64 bytes, so will be going round the big loop at least once. + ! + ! r2 = rounded up r4 + ! r3 = rounded down r0 + + mov r0, r3 ! 5 MT (latency=0) + mov #(~0x1f), r1 ! 6 EX + + and r1, r3 ! 78 EX + mov r4, r2 ! 5 MT (latency=0) + + cmp/eq r3, r0 ! 54 MT + add #0x1f, r2 ! 50 EX + + add #-2, r5 ! 50 EX + bt/s 1f ! 110 BR + and r1, r2 ! 78 EX + + ! Copy a short word one at a time until we are cache line aligned + ! Normal values: r0, r2, r3, r4 + ! Unused: r1, r6, r7 + ! Mod: r5 (=r5-2) + ! + add #2, r3 ! 50 EX + +2: mov.w @(r0,r5),r1 ! 20 LS (latency=2) + cmp/eq r3,r0 ! 54 MT + + bf/s 2b ! 111 BR + + mov.w r1,@-r0 ! 29 LS + + ! Copy the cache line aligned blocks + ! + ! In use: r0, r2, r4, r5 (=r5-2) + ! Scratch: r1, r3, r6, r7 + ! + ! We could do this with the four scratch registers, but if src + ! and dest hit the same cache line, this will thrash, so make + ! use of additional registers. + ! + ! We also need r0 as a temporary (for movca), so 'undo' the invariant: + ! r5: src (was r0+r5) + ! r1: dest (was r0) + ! this can be reversed at the end, so we don't need to save any extra + ! state. + ! +1: mov.l r8, @-r15 ! 30 LS + add r0, r5 ! 49 EX + + mov.l r9, @-r15 ! 30 LS + mov r0, r1 ! 5 MT (latency=0) + + mov.l r10, @-r15 ! 30 LS + add #-0x1e, r5 ! 50 EX + + mov.l r11, @-r15 ! 30 LS + + mov.l r12, @-r15 ! 30 LS + + ! 17 cycles, 32 bytes per iteration +#ifdef CONFIG_CPU_LITTLE_ENDIAN +2: mov.w @r5+, r0 ! 14 LS (latency=2) ..JI + add #-0x20, r1 ! 50 EX + + mov.l @r5+, r3 ! 15 LS (latency=2) NMLK + + mov.l @r5+, r6 ! 15 LS (latency=2) RQPO + shll16 r0 ! 103 EX JI.. + + mov.l @r5+, r7 ! 15 LS (latency=2) + xtrct r3, r0 ! 48 EX LKJI + + mov.l @r5+, r8 ! 15 LS (latency=2) + xtrct r6, r3 ! 48 EX PONM + + mov.l @r5+, r9 ! 15 LS (latency=2) + xtrct r7, r6 ! 48 EX + + mov.l @r5+, r10 ! 15 LS (latency=2) + xtrct r8, r7 ! 48 EX + + mov.l @r5+, r11 ! 15 LS (latency=2) + xtrct r9, r8 ! 48 EX + + mov.w @r5+, r12 ! 15 LS (latency=2) + xtrct r10, r9 ! 48 EX + + movca.l r0,@r1 ! 40 LS (latency=3-7) + xtrct r11, r10 ! 48 EX + + mov.l r3, @(0x04,r1) ! 33 LS + xtrct r12, r11 ! 48 EX + + mov.l r6, @(0x08,r1) ! 33 LS + + mov.l r7, @(0x0c,r1) ! 33 LS + + mov.l r8, @(0x10,r1) ! 33 LS + add #-0x40, r5 ! 50 EX + + mov.l r9, @(0x14,r1) ! 33 LS + cmp/eq r2,r1 ! 54 MT + + mov.l r10, @(0x18,r1) ! 33 LS + bf/s 2b ! 109 BR + + mov.l r11, @(0x1c,r1) ! 33 LS +#else +2: mov.w @(0x1e,r5), r0 ! 17 LS (latency=2) + add #-2, r5 ! 50 EX + + mov.l @(0x1c,r5), r3 ! 18 LS (latency=2) + add #-4, r1 ! 50 EX + + mov.l @(0x18,r5), r6 ! 18 LS (latency=2) + shll16 r0 ! 103 EX + + mov.l @(0x14,r5), r7 ! 18 LS (latency=2) + xtrct r3, r0 ! 48 EX + + mov.l @(0x10,r5), r8 ! 18 LS (latency=2) + xtrct r6, r3 ! 48 EX + + mov.l @(0x0c,r5), r9 ! 18 LS (latency=2) + xtrct r7, r6 ! 48 EX + + mov.l @(0x08,r5), r10 ! 18 LS (latency=2) + xtrct r8, r7 ! 48 EX + + mov.l @(0x04,r5), r11 ! 18 LS (latency=2) + xtrct r9, r8 ! 48 EX + + mov.l @(0x00,r5), r12 ! 18 LS (latency=2) + xtrct r10, r9 ! 48 EX + + movca.l r0,@r1 ! 40 LS (latency=3-7) + add #-0x1c, r1 ! 50 EX + + mov.l r3, @(0x18,r1) ! 33 LS + xtrct r11, r10 ! 48 EX + + mov.l r6, @(0x14,r1) ! 33 LS + xtrct r12, r11 ! 48 EX + + mov.l r7, @(0x10,r1) ! 33 LS + + mov.l r8, @(0x0c,r1) ! 33 LS + add #-0x1e, r5 ! 50 EX + + mov.l r9, @(0x08,r1) ! 33 LS + cmp/eq r2,r1 ! 54 MT + + mov.l r10, @(0x04,r1) ! 33 LS + bf/s 2b ! 109 BR + + mov.l r11, @(0x00,r1) ! 33 LS +#endif + + mov.l @r15+, r12 + mov r1, r0 ! 5 MT (latency=0) + + mov.l @r15+, r11 ! 15 LS + sub r1, r5 ! 75 EX + + mov.l @r15+, r10 ! 15 LS + cmp/eq r4, r0 ! 54 MT + + bf/s 1f ! 109 BR + mov.l @r15+, r9 ! 15 LS + + rts +1: mov.l @r15+, r8 ! 15 LS + + add #0x1e, r5 ! 50 EX + + ! Finish off a short word at a time + ! r5 must be invariant - 2 +10: mov r4,r2 ! 5 MT (latency=0) + add #1,r2 ! 50 EX + + cmp/hi r2, r0 ! 57 MT + bf/s 1f ! 109 BR + + add #2, r2 ! 50 EX + +3: mov.w @(r0,r5),r1 ! 20 LS + cmp/hi r2,r0 ! 57 MT + + bt/s 3b ! 109 BR + + mov.w r1,@-r0 ! 29 LS +1: + + ! + ! Finally, copy the last byte if necessary + cmp/eq r4,r0 ! 54 MT + bt/s 9b + add #1,r5 + mov.b @(r0,r5),r1 + rts + mov.b r1,@-r0 + diff --git a/arch/sh/lib/memcpy.S b/arch/sh/lib/memcpy.S new file mode 100644 index 000000000..08ab3062c --- /dev/null +++ b/arch/sh/lib/memcpy.S @@ -0,0 +1,228 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* $Id: memcpy.S,v 1.3 2001/07/27 11:50:52 gniibe Exp $ + * + * "memcpy" implementation of SuperH + * + * Copyright (C) 1999 Niibe Yutaka + * + */ + +/* + * void *memcpy(void *dst, const void *src, size_t n); + * No overlap between the memory of DST and of SRC are assumed. + */ + +#include <linux/linkage.h> +ENTRY(memcpy) + tst r6,r6 + bt/s 9f ! if n=0, do nothing + mov r4,r0 + sub r4,r5 ! From here, r5 has the distance to r0 + add r6,r0 ! From here, r0 points the end of copying point + mov #12,r1 + cmp/gt r6,r1 + bt/s 7f ! if it's too small, copy a byte at once + add #-1,r5 + add #1,r5 + ! From here, r6 is free + ! + ! r4 --> [ ... ] DST [ ... ] SRC + ! [ ... ] [ ... ] + ! : : + ! r0 --> [ ... ] r0+r5 --> [ ... ] + ! + ! + mov r5,r1 + mov #3,r2 + and r2,r1 + shll2 r1 + mov r0,r3 ! Save the value on R0 to R3 + mova jmptable,r0 + add r1,r0 + mov.l @r0,r1 + jmp @r1 + mov r3,r0 ! and back to R0 + .balign 4 +jmptable: + .long case0 + .long case1 + .long case2 + .long case3 + + ! copy a byte at once +7: mov r4,r2 + add #1,r2 +8: + cmp/hi r2,r0 + mov.b @(r0,r5),r1 + bt/s 8b ! while (r0>r2) + mov.b r1,@-r0 +9: + rts + nop + +case0: + ! + ! GHIJ KLMN OPQR --> GHIJ KLMN OPQR + ! + ! First, align to long word boundary + mov r0,r3 + and r2,r3 + tst r3,r3 + bt/s 2f + add #-4,r5 + add #3,r5 +1: dt r3 + mov.b @(r0,r5),r1 + bf/s 1b + mov.b r1,@-r0 + ! + add #-3,r5 +2: ! Second, copy a long word at once + mov r4,r2 + add #7,r2 +3: mov.l @(r0,r5),r1 + cmp/hi r2,r0 + bt/s 3b + mov.l r1,@-r0 + ! + ! Third, copy a byte at once, if necessary + cmp/eq r4,r0 + bt/s 9b + add #3,r5 + bra 8b + add #-6,r2 + +case1: + ! + ! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR. + ! + ! First, align to long word boundary + mov r0,r3 + and r2,r3 + tst r3,r3 + bt/s 2f + add #-1,r5 +1: dt r3 + mov.b @(r0,r5),r1 + bf/s 1b + mov.b r1,@-r0 + ! +2: ! Second, read a long word and write a long word at once + mov.l @(r0,r5),r1 + add #-4,r5 + mov r4,r2 + add #7,r2 + ! +#ifdef __LITTLE_ENDIAN__ +3: mov r1,r3 ! RQPO + shll16 r3 + shll8 r3 ! Oxxx + mov.l @(r0,r5),r1 ! NMLK + mov r1,r6 + shlr8 r6 ! xNML + or r6,r3 ! ONML + cmp/hi r2,r0 + bt/s 3b + mov.l r3,@-r0 +#else +3: mov r1,r3 ! OPQR + shlr16 r3 + shlr8 r3 ! xxxO + mov.l @(r0,r5),r1 ! KLMN + mov r1,r6 + shll8 r6 ! LMNx + or r6,r3 ! LMNO + cmp/hi r2,r0 + bt/s 3b + mov.l r3,@-r0 +#endif + ! + ! Third, copy a byte at once, if necessary + cmp/eq r4,r0 + bt/s 9b + add #4,r5 + bra 8b + add #-6,r2 + +case2: + ! + ! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR.. + ! + ! First, align to word boundary + tst #1,r0 + bt/s 2f + add #-1,r5 + mov.b @(r0,r5),r1 + mov.b r1,@-r0 + ! +2: ! Second, read a word and write a word at once + add #-1,r5 + mov r4,r2 + add #3,r2 + ! +3: mov.w @(r0,r5),r1 + cmp/hi r2,r0 + bt/s 3b + mov.w r1,@-r0 + ! + ! Third, copy a byte at once, if necessary + cmp/eq r4,r0 + bt/s 9b + add #1,r5 + mov.b @(r0,r5),r1 + rts + mov.b r1,@-r0 + +case3: + ! + ! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R... + ! + ! First, align to long word boundary + mov r0,r3 + and r2,r3 + tst r3,r3 + bt/s 2f + add #-1,r5 +1: dt r3 + mov.b @(r0,r5),r1 + bf/s 1b + mov.b r1,@-r0 + ! +2: ! Second, read a long word and write a long word at once + add #-2,r5 + mov.l @(r0,r5),r1 + add #-4,r5 + mov r4,r2 + add #7,r2 + ! +#ifdef __LITTLE_ENDIAN__ +3: mov r1,r3 ! RQPO + shll8 r3 ! QPOx + mov.l @(r0,r5),r1 ! NMLK + mov r1,r6 + shlr16 r6 + shlr8 r6 ! xxxN + or r6,r3 ! QPON + cmp/hi r2,r0 + bt/s 3b + mov.l r3,@-r0 +#else +3: mov r1,r3 ! OPQR + shlr8 r3 ! xOPQ + mov.l @(r0,r5),r1 ! KLMN + mov r1,r6 + shll16 r6 + shll8 r6 ! Nxxx + or r6,r3 ! NOPQ + cmp/hi r2,r0 + bt/s 3b + mov.l r3,@-r0 +#endif + ! + ! Third, copy a byte at once, if necessary + cmp/eq r4,r0 + bt/s 9b + add #6,r5 + bra 8b + add #-6,r2 diff --git a/arch/sh/lib/memmove.S b/arch/sh/lib/memmove.S new file mode 100644 index 000000000..bdca32181 --- /dev/null +++ b/arch/sh/lib/memmove.S @@ -0,0 +1,255 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* $Id: memmove.S,v 1.2 2001/07/27 11:51:09 gniibe Exp $ + * + * "memmove" implementation of SuperH + * + * Copyright (C) 1999 Niibe Yutaka + * + */ + +/* + * void *memmove(void *dst, const void *src, size_t n); + * The memory areas may overlap. + */ + +#include <linux/linkage.h> +ENTRY(memmove) + ! if dest > src, call memcpy (it copies in decreasing order) + cmp/hi r5,r4 + bf 1f + mov.l 2f,r0 + jmp @r0 + nop + .balign 4 +2: .long memcpy +1: + sub r5,r4 ! From here, r4 has the distance to r0 + tst r6,r6 + bt/s 9f ! if n=0, do nothing + mov r5,r0 + add r6,r5 + mov #12,r1 + cmp/gt r6,r1 + bt/s 8f ! if it's too small, copy a byte at once + add #-1,r4 + add #1,r4 + ! + ! [ ... ] DST [ ... ] SRC + ! [ ... ] [ ... ] + ! : : + ! r0+r4--> [ ... ] r0 --> [ ... ] + ! : : + ! [ ... ] [ ... ] + ! r5 --> + ! + mov r4,r1 + mov #3,r2 + and r2,r1 + shll2 r1 + mov r0,r3 ! Save the value on R0 to R3 + mova jmptable,r0 + add r1,r0 + mov.l @r0,r1 + jmp @r1 + mov r3,r0 ! and back to R0 + .balign 4 +jmptable: + .long case0 + .long case1 + .long case2 + .long case3 + + ! copy a byte at once +8: mov.b @r0+,r1 + cmp/hs r5,r0 + bf/s 8b ! while (r0<r5) + mov.b r1,@(r0,r4) + add #1,r4 +9: + add r4,r0 + rts + sub r6,r0 + +case_none: + bra 8b + add #-1,r4 + +case0: + ! + ! GHIJ KLMN OPQR --> GHIJ KLMN OPQR + ! + ! First, align to long word boundary + mov r0,r3 + and r2,r3 + tst r3,r3 + bt/s 2f + add #-1,r4 + mov #4,r2 + sub r3,r2 +1: dt r2 + mov.b @r0+,r1 + bf/s 1b + mov.b r1,@(r0,r4) + ! +2: ! Second, copy a long word at once + add #-3,r4 + add #-3,r5 +3: mov.l @r0+,r1 + cmp/hs r5,r0 + bf/s 3b + mov.l r1,@(r0,r4) + add #3,r5 + ! + ! Third, copy a byte at once, if necessary + cmp/eq r5,r0 + bt/s 9b + add #4,r4 + bra 8b + add #-1,r4 + +case3: + ! + ! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR. + ! + ! First, align to long word boundary + mov r0,r3 + and r2,r3 + tst r3,r3 + bt/s 2f + add #-1,r4 + mov #4,r2 + sub r3,r2 +1: dt r2 + mov.b @r0+,r1 + bf/s 1b + mov.b r1,@(r0,r4) + ! +2: ! Second, read a long word and write a long word at once + add #-2,r4 + mov.l @(r0,r4),r1 + add #-7,r5 + add #-4,r4 + ! +#ifdef __LITTLE_ENDIAN__ + shll8 r1 +3: mov r1,r3 ! JIHG + shlr8 r3 ! xJIH + mov.l @r0+,r1 ! NMLK + mov r1,r2 + shll16 r2 + shll8 r2 ! Kxxx + or r2,r3 ! KJIH + cmp/hs r5,r0 + bf/s 3b + mov.l r3,@(r0,r4) +#else + shlr8 r1 +3: mov r1,r3 ! GHIJ + shll8 r3 ! HIJx + mov.l @r0+,r1 ! KLMN + mov r1,r2 + shlr16 r2 + shlr8 r2 ! xxxK + or r2,r3 ! HIJK + cmp/hs r5,r0 + bf/s 3b + mov.l r3,@(r0,r4) +#endif + add #7,r5 + ! + ! Third, copy a byte at once, if necessary + cmp/eq r5,r0 + bt/s 9b + add #7,r4 + add #-3,r0 + bra 8b + add #-1,r4 + +case2: + ! + ! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR.. + ! + ! First, align to word boundary + tst #1,r0 + bt/s 2f + add #-1,r4 + mov.b @r0+,r1 + mov.b r1,@(r0,r4) + ! +2: ! Second, read a word and write a word at once + add #-1,r4 + add #-1,r5 + ! +3: mov.w @r0+,r1 + cmp/hs r5,r0 + bf/s 3b + mov.w r1,@(r0,r4) + add #1,r5 + ! + ! Third, copy a byte at once, if necessary + cmp/eq r5,r0 + bt/s 9b + add #2,r4 + mov.b @r0,r1 + mov.b r1,@(r0,r4) + bra 9b + add #1,r0 + +case1: + ! + ! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R... + ! + ! First, align to long word boundary + mov r0,r3 + and r2,r3 + tst r3,r3 + bt/s 2f + add #-1,r4 + mov #4,r2 + sub r3,r2 +1: dt r2 + mov.b @r0+,r1 + bf/s 1b + mov.b r1,@(r0,r4) + ! +2: ! Second, read a long word and write a long word at once + mov.l @(r0,r4),r1 + add #-7,r5 + add #-4,r4 + ! +#ifdef __LITTLE_ENDIAN__ + shll16 r1 + shll8 r1 +3: mov r1,r3 ! JIHG + shlr16 r3 + shlr8 r3 ! xxxJ + mov.l @r0+,r1 ! NMLK + mov r1,r2 + shll8 r2 ! MLKx + or r2,r3 ! MLKJ + cmp/hs r5,r0 + bf/s 3b + mov.l r3,@(r0,r4) +#else + shlr16 r1 + shlr8 r1 +3: mov r1,r3 ! GHIJ + shll16 r3 + shll8 r3 ! Jxxx + mov.l @r0+,r1 ! KLMN + mov r1,r2 + shlr8 r2 ! xKLM + or r2,r3 ! JKLM + cmp/hs r5,r0 + bf/s 3b ! while(r0<r5) + mov.l r3,@(r0,r4) +#endif + add #7,r5 + ! + ! Third, copy a byte at once, if necessary + cmp/eq r5,r0 + bt/s 9b + add #5,r4 + add #-3,r0 + bra 8b + add #-1,r4 diff --git a/arch/sh/lib/memset-sh4.S b/arch/sh/lib/memset-sh4.S new file mode 100644 index 000000000..10649c4cd --- /dev/null +++ b/arch/sh/lib/memset-sh4.S @@ -0,0 +1,108 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * "memset" implementation for SH4 + * + * Copyright (C) 1999 Niibe Yutaka + * Copyright (c) 2009 STMicroelectronics Limited + * Author: Stuart Menefy <stuart.menefy:st.com> + */ + +/* + * void *memset(void *s, int c, size_t n); + */ + +#include <linux/linkage.h> + +ENTRY(memset) + mov #12,r0 + add r6,r4 + cmp/gt r6,r0 + bt/s 40f ! if it's too small, set a byte at once + mov r4,r0 + and #3,r0 + cmp/eq #0,r0 + bt/s 2f ! It's aligned + sub r0,r6 +1: + dt r0 + bf/s 1b + mov.b r5,@-r4 +2: ! make VVVV + extu.b r5,r5 + swap.b r5,r0 ! V0 + or r0,r5 ! VV + swap.w r5,r0 ! VV00 + or r0,r5 ! VVVV + + ! Check if enough bytes need to be copied to be worth the big loop + mov #0x40, r0 ! (MT) + cmp/gt r6,r0 ! (MT) 64 > len => slow loop + + bt/s 22f + mov r6,r0 + + ! align the dst to the cache block size if necessary + mov r4, r3 + mov #~(0x1f), r1 + + and r3, r1 + cmp/eq r3, r1 + + bt/s 11f ! dst is already aligned + sub r1, r3 ! r3-r1 -> r3 + shlr2 r3 ! number of loops + +10: mov.l r5,@-r4 + dt r3 + bf/s 10b + add #-4, r6 + +11: ! dst is 32byte aligned + mov r6,r2 + mov #-5,r0 + shld r0,r2 ! number of loops + + add #-32, r4 + mov r5, r0 +12: + movca.l r0,@r4 + mov.l r5,@(4, r4) + mov.l r5,@(8, r4) + mov.l r5,@(12,r4) + mov.l r5,@(16,r4) + mov.l r5,@(20,r4) + add #-0x20, r6 + mov.l r5,@(24,r4) + dt r2 + mov.l r5,@(28,r4) + bf/s 12b + add #-32, r4 + + add #32, r4 + mov #8, r0 + cmp/ge r0, r6 + bf 40f + + mov r6,r0 +22: + shlr2 r0 + shlr r0 ! r0 = r6 >> 3 +3: + dt r0 + mov.l r5,@-r4 ! set 8-byte at once + bf/s 3b + mov.l r5,@-r4 + ! + mov #7,r0 + and r0,r6 + + ! fill bytes (length may be zero) +40: tst r6,r6 + bt 5f +4: + dt r6 + bf/s 4b + mov.b r5,@-r4 +5: + rts + mov r4,r0 diff --git a/arch/sh/lib/memset.S b/arch/sh/lib/memset.S new file mode 100644 index 000000000..a6d5ec0bd --- /dev/null +++ b/arch/sh/lib/memset.S @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* $Id: memset.S,v 1.1 2000/04/14 16:49:01 mjd Exp $ + * + * "memset" implementation of SuperH + * + * Copyright (C) 1999 Niibe Yutaka + * + */ + +/* + * void *memset(void *s, int c, size_t n); + */ + +#include <linux/linkage.h> + +ENTRY(memset) + tst r6,r6 + bt/s 5f ! if n=0, do nothing + add r6,r4 + mov #12,r0 + cmp/gt r6,r0 + bt/s 4f ! if it's too small, set a byte at once + mov r4,r0 + and #3,r0 + cmp/eq #0,r0 + bt/s 2f ! It's aligned + sub r0,r6 +1: + dt r0 + bf/s 1b + mov.b r5,@-r4 +2: ! make VVVV + extu.b r5,r5 + swap.b r5,r0 ! V0 + or r0,r5 ! VV + swap.w r5,r0 ! VV00 + or r0,r5 ! VVVV + ! + mov r6,r0 + shlr2 r0 + shlr r0 ! r0 = r6 >> 3 +3: + dt r0 + mov.l r5,@-r4 ! set 8-byte at once + bf/s 3b + mov.l r5,@-r4 + ! + mov #7,r0 + and r0,r6 + tst r6,r6 + bt 5f + ! fill bytes +4: + dt r6 + bf/s 4b + mov.b r5,@-r4 +5: + rts + mov r4,r0 diff --git a/arch/sh/lib/movmem.S b/arch/sh/lib/movmem.S new file mode 100644 index 000000000..8ac54d6b3 --- /dev/null +++ b/arch/sh/lib/movmem.S @@ -0,0 +1,217 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH GCC-exception-2.0 + + Copyright (C) 1994, 1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003, + 2004, 2005, 2006 + Free Software Foundation, Inc. +*/ + +!! libgcc routines for the Renesas / SuperH SH CPUs. +!! Contributed by Steve Chamberlain. +!! sac@cygnus.com + +!! ashiftrt_r4_x, ___ashrsi3, ___ashlsi3, ___lshrsi3 routines +!! recoded in assembly by Toshiyasu Morita +!! tm@netcom.com + +/* SH2 optimizations for ___ashrsi3, ___ashlsi3, ___lshrsi3 and + ELF local label prefixes by J"orn Rennecke + amylaar@cygnus.com */ + + .text + .balign 4 + .global __movmem + .global __movstr + .set __movstr, __movmem + /* This would be a lot simpler if r6 contained the byte count + minus 64, and we wouldn't be called here for a byte count of 64. */ +__movmem: + sts.l pr,@-r15 + shll2 r6 + bsr __movmemSI52+2 + mov.l @(48,r5),r0 + .balign 4 +movmem_loop: /* Reached with rts */ + mov.l @(60,r5),r0 + add #-64,r6 + mov.l r0,@(60,r4) + tst r6,r6 + mov.l @(56,r5),r0 + bt movmem_done + mov.l r0,@(56,r4) + cmp/pl r6 + mov.l @(52,r5),r0 + add #64,r5 + mov.l r0,@(52,r4) + add #64,r4 + bt __movmemSI52 +! done all the large groups, do the remainder +! jump to movmem+ + mova __movmemSI4+4,r0 + add r6,r0 + jmp @r0 +movmem_done: ! share slot insn, works out aligned. + lds.l @r15+,pr + mov.l r0,@(56,r4) + mov.l @(52,r5),r0 + rts + mov.l r0,@(52,r4) + .balign 4 + + .global __movmemSI64 + .global __movstrSI64 + .set __movstrSI64, __movmemSI64 +__movmemSI64: + mov.l @(60,r5),r0 + mov.l r0,@(60,r4) + .global __movmemSI60 + .global __movstrSI60 + .set __movstrSI60, __movmemSI60 +__movmemSI60: + mov.l @(56,r5),r0 + mov.l r0,@(56,r4) + .global __movmemSI56 + .global __movstrSI56 + .set __movstrSI56, __movmemSI56 +__movmemSI56: + mov.l @(52,r5),r0 + mov.l r0,@(52,r4) + .global __movmemSI52 + .global __movstrSI52 + .set __movstrSI52, __movmemSI52 +__movmemSI52: + mov.l @(48,r5),r0 + mov.l r0,@(48,r4) + .global __movmemSI48 + .global __movstrSI48 + .set __movstrSI48, __movmemSI48 +__movmemSI48: + mov.l @(44,r5),r0 + mov.l r0,@(44,r4) + .global __movmemSI44 + .global __movstrSI44 + .set __movstrSI44, __movmemSI44 +__movmemSI44: + mov.l @(40,r5),r0 + mov.l r0,@(40,r4) + .global __movmemSI40 + .global __movstrSI40 + .set __movstrSI40, __movmemSI40 +__movmemSI40: + mov.l @(36,r5),r0 + mov.l r0,@(36,r4) + .global __movmemSI36 + .global __movstrSI36 + .set __movstrSI36, __movmemSI36 +__movmemSI36: + mov.l @(32,r5),r0 + mov.l r0,@(32,r4) + .global __movmemSI32 + .global __movstrSI32 + .set __movstrSI32, __movmemSI32 +__movmemSI32: + mov.l @(28,r5),r0 + mov.l r0,@(28,r4) + .global __movmemSI28 + .global __movstrSI28 + .set __movstrSI28, __movmemSI28 +__movmemSI28: + mov.l @(24,r5),r0 + mov.l r0,@(24,r4) + .global __movmemSI24 + .global __movstrSI24 + .set __movstrSI24, __movmemSI24 +__movmemSI24: + mov.l @(20,r5),r0 + mov.l r0,@(20,r4) + .global __movmemSI20 + .global __movstrSI20 + .set __movstrSI20, __movmemSI20 +__movmemSI20: + mov.l @(16,r5),r0 + mov.l r0,@(16,r4) + .global __movmemSI16 + .global __movstrSI16 + .set __movstrSI16, __movmemSI16 +__movmemSI16: + mov.l @(12,r5),r0 + mov.l r0,@(12,r4) + .global __movmemSI12 + .global __movstrSI12 + .set __movstrSI12, __movmemSI12 +__movmemSI12: + mov.l @(8,r5),r0 + mov.l r0,@(8,r4) + .global __movmemSI8 + .global __movstrSI8 + .set __movstrSI8, __movmemSI8 +__movmemSI8: + mov.l @(4,r5),r0 + mov.l r0,@(4,r4) + .global __movmemSI4 + .global __movstrSI4 + .set __movstrSI4, __movmemSI4 +__movmemSI4: + mov.l @(0,r5),r0 + rts + mov.l r0,@(0,r4) + + .global __movmem_i4_even + .global __movstr_i4_even + .set __movstr_i4_even, __movmem_i4_even + + .global __movmem_i4_odd + .global __movstr_i4_odd + .set __movstr_i4_odd, __movmem_i4_odd + + .global __movmemSI12_i4 + .global __movstrSI12_i4 + .set __movstrSI12_i4, __movmemSI12_i4 + + .p2align 5 +L_movmem_2mod4_end: + mov.l r0,@(16,r4) + rts + mov.l r1,@(20,r4) + + .p2align 2 + +__movmem_i4_even: + mov.l @r5+,r0 + bra L_movmem_start_even + mov.l @r5+,r1 + +__movmem_i4_odd: + mov.l @r5+,r1 + add #-4,r4 + mov.l @r5+,r2 + mov.l @r5+,r3 + mov.l r1,@(4,r4) + mov.l r2,@(8,r4) + +L_movmem_loop: + mov.l r3,@(12,r4) + dt r6 + mov.l @r5+,r0 + bt/s L_movmem_2mod4_end + mov.l @r5+,r1 + add #16,r4 +L_movmem_start_even: + mov.l @r5+,r2 + mov.l @r5+,r3 + mov.l r0,@r4 + dt r6 + mov.l r1,@(4,r4) + bf/s L_movmem_loop + mov.l r2,@(8,r4) + rts + mov.l r3,@(12,r4) + + .p2align 4 +__movmemSI12_i4: + mov.l @r5,r0 + mov.l @(4,r5),r1 + mov.l @(8,r5),r2 + mov.l r0,@r4 + mov.l r1,@(4,r4) + rts + mov.l r2,@(8,r4) diff --git a/arch/sh/lib/strlen.S b/arch/sh/lib/strlen.S new file mode 100644 index 000000000..80ea53dd3 --- /dev/null +++ b/arch/sh/lib/strlen.S @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* $Id: strlen.S,v 1.2 2001/06/29 14:07:15 gniibe Exp $ + * + * "strlen" implementation of SuperH + * + * Copyright (C) 1999 Kaz Kojima + * + */ + +/* size_t strlen (const char *s) */ + +#include <linux/linkage.h> +ENTRY(strlen) + mov r4,r0 + and #3,r0 + tst r0,r0 + bt/s 1f + mov #0,r2 + + add #-1,r0 + shll2 r0 + shll r0 + braf r0 + nop + + mov.b @r4+,r1 + tst r1,r1 + bt 8f + add #1,r2 + + mov.b @r4+,r1 + tst r1,r1 + bt 8f + add #1,r2 + + mov.b @r4+,r1 + tst r1,r1 + bt 8f + add #1,r2 + +1: + mov #0,r3 +2: + mov.l @r4+,r1 + cmp/str r3,r1 + bf/s 2b + add #4,r2 + + add #-4,r2 +#ifndef __LITTLE_ENDIAN__ + swap.b r1,r1 + swap.w r1,r1 + swap.b r1,r1 +#endif + extu.b r1,r0 + tst r0,r0 + bt/s 8f + shlr8 r1 + add #1,r2 + extu.b r1,r0 + tst r0,r0 + bt/s 8f + shlr8 r1 + add #1,r2 + extu.b r1,r0 + tst r0,r0 + bt 8f + add #1,r2 +8: + rts + mov r2,r0 diff --git a/arch/sh/lib/udiv_qrnnd.S b/arch/sh/lib/udiv_qrnnd.S new file mode 100644 index 000000000..28938dacc --- /dev/null +++ b/arch/sh/lib/udiv_qrnnd.S @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH GCC-exception-2.0 + + Copyright (C) 1994, 1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003, + 2004, 2005, 2006 + Free Software Foundation, Inc. +*/ + +!! libgcc routines for the Renesas / SuperH SH CPUs. +!! Contributed by Steve Chamberlain. +!! sac@cygnus.com + +!! ashiftrt_r4_x, ___ashrsi3, ___ashlsi3, ___lshrsi3 routines +!! recoded in assembly by Toshiyasu Morita +!! tm@netcom.com + +/* SH2 optimizations for ___ashrsi3, ___ashlsi3, ___lshrsi3 and + ELF local label prefixes by J"orn Rennecke + amylaar@cygnus.com */ + + /* r0: rn r1: qn */ /* r0: n1 r4: n0 r5: d r6: d1 */ /* r2: __m */ + /* n1 < d, but n1 might be larger than d1. */ + .global __udiv_qrnnd_16 + .balign 8 +__udiv_qrnnd_16: + div0u + cmp/hi r6,r0 + bt .Lots + .rept 16 + div1 r6,r0 + .endr + extu.w r0,r1 + bt 0f + add r6,r0 +0: rotcl r1 + mulu.w r1,r5 + xtrct r4,r0 + swap.w r0,r0 + sts macl,r2 + cmp/hs r2,r0 + sub r2,r0 + bt 0f + addc r5,r0 + add #-1,r1 + bt 0f +1: add #-1,r1 + rts + add r5,r0 + .balign 8 +.Lots: + sub r5,r0 + swap.w r4,r1 + xtrct r0,r1 + clrt + mov r1,r0 + addc r5,r0 + mov #-1,r1 + bf/s 1b + shlr16 r1 +0: rts + nop diff --git a/arch/sh/lib/udivsi3.S b/arch/sh/lib/udivsi3.S new file mode 100644 index 000000000..09ed1f9de --- /dev/null +++ b/arch/sh/lib/udivsi3.S @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH GCC-exception-2.0 + + Copyright (C) 1994, 1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003, + 2004, 2005 + Free Software Foundation, Inc. +*/ + +!! libgcc routines for the Renesas / SuperH SH CPUs. +!! Contributed by Steve Chamberlain. +!! sac@cygnus.com + + .balign 4 + .global __udivsi3 + .type __udivsi3, @function +div8: + div1 r5,r4 +div7: + div1 r5,r4; div1 r5,r4; div1 r5,r4 + div1 r5,r4; div1 r5,r4; div1 r5,r4; rts; div1 r5,r4 + +divx4: + div1 r5,r4; rotcl r0 + div1 r5,r4; rotcl r0 + div1 r5,r4; rotcl r0 + rts; div1 r5,r4 + +__udivsi3: + sts.l pr,@-r15 + extu.w r5,r0 + cmp/eq r5,r0 + bf/s large_divisor + div0u + swap.w r4,r0 + shlr16 r4 + bsr div8 + shll16 r5 + bsr div7 + div1 r5,r4 + xtrct r4,r0 + xtrct r0,r4 + bsr div8 + swap.w r4,r4 + bsr div7 + div1 r5,r4 + lds.l @r15+,pr + xtrct r4,r0 + swap.w r0,r0 + rotcl r0 + rts + shlr16 r5 + +large_divisor: + mov #0,r0 + xtrct r4,r0 + xtrct r0,r4 + bsr divx4 + rotcl r0 + bsr divx4 + rotcl r0 + bsr divx4 + rotcl r0 + bsr divx4 + rotcl r0 + lds.l @r15+,pr + rts + rotcl r0 diff --git a/arch/sh/lib/udivsi3_i4i-Os.S b/arch/sh/lib/udivsi3_i4i-Os.S new file mode 100644 index 000000000..fa4e4dff3 --- /dev/null +++ b/arch/sh/lib/udivsi3_i4i-Os.S @@ -0,0 +1,128 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH GCC-exception-2.0 + * + * Copyright (C) 2006 Free Software Foundation, Inc. + */ + +/* Moderately Space-optimized libgcc routines for the Renesas SH / + STMicroelectronics ST40 CPUs. + Contributed by J"orn Rennecke joern.rennecke@st.com. */ + +/* Size: 186 bytes jointly for udivsi3_i4i and sdivsi3_i4i + sh4-200 run times: + udiv small divisor: 55 cycles + udiv large divisor: 52 cycles + sdiv small divisor, positive result: 59 cycles + sdiv large divisor, positive result: 56 cycles + sdiv small divisor, negative result: 65 cycles (*) + sdiv large divisor, negative result: 62 cycles (*) + (*): r2 is restored in the rts delay slot and has a lingering latency + of two more cycles. */ + .balign 4 + .global __udivsi3_i4i + .global __udivsi3_i4 + .set __udivsi3_i4, __udivsi3_i4i + .type __udivsi3_i4i, @function + .type __sdivsi3_i4i, @function +__udivsi3_i4i: + sts pr,r1 + mov.l r4,@-r15 + extu.w r5,r0 + cmp/eq r5,r0 + swap.w r4,r0 + shlr16 r4 + bf/s large_divisor + div0u + mov.l r5,@-r15 + shll16 r5 +sdiv_small_divisor: + div1 r5,r4 + bsr div6 + div1 r5,r4 + div1 r5,r4 + bsr div6 + div1 r5,r4 + xtrct r4,r0 + xtrct r0,r4 + bsr div7 + swap.w r4,r4 + div1 r5,r4 + bsr div7 + div1 r5,r4 + xtrct r4,r0 + mov.l @r15+,r5 + swap.w r0,r0 + mov.l @r15+,r4 + jmp @r1 + rotcl r0 +div7: + div1 r5,r4 +div6: + div1 r5,r4; div1 r5,r4; div1 r5,r4 + div1 r5,r4; div1 r5,r4; rts; div1 r5,r4 + +divx3: + rotcl r0 + div1 r5,r4 + rotcl r0 + div1 r5,r4 + rotcl r0 + rts + div1 r5,r4 + +large_divisor: + mov.l r5,@-r15 +sdiv_large_divisor: + xor r4,r0 + .rept 4 + rotcl r0 + bsr divx3 + div1 r5,r4 + .endr + mov.l @r15+,r5 + mov.l @r15+,r4 + jmp @r1 + rotcl r0 + + .global __sdivsi3_i4i + .global __sdivsi3_i4 + .global __sdivsi3 + .set __sdivsi3_i4, __sdivsi3_i4i + .set __sdivsi3, __sdivsi3_i4i +__sdivsi3_i4i: + mov.l r4,@-r15 + cmp/pz r5 + mov.l r5,@-r15 + bt/s pos_divisor + cmp/pz r4 + neg r5,r5 + extu.w r5,r0 + bt/s neg_result + cmp/eq r5,r0 + neg r4,r4 +pos_result: + swap.w r4,r0 + bra sdiv_check_divisor + sts pr,r1 +pos_divisor: + extu.w r5,r0 + bt/s pos_result + cmp/eq r5,r0 + neg r4,r4 +neg_result: + mova negate_result,r0 + ; + mov r0,r1 + swap.w r4,r0 + lds r2,macl + sts pr,r2 +sdiv_check_divisor: + shlr16 r4 + bf/s sdiv_large_divisor + div0u + bra sdiv_small_divisor + shll16 r5 + .balign 4 +negate_result: + neg r0,r0 + jmp @r2 + sts macl,r2 diff --git a/arch/sh/lib/udivsi3_i4i.S b/arch/sh/lib/udivsi3_i4i.S new file mode 100644 index 000000000..6944eb6b4 --- /dev/null +++ b/arch/sh/lib/udivsi3_i4i.S @@ -0,0 +1,645 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH GCC-exception-2.0 + + Copyright (C) 1994, 1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003, + 2004, 2005, 2006 + Free Software Foundation, Inc. +*/ + +!! libgcc routines for the Renesas / SuperH SH CPUs. +!! Contributed by Steve Chamberlain. +!! sac@cygnus.com + +!! ashiftrt_r4_x, ___ashrsi3, ___ashlsi3, ___lshrsi3 routines +!! recoded in assembly by Toshiyasu Morita +!! tm@netcom.com + +/* SH2 optimizations for ___ashrsi3, ___ashlsi3, ___lshrsi3 and + ELF local label prefixes by J"orn Rennecke + amylaar@cygnus.com */ + +/* This code used shld, thus is not suitable for SH1 / SH2. */ + +/* Signed / unsigned division without use of FPU, optimized for SH4. + Uses a lookup table for divisors in the range -128 .. +128, and + div1 with case distinction for larger divisors in three more ranges. + The code is lumped together with the table to allow the use of mova. */ +#ifdef CONFIG_CPU_LITTLE_ENDIAN +#define L_LSB 0 +#define L_LSWMSB 1 +#define L_MSWLSB 2 +#else +#define L_LSB 3 +#define L_LSWMSB 2 +#define L_MSWLSB 1 +#endif + + .balign 4 + .global __udivsi3_i4i + .global __udivsi3_i4 + .set __udivsi3_i4, __udivsi3_i4i + .type __udivsi3_i4i, @function +__udivsi3_i4i: + mov.w c128_w, r1 + div0u + mov r4,r0 + shlr8 r0 + cmp/hi r1,r5 + extu.w r5,r1 + bf udiv_le128 + cmp/eq r5,r1 + bf udiv_ge64k + shlr r0 + mov r5,r1 + shll16 r5 + mov.l r4,@-r15 + div1 r5,r0 + mov.l r1,@-r15 + div1 r5,r0 + div1 r5,r0 + bra udiv_25 + div1 r5,r0 + +div_le128: + mova div_table_ix,r0 + bra div_le128_2 + mov.b @(r0,r5),r1 +udiv_le128: + mov.l r4,@-r15 + mova div_table_ix,r0 + mov.b @(r0,r5),r1 + mov.l r5,@-r15 +div_le128_2: + mova div_table_inv,r0 + mov.l @(r0,r1),r1 + mov r5,r0 + tst #0xfe,r0 + mova div_table_clz,r0 + dmulu.l r1,r4 + mov.b @(r0,r5),r1 + bt/s div_by_1 + mov r4,r0 + mov.l @r15+,r5 + sts mach,r0 + /* clrt */ + addc r4,r0 + mov.l @r15+,r4 + rotcr r0 + rts + shld r1,r0 + +div_by_1_neg: + neg r4,r0 +div_by_1: + mov.l @r15+,r5 + rts + mov.l @r15+,r4 + +div_ge64k: + bt/s div_r8 + div0u + shll8 r5 + bra div_ge64k_2 + div1 r5,r0 +udiv_ge64k: + cmp/hi r0,r5 + mov r5,r1 + bt udiv_r8 + shll8 r5 + mov.l r4,@-r15 + div1 r5,r0 + mov.l r1,@-r15 +div_ge64k_2: + div1 r5,r0 + mov.l zero_l,r1 + .rept 4 + div1 r5,r0 + .endr + mov.l r1,@-r15 + div1 r5,r0 + mov.w m256_w,r1 + div1 r5,r0 + mov.b r0,@(L_LSWMSB,r15) + xor r4,r0 + and r1,r0 + bra div_ge64k_end + xor r4,r0 + +div_r8: + shll16 r4 + bra div_r8_2 + shll8 r4 +udiv_r8: + mov.l r4,@-r15 + shll16 r4 + clrt + shll8 r4 + mov.l r5,@-r15 +div_r8_2: + rotcl r4 + mov r0,r1 + div1 r5,r1 + mov r4,r0 + rotcl r0 + mov r5,r4 + div1 r5,r1 + .rept 5 + rotcl r0; div1 r5,r1 + .endr + rotcl r0 + mov.l @r15+,r5 + div1 r4,r1 + mov.l @r15+,r4 + rts + rotcl r0 + + .global __sdivsi3_i4i + .global __sdivsi3_i4 + .global __sdivsi3 + .set __sdivsi3_i4, __sdivsi3_i4i + .set __sdivsi3, __sdivsi3_i4i + .type __sdivsi3_i4i, @function + /* This is link-compatible with a __sdivsi3 call, + but we effectively clobber only r1. */ +__sdivsi3_i4i: + mov.l r4,@-r15 + cmp/pz r5 + mov.w c128_w, r1 + bt/s pos_divisor + cmp/pz r4 + mov.l r5,@-r15 + neg r5,r5 + bt/s neg_result + cmp/hi r1,r5 + neg r4,r4 +pos_result: + extu.w r5,r0 + bf div_le128 + cmp/eq r5,r0 + mov r4,r0 + shlr8 r0 + bf/s div_ge64k + cmp/hi r0,r5 + div0u + shll16 r5 + div1 r5,r0 + div1 r5,r0 + div1 r5,r0 +udiv_25: + mov.l zero_l,r1 + div1 r5,r0 + div1 r5,r0 + mov.l r1,@-r15 + .rept 3 + div1 r5,r0 + .endr + mov.b r0,@(L_MSWLSB,r15) + xtrct r4,r0 + swap.w r0,r0 + .rept 8 + div1 r5,r0 + .endr + mov.b r0,@(L_LSWMSB,r15) +div_ge64k_end: + .rept 8 + div1 r5,r0 + .endr + mov.l @r15+,r4 ! zero-extension and swap using LS unit. + extu.b r0,r0 + mov.l @r15+,r5 + or r4,r0 + mov.l @r15+,r4 + rts + rotcl r0 + +div_le128_neg: + tst #0xfe,r0 + mova div_table_ix,r0 + mov.b @(r0,r5),r1 + mova div_table_inv,r0 + bt/s div_by_1_neg + mov.l @(r0,r1),r1 + mova div_table_clz,r0 + dmulu.l r1,r4 + mov.b @(r0,r5),r1 + mov.l @r15+,r5 + sts mach,r0 + /* clrt */ + addc r4,r0 + mov.l @r15+,r4 + rotcr r0 + shld r1,r0 + rts + neg r0,r0 + +pos_divisor: + mov.l r5,@-r15 + bt/s pos_result + cmp/hi r1,r5 + neg r4,r4 +neg_result: + extu.w r5,r0 + bf div_le128_neg + cmp/eq r5,r0 + mov r4,r0 + shlr8 r0 + bf/s div_ge64k_neg + cmp/hi r0,r5 + div0u + mov.l zero_l,r1 + shll16 r5 + div1 r5,r0 + mov.l r1,@-r15 + .rept 7 + div1 r5,r0 + .endr + mov.b r0,@(L_MSWLSB,r15) + xtrct r4,r0 + swap.w r0,r0 + .rept 8 + div1 r5,r0 + .endr + mov.b r0,@(L_LSWMSB,r15) +div_ge64k_neg_end: + .rept 8 + div1 r5,r0 + .endr + mov.l @r15+,r4 ! zero-extension and swap using LS unit. + extu.b r0,r1 + mov.l @r15+,r5 + or r4,r1 +div_r8_neg_end: + mov.l @r15+,r4 + rotcl r1 + rts + neg r1,r0 + +div_ge64k_neg: + bt/s div_r8_neg + div0u + shll8 r5 + mov.l zero_l,r1 + .rept 6 + div1 r5,r0 + .endr + mov.l r1,@-r15 + div1 r5,r0 + mov.w m256_w,r1 + div1 r5,r0 + mov.b r0,@(L_LSWMSB,r15) + xor r4,r0 + and r1,r0 + bra div_ge64k_neg_end + xor r4,r0 + +c128_w: + .word 128 + +div_r8_neg: + clrt + shll16 r4 + mov r4,r1 + shll8 r1 + mov r5,r4 + .rept 7 + rotcl r1; div1 r5,r0 + .endr + mov.l @r15+,r5 + rotcl r1 + bra div_r8_neg_end + div1 r4,r0 + +m256_w: + .word 0xff00 +/* This table has been generated by divtab-sh4.c. */ + .balign 4 +div_table_clz: + .byte 0 + .byte 1 + .byte 0 + .byte -1 + .byte -1 + .byte -2 + .byte -2 + .byte -2 + .byte -2 + .byte -3 + .byte -3 + .byte -3 + .byte -3 + .byte -3 + .byte -3 + .byte -3 + .byte -3 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 +/* Lookup table translating positive divisor to index into table of + normalized inverse. N.B. the '0' entry is also the last entry of the + previous table, and causes an unaligned access for division by zero. */ +div_table_ix: + .byte -6 + .byte -128 + .byte -128 + .byte 0 + .byte -128 + .byte -64 + .byte 0 + .byte 64 + .byte -128 + .byte -96 + .byte -64 + .byte -32 + .byte 0 + .byte 32 + .byte 64 + .byte 96 + .byte -128 + .byte -112 + .byte -96 + .byte -80 + .byte -64 + .byte -48 + .byte -32 + .byte -16 + .byte 0 + .byte 16 + .byte 32 + .byte 48 + .byte 64 + .byte 80 + .byte 96 + .byte 112 + .byte -128 + .byte -120 + .byte -112 + .byte -104 + .byte -96 + .byte -88 + .byte -80 + .byte -72 + .byte -64 + .byte -56 + .byte -48 + .byte -40 + .byte -32 + .byte -24 + .byte -16 + .byte -8 + .byte 0 + .byte 8 + .byte 16 + .byte 24 + .byte 32 + .byte 40 + .byte 48 + .byte 56 + .byte 64 + .byte 72 + .byte 80 + .byte 88 + .byte 96 + .byte 104 + .byte 112 + .byte 120 + .byte -128 + .byte -124 + .byte -120 + .byte -116 + .byte -112 + .byte -108 + .byte -104 + .byte -100 + .byte -96 + .byte -92 + .byte -88 + .byte -84 + .byte -80 + .byte -76 + .byte -72 + .byte -68 + .byte -64 + .byte -60 + .byte -56 + .byte -52 + .byte -48 + .byte -44 + .byte -40 + .byte -36 + .byte -32 + .byte -28 + .byte -24 + .byte -20 + .byte -16 + .byte -12 + .byte -8 + .byte -4 + .byte 0 + .byte 4 + .byte 8 + .byte 12 + .byte 16 + .byte 20 + .byte 24 + .byte 28 + .byte 32 + .byte 36 + .byte 40 + .byte 44 + .byte 48 + .byte 52 + .byte 56 + .byte 60 + .byte 64 + .byte 68 + .byte 72 + .byte 76 + .byte 80 + .byte 84 + .byte 88 + .byte 92 + .byte 96 + .byte 100 + .byte 104 + .byte 108 + .byte 112 + .byte 116 + .byte 120 + .byte 124 + .byte -128 +/* 1/64 .. 1/127, normalized. There is an implicit leading 1 in bit 32. */ + .balign 4 +zero_l: + .long 0x0 + .long 0xF81F81F9 + .long 0xF07C1F08 + .long 0xE9131AC0 + .long 0xE1E1E1E2 + .long 0xDAE6076C + .long 0xD41D41D5 + .long 0xCD856891 + .long 0xC71C71C8 + .long 0xC0E07039 + .long 0xBACF914D + .long 0xB4E81B4F + .long 0xAF286BCB + .long 0xA98EF607 + .long 0xA41A41A5 + .long 0x9EC8E952 + .long 0x9999999A + .long 0x948B0FCE + .long 0x8F9C18FA + .long 0x8ACB90F7 + .long 0x86186187 + .long 0x81818182 + .long 0x7D05F418 + .long 0x78A4C818 + .long 0x745D1746 + .long 0x702E05C1 + .long 0x6C16C16D + .long 0x68168169 + .long 0x642C8591 + .long 0x60581606 + .long 0x5C9882BA + .long 0x58ED2309 +div_table_inv: + .long 0x55555556 + .long 0x51D07EAF + .long 0x4E5E0A73 + .long 0x4AFD6A06 + .long 0x47AE147B + .long 0x446F8657 + .long 0x41414142 + .long 0x3E22CBCF + .long 0x3B13B13C + .long 0x38138139 + .long 0x3521CFB3 + .long 0x323E34A3 + .long 0x2F684BDB + .long 0x2C9FB4D9 + .long 0x29E4129F + .long 0x27350B89 + .long 0x24924925 + .long 0x21FB7813 + .long 0x1F7047DD + .long 0x1CF06ADB + .long 0x1A7B9612 + .long 0x18118119 + .long 0x15B1E5F8 + .long 0x135C8114 + .long 0x11111112 + .long 0xECF56BF + .long 0xC9714FC + .long 0xA6810A7 + .long 0x8421085 + .long 0x624DD30 + .long 0x4104105 + .long 0x2040811 + /* maximum error: 0.987342 scaled: 0.921875*/ |