summaryrefslogtreecommitdiffstats
path: root/arch/microblaze/lib
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--arch/microblaze/lib/Makefile24
-rw-r--r--arch/microblaze/lib/ashldi3.c29
-rw-r--r--arch/microblaze/lib/ashrdi3.c31
-rw-r--r--arch/microblaze/lib/cmpdi2.c27
-rw-r--r--arch/microblaze/lib/divsi3.S74
-rw-r--r--arch/microblaze/lib/fastcopy.S666
-rw-r--r--arch/microblaze/lib/libgcc.h33
-rw-r--r--arch/microblaze/lib/lshrdi3.c29
-rw-r--r--arch/microblaze/lib/memcpy.c179
-rw-r--r--arch/microblaze/lib/memmove.c195
-rw-r--r--arch/microblaze/lib/memset.c94
-rw-r--r--arch/microblaze/lib/modsi3.S74
-rw-r--r--arch/microblaze/lib/muldi3.c58
-rw-r--r--arch/microblaze/lib/mulsi3.S47
-rw-r--r--arch/microblaze/lib/uaccess_old.S176
-rw-r--r--arch/microblaze/lib/ucmpdi2.c21
-rw-r--r--arch/microblaze/lib/udivsi3.S85
-rw-r--r--arch/microblaze/lib/umodsi3.S87
18 files changed, 1929 insertions, 0 deletions
diff --git a/arch/microblaze/lib/Makefile b/arch/microblaze/lib/Makefile
new file mode 100644
index 000000000..9fe7ab688
--- /dev/null
+++ b/arch/microblaze/lib/Makefile
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile
+#
+
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_ashldi3.o = -pg
+CFLAGS_REMOVE_ashrdi3.o = -pg
+CFLAGS_REMOVE_lshrdi3.o = -pg
+endif
+
+lib-y := memset.o
+
+ifeq ($(CONFIG_OPT_LIB_ASM),y)
+lib-y += fastcopy.o
+else
+lib-y += memcpy.o memmove.o
+endif
+
+lib-y += uaccess_old.o
+
+# libgcc-style stuff needed in the kernel
+obj-y += ashldi3.o ashrdi3.o cmpdi2.o divsi3.o lshrdi3.o modsi3.o
+obj-y += muldi3.o mulsi3.o ucmpdi2.o udivsi3.o umodsi3.o
diff --git a/arch/microblaze/lib/ashldi3.c b/arch/microblaze/lib/ashldi3.c
new file mode 100644
index 000000000..4d0f9481b
--- /dev/null
+++ b/arch/microblaze/lib/ashldi3.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/export.h>
+
+#include "libgcc.h"
+
+long long __ashldi3(long long u, word_type b)
+{
+ DWunion uu, w;
+ word_type bm;
+
+ if (b == 0)
+ return u;
+
+ uu.ll = u;
+ bm = 32 - b;
+
+ if (bm <= 0) {
+ w.s.low = 0;
+ w.s.high = (unsigned int) uu.s.low << -bm;
+ } else {
+ const unsigned int carries = (unsigned int) uu.s.low >> bm;
+
+ w.s.low = (unsigned int) uu.s.low << b;
+ w.s.high = ((unsigned int) uu.s.high << b) | carries;
+ }
+
+ return w.ll;
+}
+EXPORT_SYMBOL(__ashldi3);
diff --git a/arch/microblaze/lib/ashrdi3.c b/arch/microblaze/lib/ashrdi3.c
new file mode 100644
index 000000000..268098a96
--- /dev/null
+++ b/arch/microblaze/lib/ashrdi3.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/export.h>
+
+#include "libgcc.h"
+
+long long __ashrdi3(long long u, word_type b)
+{
+ DWunion uu, w;
+ word_type bm;
+
+ if (b == 0)
+ return u;
+
+ uu.ll = u;
+ bm = 32 - b;
+
+ if (bm <= 0) {
+ /* w.s.high = 1..1 or 0..0 */
+ w.s.high =
+ uu.s.high >> 31;
+ w.s.low = uu.s.high >> -bm;
+ } else {
+ const unsigned int carries = (unsigned int) uu.s.high << bm;
+
+ w.s.high = uu.s.high >> b;
+ w.s.low = ((unsigned int) uu.s.low >> b) | carries;
+ }
+
+ return w.ll;
+}
+EXPORT_SYMBOL(__ashrdi3);
diff --git a/arch/microblaze/lib/cmpdi2.c b/arch/microblaze/lib/cmpdi2.c
new file mode 100644
index 000000000..b3f896385
--- /dev/null
+++ b/arch/microblaze/lib/cmpdi2.c
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/export.h>
+
+#include "libgcc.h"
+
+word_type __cmpdi2(long long a, long long b)
+{
+ const DWunion au = {
+ .ll = a
+ };
+ const DWunion bu = {
+ .ll = b
+ };
+
+ if (au.s.high < bu.s.high)
+ return 0;
+ else if (au.s.high > bu.s.high)
+ return 2;
+
+ if ((unsigned int) au.s.low < (unsigned int) bu.s.low)
+ return 0;
+ else if ((unsigned int) au.s.low > (unsigned int) bu.s.low)
+ return 2;
+
+ return 1;
+}
+EXPORT_SYMBOL(__cmpdi2);
diff --git a/arch/microblaze/lib/divsi3.S b/arch/microblaze/lib/divsi3.S
new file mode 100644
index 000000000..919fb69f8
--- /dev/null
+++ b/arch/microblaze/lib/divsi3.S
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+
+/*
+* Divide operation for 32 bit integers.
+* Input : Dividend in Reg r5
+* Divisor in Reg r6
+* Output: Result in Reg r3
+*/
+ .text
+ .globl __divsi3
+ .type __divsi3, @function
+ .ent __divsi3
+__divsi3:
+ .frame r1, 0, r15
+
+ addik r1, r1, -16
+ swi r28, r1, 0
+ swi r29, r1, 4
+ swi r30, r1, 8
+ swi r31, r1, 12
+
+ beqi r6, div_by_zero /* div_by_zero - division error */
+ beqi r5, result_is_zero /* result is zero */
+ bgeid r5, r5_pos
+ xor r28, r5, r6 /* get the sign of the result */
+ rsubi r5, r5, 0 /* make r5 positive */
+r5_pos:
+ bgei r6, r6_pos
+ rsubi r6, r6, 0 /* make r6 positive */
+r6_pos:
+ addik r30, r0, 0 /* clear mod */
+ addik r3, r0, 0 /* clear div */
+ addik r29, r0, 32 /* initialize the loop count */
+
+ /* first part try to find the first '1' in the r5 */
+div0:
+ blti r5, div2 /* this traps r5 == 0x80000000 */
+div1:
+ add r5, r5, r5 /* left shift logical r5 */
+ bgtid r5, div1
+ addik r29, r29, -1
+div2:
+ /* left shift logical r5 get the '1' into the carry */
+ add r5, r5, r5
+ addc r30, r30, r30 /* move that bit into the mod register */
+ rsub r31, r6, r30 /* try to subtract (r30 a r6) */
+ blti r31, mod_too_small
+ /* move the r31 to mod since the result was positive */
+ or r30, r0, r31
+ addik r3, r3, 1
+mod_too_small:
+ addik r29, r29, -1
+ beqi r29, loop_end
+ add r3, r3, r3 /* shift in the '1' into div */
+ bri div2 /* div2 */
+loop_end:
+ bgei r28, return_here
+ brid return_here
+ rsubi r3, r3, 0 /* negate the result */
+div_by_zero:
+result_is_zero:
+ or r3, r0, r0 /* set result to 0 */
+return_here:
+/* restore values of csrs and that of r3 and the divisor and the dividend */
+ lwi r28, r1, 0
+ lwi r29, r1, 4
+ lwi r30, r1, 8
+ lwi r31, r1, 12
+ rtsd r15, 8
+ addik r1, r1, 16
+
+.size __divsi3, . - __divsi3
+.end __divsi3
diff --git a/arch/microblaze/lib/fastcopy.S b/arch/microblaze/lib/fastcopy.S
new file mode 100644
index 000000000..fdc48bb06
--- /dev/null
+++ b/arch/microblaze/lib/fastcopy.S
@@ -0,0 +1,666 @@
+/*
+ * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
+ * Copyright (C) 2008-2009 PetaLogix
+ * Copyright (C) 2008 Jim Law - Iris LP All rights reserved.
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License. See the file COPYING in the main directory of this
+ * archive for more details.
+ *
+ * Written by Jim Law <jlaw@irispower.com>
+ *
+ * intended to replace:
+ * memcpy in memcpy.c and
+ * memmove in memmove.c
+ * ... in arch/microblaze/lib
+ *
+ *
+ * assly_fastcopy.S
+ *
+ * Attempt at quicker memcpy and memmove for MicroBlaze
+ * Input : Operand1 in Reg r5 - destination address
+ * Operand2 in Reg r6 - source address
+ * Operand3 in Reg r7 - number of bytes to transfer
+ * Output: Result in Reg r3 - starting destinaition address
+ *
+ *
+ * Explanation:
+ * Perform (possibly unaligned) copy of a block of memory
+ * between mem locations with size of xfer spec'd in bytes
+ */
+
+#include <linux/linkage.h>
+ .text
+ .globl memcpy
+ .type memcpy, @function
+ .ent memcpy
+
+memcpy:
+fast_memcpy_ascending:
+ /* move d to return register as value of function */
+ addi r3, r5, 0
+
+ addi r4, r0, 4 /* n = 4 */
+ cmpu r4, r4, r7 /* n = c - n (unsigned) */
+ blti r4, a_xfer_end /* if n < 0, less than one word to transfer */
+
+ /* transfer first 0~3 bytes to get aligned dest address */
+ andi r4, r5, 3 /* n = d & 3 */
+ /* if zero, destination already aligned */
+ beqi r4, a_dalign_done
+ /* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */
+ rsubi r4, r4, 4
+ rsub r7, r4, r7 /* c = c - n adjust c */
+
+a_xfer_first_loop:
+ /* if no bytes left to transfer, transfer the bulk */
+ beqi r4, a_dalign_done
+ lbui r11, r6, 0 /* h = *s */
+ sbi r11, r5, 0 /* *d = h */
+ addi r6, r6, 1 /* s++ */
+ addi r5, r5, 1 /* d++ */
+ brid a_xfer_first_loop /* loop */
+ addi r4, r4, -1 /* n-- (IN DELAY SLOT) */
+
+a_dalign_done:
+ addi r4, r0, 32 /* n = 32 */
+ cmpu r4, r4, r7 /* n = c - n (unsigned) */
+ /* if n < 0, less than one block to transfer */
+ blti r4, a_block_done
+
+a_block_xfer:
+ andi r4, r7, 0xffffffe0 /* n = c & ~31 */
+ rsub r7, r4, r7 /* c = c - n */
+
+ andi r9, r6, 3 /* t1 = s & 3 */
+ /* if temp != 0, unaligned transfers needed */
+ bnei r9, a_block_unaligned
+
+a_block_aligned:
+ lwi r9, r6, 0 /* t1 = *(s + 0) */
+ lwi r10, r6, 4 /* t2 = *(s + 4) */
+ lwi r11, r6, 8 /* t3 = *(s + 8) */
+ lwi r12, r6, 12 /* t4 = *(s + 12) */
+ swi r9, r5, 0 /* *(d + 0) = t1 */
+ swi r10, r5, 4 /* *(d + 4) = t2 */
+ swi r11, r5, 8 /* *(d + 8) = t3 */
+ swi r12, r5, 12 /* *(d + 12) = t4 */
+ lwi r9, r6, 16 /* t1 = *(s + 16) */
+ lwi r10, r6, 20 /* t2 = *(s + 20) */
+ lwi r11, r6, 24 /* t3 = *(s + 24) */
+ lwi r12, r6, 28 /* t4 = *(s + 28) */
+ swi r9, r5, 16 /* *(d + 16) = t1 */
+ swi r10, r5, 20 /* *(d + 20) = t2 */
+ swi r11, r5, 24 /* *(d + 24) = t3 */
+ swi r12, r5, 28 /* *(d + 28) = t4 */
+ addi r6, r6, 32 /* s = s + 32 */
+ addi r4, r4, -32 /* n = n - 32 */
+ bneid r4, a_block_aligned /* while (n) loop */
+ addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
+ bri a_block_done
+
+a_block_unaligned:
+ andi r8, r6, 0xfffffffc /* as = s & ~3 */
+ add r6, r6, r4 /* s = s + n */
+ lwi r11, r8, 0 /* h = *(as + 0) */
+
+ addi r9, r9, -1
+ beqi r9, a_block_u1 /* t1 was 1 => 1 byte offset */
+ addi r9, r9, -1
+ beqi r9, a_block_u2 /* t1 was 2 => 2 byte offset */
+
+a_block_u3:
+ bslli r11, r11, 24 /* h = h << 24 */
+a_bu3_loop:
+ lwi r12, r8, 4 /* v = *(as + 4) */
+ bsrli r9, r12, 8 /* t1 = v >> 8 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 0 /* *(d + 0) = t1 */
+ bslli r11, r12, 24 /* h = v << 24 */
+ lwi r12, r8, 8 /* v = *(as + 8) */
+ bsrli r9, r12, 8 /* t1 = v >> 8 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 4 /* *(d + 4) = t1 */
+ bslli r11, r12, 24 /* h = v << 24 */
+ lwi r12, r8, 12 /* v = *(as + 12) */
+ bsrli r9, r12, 8 /* t1 = v >> 8 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 8 /* *(d + 8) = t1 */
+ bslli r11, r12, 24 /* h = v << 24 */
+ lwi r12, r8, 16 /* v = *(as + 16) */
+ bsrli r9, r12, 8 /* t1 = v >> 8 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 12 /* *(d + 12) = t1 */
+ bslli r11, r12, 24 /* h = v << 24 */
+ lwi r12, r8, 20 /* v = *(as + 20) */
+ bsrli r9, r12, 8 /* t1 = v >> 8 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 16 /* *(d + 16) = t1 */
+ bslli r11, r12, 24 /* h = v << 24 */
+ lwi r12, r8, 24 /* v = *(as + 24) */
+ bsrli r9, r12, 8 /* t1 = v >> 8 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 20 /* *(d + 20) = t1 */
+ bslli r11, r12, 24 /* h = v << 24 */
+ lwi r12, r8, 28 /* v = *(as + 28) */
+ bsrli r9, r12, 8 /* t1 = v >> 8 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 24 /* *(d + 24) = t1 */
+ bslli r11, r12, 24 /* h = v << 24 */
+ lwi r12, r8, 32 /* v = *(as + 32) */
+ bsrli r9, r12, 8 /* t1 = v >> 8 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 28 /* *(d + 28) = t1 */
+ bslli r11, r12, 24 /* h = v << 24 */
+ addi r8, r8, 32 /* as = as + 32 */
+ addi r4, r4, -32 /* n = n - 32 */
+ bneid r4, a_bu3_loop /* while (n) loop */
+ addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
+ bri a_block_done
+
+a_block_u1:
+ bslli r11, r11, 8 /* h = h << 8 */
+a_bu1_loop:
+ lwi r12, r8, 4 /* v = *(as + 4) */
+ bsrli r9, r12, 24 /* t1 = v >> 24 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 0 /* *(d + 0) = t1 */
+ bslli r11, r12, 8 /* h = v << 8 */
+ lwi r12, r8, 8 /* v = *(as + 8) */
+ bsrli r9, r12, 24 /* t1 = v >> 24 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 4 /* *(d + 4) = t1 */
+ bslli r11, r12, 8 /* h = v << 8 */
+ lwi r12, r8, 12 /* v = *(as + 12) */
+ bsrli r9, r12, 24 /* t1 = v >> 24 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 8 /* *(d + 8) = t1 */
+ bslli r11, r12, 8 /* h = v << 8 */
+ lwi r12, r8, 16 /* v = *(as + 16) */
+ bsrli r9, r12, 24 /* t1 = v >> 24 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 12 /* *(d + 12) = t1 */
+ bslli r11, r12, 8 /* h = v << 8 */
+ lwi r12, r8, 20 /* v = *(as + 20) */
+ bsrli r9, r12, 24 /* t1 = v >> 24 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 16 /* *(d + 16) = t1 */
+ bslli r11, r12, 8 /* h = v << 8 */
+ lwi r12, r8, 24 /* v = *(as + 24) */
+ bsrli r9, r12, 24 /* t1 = v >> 24 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 20 /* *(d + 20) = t1 */
+ bslli r11, r12, 8 /* h = v << 8 */
+ lwi r12, r8, 28 /* v = *(as + 28) */
+ bsrli r9, r12, 24 /* t1 = v >> 24 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 24 /* *(d + 24) = t1 */
+ bslli r11, r12, 8 /* h = v << 8 */
+ lwi r12, r8, 32 /* v = *(as + 32) */
+ bsrli r9, r12, 24 /* t1 = v >> 24 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 28 /* *(d + 28) = t1 */
+ bslli r11, r12, 8 /* h = v << 8 */
+ addi r8, r8, 32 /* as = as + 32 */
+ addi r4, r4, -32 /* n = n - 32 */
+ bneid r4, a_bu1_loop /* while (n) loop */
+ addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
+ bri a_block_done
+
+a_block_u2:
+ bslli r11, r11, 16 /* h = h << 16 */
+a_bu2_loop:
+ lwi r12, r8, 4 /* v = *(as + 4) */
+ bsrli r9, r12, 16 /* t1 = v >> 16 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 0 /* *(d + 0) = t1 */
+ bslli r11, r12, 16 /* h = v << 16 */
+ lwi r12, r8, 8 /* v = *(as + 8) */
+ bsrli r9, r12, 16 /* t1 = v >> 16 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 4 /* *(d + 4) = t1 */
+ bslli r11, r12, 16 /* h = v << 16 */
+ lwi r12, r8, 12 /* v = *(as + 12) */
+ bsrli r9, r12, 16 /* t1 = v >> 16 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 8 /* *(d + 8) = t1 */
+ bslli r11, r12, 16 /* h = v << 16 */
+ lwi r12, r8, 16 /* v = *(as + 16) */
+ bsrli r9, r12, 16 /* t1 = v >> 16 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 12 /* *(d + 12) = t1 */
+ bslli r11, r12, 16 /* h = v << 16 */
+ lwi r12, r8, 20 /* v = *(as + 20) */
+ bsrli r9, r12, 16 /* t1 = v >> 16 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 16 /* *(d + 16) = t1 */
+ bslli r11, r12, 16 /* h = v << 16 */
+ lwi r12, r8, 24 /* v = *(as + 24) */
+ bsrli r9, r12, 16 /* t1 = v >> 16 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 20 /* *(d + 20) = t1 */
+ bslli r11, r12, 16 /* h = v << 16 */
+ lwi r12, r8, 28 /* v = *(as + 28) */
+ bsrli r9, r12, 16 /* t1 = v >> 16 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 24 /* *(d + 24) = t1 */
+ bslli r11, r12, 16 /* h = v << 16 */
+ lwi r12, r8, 32 /* v = *(as + 32) */
+ bsrli r9, r12, 16 /* t1 = v >> 16 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 28 /* *(d + 28) = t1 */
+ bslli r11, r12, 16 /* h = v << 16 */
+ addi r8, r8, 32 /* as = as + 32 */
+ addi r4, r4, -32 /* n = n - 32 */
+ bneid r4, a_bu2_loop /* while (n) loop */
+ addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
+
+a_block_done:
+ addi r4, r0, 4 /* n = 4 */
+ cmpu r4, r4, r7 /* n = c - n (unsigned) */
+ blti r4, a_xfer_end /* if n < 0, less than one word to transfer */
+
+a_word_xfer:
+ andi r4, r7, 0xfffffffc /* n = c & ~3 */
+ addi r10, r0, 0 /* offset = 0 */
+
+ andi r9, r6, 3 /* t1 = s & 3 */
+ /* if temp != 0, unaligned transfers needed */
+ bnei r9, a_word_unaligned
+
+a_word_aligned:
+ lw r9, r6, r10 /* t1 = *(s+offset) */
+ sw r9, r5, r10 /* *(d+offset) = t1 */
+ addi r4, r4,-4 /* n-- */
+ bneid r4, a_word_aligned /* loop */
+ addi r10, r10, 4 /* offset++ (IN DELAY SLOT) */
+
+ bri a_word_done
+
+a_word_unaligned:
+ andi r8, r6, 0xfffffffc /* as = s & ~3 */
+ lwi r11, r8, 0 /* h = *(as + 0) */
+ addi r8, r8, 4 /* as = as + 4 */
+
+ addi r9, r9, -1
+ beqi r9, a_word_u1 /* t1 was 1 => 1 byte offset */
+ addi r9, r9, -1
+ beqi r9, a_word_u2 /* t1 was 2 => 2 byte offset */
+
+a_word_u3:
+ bslli r11, r11, 24 /* h = h << 24 */
+a_wu3_loop:
+ lw r12, r8, r10 /* v = *(as + offset) */
+ bsrli r9, r12, 8 /* t1 = v >> 8 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ sw r9, r5, r10 /* *(d + offset) = t1 */
+ bslli r11, r12, 24 /* h = v << 24 */
+ addi r4, r4,-4 /* n = n - 4 */
+ bneid r4, a_wu3_loop /* while (n) loop */
+ addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
+
+ bri a_word_done
+
+a_word_u1:
+ bslli r11, r11, 8 /* h = h << 8 */
+a_wu1_loop:
+ lw r12, r8, r10 /* v = *(as + offset) */
+ bsrli r9, r12, 24 /* t1 = v >> 24 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ sw r9, r5, r10 /* *(d + offset) = t1 */
+ bslli r11, r12, 8 /* h = v << 8 */
+ addi r4, r4,-4 /* n = n - 4 */
+ bneid r4, a_wu1_loop /* while (n) loop */
+ addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
+
+ bri a_word_done
+
+a_word_u2:
+ bslli r11, r11, 16 /* h = h << 16 */
+a_wu2_loop:
+ lw r12, r8, r10 /* v = *(as + offset) */
+ bsrli r9, r12, 16 /* t1 = v >> 16 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ sw r9, r5, r10 /* *(d + offset) = t1 */
+ bslli r11, r12, 16 /* h = v << 16 */
+ addi r4, r4,-4 /* n = n - 4 */
+ bneid r4, a_wu2_loop /* while (n) loop */
+ addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
+
+a_word_done:
+ add r5, r5, r10 /* d = d + offset */
+ add r6, r6, r10 /* s = s + offset */
+ rsub r7, r10, r7 /* c = c - offset */
+
+a_xfer_end:
+a_xfer_end_loop:
+ beqi r7, a_done /* while (c) */
+ lbui r9, r6, 0 /* t1 = *s */
+ addi r6, r6, 1 /* s++ */
+ sbi r9, r5, 0 /* *d = t1 */
+ addi r7, r7, -1 /* c-- */
+ brid a_xfer_end_loop /* loop */
+ addi r5, r5, 1 /* d++ (IN DELAY SLOT) */
+
+a_done:
+ rtsd r15, 8
+ nop
+
+.size memcpy, . - memcpy
+.end memcpy
+/*----------------------------------------------------------------------------*/
+ .globl memmove
+ .type memmove, @function
+ .ent memmove
+
+memmove:
+ cmpu r4, r5, r6 /* n = s - d */
+ bgei r4,fast_memcpy_ascending
+
+fast_memcpy_descending:
+ /* move d to return register as value of function */
+ addi r3, r5, 0
+
+ add r5, r5, r7 /* d = d + c */
+ add r6, r6, r7 /* s = s + c */
+
+ addi r4, r0, 4 /* n = 4 */
+ cmpu r4, r4, r7 /* n = c - n (unsigned) */
+ blti r4,d_xfer_end /* if n < 0, less than one word to transfer */
+
+ /* transfer first 0~3 bytes to get aligned dest address */
+ andi r4, r5, 3 /* n = d & 3 */
+ /* if zero, destination already aligned */
+ beqi r4,d_dalign_done
+ rsub r7, r4, r7 /* c = c - n adjust c */
+
+d_xfer_first_loop:
+ /* if no bytes left to transfer, transfer the bulk */
+ beqi r4,d_dalign_done
+ addi r6, r6, -1 /* s-- */
+ addi r5, r5, -1 /* d-- */
+ lbui r11, r6, 0 /* h = *s */
+ sbi r11, r5, 0 /* *d = h */
+ brid d_xfer_first_loop /* loop */
+ addi r4, r4, -1 /* n-- (IN DELAY SLOT) */
+
+d_dalign_done:
+ addi r4, r0, 32 /* n = 32 */
+ cmpu r4, r4, r7 /* n = c - n (unsigned) */
+ /* if n < 0, less than one block to transfer */
+ blti r4, d_block_done
+
+d_block_xfer:
+ andi r4, r7, 0xffffffe0 /* n = c & ~31 */
+ rsub r7, r4, r7 /* c = c - n */
+
+ andi r9, r6, 3 /* t1 = s & 3 */
+ /* if temp != 0, unaligned transfers needed */
+ bnei r9, d_block_unaligned
+
+d_block_aligned:
+ addi r6, r6, -32 /* s = s - 32 */
+ addi r5, r5, -32 /* d = d - 32 */
+ lwi r9, r6, 28 /* t1 = *(s + 28) */
+ lwi r10, r6, 24 /* t2 = *(s + 24) */
+ lwi r11, r6, 20 /* t3 = *(s + 20) */
+ lwi r12, r6, 16 /* t4 = *(s + 16) */
+ swi r9, r5, 28 /* *(d + 28) = t1 */
+ swi r10, r5, 24 /* *(d + 24) = t2 */
+ swi r11, r5, 20 /* *(d + 20) = t3 */
+ swi r12, r5, 16 /* *(d + 16) = t4 */
+ lwi r9, r6, 12 /* t1 = *(s + 12) */
+ lwi r10, r6, 8 /* t2 = *(s + 8) */
+ lwi r11, r6, 4 /* t3 = *(s + 4) */
+ lwi r12, r6, 0 /* t4 = *(s + 0) */
+ swi r9, r5, 12 /* *(d + 12) = t1 */
+ swi r10, r5, 8 /* *(d + 8) = t2 */
+ swi r11, r5, 4 /* *(d + 4) = t3 */
+ addi r4, r4, -32 /* n = n - 32 */
+ bneid r4, d_block_aligned /* while (n) loop */
+ swi r12, r5, 0 /* *(d + 0) = t4 (IN DELAY SLOT) */
+ bri d_block_done
+
+d_block_unaligned:
+ andi r8, r6, 0xfffffffc /* as = s & ~3 */
+ rsub r6, r4, r6 /* s = s - n */
+ lwi r11, r8, 0 /* h = *(as + 0) */
+
+ addi r9, r9, -1
+ beqi r9,d_block_u1 /* t1 was 1 => 1 byte offset */
+ addi r9, r9, -1
+ beqi r9,d_block_u2 /* t1 was 2 => 2 byte offset */
+
+d_block_u3:
+ bsrli r11, r11, 8 /* h = h >> 8 */
+d_bu3_loop:
+ addi r8, r8, -32 /* as = as - 32 */
+ addi r5, r5, -32 /* d = d - 32 */
+ lwi r12, r8, 28 /* v = *(as + 28) */
+ bslli r9, r12, 24 /* t1 = v << 24 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 28 /* *(d + 28) = t1 */
+ bsrli r11, r12, 8 /* h = v >> 8 */
+ lwi r12, r8, 24 /* v = *(as + 24) */
+ bslli r9, r12, 24 /* t1 = v << 24 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 24 /* *(d + 24) = t1 */
+ bsrli r11, r12, 8 /* h = v >> 8 */
+ lwi r12, r8, 20 /* v = *(as + 20) */
+ bslli r9, r12, 24 /* t1 = v << 24 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 20 /* *(d + 20) = t1 */
+ bsrli r11, r12, 8 /* h = v >> 8 */
+ lwi r12, r8, 16 /* v = *(as + 16) */
+ bslli r9, r12, 24 /* t1 = v << 24 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 16 /* *(d + 16) = t1 */
+ bsrli r11, r12, 8 /* h = v >> 8 */
+ lwi r12, r8, 12 /* v = *(as + 12) */
+ bslli r9, r12, 24 /* t1 = v << 24 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 12 /* *(d + 112) = t1 */
+ bsrli r11, r12, 8 /* h = v >> 8 */
+ lwi r12, r8, 8 /* v = *(as + 8) */
+ bslli r9, r12, 24 /* t1 = v << 24 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 8 /* *(d + 8) = t1 */
+ bsrli r11, r12, 8 /* h = v >> 8 */
+ lwi r12, r8, 4 /* v = *(as + 4) */
+ bslli r9, r12, 24 /* t1 = v << 24 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 4 /* *(d + 4) = t1 */
+ bsrli r11, r12, 8 /* h = v >> 8 */
+ lwi r12, r8, 0 /* v = *(as + 0) */
+ bslli r9, r12, 24 /* t1 = v << 24 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 0 /* *(d + 0) = t1 */
+ addi r4, r4, -32 /* n = n - 32 */
+ bneid r4, d_bu3_loop /* while (n) loop */
+ bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */
+ bri d_block_done
+
+d_block_u1:
+ bsrli r11, r11, 24 /* h = h >> 24 */
+d_bu1_loop:
+ addi r8, r8, -32 /* as = as - 32 */
+ addi r5, r5, -32 /* d = d - 32 */
+ lwi r12, r8, 28 /* v = *(as + 28) */
+ bslli r9, r12, 8 /* t1 = v << 8 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 28 /* *(d + 28) = t1 */
+ bsrli r11, r12, 24 /* h = v >> 24 */
+ lwi r12, r8, 24 /* v = *(as + 24) */
+ bslli r9, r12, 8 /* t1 = v << 8 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 24 /* *(d + 24) = t1 */
+ bsrli r11, r12, 24 /* h = v >> 24 */
+ lwi r12, r8, 20 /* v = *(as + 20) */
+ bslli r9, r12, 8 /* t1 = v << 8 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 20 /* *(d + 20) = t1 */
+ bsrli r11, r12, 24 /* h = v >> 24 */
+ lwi r12, r8, 16 /* v = *(as + 16) */
+ bslli r9, r12, 8 /* t1 = v << 8 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 16 /* *(d + 16) = t1 */
+ bsrli r11, r12, 24 /* h = v >> 24 */
+ lwi r12, r8, 12 /* v = *(as + 12) */
+ bslli r9, r12, 8 /* t1 = v << 8 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 12 /* *(d + 112) = t1 */
+ bsrli r11, r12, 24 /* h = v >> 24 */
+ lwi r12, r8, 8 /* v = *(as + 8) */
+ bslli r9, r12, 8 /* t1 = v << 8 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 8 /* *(d + 8) = t1 */
+ bsrli r11, r12, 24 /* h = v >> 24 */
+ lwi r12, r8, 4 /* v = *(as + 4) */
+ bslli r9, r12, 8 /* t1 = v << 8 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 4 /* *(d + 4) = t1 */
+ bsrli r11, r12, 24 /* h = v >> 24 */
+ lwi r12, r8, 0 /* v = *(as + 0) */
+ bslli r9, r12, 8 /* t1 = v << 8 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 0 /* *(d + 0) = t1 */
+ addi r4, r4, -32 /* n = n - 32 */
+ bneid r4, d_bu1_loop /* while (n) loop */
+ bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */
+ bri d_block_done
+
+d_block_u2:
+ bsrli r11, r11, 16 /* h = h >> 16 */
+d_bu2_loop:
+ addi r8, r8, -32 /* as = as - 32 */
+ addi r5, r5, -32 /* d = d - 32 */
+ lwi r12, r8, 28 /* v = *(as + 28) */
+ bslli r9, r12, 16 /* t1 = v << 16 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 28 /* *(d + 28) = t1 */
+ bsrli r11, r12, 16 /* h = v >> 16 */
+ lwi r12, r8, 24 /* v = *(as + 24) */
+ bslli r9, r12, 16 /* t1 = v << 16 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 24 /* *(d + 24) = t1 */
+ bsrli r11, r12, 16 /* h = v >> 16 */
+ lwi r12, r8, 20 /* v = *(as + 20) */
+ bslli r9, r12, 16 /* t1 = v << 16 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 20 /* *(d + 20) = t1 */
+ bsrli r11, r12, 16 /* h = v >> 16 */
+ lwi r12, r8, 16 /* v = *(as + 16) */
+ bslli r9, r12, 16 /* t1 = v << 16 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 16 /* *(d + 16) = t1 */
+ bsrli r11, r12, 16 /* h = v >> 16 */
+ lwi r12, r8, 12 /* v = *(as + 12) */
+ bslli r9, r12, 16 /* t1 = v << 16 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 12 /* *(d + 112) = t1 */
+ bsrli r11, r12, 16 /* h = v >> 16 */
+ lwi r12, r8, 8 /* v = *(as + 8) */
+ bslli r9, r12, 16 /* t1 = v << 16 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 8 /* *(d + 8) = t1 */
+ bsrli r11, r12, 16 /* h = v >> 16 */
+ lwi r12, r8, 4 /* v = *(as + 4) */
+ bslli r9, r12, 16 /* t1 = v << 16 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 4 /* *(d + 4) = t1 */
+ bsrli r11, r12, 16 /* h = v >> 16 */
+ lwi r12, r8, 0 /* v = *(as + 0) */
+ bslli r9, r12, 16 /* t1 = v << 16 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ swi r9, r5, 0 /* *(d + 0) = t1 */
+ addi r4, r4, -32 /* n = n - 32 */
+ bneid r4, d_bu2_loop /* while (n) loop */
+ bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */
+
+d_block_done:
+ addi r4, r0, 4 /* n = 4 */
+ cmpu r4, r4, r7 /* n = c - n (unsigned) */
+ blti r4,d_xfer_end /* if n < 0, less than one word to transfer */
+
+d_word_xfer:
+ andi r4, r7, 0xfffffffc /* n = c & ~3 */
+ rsub r5, r4, r5 /* d = d - n */
+ rsub r6, r4, r6 /* s = s - n */
+ rsub r7, r4, r7 /* c = c - n */
+
+ andi r9, r6, 3 /* t1 = s & 3 */
+ /* if temp != 0, unaligned transfers needed */
+ bnei r9, d_word_unaligned
+
+d_word_aligned:
+ addi r4, r4,-4 /* n-- */
+ lw r9, r6, r4 /* t1 = *(s+n) */
+ bneid r4, d_word_aligned /* loop */
+ sw r9, r5, r4 /* *(d+n) = t1 (IN DELAY SLOT) */
+
+ bri d_word_done
+
+d_word_unaligned:
+ andi r8, r6, 0xfffffffc /* as = s & ~3 */
+ lw r11, r8, r4 /* h = *(as + n) */
+
+ addi r9, r9, -1
+ beqi r9,d_word_u1 /* t1 was 1 => 1 byte offset */
+ addi r9, r9, -1
+ beqi r9,d_word_u2 /* t1 was 2 => 2 byte offset */
+
+d_word_u3:
+ bsrli r11, r11, 8 /* h = h >> 8 */
+d_wu3_loop:
+ addi r4, r4,-4 /* n = n - 4 */
+ lw r12, r8, r4 /* v = *(as + n) */
+ bslli r9, r12, 24 /* t1 = v << 24 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ sw r9, r5, r4 /* *(d + n) = t1 */
+ bneid r4, d_wu3_loop /* while (n) loop */
+ bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */
+
+ bri d_word_done
+
+d_word_u1:
+ bsrli r11, r11, 24 /* h = h >> 24 */
+d_wu1_loop:
+ addi r4, r4,-4 /* n = n - 4 */
+ lw r12, r8, r4 /* v = *(as + n) */
+ bslli r9, r12, 8 /* t1 = v << 8 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ sw r9, r5, r4 /* *(d + n) = t1 */
+ bneid r4, d_wu1_loop /* while (n) loop */
+ bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */
+
+ bri d_word_done
+
+d_word_u2:
+ bsrli r11, r11, 16 /* h = h >> 16 */
+d_wu2_loop:
+ addi r4, r4,-4 /* n = n - 4 */
+ lw r12, r8, r4 /* v = *(as + n) */
+ bslli r9, r12, 16 /* t1 = v << 16 */
+ or r9, r11, r9 /* t1 = h | t1 */
+ sw r9, r5, r4 /* *(d + n) = t1 */
+ bneid r4, d_wu2_loop /* while (n) loop */
+ bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */
+
+d_word_done:
+
+d_xfer_end:
+d_xfer_end_loop:
+ beqi r7, a_done /* while (c) */
+ addi r6, r6, -1 /* s-- */
+ lbui r9, r6, 0 /* t1 = *s */
+ addi r5, r5, -1 /* d-- */
+ sbi r9, r5, 0 /* *d = t1 */
+ brid d_xfer_end_loop /* loop */
+ addi r7, r7, -1 /* c-- (IN DELAY SLOT) */
+
+d_done:
+ rtsd r15, 8
+ nop
+
+.size memmove, . - memmove
+.end memmove
diff --git a/arch/microblaze/lib/libgcc.h b/arch/microblaze/lib/libgcc.h
new file mode 100644
index 000000000..a909366c2
--- /dev/null
+++ b/arch/microblaze/lib/libgcc.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_LIBGCC_H
+#define __ASM_LIBGCC_H
+
+#include <asm/byteorder.h>
+
+typedef int word_type __attribute__ ((mode (__word__)));
+
+#ifdef __BIG_ENDIAN
+struct DWstruct {
+ int high, low;
+};
+#elif defined(__LITTLE_ENDIAN)
+struct DWstruct {
+ int low, high;
+};
+#else
+#error I feel sick.
+#endif
+
+typedef union {
+ struct DWstruct s;
+ long long ll;
+} DWunion;
+
+extern long long __ashldi3(long long u, word_type b);
+extern long long __ashrdi3(long long u, word_type b);
+extern word_type __cmpdi2(long long a, long long b);
+extern long long __lshrdi3(long long u, word_type b);
+extern long long __muldi3(long long u, long long v);
+extern word_type __ucmpdi2(unsigned long long a, unsigned long long b);
+
+#endif /* __ASM_LIBGCC_H */
diff --git a/arch/microblaze/lib/lshrdi3.c b/arch/microblaze/lib/lshrdi3.c
new file mode 100644
index 000000000..00eb82682
--- /dev/null
+++ b/arch/microblaze/lib/lshrdi3.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/export.h>
+
+#include "libgcc.h"
+
+long long __lshrdi3(long long u, word_type b)
+{
+ DWunion uu, w;
+ word_type bm;
+
+ if (b == 0)
+ return u;
+
+ uu.ll = u;
+ bm = 32 - b;
+
+ if (bm <= 0) {
+ w.s.high = 0;
+ w.s.low = (unsigned int) uu.s.high >> -bm;
+ } else {
+ const unsigned int carries = (unsigned int) uu.s.high << bm;
+
+ w.s.high = (unsigned int) uu.s.high >> b;
+ w.s.low = ((unsigned int) uu.s.low >> b) | carries;
+ }
+
+ return w.ll;
+}
+EXPORT_SYMBOL(__lshrdi3);
diff --git a/arch/microblaze/lib/memcpy.c b/arch/microblaze/lib/memcpy.c
new file mode 100644
index 000000000..9966dce55
--- /dev/null
+++ b/arch/microblaze/lib/memcpy.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
+ * Copyright (C) 2008-2009 PetaLogix
+ * Copyright (C) 2007 John Williams
+ *
+ * Reasonably optimised generic C-code for memcpy on Microblaze
+ * This is generic C code to do efficient, alignment-aware memcpy.
+ *
+ * It is based on demo code originally Copyright 2001 by Intel Corp, taken from
+ * http://www.embedded.com/showArticle.jhtml?articleID=19205567
+ *
+ * Attempts were made, unsuccessfully, to contact the original
+ * author of this code (Michael Morrow, Intel). Below is the original
+ * copyright notice.
+ *
+ * This software has been developed by Intel Corporation.
+ * Intel specifically disclaims all warranties, express or
+ * implied, and all liability, including consequential and
+ * other indirect damages, for the use of this program, including
+ * liability for infringement of any proprietary rights,
+ * and including the warranties of merchantability and fitness
+ * for a particular purpose. Intel does not assume any
+ * responsibility for and errors which may appear in this program
+ * not any responsibility to update it.
+ */
+
+#include <linux/export.h>
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/compiler.h>
+
+#include <linux/string.h>
+
+#ifdef CONFIG_OPT_LIB_FUNCTION
+void *memcpy(void *v_dst, const void *v_src, __kernel_size_t c)
+{
+ const char *src = v_src;
+ char *dst = v_dst;
+
+ /* The following code tries to optimize the copy by using unsigned
+ * alignment. This will work fine if both source and destination are
+ * aligned on the same boundary. However, if they are aligned on
+ * different boundaries shifts will be necessary. This might result in
+ * bad performance on MicroBlaze systems without a barrel shifter.
+ */
+ const uint32_t *i_src;
+ uint32_t *i_dst;
+
+ if (likely(c >= 4)) {
+ unsigned value, buf_hold;
+
+ /* Align the destination to a word boundary. */
+ /* This is done in an endian independent manner. */
+ switch ((unsigned long)dst & 3) {
+ case 1:
+ *dst++ = *src++;
+ --c;
+ fallthrough;
+ case 2:
+ *dst++ = *src++;
+ --c;
+ fallthrough;
+ case 3:
+ *dst++ = *src++;
+ --c;
+ }
+
+ i_dst = (void *)dst;
+
+ /* Choose a copy scheme based on the source */
+ /* alignment relative to destination. */
+ switch ((unsigned long)src & 3) {
+ case 0x0: /* Both byte offsets are aligned */
+ i_src = (const void *)src;
+
+ for (; c >= 4; c -= 4)
+ *i_dst++ = *i_src++;
+
+ src = (const void *)i_src;
+ break;
+ case 0x1: /* Unaligned - Off by 1 */
+ /* Word align the source */
+ i_src = (const void *) ((unsigned)src & ~3);
+#ifndef __MICROBLAZEEL__
+ /* Load the holding buffer */
+ buf_hold = *i_src++ << 8;
+
+ for (; c >= 4; c -= 4) {
+ value = *i_src++;
+ *i_dst++ = buf_hold | value >> 24;
+ buf_hold = value << 8;
+ }
+#else
+ /* Load the holding buffer */
+ buf_hold = (*i_src++ & 0xFFFFFF00) >> 8;
+
+ for (; c >= 4; c -= 4) {
+ value = *i_src++;
+ *i_dst++ = buf_hold | ((value & 0xFF) << 24);
+ buf_hold = (value & 0xFFFFFF00) >> 8;
+ }
+#endif
+ /* Realign the source */
+ src = (const void *)i_src;
+ src -= 3;
+ break;
+ case 0x2: /* Unaligned - Off by 2 */
+ /* Word align the source */
+ i_src = (const void *) ((unsigned)src & ~3);
+#ifndef __MICROBLAZEEL__
+ /* Load the holding buffer */
+ buf_hold = *i_src++ << 16;
+
+ for (; c >= 4; c -= 4) {
+ value = *i_src++;
+ *i_dst++ = buf_hold | value >> 16;
+ buf_hold = value << 16;
+ }
+#else
+ /* Load the holding buffer */
+ buf_hold = (*i_src++ & 0xFFFF0000) >> 16;
+
+ for (; c >= 4; c -= 4) {
+ value = *i_src++;
+ *i_dst++ = buf_hold | ((value & 0xFFFF) << 16);
+ buf_hold = (value & 0xFFFF0000) >> 16;
+ }
+#endif
+ /* Realign the source */
+ src = (const void *)i_src;
+ src -= 2;
+ break;
+ case 0x3: /* Unaligned - Off by 3 */
+ /* Word align the source */
+ i_src = (const void *) ((unsigned)src & ~3);
+#ifndef __MICROBLAZEEL__
+ /* Load the holding buffer */
+ buf_hold = *i_src++ << 24;
+
+ for (; c >= 4; c -= 4) {
+ value = *i_src++;
+ *i_dst++ = buf_hold | value >> 8;
+ buf_hold = value << 24;
+ }
+#else
+ /* Load the holding buffer */
+ buf_hold = (*i_src++ & 0xFF000000) >> 24;
+
+ for (; c >= 4; c -= 4) {
+ value = *i_src++;
+ *i_dst++ = buf_hold | ((value & 0xFFFFFF) << 8);
+ buf_hold = (value & 0xFF000000) >> 24;
+ }
+#endif
+ /* Realign the source */
+ src = (const void *)i_src;
+ src -= 1;
+ break;
+ }
+ dst = (void *)i_dst;
+ }
+
+ /* Finish off any remaining bytes */
+ /* simple fast copy, ... unless a cache boundary is crossed */
+ switch (c) {
+ case 3:
+ *dst++ = *src++;
+ fallthrough;
+ case 2:
+ *dst++ = *src++;
+ fallthrough;
+ case 1:
+ *dst++ = *src++;
+ }
+
+ return v_dst;
+}
+EXPORT_SYMBOL(memcpy);
+#endif /* CONFIG_OPT_LIB_FUNCTION */
diff --git a/arch/microblaze/lib/memmove.c b/arch/microblaze/lib/memmove.c
new file mode 100644
index 000000000..c1f08c484
--- /dev/null
+++ b/arch/microblaze/lib/memmove.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
+ * Copyright (C) 2008-2009 PetaLogix
+ * Copyright (C) 2007 John Williams
+ *
+ * Reasonably optimised generic C-code for memcpy on Microblaze
+ * This is generic C code to do efficient, alignment-aware memmove.
+ *
+ * It is based on demo code originally Copyright 2001 by Intel Corp, taken from
+ * http://www.embedded.com/showArticle.jhtml?articleID=19205567
+ *
+ * Attempts were made, unsuccessfully, to contact the original
+ * author of this code (Michael Morrow, Intel). Below is the original
+ * copyright notice.
+ *
+ * This software has been developed by Intel Corporation.
+ * Intel specifically disclaims all warranties, express or
+ * implied, and all liability, including consequential and
+ * other indirect damages, for the use of this program, including
+ * liability for infringement of any proprietary rights,
+ * and including the warranties of merchantability and fitness
+ * for a particular purpose. Intel does not assume any
+ * responsibility for and errors which may appear in this program
+ * not any responsibility to update it.
+ */
+
+#include <linux/export.h>
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/compiler.h>
+#include <linux/string.h>
+
+#ifdef CONFIG_OPT_LIB_FUNCTION
+void *memmove(void *v_dst, const void *v_src, __kernel_size_t c)
+{
+ const char *src = v_src;
+ char *dst = v_dst;
+ const uint32_t *i_src;
+ uint32_t *i_dst;
+
+ if (!c)
+ return v_dst;
+
+ /* Use memcpy when source is higher than dest */
+ if (v_dst <= v_src)
+ return memcpy(v_dst, v_src, c);
+
+ /* The following code tries to optimize the copy by using unsigned
+ * alignment. This will work fine if both source and destination are
+ * aligned on the same boundary. However, if they are aligned on
+ * different boundaries shifts will be necessary. This might result in
+ * bad performance on MicroBlaze systems without a barrel shifter.
+ */
+ /* FIXME this part needs more test */
+ /* Do a descending copy - this is a bit trickier! */
+ dst += c;
+ src += c;
+
+ if (c >= 4) {
+ unsigned value, buf_hold;
+
+ /* Align the destination to a word boundary. */
+ /* This is done in an endian independent manner. */
+
+ switch ((unsigned long)dst & 3) {
+ case 3:
+ *--dst = *--src;
+ --c;
+ fallthrough;
+ case 2:
+ *--dst = *--src;
+ --c;
+ fallthrough;
+ case 1:
+ *--dst = *--src;
+ --c;
+ }
+
+ i_dst = (void *)dst;
+ /* Choose a copy scheme based on the source */
+ /* alignment relative to destination. */
+ switch ((unsigned long)src & 3) {
+ case 0x0: /* Both byte offsets are aligned */
+
+ i_src = (const void *)src;
+
+ for (; c >= 4; c -= 4)
+ *--i_dst = *--i_src;
+
+ src = (const void *)i_src;
+ break;
+ case 0x1: /* Unaligned - Off by 1 */
+ /* Word align the source */
+ i_src = (const void *) (((unsigned)src + 4) & ~3);
+#ifndef __MICROBLAZEEL__
+ /* Load the holding buffer */
+ buf_hold = *--i_src >> 24;
+
+ for (; c >= 4; c -= 4) {
+ value = *--i_src;
+ *--i_dst = buf_hold << 8 | value;
+ buf_hold = value >> 24;
+ }
+#else
+ /* Load the holding buffer */
+ buf_hold = (*--i_src & 0xFF) << 24;
+
+ for (; c >= 4; c -= 4) {
+ value = *--i_src;
+ *--i_dst = buf_hold |
+ ((value & 0xFFFFFF00) >> 8);
+ buf_hold = (value & 0xFF) << 24;
+ }
+#endif
+ /* Realign the source */
+ src = (const void *)i_src;
+ src += 1;
+ break;
+ case 0x2: /* Unaligned - Off by 2 */
+ /* Word align the source */
+ i_src = (const void *) (((unsigned)src + 4) & ~3);
+#ifndef __MICROBLAZEEL__
+ /* Load the holding buffer */
+ buf_hold = *--i_src >> 16;
+
+ for (; c >= 4; c -= 4) {
+ value = *--i_src;
+ *--i_dst = buf_hold << 16 | value;
+ buf_hold = value >> 16;
+ }
+#else
+ /* Load the holding buffer */
+ buf_hold = (*--i_src & 0xFFFF) << 16;
+
+ for (; c >= 4; c -= 4) {
+ value = *--i_src;
+ *--i_dst = buf_hold |
+ ((value & 0xFFFF0000) >> 16);
+ buf_hold = (value & 0xFFFF) << 16;
+ }
+#endif
+ /* Realign the source */
+ src = (const void *)i_src;
+ src += 2;
+ break;
+ case 0x3: /* Unaligned - Off by 3 */
+ /* Word align the source */
+ i_src = (const void *) (((unsigned)src + 4) & ~3);
+#ifndef __MICROBLAZEEL__
+ /* Load the holding buffer */
+ buf_hold = *--i_src >> 8;
+
+ for (; c >= 4; c -= 4) {
+ value = *--i_src;
+ *--i_dst = buf_hold << 24 | value;
+ buf_hold = value >> 8;
+ }
+#else
+ /* Load the holding buffer */
+ buf_hold = (*--i_src & 0xFFFFFF) << 8;
+
+ for (; c >= 4; c -= 4) {
+ value = *--i_src;
+ *--i_dst = buf_hold |
+ ((value & 0xFF000000) >> 24);
+ buf_hold = (value & 0xFFFFFF) << 8;
+ }
+#endif
+ /* Realign the source */
+ src = (const void *)i_src;
+ src += 3;
+ break;
+ }
+ dst = (void *)i_dst;
+ }
+
+ /* simple fast copy, ... unless a cache boundary is crossed */
+ /* Finish off any remaining bytes */
+ switch (c) {
+ case 4:
+ *--dst = *--src;
+ fallthrough;
+ case 3:
+ *--dst = *--src;
+ fallthrough;
+ case 2:
+ *--dst = *--src;
+ fallthrough;
+ case 1:
+ *--dst = *--src;
+ }
+ return v_dst;
+}
+EXPORT_SYMBOL(memmove);
+#endif /* CONFIG_OPT_LIB_FUNCTION */
diff --git a/arch/microblaze/lib/memset.c b/arch/microblaze/lib/memset.c
new file mode 100644
index 000000000..7c2352d56
--- /dev/null
+++ b/arch/microblaze/lib/memset.c
@@ -0,0 +1,94 @@
+/*
+ * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
+ * Copyright (C) 2008-2009 PetaLogix
+ * Copyright (C) 2007 John Williams
+ *
+ * Reasonably optimised generic C-code for memset on Microblaze
+ * This is generic C code to do efficient, alignment-aware memcpy.
+ *
+ * It is based on demo code originally Copyright 2001 by Intel Corp, taken from
+ * http://www.embedded.com/showArticle.jhtml?articleID=19205567
+ *
+ * Attempts were made, unsuccessfully, to contact the original
+ * author of this code (Michael Morrow, Intel). Below is the original
+ * copyright notice.
+ *
+ * This software has been developed by Intel Corporation.
+ * Intel specifically disclaims all warranties, express or
+ * implied, and all liability, including consequential and
+ * other indirect damages, for the use of this program, including
+ * liability for infringement of any proprietary rights,
+ * and including the warranties of merchantability and fitness
+ * for a particular purpose. Intel does not assume any
+ * responsibility for and errors which may appear in this program
+ * not any responsibility to update it.
+ */
+
+#include <linux/export.h>
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/compiler.h>
+#include <linux/string.h>
+
+#ifdef CONFIG_OPT_LIB_FUNCTION
+void *memset(void *v_src, int c, __kernel_size_t n)
+{
+ char *src = v_src;
+ uint32_t *i_src;
+ uint32_t w32 = 0;
+
+ /* Truncate c to 8 bits */
+ c = (c & 0xFF);
+
+ if (unlikely(c)) {
+ /* Make a repeating word out of it */
+ w32 = c;
+ w32 |= w32 << 8;
+ w32 |= w32 << 16;
+ }
+
+ if (likely(n >= 4)) {
+ /* Align the destination to a word boundary */
+ /* This is done in an endian independent manner */
+ switch ((unsigned) src & 3) {
+ case 1:
+ *src++ = c;
+ --n;
+ fallthrough;
+ case 2:
+ *src++ = c;
+ --n;
+ fallthrough;
+ case 3:
+ *src++ = c;
+ --n;
+ }
+
+ i_src = (void *)src;
+
+ /* Do as many full-word copies as we can */
+ for (; n >= 4; n -= 4)
+ *i_src++ = w32;
+
+ src = (void *)i_src;
+ }
+
+ /* Simple, byte oriented memset or the rest of count. */
+ switch (n) {
+ case 3:
+ *src++ = c;
+ fallthrough;
+ case 2:
+ *src++ = c;
+ fallthrough;
+ case 1:
+ *src++ = c;
+ break;
+ default:
+ break;
+ }
+
+ return v_src;
+}
+EXPORT_SYMBOL(memset);
+#endif /* CONFIG_OPT_LIB_FUNCTION */
diff --git a/arch/microblaze/lib/modsi3.S b/arch/microblaze/lib/modsi3.S
new file mode 100644
index 000000000..64154865c
--- /dev/null
+++ b/arch/microblaze/lib/modsi3.S
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+
+/*
+* modulo operation for 32 bit integers.
+* Input : op1 in Reg r5
+* op2 in Reg r6
+* Output: op1 mod op2 in Reg r3
+*/
+
+ .text
+ .globl __modsi3
+ .type __modsi3, @function
+ .ent __modsi3
+
+__modsi3:
+ .frame r1, 0, r15
+
+ addik r1, r1, -16
+ swi r28, r1, 0
+ swi r29, r1, 4
+ swi r30, r1, 8
+ swi r31, r1, 12
+
+ beqi r6, div_by_zero /* div_by_zero division error */
+ beqi r5, result_is_zero /* result is zero */
+ bgeid r5, r5_pos
+ /* get the sign of the result [ depends only on the first arg] */
+ add r28, r5, r0
+ rsubi r5, r5, 0 /* make r5 positive */
+r5_pos:
+ bgei r6, r6_pos
+ rsubi r6, r6, 0 /* make r6 positive */
+r6_pos:
+ addik r3, r0, 0 /* clear mod */
+ addik r30, r0, 0 /* clear div */
+ addik r29, r0, 32 /* initialize the loop count */
+/* first part try to find the first '1' in the r5 */
+div1:
+ add r5, r5, r5 /* left shift logical r5 */
+ bgeid r5, div1
+ addik r29, r29, -1
+div2:
+ /* left shift logical r5 get the '1' into the carry */
+ add r5, r5, r5
+ addc r3, r3, r3 /* move that bit into the mod register */
+ rsub r31, r6, r3 /* try to subtract (r30 a r6) */
+ blti r31, mod_too_small
+ /* move the r31 to mod since the result was positive */
+ or r3, r0, r31
+ addik r30, r30, 1
+mod_too_small:
+ addik r29, r29, -1
+ beqi r29, loop_end
+ add r30, r30, r30 /* shift in the '1' into div */
+ bri div2 /* div2 */
+loop_end:
+ bgei r28, return_here
+ brid return_here
+ rsubi r3, r3, 0 /* negate the result */
+div_by_zero:
+result_is_zero:
+ or r3, r0, r0 /* set result to 0 [both mod as well as div are 0] */
+return_here:
+/* restore values of csrs and that of r3 and the divisor and the dividend */
+ lwi r28, r1, 0
+ lwi r29, r1, 4
+ lwi r30, r1, 8
+ lwi r31, r1, 12
+ rtsd r15, 8
+ addik r1, r1, 16
+
+.size __modsi3, . - __modsi3
+.end __modsi3
diff --git a/arch/microblaze/lib/muldi3.c b/arch/microblaze/lib/muldi3.c
new file mode 100644
index 000000000..8e6bc17e2
--- /dev/null
+++ b/arch/microblaze/lib/muldi3.c
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/export.h>
+
+#include "libgcc.h"
+
+#define W_TYPE_SIZE 32
+
+#define __ll_B ((unsigned long) 1 << (W_TYPE_SIZE / 2))
+#define __ll_lowpart(t) ((unsigned long) (t) & (__ll_B - 1))
+#define __ll_highpart(t) ((unsigned long) (t) >> (W_TYPE_SIZE / 2))
+
+/* If we still don't have umul_ppmm, define it using plain C. */
+#if !defined(umul_ppmm)
+#define umul_ppmm(w1, w0, u, v) \
+ do { \
+ unsigned long __x0, __x1, __x2, __x3; \
+ unsigned short __ul, __vl, __uh, __vh; \
+ \
+ __ul = __ll_lowpart(u); \
+ __uh = __ll_highpart(u); \
+ __vl = __ll_lowpart(v); \
+ __vh = __ll_highpart(v); \
+ \
+ __x0 = (unsigned long) __ul * __vl; \
+ __x1 = (unsigned long) __ul * __vh; \
+ __x2 = (unsigned long) __uh * __vl; \
+ __x3 = (unsigned long) __uh * __vh; \
+ \
+ __x1 += __ll_highpart(__x0); /* this can't give carry */\
+ __x1 += __x2; /* but this indeed can */ \
+ if (__x1 < __x2) /* did we get it? */ \
+ __x3 += __ll_B; /* yes, add it in the proper pos */ \
+ \
+ (w1) = __x3 + __ll_highpart(__x1); \
+ (w0) = __ll_lowpart(__x1) * __ll_B + __ll_lowpart(__x0);\
+ } while (0)
+#endif
+
+#if !defined(__umulsidi3)
+#define __umulsidi3(u, v) ({ \
+ DWunion __w; \
+ umul_ppmm(__w.s.high, __w.s.low, u, v); \
+ __w.ll; \
+ })
+#endif
+
+long long __muldi3(long long u, long long v)
+{
+ const DWunion uu = {.ll = u};
+ const DWunion vv = {.ll = v};
+ DWunion w = {.ll = __umulsidi3(uu.s.low, vv.s.low)};
+
+ w.s.high += ((unsigned long) uu.s.low * (unsigned long) vv.s.high
+ + (unsigned long) uu.s.high * (unsigned long) vv.s.low);
+
+ return w.ll;
+}
+EXPORT_SYMBOL(__muldi3);
diff --git a/arch/microblaze/lib/mulsi3.S b/arch/microblaze/lib/mulsi3.S
new file mode 100644
index 000000000..d6703a4d9
--- /dev/null
+++ b/arch/microblaze/lib/mulsi3.S
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+
+/*
+ * Multiply operation for 32 bit integers.
+ * Input : Operand1 in Reg r5
+ * Operand2 in Reg r6
+ * Output: Result [op1 * op2] in Reg r3
+ */
+ .text
+ .globl __mulsi3
+ .type __mulsi3, @function
+ .ent __mulsi3
+
+__mulsi3:
+ .frame r1, 0, r15
+ add r3, r0, r0
+ beqi r5, result_is_zero /* multiply by zero */
+ beqi r6, result_is_zero /* multiply by zero */
+ bgeid r5, r5_pos
+ xor r4, r5, r6 /* get the sign of the result */
+ rsubi r5, r5, 0 /* make r5 positive */
+r5_pos:
+ bgei r6, r6_pos
+ rsubi r6, r6, 0 /* make r6 positive */
+r6_pos:
+ bri l1
+l2:
+ add r5, r5, r5
+l1:
+ srl r6, r6
+ addc r7, r0, r0
+ beqi r7, l2
+ bneid r6, l2
+ add r3, r3, r5
+ blti r4, negateresult
+ rtsd r15, 8
+ nop
+negateresult:
+ rtsd r15, 8
+ rsub r3, r3, r0
+result_is_zero:
+ rtsd r15, 8
+ addi r3, r0, 0
+
+.size __mulsi3, . - __mulsi3
+.end __mulsi3
diff --git a/arch/microblaze/lib/uaccess_old.S b/arch/microblaze/lib/uaccess_old.S
new file mode 100644
index 000000000..dd5f3bfbc
--- /dev/null
+++ b/arch/microblaze/lib/uaccess_old.S
@@ -0,0 +1,176 @@
+/*
+ * Copyright (C) 2009 Michal Simek <monstr@monstr.eu>
+ * Copyright (C) 2009 PetaLogix
+ * Copyright (C) 2007 LynuxWorks, Inc.
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+
+#include <linux/errno.h>
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+/* Loop unrolling for __copy_tofrom_user */
+#define COPY(offset) \
+1: lwi r4 , r6, 0x0000 + offset; \
+2: lwi r19, r6, 0x0004 + offset; \
+3: lwi r20, r6, 0x0008 + offset; \
+4: lwi r21, r6, 0x000C + offset; \
+5: lwi r22, r6, 0x0010 + offset; \
+6: lwi r23, r6, 0x0014 + offset; \
+7: lwi r24, r6, 0x0018 + offset; \
+8: lwi r25, r6, 0x001C + offset; \
+9: swi r4 , r5, 0x0000 + offset; \
+10: swi r19, r5, 0x0004 + offset; \
+11: swi r20, r5, 0x0008 + offset; \
+12: swi r21, r5, 0x000C + offset; \
+13: swi r22, r5, 0x0010 + offset; \
+14: swi r23, r5, 0x0014 + offset; \
+15: swi r24, r5, 0x0018 + offset; \
+16: swi r25, r5, 0x001C + offset; \
+ .section __ex_table,"a"; \
+ .word 1b, 33f; \
+ .word 2b, 33f; \
+ .word 3b, 33f; \
+ .word 4b, 33f; \
+ .word 5b, 33f; \
+ .word 6b, 33f; \
+ .word 7b, 33f; \
+ .word 8b, 33f; \
+ .word 9b, 33f; \
+ .word 10b, 33f; \
+ .word 11b, 33f; \
+ .word 12b, 33f; \
+ .word 13b, 33f; \
+ .word 14b, 33f; \
+ .word 15b, 33f; \
+ .word 16b, 33f; \
+ .text
+
+#define COPY_80(offset) \
+ COPY(0x00 + offset);\
+ COPY(0x20 + offset);\
+ COPY(0x40 + offset);\
+ COPY(0x60 + offset);
+
+/*
+ * int __copy_tofrom_user(char *to, char *from, int len)
+ * Return:
+ * 0 on success
+ * number of not copied bytes on error
+ */
+ .text
+.globl __copy_tofrom_user;
+.type __copy_tofrom_user, @function
+.align 4;
+__copy_tofrom_user:
+ /*
+ * r5 - to
+ * r6 - from
+ * r7, r3 - count
+ * r4 - tempval
+ */
+ beqid r7, 0f /* zero size is not likely */
+ or r3, r5, r6 /* find if is any to/from unaligned */
+ or r3, r3, r7 /* find if count is unaligned */
+ andi r3, r3, 0x3 /* mask last 3 bits */
+ bneid r3, bu1 /* if r3 is not zero then byte copying */
+ or r3, r0, r0
+
+ rsubi r3, r7, PAGE_SIZE /* detect PAGE_SIZE */
+ beqid r3, page;
+ or r3, r0, r0
+
+w1: lw r4, r6, r3 /* at least one 4 byte copy */
+w2: sw r4, r5, r3
+ addik r7, r7, -4
+ bneid r7, w1
+ addik r3, r3, 4
+ addik r3, r7, 0
+ rtsd r15, 8
+ nop
+
+ .section __ex_table,"a"
+ .word w1, 0f;
+ .word w2, 0f;
+ .text
+
+.align 4 /* Alignment is important to keep icache happy */
+page: /* Create room on stack and save registers for storing values */
+ addik r1, r1, -40
+ swi r5, r1, 0
+ swi r6, r1, 4
+ swi r7, r1, 8
+ swi r19, r1, 12
+ swi r20, r1, 16
+ swi r21, r1, 20
+ swi r22, r1, 24
+ swi r23, r1, 28
+ swi r24, r1, 32
+ swi r25, r1, 36
+loop: /* r4, r19, r20, r21, r22, r23, r24, r25 are used for storing values */
+ /* Loop unrolling to get performance boost */
+ COPY_80(0x000);
+ COPY_80(0x080);
+ COPY_80(0x100);
+ COPY_80(0x180);
+ /* copy loop */
+ addik r6, r6, 0x200
+ addik r7, r7, -0x200
+ bneid r7, loop
+ addik r5, r5, 0x200
+
+ /* Restore register content */
+ lwi r5, r1, 0
+ lwi r6, r1, 4
+ lwi r7, r1, 8
+ lwi r19, r1, 12
+ lwi r20, r1, 16
+ lwi r21, r1, 20
+ lwi r22, r1, 24
+ lwi r23, r1, 28
+ lwi r24, r1, 32
+ lwi r25, r1, 36
+ addik r1, r1, 40
+ /* return back */
+ addik r3, r0, 0
+ rtsd r15, 8
+ nop
+
+/* Fault case - return temp count */
+33:
+ addik r3, r7, 0
+ /* Restore register content */
+ lwi r5, r1, 0
+ lwi r6, r1, 4
+ lwi r7, r1, 8
+ lwi r19, r1, 12
+ lwi r20, r1, 16
+ lwi r21, r1, 20
+ lwi r22, r1, 24
+ lwi r23, r1, 28
+ lwi r24, r1, 32
+ lwi r25, r1, 36
+ addik r1, r1, 40
+ /* return back */
+ rtsd r15, 8
+ nop
+
+.align 4 /* Alignment is important to keep icache happy */
+bu1: lbu r4,r6,r3
+bu2: sb r4,r5,r3
+ addik r7,r7,-1
+ bneid r7,bu1
+ addik r3,r3,1 /* delay slot */
+0:
+ addik r3,r7,0
+ rtsd r15,8
+ nop
+ .size __copy_tofrom_user, . - __copy_tofrom_user
+
+ .section __ex_table,"a"
+ .word bu1, 0b;
+ .word bu2, 0b;
+ .text
diff --git a/arch/microblaze/lib/ucmpdi2.c b/arch/microblaze/lib/ucmpdi2.c
new file mode 100644
index 000000000..c60e068ff
--- /dev/null
+++ b/arch/microblaze/lib/ucmpdi2.c
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/export.h>
+
+#include "libgcc.h"
+
+word_type __ucmpdi2(unsigned long long a, unsigned long long b)
+{
+ const DWunion au = {.ll = a};
+ const DWunion bu = {.ll = b};
+
+ if ((unsigned int) au.s.high < (unsigned int) bu.s.high)
+ return 0;
+ else if ((unsigned int) au.s.high > (unsigned int) bu.s.high)
+ return 2;
+ if ((unsigned int) au.s.low < (unsigned int) bu.s.low)
+ return 0;
+ else if ((unsigned int) au.s.low > (unsigned int) bu.s.low)
+ return 2;
+ return 1;
+}
+EXPORT_SYMBOL(__ucmpdi2);
diff --git a/arch/microblaze/lib/udivsi3.S b/arch/microblaze/lib/udivsi3.S
new file mode 100644
index 000000000..acdc66723
--- /dev/null
+++ b/arch/microblaze/lib/udivsi3.S
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+
+/*
+* Unsigned divide operation.
+* Input : Divisor in Reg r5
+* Dividend in Reg r6
+* Output: Result in Reg r3
+*/
+
+ .text
+ .globl __udivsi3
+ .type __udivsi3, @function
+ .ent __udivsi3
+
+__udivsi3:
+
+ .frame r1, 0, r15
+
+ addik r1, r1, -12
+ swi r29, r1, 0
+ swi r30, r1, 4
+ swi r31, r1, 8
+
+ beqi r6, div_by_zero /* div_by_zero /* division error */
+ beqid r5, result_is_zero /* result is zero */
+ addik r30, r0, 0 /* clear mod */
+ addik r29, r0, 32 /* initialize the loop count */
+
+/* check if r6 and r5 are equal - if yes, return 1 */
+ rsub r18, r5, r6
+ beqid r18, return_here
+ addik r3, r0, 1
+
+/* check if (uns)r6 is greater than (uns)r5. in that case, just return 0 */
+ xor r18, r5, r6
+ bgeid r18, 16
+ add r3, r0, r0 /* we would anyways clear r3 */
+ blti r6, return_here /* r6[bit 31 = 1] hence is greater */
+ bri checkr6
+ rsub r18, r6, r5 /* microblazecmp */
+ blti r18, return_here
+
+/* if r6 [bit 31] is set, then return result as 1 */
+checkr6:
+ bgti r6, div0
+ brid return_here
+ addik r3, r0, 1
+
+/* first part try to find the first '1' in the r5 */
+div0:
+ blti r5, div2
+div1:
+ add r5, r5, r5 /* left shift logical r5 */
+ bgtid r5, div1
+ addik r29, r29, -1
+div2:
+/* left shift logical r5 get the '1' into the carry */
+ add r5, r5, r5
+ addc r30, r30, r30 /* move that bit into the mod register */
+ rsub r31, r6, r30 /* try to subtract (r30 a r6) */
+ blti r31, mod_too_small
+/* move the r31 to mod since the result was positive */
+ or r30, r0, r31
+ addik r3, r3, 1
+mod_too_small:
+ addik r29, r29, -1
+ beqi r29, loop_end
+ add r3, r3, r3 /* shift in the '1' into div */
+ bri div2 /* div2 */
+loop_end:
+ bri return_here
+div_by_zero:
+result_is_zero:
+ or r3, r0, r0 /* set result to 0 */
+return_here:
+/* restore values of csrs and that of r3 and the divisor and the dividend */
+ lwi r29, r1, 0
+ lwi r30, r1, 4
+ lwi r31, r1, 8
+ rtsd r15, 8
+ addik r1, r1, 12
+
+.size __udivsi3, . - __udivsi3
+.end __udivsi3
diff --git a/arch/microblaze/lib/umodsi3.S b/arch/microblaze/lib/umodsi3.S
new file mode 100644
index 000000000..f4b814e88
--- /dev/null
+++ b/arch/microblaze/lib/umodsi3.S
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+
+/*
+ * Unsigned modulo operation for 32 bit integers.
+ * Input : op1 in Reg r5
+ * op2 in Reg r6
+ * Output: op1 mod op2 in Reg r3
+ */
+
+ .text
+ .globl __umodsi3
+ .type __umodsi3, @function
+ .ent __umodsi3
+
+__umodsi3:
+ .frame r1, 0, r15
+
+ addik r1, r1, -12
+ swi r29, r1, 0
+ swi r30, r1, 4
+ swi r31, r1, 8
+
+ beqi r6, div_by_zero /* div_by_zero - division error */
+ beqid r5, result_is_zero /* result is zero */
+ addik r3, r0, 0 /* clear div */
+ addik r30, r0, 0 /* clear mod */
+ addik r29, r0, 32 /* initialize the loop count */
+
+/* check if r6 and r5 are equal /* if yes, return 0 */
+ rsub r18, r5, r6
+ beqi r18, return_here
+
+/* check if (uns)r6 is greater than (uns)r5. in that case, just return r5 */
+ xor r18, r5, r6
+ bgeid r18, 16
+ addik r3, r5, 0
+ blti r6, return_here
+ bri $lcheckr6
+ rsub r18, r5, r6 /* microblazecmp */
+ bgti r18, return_here
+
+/* if r6 [bit 31] is set, then return result as r5-r6 */
+$lcheckr6:
+ bgtid r6, div0
+ addik r3, r0, 0
+ addik r18, r0, 0x7fffffff
+ and r5, r5, r18
+ and r6, r6, r18
+ brid return_here
+ rsub r3, r6, r5
+/* first part: try to find the first '1' in the r5 */
+div0:
+ blti r5, div2
+div1:
+ add r5, r5, r5 /* left shift logical r5 */
+ bgeid r5, div1
+ addik r29, r29, -1
+div2:
+ /* left shift logical r5 get the '1' into the carry */
+ add r5, r5, r5
+ addc r3, r3, r3 /* move that bit into the mod register */
+ rsub r31, r6, r3 /* try to subtract (r3 a r6) */
+ blti r31, mod_too_small
+ /* move the r31 to mod since the result was positive */
+ or r3, r0, r31
+ addik r30, r30, 1
+mod_too_small:
+ addik r29, r29, -1
+ beqi r29, loop_end
+ add r30, r30, r30 /* shift in the '1' into div */
+ bri div2 /* div2 */
+loop_end:
+ bri return_here
+div_by_zero:
+result_is_zero:
+ or r3, r0, r0 /* set result to 0 */
+return_here:
+/* restore values of csrs and that of r3 and the divisor and the dividend */
+ lwi r29, r1, 0
+ lwi r30, r1, 4
+ lwi r31, r1, 8
+ rtsd r15, 8
+ addik r1, r1, 12
+
+.size __umodsi3, . - __umodsi3
+.end __umodsi3