diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-11 08:27:49 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-11 08:27:49 +0000 |
commit | ace9429bb58fd418f0c81d4c2835699bddf6bde6 (patch) | |
tree | b2d64bc10158fdd5497876388cd68142ca374ed3 /arch/powerpc/lib | |
parent | Initial commit. (diff) | |
download | linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.tar.xz linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.zip |
Adding upstream version 6.6.15.upstream/6.6.15
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'arch/powerpc/lib')
41 files changed, 16855 insertions, 0 deletions
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile new file mode 100644 index 0000000000..6eac63e79a --- /dev/null +++ b/arch/powerpc/lib/Makefile @@ -0,0 +1,83 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for ppc-specific library files.. +# + +ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) + +CFLAGS_code-patching.o += -fno-stack-protector +CFLAGS_feature-fixups.o += -fno-stack-protector + +CFLAGS_REMOVE_code-patching.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_feature-fixups.o = $(CC_FLAGS_FTRACE) + +KASAN_SANITIZE_code-patching.o := n +KASAN_SANITIZE_feature-fixups.o := n +# restart_table.o contains functions called in the NMI interrupt path +# which can be in real mode. Disable KASAN. +KASAN_SANITIZE_restart_table.o := n +KCSAN_SANITIZE_code-patching.o := n +KCSAN_SANITIZE_feature-fixups.o := n + +ifdef CONFIG_KASAN +CFLAGS_code-patching.o += -DDISABLE_BRANCH_PROFILING +CFLAGS_feature-fixups.o += -DDISABLE_BRANCH_PROFILING +endif + +CFLAGS_code-patching.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) +CFLAGS_feature-fixups.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) + +obj-y += code-patching.o feature-fixups.o pmem.o + +obj-$(CONFIG_CODE_PATCHING_SELFTEST) += test-code-patching.o + +ifndef CONFIG_KASAN +obj-y += string.o memcmp_$(BITS).o +obj-$(CONFIG_PPC32) += strlen_32.o +endif + +obj-$(CONFIG_PPC32) += div64.o copy_32.o crtsavres.o + +obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o + +# See corresponding test in arch/powerpc/Makefile +# 64-bit linker creates .sfpr on demand for final link (vmlinux), +# so it is only needed for modules, and only for older linkers which +# do not support --save-restore-funcs +ifndef CONFIG_LD_IS_BFD +always-$(CONFIG_PPC64) += crtsavres.o +endif + +obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o copypage_power7.o \ + memcpy_power7.o restart_table.o + +obj64-y += copypage_64.o copyuser_64.o mem_64.o hweight_64.o \ + memcpy_64.o copy_mc_64.o + +ifdef CONFIG_PPC_QUEUED_SPINLOCKS +obj-$(CONFIG_SMP) += qspinlock.o +else +obj64-$(CONFIG_SMP) += locks.o +endif + +obj64-$(CONFIG_ALTIVEC) += vmx-helper.o +obj64-$(CONFIG_KPROBES_SANITY_TEST) += test_emulate_step.o \ + test_emulate_step_exec_instr.o + +obj-y += checksum_$(BITS).o checksum_wrappers.o \ + string_$(BITS).o + +obj-y += sstep.o +obj-$(CONFIG_PPC_FPU) += ldstfp.o +obj64-y += quad.o + +obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o + +obj-$(CONFIG_FTR_FIXUP_SELFTEST) += feature-fixups-test.o + +obj-$(CONFIG_ALTIVEC) += xor_vmx.o xor_vmx_glue.o +CFLAGS_xor_vmx.o += -maltivec $(call cc-option,-mabi=altivec) +# Enable <altivec.h> +CFLAGS_xor_vmx.o += -isystem $(shell $(CC) -print-file-name=include) + +obj-$(CONFIG_PPC64) += $(obj64-y) diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S new file mode 100644 index 0000000000..cd00b9bdd7 --- /dev/null +++ b/arch/powerpc/lib/checksum_32.S @@ -0,0 +1,309 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * This file contains assembly-language implementations + * of IP-style 1's complement checksum routines. + * + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). + */ + +#include <linux/export.h> +#include <linux/sys.h> +#include <asm/processor.h> +#include <asm/cache.h> +#include <asm/errno.h> +#include <asm/ppc_asm.h> + + .text + +/* + * computes the checksum of a memory block at buff, length len, + * and adds in "sum" (32-bit) + * + * __csum_partial(buff, len, sum) + */ +_GLOBAL(__csum_partial) + subi r3,r3,4 + srawi. r6,r4,2 /* Divide len by 4 and also clear carry */ + beq 3f /* if we're doing < 4 bytes */ + andi. r0,r3,2 /* Align buffer to longword boundary */ + beq+ 1f + lhz r0,4(r3) /* do 2 bytes to get aligned */ + subi r4,r4,2 + addi r3,r3,2 + srwi. r6,r4,2 /* # words to do */ + adde r5,r5,r0 + beq 3f +1: andi. r6,r6,3 /* Prepare to handle words 4 by 4 */ + beq 21f + mtctr r6 +2: lwzu r0,4(r3) + adde r5,r5,r0 + bdnz 2b +21: srwi. r6,r4,4 /* # blocks of 4 words to do */ + beq 3f + lwz r0,4(r3) + mtctr r6 + lwz r6,8(r3) + adde r5,r5,r0 + lwz r7,12(r3) + adde r5,r5,r6 + lwzu r8,16(r3) + adde r5,r5,r7 + bdz 23f +22: lwz r0,4(r3) + adde r5,r5,r8 + lwz r6,8(r3) + adde r5,r5,r0 + lwz r7,12(r3) + adde r5,r5,r6 + lwzu r8,16(r3) + adde r5,r5,r7 + bdnz 22b +23: adde r5,r5,r8 +3: andi. r0,r4,2 + beq+ 4f + lhz r0,4(r3) + addi r3,r3,2 + adde r5,r5,r0 +4: andi. r0,r4,1 + beq+ 5f + lbz r0,4(r3) + slwi r0,r0,8 /* Upper byte of word */ + adde r5,r5,r0 +5: addze r3,r5 /* add in final carry */ + blr +EXPORT_SYMBOL(__csum_partial) + +/* + * Computes the checksum of a memory block at src, length len, + * and adds in 0xffffffff, while copying the block to dst. + * If an access exception occurs it returns zero. + * + * csum_partial_copy_generic(src, dst, len) + */ +#define CSUM_COPY_16_BYTES_WITHEX(n) \ +8 ## n ## 0: \ + lwz r7,4(r4); \ +8 ## n ## 1: \ + lwz r8,8(r4); \ +8 ## n ## 2: \ + lwz r9,12(r4); \ +8 ## n ## 3: \ + lwzu r10,16(r4); \ +8 ## n ## 4: \ + stw r7,4(r6); \ + adde r12,r12,r7; \ +8 ## n ## 5: \ + stw r8,8(r6); \ + adde r12,r12,r8; \ +8 ## n ## 6: \ + stw r9,12(r6); \ + adde r12,r12,r9; \ +8 ## n ## 7: \ + stwu r10,16(r6); \ + adde r12,r12,r10 + +#define CSUM_COPY_16_BYTES_EXCODE(n) \ + EX_TABLE(8 ## n ## 0b, fault); \ + EX_TABLE(8 ## n ## 1b, fault); \ + EX_TABLE(8 ## n ## 2b, fault); \ + EX_TABLE(8 ## n ## 3b, fault); \ + EX_TABLE(8 ## n ## 4b, fault); \ + EX_TABLE(8 ## n ## 5b, fault); \ + EX_TABLE(8 ## n ## 6b, fault); \ + EX_TABLE(8 ## n ## 7b, fault); + + .text + +CACHELINE_BYTES = L1_CACHE_BYTES +LG_CACHELINE_BYTES = L1_CACHE_SHIFT +CACHELINE_MASK = (L1_CACHE_BYTES-1) + +_GLOBAL(csum_partial_copy_generic) + li r12,-1 + addic r0,r0,0 /* clear carry */ + addi r6,r4,-4 + neg r0,r4 + addi r4,r3,-4 + andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ + crset 4*cr7+eq + beq 58f + + cmplw 0,r5,r0 /* is this more than total to do? */ + blt 63f /* if not much to do */ + rlwinm r7,r6,3,0x8 + rlwnm r12,r12,r7,0,31 /* odd destination address: rotate one byte */ + cmplwi cr7,r7,0 /* is destination address even ? */ + andi. r8,r0,3 /* get it word-aligned first */ + mtctr r8 + beq+ 61f + li r3,0 +70: lbz r9,4(r4) /* do some bytes */ + addi r4,r4,1 + slwi r3,r3,8 + rlwimi r3,r9,0,24,31 +71: stb r9,4(r6) + addi r6,r6,1 + bdnz 70b + adde r12,r12,r3 +61: subf r5,r0,r5 + srwi. r0,r0,2 + mtctr r0 + beq 58f +72: lwzu r9,4(r4) /* do some words */ + adde r12,r12,r9 +73: stwu r9,4(r6) + bdnz 72b + +58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ + clrlwi r5,r5,32-LG_CACHELINE_BYTES + li r11,4 + beq 63f + + /* Here we decide how far ahead to prefetch the source */ + li r3,4 + cmpwi r0,1 + li r7,0 + ble 114f + li r7,1 +#if MAX_COPY_PREFETCH > 1 + /* Heuristically, for large transfers we prefetch + MAX_COPY_PREFETCH cachelines ahead. For small transfers + we prefetch 1 cacheline ahead. */ + cmpwi r0,MAX_COPY_PREFETCH + ble 112f + li r7,MAX_COPY_PREFETCH +112: mtctr r7 +111: dcbt r3,r4 + addi r3,r3,CACHELINE_BYTES + bdnz 111b +#else + dcbt r3,r4 + addi r3,r3,CACHELINE_BYTES +#endif /* MAX_COPY_PREFETCH > 1 */ + +114: subf r8,r7,r0 + mr r0,r7 + mtctr r8 + +53: dcbt r3,r4 +54: dcbz r11,r6 +/* the main body of the cacheline loop */ + CSUM_COPY_16_BYTES_WITHEX(0) +#if L1_CACHE_BYTES >= 32 + CSUM_COPY_16_BYTES_WITHEX(1) +#if L1_CACHE_BYTES >= 64 + CSUM_COPY_16_BYTES_WITHEX(2) + CSUM_COPY_16_BYTES_WITHEX(3) +#if L1_CACHE_BYTES >= 128 + CSUM_COPY_16_BYTES_WITHEX(4) + CSUM_COPY_16_BYTES_WITHEX(5) + CSUM_COPY_16_BYTES_WITHEX(6) + CSUM_COPY_16_BYTES_WITHEX(7) +#endif +#endif +#endif + bdnz 53b + cmpwi r0,0 + li r3,4 + li r7,0 + bne 114b + +63: srwi. r0,r5,2 + mtctr r0 + beq 64f +30: lwzu r0,4(r4) + adde r12,r12,r0 +31: stwu r0,4(r6) + bdnz 30b + +64: andi. r0,r5,2 + beq+ 65f +40: lhz r0,4(r4) + addi r4,r4,2 +41: sth r0,4(r6) + adde r12,r12,r0 + addi r6,r6,2 +65: andi. r0,r5,1 + beq+ 66f +50: lbz r0,4(r4) +51: stb r0,4(r6) + slwi r0,r0,8 + adde r12,r12,r0 +66: addze r3,r12 + beqlr+ cr7 + rlwinm r3,r3,8,0,31 /* odd destination address: rotate one byte */ + blr + +fault: + li r3,0 + blr + + EX_TABLE(70b, fault); + EX_TABLE(71b, fault); + EX_TABLE(72b, fault); + EX_TABLE(73b, fault); + EX_TABLE(54b, fault); + +/* + * this stuff handles faults in the cacheline loop and branches to either + * fault (if in read part) or fault (if in write part) + */ + CSUM_COPY_16_BYTES_EXCODE(0) +#if L1_CACHE_BYTES >= 32 + CSUM_COPY_16_BYTES_EXCODE(1) +#if L1_CACHE_BYTES >= 64 + CSUM_COPY_16_BYTES_EXCODE(2) + CSUM_COPY_16_BYTES_EXCODE(3) +#if L1_CACHE_BYTES >= 128 + CSUM_COPY_16_BYTES_EXCODE(4) + CSUM_COPY_16_BYTES_EXCODE(5) + CSUM_COPY_16_BYTES_EXCODE(6) + CSUM_COPY_16_BYTES_EXCODE(7) +#endif +#endif +#endif + + EX_TABLE(30b, fault); + EX_TABLE(31b, fault); + EX_TABLE(40b, fault); + EX_TABLE(41b, fault); + EX_TABLE(50b, fault); + EX_TABLE(51b, fault); + +EXPORT_SYMBOL(csum_partial_copy_generic) + +/* + * __sum16 csum_ipv6_magic(const struct in6_addr *saddr, + * const struct in6_addr *daddr, + * __u32 len, __u8 proto, __wsum sum) + */ + +_GLOBAL(csum_ipv6_magic) + lwz r8, 0(r3) + lwz r9, 4(r3) + addc r0, r7, r8 + lwz r10, 8(r3) + adde r0, r0, r9 + lwz r11, 12(r3) + adde r0, r0, r10 + lwz r8, 0(r4) + adde r0, r0, r11 + lwz r9, 4(r4) + adde r0, r0, r8 + lwz r10, 8(r4) + adde r0, r0, r9 + lwz r11, 12(r4) + adde r0, r0, r10 + add r5, r5, r6 /* assumption: len + proto doesn't carry */ + adde r0, r0, r11 + adde r0, r0, r5 + addze r0, r0 + rotlwi r3, r0, 16 + add r3, r0, r3 + not r3, r3 + rlwinm r3, r3, 16, 16, 31 + blr +EXPORT_SYMBOL(csum_ipv6_magic) diff --git a/arch/powerpc/lib/checksum_64.S b/arch/powerpc/lib/checksum_64.S new file mode 100644 index 0000000000..d53d8f09a2 --- /dev/null +++ b/arch/powerpc/lib/checksum_64.S @@ -0,0 +1,443 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * This file contains assembly-language implementations + * of IP-style 1's complement checksum routines. + * + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). + */ + +#include <linux/export.h> +#include <linux/sys.h> +#include <asm/processor.h> +#include <asm/errno.h> +#include <asm/ppc_asm.h> + +/* + * Computes the checksum of a memory block at buff, length len, + * and adds in "sum" (32-bit). + * + * __csum_partial(r3=buff, r4=len, r5=sum) + */ +_GLOBAL(__csum_partial) + addic r0,r5,0 /* clear carry */ + + srdi. r6,r4,3 /* less than 8 bytes? */ + beq .Lcsum_tail_word + + /* + * If only halfword aligned, align to a double word. Since odd + * aligned addresses should be rare and they would require more + * work to calculate the correct checksum, we ignore that case + * and take the potential slowdown of unaligned loads. + */ + rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */ + beq .Lcsum_aligned + + li r7,4 + sub r6,r7,r6 + mtctr r6 + +1: + lhz r6,0(r3) /* align to doubleword */ + subi r4,r4,2 + addi r3,r3,2 + adde r0,r0,r6 + bdnz 1b + +.Lcsum_aligned: + /* + * We unroll the loop such that each iteration is 64 bytes with an + * entry and exit limb of 64 bytes, meaning a minimum size of + * 128 bytes. + */ + srdi. r6,r4,7 + beq .Lcsum_tail_doublewords /* len < 128 */ + + srdi r6,r4,6 + subi r6,r6,1 + mtctr r6 + + stdu r1,-STACKFRAMESIZE(r1) + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + + ld r6,0(r3) + ld r9,8(r3) + + ld r10,16(r3) + ld r11,24(r3) + + /* + * On POWER6 and POWER7 back to back adde instructions take 2 cycles + * because of the XER dependency. This means the fastest this loop can + * go is 16 cycles per iteration. The scheduling of the loop below has + * been shown to hit this on both POWER6 and POWER7. + */ + .align 5 +2: + adde r0,r0,r6 + ld r12,32(r3) + ld r14,40(r3) + + adde r0,r0,r9 + ld r15,48(r3) + ld r16,56(r3) + addi r3,r3,64 + + adde r0,r0,r10 + + adde r0,r0,r11 + + adde r0,r0,r12 + + adde r0,r0,r14 + + adde r0,r0,r15 + ld r6,0(r3) + ld r9,8(r3) + + adde r0,r0,r16 + ld r10,16(r3) + ld r11,24(r3) + bdnz 2b + + + adde r0,r0,r6 + ld r12,32(r3) + ld r14,40(r3) + + adde r0,r0,r9 + ld r15,48(r3) + ld r16,56(r3) + addi r3,r3,64 + + adde r0,r0,r10 + adde r0,r0,r11 + adde r0,r0,r12 + adde r0,r0,r14 + adde r0,r0,r15 + adde r0,r0,r16 + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + addi r1,r1,STACKFRAMESIZE + + andi. r4,r4,63 + +.Lcsum_tail_doublewords: /* Up to 127 bytes to go */ + srdi. r6,r4,3 + beq .Lcsum_tail_word + + mtctr r6 +3: + ld r6,0(r3) + addi r3,r3,8 + adde r0,r0,r6 + bdnz 3b + + andi. r4,r4,7 + +.Lcsum_tail_word: /* Up to 7 bytes to go */ + srdi. r6,r4,2 + beq .Lcsum_tail_halfword + + lwz r6,0(r3) + addi r3,r3,4 + adde r0,r0,r6 + subi r4,r4,4 + +.Lcsum_tail_halfword: /* Up to 3 bytes to go */ + srdi. r6,r4,1 + beq .Lcsum_tail_byte + + lhz r6,0(r3) + addi r3,r3,2 + adde r0,r0,r6 + subi r4,r4,2 + +.Lcsum_tail_byte: /* Up to 1 byte to go */ + andi. r6,r4,1 + beq .Lcsum_finish + + lbz r6,0(r3) +#ifdef __BIG_ENDIAN__ + sldi r9,r6,8 /* Pad the byte out to 16 bits */ + adde r0,r0,r9 +#else + adde r0,r0,r6 +#endif + +.Lcsum_finish: + addze r0,r0 /* add in final carry */ + rldicl r4,r0,32,0 /* fold two 32 bit halves together */ + add r3,r4,r0 + srdi r3,r3,32 + blr +EXPORT_SYMBOL(__csum_partial) + + + .macro srcnr +100: + EX_TABLE(100b,.Lerror_nr) + .endm + + .macro source +150: + EX_TABLE(150b,.Lerror) + .endm + + .macro dstnr +200: + EX_TABLE(200b,.Lerror_nr) + .endm + + .macro dest +250: + EX_TABLE(250b,.Lerror) + .endm + +/* + * Computes the checksum of a memory block at src, length len, + * and adds in 0xffffffff (32-bit), while copying the block to dst. + * If an access exception occurs, it returns 0. + * + * csum_partial_copy_generic(r3=src, r4=dst, r5=len) + */ +_GLOBAL(csum_partial_copy_generic) + li r6,-1 + addic r0,r6,0 /* clear carry */ + + srdi. r6,r5,3 /* less than 8 bytes? */ + beq .Lcopy_tail_word + + /* + * If only halfword aligned, align to a double word. Since odd + * aligned addresses should be rare and they would require more + * work to calculate the correct checksum, we ignore that case + * and take the potential slowdown of unaligned loads. + * + * If the source and destination are relatively unaligned we only + * align the source. This keeps things simple. + */ + rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */ + beq .Lcopy_aligned + + li r9,4 + sub r6,r9,r6 + mtctr r6 + +1: +srcnr; lhz r6,0(r3) /* align to doubleword */ + subi r5,r5,2 + addi r3,r3,2 + adde r0,r0,r6 +dstnr; sth r6,0(r4) + addi r4,r4,2 + bdnz 1b + +.Lcopy_aligned: + /* + * We unroll the loop such that each iteration is 64 bytes with an + * entry and exit limb of 64 bytes, meaning a minimum size of + * 128 bytes. + */ + srdi. r6,r5,7 + beq .Lcopy_tail_doublewords /* len < 128 */ + + srdi r6,r5,6 + subi r6,r6,1 + mtctr r6 + + stdu r1,-STACKFRAMESIZE(r1) + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + +source; ld r6,0(r3) +source; ld r9,8(r3) + +source; ld r10,16(r3) +source; ld r11,24(r3) + + /* + * On POWER6 and POWER7 back to back adde instructions take 2 cycles + * because of the XER dependency. This means the fastest this loop can + * go is 16 cycles per iteration. The scheduling of the loop below has + * been shown to hit this on both POWER6 and POWER7. + */ + .align 5 +2: + adde r0,r0,r6 +source; ld r12,32(r3) +source; ld r14,40(r3) + + adde r0,r0,r9 +source; ld r15,48(r3) +source; ld r16,56(r3) + addi r3,r3,64 + + adde r0,r0,r10 +dest; std r6,0(r4) +dest; std r9,8(r4) + + adde r0,r0,r11 +dest; std r10,16(r4) +dest; std r11,24(r4) + + adde r0,r0,r12 +dest; std r12,32(r4) +dest; std r14,40(r4) + + adde r0,r0,r14 +dest; std r15,48(r4) +dest; std r16,56(r4) + addi r4,r4,64 + + adde r0,r0,r15 +source; ld r6,0(r3) +source; ld r9,8(r3) + + adde r0,r0,r16 +source; ld r10,16(r3) +source; ld r11,24(r3) + bdnz 2b + + + adde r0,r0,r6 +source; ld r12,32(r3) +source; ld r14,40(r3) + + adde r0,r0,r9 +source; ld r15,48(r3) +source; ld r16,56(r3) + addi r3,r3,64 + + adde r0,r0,r10 +dest; std r6,0(r4) +dest; std r9,8(r4) + + adde r0,r0,r11 +dest; std r10,16(r4) +dest; std r11,24(r4) + + adde r0,r0,r12 +dest; std r12,32(r4) +dest; std r14,40(r4) + + adde r0,r0,r14 +dest; std r15,48(r4) +dest; std r16,56(r4) + addi r4,r4,64 + + adde r0,r0,r15 + adde r0,r0,r16 + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + addi r1,r1,STACKFRAMESIZE + + andi. r5,r5,63 + +.Lcopy_tail_doublewords: /* Up to 127 bytes to go */ + srdi. r6,r5,3 + beq .Lcopy_tail_word + + mtctr r6 +3: +srcnr; ld r6,0(r3) + addi r3,r3,8 + adde r0,r0,r6 +dstnr; std r6,0(r4) + addi r4,r4,8 + bdnz 3b + + andi. r5,r5,7 + +.Lcopy_tail_word: /* Up to 7 bytes to go */ + srdi. r6,r5,2 + beq .Lcopy_tail_halfword + +srcnr; lwz r6,0(r3) + addi r3,r3,4 + adde r0,r0,r6 +dstnr; stw r6,0(r4) + addi r4,r4,4 + subi r5,r5,4 + +.Lcopy_tail_halfword: /* Up to 3 bytes to go */ + srdi. r6,r5,1 + beq .Lcopy_tail_byte + +srcnr; lhz r6,0(r3) + addi r3,r3,2 + adde r0,r0,r6 +dstnr; sth r6,0(r4) + addi r4,r4,2 + subi r5,r5,2 + +.Lcopy_tail_byte: /* Up to 1 byte to go */ + andi. r6,r5,1 + beq .Lcopy_finish + +srcnr; lbz r6,0(r3) +#ifdef __BIG_ENDIAN__ + sldi r9,r6,8 /* Pad the byte out to 16 bits */ + adde r0,r0,r9 +#else + adde r0,r0,r6 +#endif +dstnr; stb r6,0(r4) + +.Lcopy_finish: + addze r0,r0 /* add in final carry */ + rldicl r4,r0,32,0 /* fold two 32 bit halves together */ + add r3,r4,r0 + srdi r3,r3,32 + blr + +.Lerror: + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + addi r1,r1,STACKFRAMESIZE +.Lerror_nr: + li r3,0 + blr + +EXPORT_SYMBOL(csum_partial_copy_generic) + +/* + * __sum16 csum_ipv6_magic(const struct in6_addr *saddr, + * const struct in6_addr *daddr, + * __u32 len, __u8 proto, __wsum sum) + */ + +_GLOBAL(csum_ipv6_magic) + ld r8, 0(r3) + ld r9, 8(r3) + add r5, r5, r6 + addc r0, r8, r9 + ld r10, 0(r4) + ld r11, 8(r4) +#ifdef CONFIG_CPU_LITTLE_ENDIAN + rotldi r5, r5, 8 +#endif + adde r0, r0, r10 + add r5, r5, r7 + adde r0, r0, r11 + adde r0, r0, r5 + addze r0, r0 + rotldi r3, r0, 32 /* fold two 32 bit halves together */ + add r3, r0, r3 + srdi r0, r3, 32 + rotlwi r3, r0, 16 /* fold two 16 bit halves together */ + add r3, r0, r3 + not r3, r3 + rlwinm r3, r3, 16, 16, 31 + blr +EXPORT_SYMBOL(csum_ipv6_magic) diff --git a/arch/powerpc/lib/checksum_wrappers.c b/arch/powerpc/lib/checksum_wrappers.c new file mode 100644 index 0000000000..1a14c87802 --- /dev/null +++ b/arch/powerpc/lib/checksum_wrappers.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * + * Copyright (C) IBM Corporation, 2010 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ +#include <linux/export.h> +#include <linux/compiler.h> +#include <linux/types.h> +#include <asm/checksum.h> +#include <linux/uaccess.h> + +__wsum csum_and_copy_from_user(const void __user *src, void *dst, + int len) +{ + __wsum csum; + + if (unlikely(!user_read_access_begin(src, len))) + return 0; + + csum = csum_partial_copy_generic((void __force *)src, dst, len); + + user_read_access_end(); + return csum; +} + +__wsum csum_and_copy_to_user(const void *src, void __user *dst, int len) +{ + __wsum csum; + + if (unlikely(!user_write_access_begin(dst, len))) + return 0; + + csum = csum_partial_copy_generic(src, (void __force *)dst, len); + + user_write_access_end(); + return csum; +} diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c new file mode 100644 index 0000000000..b00112d7ad --- /dev/null +++ b/arch/powerpc/lib/code-patching.c @@ -0,0 +1,500 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2008 Michael Ellerman, IBM Corporation. + */ + +#include <linux/kprobes.h> +#include <linux/mmu_context.h> +#include <linux/random.h> +#include <linux/vmalloc.h> +#include <linux/init.h> +#include <linux/cpuhotplug.h> +#include <linux/uaccess.h> +#include <linux/jump_label.h> + +#include <asm/debug.h> +#include <asm/pgalloc.h> +#include <asm/tlb.h> +#include <asm/tlbflush.h> +#include <asm/page.h> +#include <asm/code-patching.h> +#include <asm/inst.h> + +static int __patch_instruction(u32 *exec_addr, ppc_inst_t instr, u32 *patch_addr) +{ + if (!ppc_inst_prefixed(instr)) { + u32 val = ppc_inst_val(instr); + + __put_kernel_nofault(patch_addr, &val, u32, failed); + } else { + u64 val = ppc_inst_as_ulong(instr); + + __put_kernel_nofault(patch_addr, &val, u64, failed); + } + + asm ("dcbst 0, %0; sync; icbi 0,%1; sync; isync" :: "r" (patch_addr), + "r" (exec_addr)); + + return 0; + +failed: + return -EPERM; +} + +int raw_patch_instruction(u32 *addr, ppc_inst_t instr) +{ + return __patch_instruction(addr, instr, addr); +} + +struct patch_context { + union { + struct vm_struct *area; + struct mm_struct *mm; + }; + unsigned long addr; + pte_t *pte; +}; + +static DEFINE_PER_CPU(struct patch_context, cpu_patching_context); + +static int map_patch_area(void *addr, unsigned long text_poke_addr); +static void unmap_patch_area(unsigned long addr); + +static bool mm_patch_enabled(void) +{ + return IS_ENABLED(CONFIG_SMP) && radix_enabled(); +} + +/* + * The following applies for Radix MMU. Hash MMU has different requirements, + * and so is not supported. + * + * Changing mm requires context synchronising instructions on both sides of + * the context switch, as well as a hwsync between the last instruction for + * which the address of an associated storage access was translated using + * the current context. + * + * switch_mm_irqs_off() performs an isync after the context switch. It is + * the responsibility of the caller to perform the CSI and hwsync before + * starting/stopping the temp mm. + */ +static struct mm_struct *start_using_temp_mm(struct mm_struct *temp_mm) +{ + struct mm_struct *orig_mm = current->active_mm; + + lockdep_assert_irqs_disabled(); + switch_mm_irqs_off(orig_mm, temp_mm, current); + + WARN_ON(!mm_is_thread_local(temp_mm)); + + suspend_breakpoints(); + return orig_mm; +} + +static void stop_using_temp_mm(struct mm_struct *temp_mm, + struct mm_struct *orig_mm) +{ + lockdep_assert_irqs_disabled(); + switch_mm_irqs_off(temp_mm, orig_mm, current); + restore_breakpoints(); +} + +static int text_area_cpu_up(unsigned int cpu) +{ + struct vm_struct *area; + unsigned long addr; + int err; + + area = get_vm_area(PAGE_SIZE, VM_ALLOC); + if (!area) { + WARN_ONCE(1, "Failed to create text area for cpu %d\n", + cpu); + return -1; + } + + // Map/unmap the area to ensure all page tables are pre-allocated + addr = (unsigned long)area->addr; + err = map_patch_area(empty_zero_page, addr); + if (err) + return err; + + unmap_patch_area(addr); + + this_cpu_write(cpu_patching_context.area, area); + this_cpu_write(cpu_patching_context.addr, addr); + this_cpu_write(cpu_patching_context.pte, virt_to_kpte(addr)); + + return 0; +} + +static int text_area_cpu_down(unsigned int cpu) +{ + free_vm_area(this_cpu_read(cpu_patching_context.area)); + this_cpu_write(cpu_patching_context.area, NULL); + this_cpu_write(cpu_patching_context.addr, 0); + this_cpu_write(cpu_patching_context.pte, NULL); + return 0; +} + +static void put_patching_mm(struct mm_struct *mm, unsigned long patching_addr) +{ + struct mmu_gather tlb; + + tlb_gather_mmu(&tlb, mm); + free_pgd_range(&tlb, patching_addr, patching_addr + PAGE_SIZE, 0, 0); + mmput(mm); +} + +static int text_area_cpu_up_mm(unsigned int cpu) +{ + struct mm_struct *mm; + unsigned long addr; + pte_t *pte; + spinlock_t *ptl; + + mm = mm_alloc(); + if (WARN_ON(!mm)) + goto fail_no_mm; + + /* + * Choose a random page-aligned address from the interval + * [PAGE_SIZE .. DEFAULT_MAP_WINDOW - PAGE_SIZE]. + * The lower address bound is PAGE_SIZE to avoid the zero-page. + */ + addr = (1 + (get_random_long() % (DEFAULT_MAP_WINDOW / PAGE_SIZE - 2))) << PAGE_SHIFT; + + /* + * PTE allocation uses GFP_KERNEL which means we need to + * pre-allocate the PTE here because we cannot do the + * allocation during patching when IRQs are disabled. + * + * Using get_locked_pte() to avoid open coding, the lock + * is unnecessary. + */ + pte = get_locked_pte(mm, addr, &ptl); + if (!pte) + goto fail_no_pte; + pte_unmap_unlock(pte, ptl); + + this_cpu_write(cpu_patching_context.mm, mm); + this_cpu_write(cpu_patching_context.addr, addr); + + return 0; + +fail_no_pte: + put_patching_mm(mm, addr); +fail_no_mm: + return -ENOMEM; +} + +static int text_area_cpu_down_mm(unsigned int cpu) +{ + put_patching_mm(this_cpu_read(cpu_patching_context.mm), + this_cpu_read(cpu_patching_context.addr)); + + this_cpu_write(cpu_patching_context.mm, NULL); + this_cpu_write(cpu_patching_context.addr, 0); + + return 0; +} + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poking_init_done); + +void __init poking_init(void) +{ + int ret; + + if (!IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) + return; + + if (mm_patch_enabled()) + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "powerpc/text_poke_mm:online", + text_area_cpu_up_mm, + text_area_cpu_down_mm); + else + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "powerpc/text_poke:online", + text_area_cpu_up, + text_area_cpu_down); + + /* cpuhp_setup_state returns >= 0 on success */ + if (WARN_ON(ret < 0)) + return; + + static_branch_enable(&poking_init_done); +} + +static unsigned long get_patch_pfn(void *addr) +{ + if (IS_ENABLED(CONFIG_MODULES) && is_vmalloc_or_module_addr(addr)) + return vmalloc_to_pfn(addr); + else + return __pa_symbol(addr) >> PAGE_SHIFT; +} + +/* + * This can be called for kernel text or a module. + */ +static int map_patch_area(void *addr, unsigned long text_poke_addr) +{ + unsigned long pfn = get_patch_pfn(addr); + + return map_kernel_page(text_poke_addr, (pfn << PAGE_SHIFT), PAGE_KERNEL); +} + +static void unmap_patch_area(unsigned long addr) +{ + pte_t *ptep; + pmd_t *pmdp; + pud_t *pudp; + p4d_t *p4dp; + pgd_t *pgdp; + + pgdp = pgd_offset_k(addr); + if (WARN_ON(pgd_none(*pgdp))) + return; + + p4dp = p4d_offset(pgdp, addr); + if (WARN_ON(p4d_none(*p4dp))) + return; + + pudp = pud_offset(p4dp, addr); + if (WARN_ON(pud_none(*pudp))) + return; + + pmdp = pmd_offset(pudp, addr); + if (WARN_ON(pmd_none(*pmdp))) + return; + + ptep = pte_offset_kernel(pmdp, addr); + if (WARN_ON(pte_none(*ptep))) + return; + + /* + * In hash, pte_clear flushes the tlb, in radix, we have to + */ + pte_clear(&init_mm, addr, ptep); + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); +} + +static int __do_patch_instruction_mm(u32 *addr, ppc_inst_t instr) +{ + int err; + u32 *patch_addr; + unsigned long text_poke_addr; + pte_t *pte; + unsigned long pfn = get_patch_pfn(addr); + struct mm_struct *patching_mm; + struct mm_struct *orig_mm; + spinlock_t *ptl; + + patching_mm = __this_cpu_read(cpu_patching_context.mm); + text_poke_addr = __this_cpu_read(cpu_patching_context.addr); + patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr)); + + pte = get_locked_pte(patching_mm, text_poke_addr, &ptl); + if (!pte) + return -ENOMEM; + + __set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, PAGE_KERNEL), 0); + + /* order PTE update before use, also serves as the hwsync */ + asm volatile("ptesync": : :"memory"); + + /* order context switch after arbitrary prior code */ + isync(); + + orig_mm = start_using_temp_mm(patching_mm); + + err = __patch_instruction(addr, instr, patch_addr); + + /* hwsync performed by __patch_instruction (sync) if successful */ + if (err) + mb(); /* sync */ + + /* context synchronisation performed by __patch_instruction (isync or exception) */ + stop_using_temp_mm(patching_mm, orig_mm); + + pte_clear(patching_mm, text_poke_addr, pte); + /* + * ptesync to order PTE update before TLB invalidation done + * by radix__local_flush_tlb_page_psize (in _tlbiel_va) + */ + local_flush_tlb_page_psize(patching_mm, text_poke_addr, mmu_virtual_psize); + + pte_unmap_unlock(pte, ptl); + + return err; +} + +static int __do_patch_instruction(u32 *addr, ppc_inst_t instr) +{ + int err; + u32 *patch_addr; + unsigned long text_poke_addr; + pte_t *pte; + unsigned long pfn = get_patch_pfn(addr); + + text_poke_addr = (unsigned long)__this_cpu_read(cpu_patching_context.addr) & PAGE_MASK; + patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr)); + + pte = __this_cpu_read(cpu_patching_context.pte); + __set_pte_at(&init_mm, text_poke_addr, pte, pfn_pte(pfn, PAGE_KERNEL), 0); + /* See ptesync comment in radix__set_pte_at() */ + if (radix_enabled()) + asm volatile("ptesync": : :"memory"); + + err = __patch_instruction(addr, instr, patch_addr); + + pte_clear(&init_mm, text_poke_addr, pte); + flush_tlb_kernel_range(text_poke_addr, text_poke_addr + PAGE_SIZE); + + return err; +} + +int patch_instruction(u32 *addr, ppc_inst_t instr) +{ + int err; + unsigned long flags; + + /* + * During early early boot patch_instruction is called + * when text_poke_area is not ready, but we still need + * to allow patching. We just do the plain old patching + */ + if (!IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) || + !static_branch_likely(&poking_init_done)) + return raw_patch_instruction(addr, instr); + + local_irq_save(flags); + if (mm_patch_enabled()) + err = __do_patch_instruction_mm(addr, instr); + else + err = __do_patch_instruction(addr, instr); + local_irq_restore(flags); + + return err; +} +NOKPROBE_SYMBOL(patch_instruction); + +int patch_branch(u32 *addr, unsigned long target, int flags) +{ + ppc_inst_t instr; + + if (create_branch(&instr, addr, target, flags)) + return -ERANGE; + + return patch_instruction(addr, instr); +} + +/* + * Helper to check if a given instruction is a conditional branch + * Derived from the conditional checks in analyse_instr() + */ +bool is_conditional_branch(ppc_inst_t instr) +{ + unsigned int opcode = ppc_inst_primary_opcode(instr); + + if (opcode == 16) /* bc, bca, bcl, bcla */ + return true; + if (opcode == 19) { + switch ((ppc_inst_val(instr) >> 1) & 0x3ff) { + case 16: /* bclr, bclrl */ + case 528: /* bcctr, bcctrl */ + case 560: /* bctar, bctarl */ + return true; + } + } + return false; +} +NOKPROBE_SYMBOL(is_conditional_branch); + +int create_cond_branch(ppc_inst_t *instr, const u32 *addr, + unsigned long target, int flags) +{ + long offset; + + offset = target; + if (! (flags & BRANCH_ABSOLUTE)) + offset = offset - (unsigned long)addr; + + /* Check we can represent the target in the instruction format */ + if (!is_offset_in_cond_branch_range(offset)) + return 1; + + /* Mask out the flags and target, so they don't step on each other. */ + *instr = ppc_inst(0x40000000 | (flags & 0x3FF0003) | (offset & 0xFFFC)); + + return 0; +} + +int instr_is_relative_branch(ppc_inst_t instr) +{ + if (ppc_inst_val(instr) & BRANCH_ABSOLUTE) + return 0; + + return instr_is_branch_iform(instr) || instr_is_branch_bform(instr); +} + +int instr_is_relative_link_branch(ppc_inst_t instr) +{ + return instr_is_relative_branch(instr) && (ppc_inst_val(instr) & BRANCH_SET_LINK); +} + +static unsigned long branch_iform_target(const u32 *instr) +{ + signed long imm; + + imm = ppc_inst_val(ppc_inst_read(instr)) & 0x3FFFFFC; + + /* If the top bit of the immediate value is set this is negative */ + if (imm & 0x2000000) + imm -= 0x4000000; + + if ((ppc_inst_val(ppc_inst_read(instr)) & BRANCH_ABSOLUTE) == 0) + imm += (unsigned long)instr; + + return (unsigned long)imm; +} + +static unsigned long branch_bform_target(const u32 *instr) +{ + signed long imm; + + imm = ppc_inst_val(ppc_inst_read(instr)) & 0xFFFC; + + /* If the top bit of the immediate value is set this is negative */ + if (imm & 0x8000) + imm -= 0x10000; + + if ((ppc_inst_val(ppc_inst_read(instr)) & BRANCH_ABSOLUTE) == 0) + imm += (unsigned long)instr; + + return (unsigned long)imm; +} + +unsigned long branch_target(const u32 *instr) +{ + if (instr_is_branch_iform(ppc_inst_read(instr))) + return branch_iform_target(instr); + else if (instr_is_branch_bform(ppc_inst_read(instr))) + return branch_bform_target(instr); + + return 0; +} + +int translate_branch(ppc_inst_t *instr, const u32 *dest, const u32 *src) +{ + unsigned long target; + target = branch_target(src); + + if (instr_is_branch_iform(ppc_inst_read(src))) + return create_branch(instr, dest, target, + ppc_inst_val(ppc_inst_read(src))); + else if (instr_is_branch_bform(ppc_inst_read(src))) + return create_cond_branch(instr, dest, target, + ppc_inst_val(ppc_inst_read(src))); + + return 1; +} diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S new file mode 100644 index 0000000000..933b685e7a --- /dev/null +++ b/arch/powerpc/lib/copy_32.S @@ -0,0 +1,515 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Memory copy functions for 32-bit PowerPC. + * + * Copyright (C) 1996-2005 Paul Mackerras. + */ +#include <linux/export.h> +#include <asm/processor.h> +#include <asm/cache.h> +#include <asm/errno.h> +#include <asm/ppc_asm.h> +#include <asm/code-patching-asm.h> +#include <asm/kasan.h> + +#define COPY_16_BYTES \ + lwz r7,4(r4); \ + lwz r8,8(r4); \ + lwz r9,12(r4); \ + lwzu r10,16(r4); \ + stw r7,4(r6); \ + stw r8,8(r6); \ + stw r9,12(r6); \ + stwu r10,16(r6) + +#define COPY_16_BYTES_WITHEX(n) \ +8 ## n ## 0: \ + lwz r7,4(r4); \ +8 ## n ## 1: \ + lwz r8,8(r4); \ +8 ## n ## 2: \ + lwz r9,12(r4); \ +8 ## n ## 3: \ + lwzu r10,16(r4); \ +8 ## n ## 4: \ + stw r7,4(r6); \ +8 ## n ## 5: \ + stw r8,8(r6); \ +8 ## n ## 6: \ + stw r9,12(r6); \ +8 ## n ## 7: \ + stwu r10,16(r6) + +#define COPY_16_BYTES_EXCODE(n) \ +9 ## n ## 0: \ + addi r5,r5,-(16 * n); \ + b 104f; \ +9 ## n ## 1: \ + addi r5,r5,-(16 * n); \ + b 105f; \ + EX_TABLE(8 ## n ## 0b,9 ## n ## 0b); \ + EX_TABLE(8 ## n ## 1b,9 ## n ## 0b); \ + EX_TABLE(8 ## n ## 2b,9 ## n ## 0b); \ + EX_TABLE(8 ## n ## 3b,9 ## n ## 0b); \ + EX_TABLE(8 ## n ## 4b,9 ## n ## 1b); \ + EX_TABLE(8 ## n ## 5b,9 ## n ## 1b); \ + EX_TABLE(8 ## n ## 6b,9 ## n ## 1b); \ + EX_TABLE(8 ## n ## 7b,9 ## n ## 1b) + + .text + +CACHELINE_BYTES = L1_CACHE_BYTES +LG_CACHELINE_BYTES = L1_CACHE_SHIFT +CACHELINE_MASK = (L1_CACHE_BYTES-1) + +#ifndef CONFIG_KASAN +_GLOBAL(memset16) + rlwinm. r0 ,r5, 31, 1, 31 + addi r6, r3, -4 + beq- 2f + rlwimi r4 ,r4 ,16 ,0 ,15 + mtctr r0 +1: stwu r4, 4(r6) + bdnz 1b +2: andi. r0, r5, 1 + beqlr + sth r4, 4(r6) + blr +EXPORT_SYMBOL(memset16) +#endif + +/* + * Use dcbz on the complete cache lines in the destination + * to set them to zero. This requires that the destination + * area is cacheable. -- paulus + * + * During early init, cache might not be active yet, so dcbz cannot be used. + * We therefore skip the optimised bloc that uses dcbz. This jump is + * replaced by a nop once cache is active. This is done in machine_init() + */ +_GLOBAL_KASAN(memset) + cmplwi 0,r5,4 + blt 7f + + rlwimi r4,r4,8,16,23 + rlwimi r4,r4,16,0,15 + + stw r4,0(r3) + beqlr + andi. r0,r3,3 + add r5,r0,r5 + subf r6,r0,r3 + cmplwi 0,r4,0 + /* + * Skip optimised bloc until cache is enabled. Will be replaced + * by 'bne' during boot to use normal procedure if r4 is not zero + */ +5: b 2f + patch_site 5b, patch__memset_nocache + + clrlwi r7,r6,32-LG_CACHELINE_BYTES + add r8,r7,r5 + srwi r9,r8,LG_CACHELINE_BYTES + addic. r9,r9,-1 /* total number of complete cachelines */ + ble 2f + xori r0,r7,CACHELINE_MASK & ~3 + srwi. r0,r0,2 + beq 3f + mtctr r0 +4: stwu r4,4(r6) + bdnz 4b +3: mtctr r9 + li r7,4 +10: dcbz r7,r6 + addi r6,r6,CACHELINE_BYTES + bdnz 10b + clrlwi r5,r8,32-LG_CACHELINE_BYTES + addi r5,r5,4 + +2: srwi r0,r5,2 + mtctr r0 + bdz 6f +1: stwu r4,4(r6) + bdnz 1b +6: andi. r5,r5,3 + beqlr + mtctr r5 + addi r6,r6,3 +8: stbu r4,1(r6) + bdnz 8b + blr + +7: cmpwi 0,r5,0 + beqlr + mtctr r5 + addi r6,r3,-1 +9: stbu r4,1(r6) + bdnz 9b + blr +EXPORT_SYMBOL(memset) +EXPORT_SYMBOL_KASAN(memset) + +/* + * This version uses dcbz on the complete cache lines in the + * destination area to reduce memory traffic. This requires that + * the destination area is cacheable. + * We only use this version if the source and dest don't overlap. + * -- paulus. + * + * During early init, cache might not be active yet, so dcbz cannot be used. + * We therefore jump to generic_memcpy which doesn't use dcbz. This jump is + * replaced by a nop once cache is active. This is done in machine_init() + */ +_GLOBAL_KASAN(memmove) + cmplw 0,r3,r4 + bgt backwards_memcpy + /* fall through */ + +_GLOBAL_KASAN(memcpy) +1: b generic_memcpy + patch_site 1b, patch__memcpy_nocache + + add r7,r3,r5 /* test if the src & dst overlap */ + add r8,r4,r5 + cmplw 0,r4,r7 + cmplw 1,r3,r8 + crand 0,0,4 /* cr0.lt &= cr1.lt */ + blt generic_memcpy /* if regions overlap */ + + addi r4,r4,-4 + addi r6,r3,-4 + neg r0,r3 + andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ + beq 58f + + cmplw 0,r5,r0 /* is this more than total to do? */ + blt 63f /* if not much to do */ + andi. r8,r0,3 /* get it word-aligned first */ + subf r5,r0,r5 + mtctr r8 + beq+ 61f +70: lbz r9,4(r4) /* do some bytes */ + addi r4,r4,1 + addi r6,r6,1 + stb r9,3(r6) + bdnz 70b +61: srwi. r0,r0,2 + mtctr r0 + beq 58f +72: lwzu r9,4(r4) /* do some words */ + stwu r9,4(r6) + bdnz 72b + +58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ + clrlwi r5,r5,32-LG_CACHELINE_BYTES + li r11,4 + mtctr r0 + beq 63f +53: + dcbz r11,r6 + COPY_16_BYTES +#if L1_CACHE_BYTES >= 32 + COPY_16_BYTES +#if L1_CACHE_BYTES >= 64 + COPY_16_BYTES + COPY_16_BYTES +#if L1_CACHE_BYTES >= 128 + COPY_16_BYTES + COPY_16_BYTES + COPY_16_BYTES + COPY_16_BYTES +#endif +#endif +#endif + bdnz 53b + +63: srwi. r0,r5,2 + mtctr r0 + beq 64f +30: lwzu r0,4(r4) + stwu r0,4(r6) + bdnz 30b + +64: andi. r0,r5,3 + mtctr r0 + beq+ 65f + addi r4,r4,3 + addi r6,r6,3 +40: lbzu r0,1(r4) + stbu r0,1(r6) + bdnz 40b +65: blr +EXPORT_SYMBOL(memcpy) +EXPORT_SYMBOL(memmove) +EXPORT_SYMBOL_KASAN(memcpy) +EXPORT_SYMBOL_KASAN(memmove) + +generic_memcpy: + srwi. r7,r5,3 + addi r6,r3,-4 + addi r4,r4,-4 + beq 2f /* if less than 8 bytes to do */ + andi. r0,r6,3 /* get dest word aligned */ + mtctr r7 + bne 5f +1: lwz r7,4(r4) + lwzu r8,8(r4) + stw r7,4(r6) + stwu r8,8(r6) + bdnz 1b + andi. r5,r5,7 +2: cmplwi 0,r5,4 + blt 3f + lwzu r0,4(r4) + addi r5,r5,-4 + stwu r0,4(r6) +3: cmpwi 0,r5,0 + beqlr + mtctr r5 + addi r4,r4,3 + addi r6,r6,3 +4: lbzu r0,1(r4) + stbu r0,1(r6) + bdnz 4b + blr +5: subfic r0,r0,4 + mtctr r0 +6: lbz r7,4(r4) + addi r4,r4,1 + stb r7,4(r6) + addi r6,r6,1 + bdnz 6b + subf r5,r0,r5 + rlwinm. r7,r5,32-3,3,31 + beq 2b + mtctr r7 + b 1b + +_GLOBAL(backwards_memcpy) + rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */ + add r6,r3,r5 + add r4,r4,r5 + beq 2f + andi. r0,r6,3 + mtctr r7 + bne 5f +1: lwz r7,-4(r4) + lwzu r8,-8(r4) + stw r7,-4(r6) + stwu r8,-8(r6) + bdnz 1b + andi. r5,r5,7 +2: cmplwi 0,r5,4 + blt 3f + lwzu r0,-4(r4) + subi r5,r5,4 + stwu r0,-4(r6) +3: cmpwi 0,r5,0 + beqlr + mtctr r5 +4: lbzu r0,-1(r4) + stbu r0,-1(r6) + bdnz 4b + blr +5: mtctr r0 +6: lbzu r7,-1(r4) + stbu r7,-1(r6) + bdnz 6b + subf r5,r0,r5 + rlwinm. r7,r5,32-3,3,31 + beq 2b + mtctr r7 + b 1b + +_GLOBAL(__copy_tofrom_user) + addi r4,r4,-4 + addi r6,r3,-4 + neg r0,r3 + andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ + beq 58f + + cmplw 0,r5,r0 /* is this more than total to do? */ + blt 63f /* if not much to do */ + andi. r8,r0,3 /* get it word-aligned first */ + mtctr r8 + beq+ 61f +70: lbz r9,4(r4) /* do some bytes */ +71: stb r9,4(r6) + addi r4,r4,1 + addi r6,r6,1 + bdnz 70b +61: subf r5,r0,r5 + srwi. r0,r0,2 + mtctr r0 + beq 58f +72: lwzu r9,4(r4) /* do some words */ +73: stwu r9,4(r6) + bdnz 72b + + EX_TABLE(70b,100f) + EX_TABLE(71b,101f) + EX_TABLE(72b,102f) + EX_TABLE(73b,103f) + +58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ + clrlwi r5,r5,32-LG_CACHELINE_BYTES + li r11,4 + beq 63f + + /* Here we decide how far ahead to prefetch the source */ + li r3,4 + cmpwi r0,1 + li r7,0 + ble 114f + li r7,1 +#if MAX_COPY_PREFETCH > 1 + /* Heuristically, for large transfers we prefetch + MAX_COPY_PREFETCH cachelines ahead. For small transfers + we prefetch 1 cacheline ahead. */ + cmpwi r0,MAX_COPY_PREFETCH + ble 112f + li r7,MAX_COPY_PREFETCH +112: mtctr r7 +111: dcbt r3,r4 + addi r3,r3,CACHELINE_BYTES + bdnz 111b +#else + dcbt r3,r4 + addi r3,r3,CACHELINE_BYTES +#endif /* MAX_COPY_PREFETCH > 1 */ + +114: subf r8,r7,r0 + mr r0,r7 + mtctr r8 + +53: dcbt r3,r4 +54: dcbz r11,r6 + EX_TABLE(54b,105f) +/* the main body of the cacheline loop */ + COPY_16_BYTES_WITHEX(0) +#if L1_CACHE_BYTES >= 32 + COPY_16_BYTES_WITHEX(1) +#if L1_CACHE_BYTES >= 64 + COPY_16_BYTES_WITHEX(2) + COPY_16_BYTES_WITHEX(3) +#if L1_CACHE_BYTES >= 128 + COPY_16_BYTES_WITHEX(4) + COPY_16_BYTES_WITHEX(5) + COPY_16_BYTES_WITHEX(6) + COPY_16_BYTES_WITHEX(7) +#endif +#endif +#endif + bdnz 53b + cmpwi r0,0 + li r3,4 + li r7,0 + bne 114b + +63: srwi. r0,r5,2 + mtctr r0 + beq 64f +30: lwzu r0,4(r4) +31: stwu r0,4(r6) + bdnz 30b + +64: andi. r0,r5,3 + mtctr r0 + beq+ 65f +40: lbz r0,4(r4) +41: stb r0,4(r6) + addi r4,r4,1 + addi r6,r6,1 + bdnz 40b +65: li r3,0 + blr + +/* read fault, initial single-byte copy */ +100: li r9,0 + b 90f +/* write fault, initial single-byte copy */ +101: li r9,1 +90: subf r5,r8,r5 + li r3,0 + b 99f +/* read fault, initial word copy */ +102: li r9,0 + b 91f +/* write fault, initial word copy */ +103: li r9,1 +91: li r3,2 + b 99f + +/* + * this stuff handles faults in the cacheline loop and branches to either + * 104f (if in read part) or 105f (if in write part), after updating r5 + */ + COPY_16_BYTES_EXCODE(0) +#if L1_CACHE_BYTES >= 32 + COPY_16_BYTES_EXCODE(1) +#if L1_CACHE_BYTES >= 64 + COPY_16_BYTES_EXCODE(2) + COPY_16_BYTES_EXCODE(3) +#if L1_CACHE_BYTES >= 128 + COPY_16_BYTES_EXCODE(4) + COPY_16_BYTES_EXCODE(5) + COPY_16_BYTES_EXCODE(6) + COPY_16_BYTES_EXCODE(7) +#endif +#endif +#endif + +/* read fault in cacheline loop */ +104: li r9,0 + b 92f +/* fault on dcbz (effectively a write fault) */ +/* or write fault in cacheline loop */ +105: li r9,1 +92: li r3,LG_CACHELINE_BYTES + mfctr r8 + add r0,r0,r8 + b 106f +/* read fault in final word loop */ +108: li r9,0 + b 93f +/* write fault in final word loop */ +109: li r9,1 +93: andi. r5,r5,3 + li r3,2 + b 99f +/* read fault in final byte loop */ +110: li r9,0 + b 94f +/* write fault in final byte loop */ +111: li r9,1 +94: li r5,0 + li r3,0 +/* + * At this stage the number of bytes not copied is + * r5 + (ctr << r3), and r9 is 0 for read or 1 for write. + */ +99: mfctr r0 +106: slw r3,r0,r3 + add. r3,r3,r5 + beq 120f /* shouldn't happen */ + cmpwi 0,r9,0 + bne 120f +/* for a read fault, first try to continue the copy one byte at a time */ + mtctr r3 +130: lbz r0,4(r4) +131: stb r0,4(r6) + addi r4,r4,1 + addi r6,r6,1 + bdnz 130b +/* then clear out the destination: r3 bytes starting at 4(r6) */ +132: mfctr r3 +120: blr + + EX_TABLE(30b,108b) + EX_TABLE(31b,109b) + EX_TABLE(40b,110b) + EX_TABLE(41b,111b) + EX_TABLE(130b,132b) + EX_TABLE(131b,120b) + +EXPORT_SYMBOL(__copy_tofrom_user) diff --git a/arch/powerpc/lib/copy_mc_64.S b/arch/powerpc/lib/copy_mc_64.S new file mode 100644 index 0000000000..bf1014b28f --- /dev/null +++ b/arch/powerpc/lib/copy_mc_64.S @@ -0,0 +1,242 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) IBM Corporation, 2011 + * Derived from copyuser_power7.s by Anton Blanchard <anton@au.ibm.com> + * Author - Balbir Singh <bsingharora@gmail.com> + */ +#include <linux/export.h> +#include <asm/ppc_asm.h> +#include <asm/errno.h> + + .macro err1 +100: + EX_TABLE(100b,.Ldo_err1) + .endm + + .macro err2 +200: + EX_TABLE(200b,.Ldo_err2) + .endm + + .macro err3 +300: EX_TABLE(300b,.Ldone) + .endm + +.Ldo_err2: + ld r22,STK_REG(R22)(r1) + ld r21,STK_REG(R21)(r1) + ld r20,STK_REG(R20)(r1) + ld r19,STK_REG(R19)(r1) + ld r18,STK_REG(R18)(r1) + ld r17,STK_REG(R17)(r1) + ld r16,STK_REG(R16)(r1) + ld r15,STK_REG(R15)(r1) + ld r14,STK_REG(R14)(r1) + addi r1,r1,STACKFRAMESIZE +.Ldo_err1: + /* Do a byte by byte copy to get the exact remaining size */ + mtctr r7 +46: +err3; lbz r0,0(r4) + addi r4,r4,1 +err3; stb r0,0(r3) + addi r3,r3,1 + bdnz 46b + li r3,0 + blr + +.Ldone: + mfctr r3 + blr + + +_GLOBAL(copy_mc_generic) + mr r7,r5 + cmpldi r5,16 + blt .Lshort_copy + +.Lcopy: + /* Get the source 8B aligned */ + neg r6,r4 + mtocrf 0x01,r6 + clrldi r6,r6,(64-3) + + bf cr7*4+3,1f +err1; lbz r0,0(r4) + addi r4,r4,1 +err1; stb r0,0(r3) + addi r3,r3,1 + subi r7,r7,1 + +1: bf cr7*4+2,2f +err1; lhz r0,0(r4) + addi r4,r4,2 +err1; sth r0,0(r3) + addi r3,r3,2 + subi r7,r7,2 + +2: bf cr7*4+1,3f +err1; lwz r0,0(r4) + addi r4,r4,4 +err1; stw r0,0(r3) + addi r3,r3,4 + subi r7,r7,4 + +3: sub r5,r5,r6 + cmpldi r5,128 + + mflr r0 + stdu r1,-STACKFRAMESIZE(r1) + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + std r17,STK_REG(R17)(r1) + std r18,STK_REG(R18)(r1) + std r19,STK_REG(R19)(r1) + std r20,STK_REG(R20)(r1) + std r21,STK_REG(R21)(r1) + std r22,STK_REG(R22)(r1) + std r0,STACKFRAMESIZE+16(r1) + + blt 5f + srdi r6,r5,7 + mtctr r6 + + /* Now do cacheline (128B) sized loads and stores. */ + .align 5 +4: +err2; ld r0,0(r4) +err2; ld r6,8(r4) +err2; ld r8,16(r4) +err2; ld r9,24(r4) +err2; ld r10,32(r4) +err2; ld r11,40(r4) +err2; ld r12,48(r4) +err2; ld r14,56(r4) +err2; ld r15,64(r4) +err2; ld r16,72(r4) +err2; ld r17,80(r4) +err2; ld r18,88(r4) +err2; ld r19,96(r4) +err2; ld r20,104(r4) +err2; ld r21,112(r4) +err2; ld r22,120(r4) + addi r4,r4,128 +err2; std r0,0(r3) +err2; std r6,8(r3) +err2; std r8,16(r3) +err2; std r9,24(r3) +err2; std r10,32(r3) +err2; std r11,40(r3) +err2; std r12,48(r3) +err2; std r14,56(r3) +err2; std r15,64(r3) +err2; std r16,72(r3) +err2; std r17,80(r3) +err2; std r18,88(r3) +err2; std r19,96(r3) +err2; std r20,104(r3) +err2; std r21,112(r3) +err2; std r22,120(r3) + addi r3,r3,128 + subi r7,r7,128 + bdnz 4b + + clrldi r5,r5,(64-7) + + /* Up to 127B to go */ +5: srdi r6,r5,4 + mtocrf 0x01,r6 + +6: bf cr7*4+1,7f +err2; ld r0,0(r4) +err2; ld r6,8(r4) +err2; ld r8,16(r4) +err2; ld r9,24(r4) +err2; ld r10,32(r4) +err2; ld r11,40(r4) +err2; ld r12,48(r4) +err2; ld r14,56(r4) + addi r4,r4,64 +err2; std r0,0(r3) +err2; std r6,8(r3) +err2; std r8,16(r3) +err2; std r9,24(r3) +err2; std r10,32(r3) +err2; std r11,40(r3) +err2; std r12,48(r3) +err2; std r14,56(r3) + addi r3,r3,64 + subi r7,r7,64 + +7: ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + ld r17,STK_REG(R17)(r1) + ld r18,STK_REG(R18)(r1) + ld r19,STK_REG(R19)(r1) + ld r20,STK_REG(R20)(r1) + ld r21,STK_REG(R21)(r1) + ld r22,STK_REG(R22)(r1) + addi r1,r1,STACKFRAMESIZE + + /* Up to 63B to go */ + bf cr7*4+2,8f +err1; ld r0,0(r4) +err1; ld r6,8(r4) +err1; ld r8,16(r4) +err1; ld r9,24(r4) + addi r4,r4,32 +err1; std r0,0(r3) +err1; std r6,8(r3) +err1; std r8,16(r3) +err1; std r9,24(r3) + addi r3,r3,32 + subi r7,r7,32 + + /* Up to 31B to go */ +8: bf cr7*4+3,9f +err1; ld r0,0(r4) +err1; ld r6,8(r4) + addi r4,r4,16 +err1; std r0,0(r3) +err1; std r6,8(r3) + addi r3,r3,16 + subi r7,r7,16 + +9: clrldi r5,r5,(64-4) + + /* Up to 15B to go */ +.Lshort_copy: + mtocrf 0x01,r5 + bf cr7*4+0,12f +err1; lwz r0,0(r4) /* Less chance of a reject with word ops */ +err1; lwz r6,4(r4) + addi r4,r4,8 +err1; stw r0,0(r3) +err1; stw r6,4(r3) + addi r3,r3,8 + subi r7,r7,8 + +12: bf cr7*4+1,13f +err1; lwz r0,0(r4) + addi r4,r4,4 +err1; stw r0,0(r3) + addi r3,r3,4 + subi r7,r7,4 + +13: bf cr7*4+2,14f +err1; lhz r0,0(r4) + addi r4,r4,2 +err1; sth r0,0(r3) + addi r3,r3,2 + subi r7,r7,2 + +14: bf cr7*4+3,15f +err1; lbz r0,0(r4) +err1; stb r0,0(r3) + +15: li r3,0 + blr + +EXPORT_SYMBOL_GPL(copy_mc_generic); diff --git a/arch/powerpc/lib/copypage_64.S b/arch/powerpc/lib/copypage_64.S new file mode 100644 index 0000000000..f33a2e6088 --- /dev/null +++ b/arch/powerpc/lib/copypage_64.S @@ -0,0 +1,118 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2008 Mark Nelson, IBM Corp. + */ +#include <linux/export.h> +#include <asm/page.h> +#include <asm/processor.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/feature-fixups.h> + +_GLOBAL_TOC(copy_page) +BEGIN_FTR_SECTION + lis r5,PAGE_SIZE@h +FTR_SECTION_ELSE +#ifdef CONFIG_PPC_BOOK3S_64 + b copypage_power7 +#endif +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY) + ori r5,r5,PAGE_SIZE@l +#ifdef CONFIG_PPC_KERNEL_PCREL + /* + * Hack for toolchain - prefixed instructions cause label difference to + * be non-constant even if 8 byte alignment is known, so they can not + * be put in FTR sections. + */ + LOAD_REG_ADDR(r10, ppc64_caches) +BEGIN_FTR_SECTION +#else +BEGIN_FTR_SECTION + LOAD_REG_ADDR(r10, ppc64_caches) +#endif + lwz r11,DCACHEL1LOGBLOCKSIZE(r10) /* log2 of cache block size */ + lwz r12,DCACHEL1BLOCKSIZE(r10) /* get cache block size */ + li r9,0 + srd r8,r5,r11 + + mtctr r8 +.Lsetup: + dcbt r9,r4 + dcbz r9,r3 + add r9,r9,r12 + bdnz .Lsetup +END_FTR_SECTION_IFSET(CPU_FTR_CP_USE_DCBTZ) + addi r3,r3,-8 + srdi r8,r5,7 /* page is copied in 128 byte strides */ + addi r8,r8,-1 /* one stride copied outside loop */ + + mtctr r8 + + ld r5,0(r4) + ld r6,8(r4) + ld r7,16(r4) + ldu r8,24(r4) +1: std r5,8(r3) + std r6,16(r3) + ld r9,8(r4) + ld r10,16(r4) + std r7,24(r3) + std r8,32(r3) + ld r11,24(r4) + ld r12,32(r4) + std r9,40(r3) + std r10,48(r3) + ld r5,40(r4) + ld r6,48(r4) + std r11,56(r3) + std r12,64(r3) + ld r7,56(r4) + ld r8,64(r4) + std r5,72(r3) + std r6,80(r3) + ld r9,72(r4) + ld r10,80(r4) + std r7,88(r3) + std r8,96(r3) + ld r11,88(r4) + ld r12,96(r4) + std r9,104(r3) + std r10,112(r3) + ld r5,104(r4) + ld r6,112(r4) + std r11,120(r3) + stdu r12,128(r3) + ld r7,120(r4) + ldu r8,128(r4) + bdnz 1b + + std r5,8(r3) + std r6,16(r3) + ld r9,8(r4) + ld r10,16(r4) + std r7,24(r3) + std r8,32(r3) + ld r11,24(r4) + ld r12,32(r4) + std r9,40(r3) + std r10,48(r3) + ld r5,40(r4) + ld r6,48(r4) + std r11,56(r3) + std r12,64(r3) + ld r7,56(r4) + ld r8,64(r4) + std r5,72(r3) + std r6,80(r3) + ld r9,72(r4) + ld r10,80(r4) + std r7,88(r3) + std r8,96(r3) + ld r11,88(r4) + ld r12,96(r4) + std r9,104(r3) + std r10,112(r3) + std r11,120(r3) + std r12,128(r3) + blr +EXPORT_SYMBOL(copy_page) diff --git a/arch/powerpc/lib/copypage_power7.S b/arch/powerpc/lib/copypage_power7.S new file mode 100644 index 0000000000..a783973f12 --- /dev/null +++ b/arch/powerpc/lib/copypage_power7.S @@ -0,0 +1,153 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ +#include <asm/page.h> +#include <asm/ppc_asm.h> + +_GLOBAL(copypage_power7) + /* + * We prefetch both the source and destination using enhanced touch + * instructions. We use a stream ID of 0 for the load side and + * 1 for the store side. Since source and destination are page + * aligned we don't need to clear the bottom 7 bits of either + * address. + */ + ori r9,r3,1 /* stream=1 => to */ + +#ifdef CONFIG_PPC_64K_PAGES + lis r7,0x0E01 /* depth=7 + * units/cachelines=512 */ +#else + lis r7,0x0E00 /* depth=7 */ + ori r7,r7,0x1000 /* units/cachelines=32 */ +#endif + ori r10,r7,1 /* stream=1 */ + + lis r8,0x8000 /* GO=1 */ + clrldi r8,r8,32 + + /* setup read stream 0 */ + dcbt 0,r4,0b01000 /* addr from */ + dcbt 0,r7,0b01010 /* length and depth from */ + /* setup write stream 1 */ + dcbtst 0,r9,0b01000 /* addr to */ + dcbtst 0,r10,0b01010 /* length and depth to */ + eieio + dcbt 0,r8,0b01010 /* all streams GO */ + +#ifdef CONFIG_ALTIVEC + mflr r0 + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) + std r0,16(r1) + stdu r1,-STACKFRAMESIZE(r1) + bl CFUNC(enter_vmx_ops) + cmpwi r3,0 + ld r0,STACKFRAMESIZE+16(r1) + ld r3,STK_REG(R31)(r1) + ld r4,STK_REG(R30)(r1) + mtlr r0 + + li r0,(PAGE_SIZE/128) + mtctr r0 + + beq .Lnonvmx_copy + + addi r1,r1,STACKFRAMESIZE + + li r6,16 + li r7,32 + li r8,48 + li r9,64 + li r10,80 + li r11,96 + li r12,112 + + .align 5 +1: lvx v7,0,r4 + lvx v6,r4,r6 + lvx v5,r4,r7 + lvx v4,r4,r8 + lvx v3,r4,r9 + lvx v2,r4,r10 + lvx v1,r4,r11 + lvx v0,r4,r12 + addi r4,r4,128 + stvx v7,0,r3 + stvx v6,r3,r6 + stvx v5,r3,r7 + stvx v4,r3,r8 + stvx v3,r3,r9 + stvx v2,r3,r10 + stvx v1,r3,r11 + stvx v0,r3,r12 + addi r3,r3,128 + bdnz 1b + + b CFUNC(exit_vmx_ops) /* tail call optimise */ + +#else + li r0,(PAGE_SIZE/128) + mtctr r0 + + stdu r1,-STACKFRAMESIZE(r1) +#endif + +.Lnonvmx_copy: + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + std r17,STK_REG(R17)(r1) + std r18,STK_REG(R18)(r1) + std r19,STK_REG(R19)(r1) + std r20,STK_REG(R20)(r1) + +1: ld r0,0(r4) + ld r5,8(r4) + ld r6,16(r4) + ld r7,24(r4) + ld r8,32(r4) + ld r9,40(r4) + ld r10,48(r4) + ld r11,56(r4) + ld r12,64(r4) + ld r14,72(r4) + ld r15,80(r4) + ld r16,88(r4) + ld r17,96(r4) + ld r18,104(r4) + ld r19,112(r4) + ld r20,120(r4) + addi r4,r4,128 + std r0,0(r3) + std r5,8(r3) + std r6,16(r3) + std r7,24(r3) + std r8,32(r3) + std r9,40(r3) + std r10,48(r3) + std r11,56(r3) + std r12,64(r3) + std r14,72(r3) + std r15,80(r3) + std r16,88(r3) + std r17,96(r3) + std r18,104(r3) + std r19,112(r3) + std r20,120(r3) + addi r3,r3,128 + bdnz 1b + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + ld r17,STK_REG(R17)(r1) + ld r18,STK_REG(R18)(r1) + ld r19,STK_REG(R19)(r1) + ld r20,STK_REG(R20)(r1) + addi r1,r1,STACKFRAMESIZE + blr diff --git a/arch/powerpc/lib/copyuser_64.S b/arch/powerpc/lib/copyuser_64.S new file mode 100644 index 0000000000..9af969d2cc --- /dev/null +++ b/arch/powerpc/lib/copyuser_64.S @@ -0,0 +1,564 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2002 Paul Mackerras, IBM Corp. + */ +#include <linux/export.h> +#include <asm/processor.h> +#include <asm/ppc_asm.h> +#include <asm/asm-compat.h> +#include <asm/feature-fixups.h> + +#ifndef SELFTEST_CASE +/* 0 == most CPUs, 1 == POWER6, 2 == Cell */ +#define SELFTEST_CASE 0 +#endif + +#ifdef __BIG_ENDIAN__ +#define sLd sld /* Shift towards low-numbered address. */ +#define sHd srd /* Shift towards high-numbered address. */ +#else +#define sLd srd /* Shift towards low-numbered address. */ +#define sHd sld /* Shift towards high-numbered address. */ +#endif + +/* + * These macros are used to generate exception table entries. + * The exception handlers below use the original arguments + * (stored on the stack) and the point where we're up to in + * the destination buffer, i.e. the address of the first + * unmodified byte. Generally r3 points into the destination + * buffer, but the first unmodified byte is at a variable + * offset from r3. In the code below, the symbol r3_offset + * is set to indicate the current offset at each point in + * the code. This offset is then used as a negative offset + * from the exception handler code, and those instructions + * before the exception handlers are addi instructions that + * adjust r3 to point to the correct place. + */ + .macro lex /* exception handler for load */ +100: EX_TABLE(100b, .Lld_exc - r3_offset) + .endm + + .macro stex /* exception handler for store */ +100: EX_TABLE(100b, .Lst_exc - r3_offset) + .endm + + .align 7 +_GLOBAL_TOC(__copy_tofrom_user) +#ifdef CONFIG_PPC_BOOK3S_64 +BEGIN_FTR_SECTION + nop +FTR_SECTION_ELSE + b __copy_tofrom_user_power7 +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY) +#endif +_GLOBAL(__copy_tofrom_user_base) + /* first check for a 4kB copy on a 4kB boundary */ + cmpldi cr1,r5,16 + cmpdi cr6,r5,4096 + or r0,r3,r4 + neg r6,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */ + andi. r0,r0,4095 + std r3,-24(r1) + crand cr0*4+2,cr0*4+2,cr6*4+2 + std r4,-16(r1) + std r5,-8(r1) + dcbt 0,r4 + beq .Lcopy_page_4K + andi. r6,r6,7 + PPC_MTOCRF(0x01,r5) + blt cr1,.Lshort_copy +/* Below we want to nop out the bne if we're on a CPU that has the + * CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit + * cleared. + * At the time of writing the only CPU that has this combination of bits + * set is Power6. + */ +test_feature = (SELFTEST_CASE == 1) +BEGIN_FTR_SECTION + nop +FTR_SECTION_ELSE + bne .Ldst_unaligned +ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \ + CPU_FTR_UNALIGNED_LD_STD) +.Ldst_aligned: + addi r3,r3,-16 +r3_offset = 16 +test_feature = (SELFTEST_CASE == 0) +BEGIN_FTR_SECTION + andi. r0,r4,7 + bne .Lsrc_unaligned +END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) + blt cr1,.Ldo_tail /* if < 16 bytes to copy */ + srdi r0,r5,5 + cmpdi cr1,r0,0 +lex; ld r7,0(r4) +lex; ld r6,8(r4) + addi r4,r4,16 + mtctr r0 + andi. r0,r5,0x10 + beq 22f + addi r3,r3,16 +r3_offset = 0 + addi r4,r4,-16 + mr r9,r7 + mr r8,r6 + beq cr1,72f +21: +lex; ld r7,16(r4) +lex; ld r6,24(r4) + addi r4,r4,32 +stex; std r9,0(r3) +r3_offset = 8 +stex; std r8,8(r3) +r3_offset = 16 +22: +lex; ld r9,0(r4) +lex; ld r8,8(r4) +stex; std r7,16(r3) +r3_offset = 24 +stex; std r6,24(r3) + addi r3,r3,32 +r3_offset = 0 + bdnz 21b +72: +stex; std r9,0(r3) +r3_offset = 8 +stex; std r8,8(r3) +r3_offset = 16 + andi. r5,r5,0xf + beq+ 3f + addi r4,r4,16 +.Ldo_tail: + addi r3,r3,16 +r3_offset = 0 + bf cr7*4+0,246f +lex; ld r9,0(r4) + addi r4,r4,8 +stex; std r9,0(r3) + addi r3,r3,8 +246: bf cr7*4+1,1f +lex; lwz r9,0(r4) + addi r4,r4,4 +stex; stw r9,0(r3) + addi r3,r3,4 +1: bf cr7*4+2,2f +lex; lhz r9,0(r4) + addi r4,r4,2 +stex; sth r9,0(r3) + addi r3,r3,2 +2: bf cr7*4+3,3f +lex; lbz r9,0(r4) +stex; stb r9,0(r3) +3: li r3,0 + blr + +.Lsrc_unaligned: +r3_offset = 16 + srdi r6,r5,3 + addi r5,r5,-16 + subf r4,r0,r4 + srdi r7,r5,4 + sldi r10,r0,3 + cmpldi cr6,r6,3 + andi. r5,r5,7 + mtctr r7 + subfic r11,r10,64 + add r5,r5,r0 + bt cr7*4+0,28f + +lex; ld r9,0(r4) /* 3+2n loads, 2+2n stores */ +lex; ld r0,8(r4) + sLd r6,r9,r10 +lex; ldu r9,16(r4) + sHd r7,r0,r11 + sLd r8,r0,r10 + or r7,r7,r6 + blt cr6,79f +lex; ld r0,8(r4) + b 2f + +28: +lex; ld r0,0(r4) /* 4+2n loads, 3+2n stores */ +lex; ldu r9,8(r4) + sLd r8,r0,r10 + addi r3,r3,-8 +r3_offset = 24 + blt cr6,5f +lex; ld r0,8(r4) + sHd r12,r9,r11 + sLd r6,r9,r10 +lex; ldu r9,16(r4) + or r12,r8,r12 + sHd r7,r0,r11 + sLd r8,r0,r10 + addi r3,r3,16 +r3_offset = 8 + beq cr6,78f + +1: or r7,r7,r6 +lex; ld r0,8(r4) +stex; std r12,8(r3) +r3_offset = 16 +2: sHd r12,r9,r11 + sLd r6,r9,r10 +lex; ldu r9,16(r4) + or r12,r8,r12 +stex; stdu r7,16(r3) +r3_offset = 8 + sHd r7,r0,r11 + sLd r8,r0,r10 + bdnz 1b + +78: +stex; std r12,8(r3) +r3_offset = 16 + or r7,r7,r6 +79: +stex; std r7,16(r3) +r3_offset = 24 +5: sHd r12,r9,r11 + or r12,r8,r12 +stex; std r12,24(r3) +r3_offset = 32 + bne 6f + li r3,0 + blr +6: cmpwi cr1,r5,8 + addi r3,r3,32 +r3_offset = 0 + sLd r9,r9,r10 + ble cr1,7f +lex; ld r0,8(r4) + sHd r7,r0,r11 + or r9,r7,r9 +7: + bf cr7*4+1,1f +#ifdef __BIG_ENDIAN__ + rotldi r9,r9,32 +#endif +stex; stw r9,0(r3) +#ifdef __LITTLE_ENDIAN__ + rotrdi r9,r9,32 +#endif + addi r3,r3,4 +1: bf cr7*4+2,2f +#ifdef __BIG_ENDIAN__ + rotldi r9,r9,16 +#endif +stex; sth r9,0(r3) +#ifdef __LITTLE_ENDIAN__ + rotrdi r9,r9,16 +#endif + addi r3,r3,2 +2: bf cr7*4+3,3f +#ifdef __BIG_ENDIAN__ + rotldi r9,r9,8 +#endif +stex; stb r9,0(r3) +#ifdef __LITTLE_ENDIAN__ + rotrdi r9,r9,8 +#endif +3: li r3,0 + blr + +.Ldst_unaligned: +r3_offset = 0 + PPC_MTOCRF(0x01,r6) /* put #bytes to 8B bdry into cr7 */ + subf r5,r6,r5 + li r7,0 + cmpldi cr1,r5,16 + bf cr7*4+3,1f +100: EX_TABLE(100b, .Lld_exc_r7) + lbz r0,0(r4) +100: EX_TABLE(100b, .Lst_exc_r7) + stb r0,0(r3) + addi r7,r7,1 +1: bf cr7*4+2,2f +100: EX_TABLE(100b, .Lld_exc_r7) + lhzx r0,r7,r4 +100: EX_TABLE(100b, .Lst_exc_r7) + sthx r0,r7,r3 + addi r7,r7,2 +2: bf cr7*4+1,3f +100: EX_TABLE(100b, .Lld_exc_r7) + lwzx r0,r7,r4 +100: EX_TABLE(100b, .Lst_exc_r7) + stwx r0,r7,r3 +3: PPC_MTOCRF(0x01,r5) + add r4,r6,r4 + add r3,r6,r3 + b .Ldst_aligned + +.Lshort_copy: +r3_offset = 0 + bf cr7*4+0,1f +lex; lwz r0,0(r4) +lex; lwz r9,4(r4) + addi r4,r4,8 +stex; stw r0,0(r3) +stex; stw r9,4(r3) + addi r3,r3,8 +1: bf cr7*4+1,2f +lex; lwz r0,0(r4) + addi r4,r4,4 +stex; stw r0,0(r3) + addi r3,r3,4 +2: bf cr7*4+2,3f +lex; lhz r0,0(r4) + addi r4,r4,2 +stex; sth r0,0(r3) + addi r3,r3,2 +3: bf cr7*4+3,4f +lex; lbz r0,0(r4) +stex; stb r0,0(r3) +4: li r3,0 + blr + +/* + * exception handlers follow + * we have to return the number of bytes not copied + * for an exception on a load, we set the rest of the destination to 0 + * Note that the number of bytes of instructions for adjusting r3 needs + * to equal the amount of the adjustment, due to the trick of using + * .Lld_exc - r3_offset as the handler address. + */ + +.Lld_exc_r7: + add r3,r3,r7 + b .Lld_exc + + /* adjust by 24 */ + addi r3,r3,8 + nop + /* adjust by 16 */ + addi r3,r3,8 + nop + /* adjust by 8 */ + addi r3,r3,8 + nop + +/* + * Here we have had a fault on a load and r3 points to the first + * unmodified byte of the destination. We use the original arguments + * and r3 to work out how much wasn't copied. Since we load some + * distance ahead of the stores, we continue copying byte-by-byte until + * we hit the load fault again in order to copy as much as possible. + */ +.Lld_exc: + ld r6,-24(r1) + ld r4,-16(r1) + ld r5,-8(r1) + subf r6,r6,r3 + add r4,r4,r6 + subf r5,r6,r5 /* #bytes left to go */ + +/* + * first see if we can copy any more bytes before hitting another exception + */ + mtctr r5 +r3_offset = 0 +100: EX_TABLE(100b, .Ldone) +43: lbz r0,0(r4) + addi r4,r4,1 +stex; stb r0,0(r3) + addi r3,r3,1 + bdnz 43b + li r3,0 /* huh? all copied successfully this time? */ + blr + +/* + * here we have trapped again, amount remaining is in ctr. + */ +.Ldone: + mfctr r3 + blr + +/* + * exception handlers for stores: we need to work out how many bytes + * weren't copied, and we may need to copy some more. + * Note that the number of bytes of instructions for adjusting r3 needs + * to equal the amount of the adjustment, due to the trick of using + * .Lst_exc - r3_offset as the handler address. + */ +.Lst_exc_r7: + add r3,r3,r7 + b .Lst_exc + + /* adjust by 24 */ + addi r3,r3,8 + nop + /* adjust by 16 */ + addi r3,r3,8 + nop + /* adjust by 8 */ + addi r3,r3,4 + /* adjust by 4 */ + addi r3,r3,4 +.Lst_exc: + ld r6,-24(r1) /* original destination pointer */ + ld r4,-16(r1) /* original source pointer */ + ld r5,-8(r1) /* original number of bytes */ + add r7,r6,r5 + /* + * If the destination pointer isn't 8-byte aligned, + * we may have got the exception as a result of a + * store that overlapped a page boundary, so we may be + * able to copy a few more bytes. + */ +17: andi. r0,r3,7 + beq 19f + subf r8,r6,r3 /* #bytes copied */ +100: EX_TABLE(100b,19f) + lbzx r0,r8,r4 +100: EX_TABLE(100b,19f) + stb r0,0(r3) + addi r3,r3,1 + cmpld r3,r7 + blt 17b +19: subf r3,r3,r7 /* #bytes not copied in r3 */ + blr + +/* + * Routine to copy a whole page of data, optimized for POWER4. + * On POWER4 it is more than 50% faster than the simple loop + * above (following the .Ldst_aligned label). + */ + .macro exc +100: EX_TABLE(100b, .Labort) + .endm +.Lcopy_page_4K: + std r31,-32(1) + std r30,-40(1) + std r29,-48(1) + std r28,-56(1) + std r27,-64(1) + std r26,-72(1) + std r25,-80(1) + std r24,-88(1) + std r23,-96(1) + std r22,-104(1) + std r21,-112(1) + std r20,-120(1) + li r5,4096/32 - 1 + addi r3,r3,-8 + li r0,5 +0: addi r5,r5,-24 + mtctr r0 +exc; ld r22,640(4) +exc; ld r21,512(4) +exc; ld r20,384(4) +exc; ld r11,256(4) +exc; ld r9,128(4) +exc; ld r7,0(4) +exc; ld r25,648(4) +exc; ld r24,520(4) +exc; ld r23,392(4) +exc; ld r10,264(4) +exc; ld r8,136(4) +exc; ldu r6,8(4) + cmpwi r5,24 +1: +exc; std r22,648(3) +exc; std r21,520(3) +exc; std r20,392(3) +exc; std r11,264(3) +exc; std r9,136(3) +exc; std r7,8(3) +exc; ld r28,648(4) +exc; ld r27,520(4) +exc; ld r26,392(4) +exc; ld r31,264(4) +exc; ld r30,136(4) +exc; ld r29,8(4) +exc; std r25,656(3) +exc; std r24,528(3) +exc; std r23,400(3) +exc; std r10,272(3) +exc; std r8,144(3) +exc; std r6,16(3) +exc; ld r22,656(4) +exc; ld r21,528(4) +exc; ld r20,400(4) +exc; ld r11,272(4) +exc; ld r9,144(4) +exc; ld r7,16(4) +exc; std r28,664(3) +exc; std r27,536(3) +exc; std r26,408(3) +exc; std r31,280(3) +exc; std r30,152(3) +exc; stdu r29,24(3) +exc; ld r25,664(4) +exc; ld r24,536(4) +exc; ld r23,408(4) +exc; ld r10,280(4) +exc; ld r8,152(4) +exc; ldu r6,24(4) + bdnz 1b +exc; std r22,648(3) +exc; std r21,520(3) +exc; std r20,392(3) +exc; std r11,264(3) +exc; std r9,136(3) +exc; std r7,8(3) + addi r4,r4,640 + addi r3,r3,648 + bge 0b + mtctr r5 +exc; ld r7,0(4) +exc; ld r8,8(4) +exc; ldu r9,16(4) +3: +exc; ld r10,8(4) +exc; std r7,8(3) +exc; ld r7,16(4) +exc; std r8,16(3) +exc; ld r8,24(4) +exc; std r9,24(3) +exc; ldu r9,32(4) +exc; stdu r10,32(3) + bdnz 3b +4: +exc; ld r10,8(4) +exc; std r7,8(3) +exc; std r8,16(3) +exc; std r9,24(3) +exc; std r10,32(3) +9: ld r20,-120(1) + ld r21,-112(1) + ld r22,-104(1) + ld r23,-96(1) + ld r24,-88(1) + ld r25,-80(1) + ld r26,-72(1) + ld r27,-64(1) + ld r28,-56(1) + ld r29,-48(1) + ld r30,-40(1) + ld r31,-32(1) + li r3,0 + blr + +/* + * on an exception, reset to the beginning and jump back into the + * standard __copy_tofrom_user + */ +.Labort: + ld r20,-120(1) + ld r21,-112(1) + ld r22,-104(1) + ld r23,-96(1) + ld r24,-88(1) + ld r25,-80(1) + ld r26,-72(1) + ld r27,-64(1) + ld r28,-56(1) + ld r29,-48(1) + ld r30,-40(1) + ld r31,-32(1) + ld r3,-24(r1) + ld r4,-16(r1) + li r5,4096 + b .Ldst_aligned +EXPORT_SYMBOL(__copy_tofrom_user) diff --git a/arch/powerpc/lib/copyuser_power7.S b/arch/powerpc/lib/copyuser_power7.S new file mode 100644 index 0000000000..ac41053c3a --- /dev/null +++ b/arch/powerpc/lib/copyuser_power7.S @@ -0,0 +1,695 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * + * Copyright (C) IBM Corporation, 2011 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ +#include <asm/ppc_asm.h> + +#ifndef SELFTEST_CASE +/* 0 == don't use VMX, 1 == use VMX */ +#define SELFTEST_CASE 0 +#endif + +#ifdef __BIG_ENDIAN__ +#define LVS(VRT,RA,RB) lvsl VRT,RA,RB +#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC +#else +#define LVS(VRT,RA,RB) lvsr VRT,RA,RB +#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC +#endif + + .macro err1 +100: + EX_TABLE(100b,.Ldo_err1) + .endm + + .macro err2 +200: + EX_TABLE(200b,.Ldo_err2) + .endm + +#ifdef CONFIG_ALTIVEC + .macro err3 +300: + EX_TABLE(300b,.Ldo_err3) + .endm + + .macro err4 +400: + EX_TABLE(400b,.Ldo_err4) + .endm + + +.Ldo_err4: + ld r16,STK_REG(R16)(r1) + ld r15,STK_REG(R15)(r1) + ld r14,STK_REG(R14)(r1) +.Ldo_err3: + bl CFUNC(exit_vmx_usercopy) + ld r0,STACKFRAMESIZE+16(r1) + mtlr r0 + b .Lexit +#endif /* CONFIG_ALTIVEC */ + +.Ldo_err2: + ld r22,STK_REG(R22)(r1) + ld r21,STK_REG(R21)(r1) + ld r20,STK_REG(R20)(r1) + ld r19,STK_REG(R19)(r1) + ld r18,STK_REG(R18)(r1) + ld r17,STK_REG(R17)(r1) + ld r16,STK_REG(R16)(r1) + ld r15,STK_REG(R15)(r1) + ld r14,STK_REG(R14)(r1) +.Lexit: + addi r1,r1,STACKFRAMESIZE +.Ldo_err1: + ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + ld r4,-STACKFRAMESIZE+STK_REG(R30)(r1) + ld r5,-STACKFRAMESIZE+STK_REG(R29)(r1) + b __copy_tofrom_user_base + + +_GLOBAL(__copy_tofrom_user_power7) + cmpldi r5,16 + cmpldi cr1,r5,3328 + + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) + std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) + + blt .Lshort_copy + +#ifdef CONFIG_ALTIVEC +test_feature = SELFTEST_CASE +BEGIN_FTR_SECTION + bgt cr1,.Lvmx_copy +END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) +#endif + +.Lnonvmx_copy: + /* Get the source 8B aligned */ + neg r6,r4 + mtocrf 0x01,r6 + clrldi r6,r6,(64-3) + + bf cr7*4+3,1f +err1; lbz r0,0(r4) + addi r4,r4,1 +err1; stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f +err1; lhz r0,0(r4) + addi r4,r4,2 +err1; sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f +err1; lwz r0,0(r4) + addi r4,r4,4 +err1; stw r0,0(r3) + addi r3,r3,4 + +3: sub r5,r5,r6 + cmpldi r5,128 + blt 5f + + mflr r0 + stdu r1,-STACKFRAMESIZE(r1) + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + std r17,STK_REG(R17)(r1) + std r18,STK_REG(R18)(r1) + std r19,STK_REG(R19)(r1) + std r20,STK_REG(R20)(r1) + std r21,STK_REG(R21)(r1) + std r22,STK_REG(R22)(r1) + std r0,STACKFRAMESIZE+16(r1) + + srdi r6,r5,7 + mtctr r6 + + /* Now do cacheline (128B) sized loads and stores. */ + .align 5 +4: +err2; ld r0,0(r4) +err2; ld r6,8(r4) +err2; ld r7,16(r4) +err2; ld r8,24(r4) +err2; ld r9,32(r4) +err2; ld r10,40(r4) +err2; ld r11,48(r4) +err2; ld r12,56(r4) +err2; ld r14,64(r4) +err2; ld r15,72(r4) +err2; ld r16,80(r4) +err2; ld r17,88(r4) +err2; ld r18,96(r4) +err2; ld r19,104(r4) +err2; ld r20,112(r4) +err2; ld r21,120(r4) + addi r4,r4,128 +err2; std r0,0(r3) +err2; std r6,8(r3) +err2; std r7,16(r3) +err2; std r8,24(r3) +err2; std r9,32(r3) +err2; std r10,40(r3) +err2; std r11,48(r3) +err2; std r12,56(r3) +err2; std r14,64(r3) +err2; std r15,72(r3) +err2; std r16,80(r3) +err2; std r17,88(r3) +err2; std r18,96(r3) +err2; std r19,104(r3) +err2; std r20,112(r3) +err2; std r21,120(r3) + addi r3,r3,128 + bdnz 4b + + clrldi r5,r5,(64-7) + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + ld r17,STK_REG(R17)(r1) + ld r18,STK_REG(R18)(r1) + ld r19,STK_REG(R19)(r1) + ld r20,STK_REG(R20)(r1) + ld r21,STK_REG(R21)(r1) + ld r22,STK_REG(R22)(r1) + addi r1,r1,STACKFRAMESIZE + + /* Up to 127B to go */ +5: srdi r6,r5,4 + mtocrf 0x01,r6 + +6: bf cr7*4+1,7f +err1; ld r0,0(r4) +err1; ld r6,8(r4) +err1; ld r7,16(r4) +err1; ld r8,24(r4) +err1; ld r9,32(r4) +err1; ld r10,40(r4) +err1; ld r11,48(r4) +err1; ld r12,56(r4) + addi r4,r4,64 +err1; std r0,0(r3) +err1; std r6,8(r3) +err1; std r7,16(r3) +err1; std r8,24(r3) +err1; std r9,32(r3) +err1; std r10,40(r3) +err1; std r11,48(r3) +err1; std r12,56(r3) + addi r3,r3,64 + + /* Up to 63B to go */ +7: bf cr7*4+2,8f +err1; ld r0,0(r4) +err1; ld r6,8(r4) +err1; ld r7,16(r4) +err1; ld r8,24(r4) + addi r4,r4,32 +err1; std r0,0(r3) +err1; std r6,8(r3) +err1; std r7,16(r3) +err1; std r8,24(r3) + addi r3,r3,32 + + /* Up to 31B to go */ +8: bf cr7*4+3,9f +err1; ld r0,0(r4) +err1; ld r6,8(r4) + addi r4,r4,16 +err1; std r0,0(r3) +err1; std r6,8(r3) + addi r3,r3,16 + +9: clrldi r5,r5,(64-4) + + /* Up to 15B to go */ +.Lshort_copy: + mtocrf 0x01,r5 + bf cr7*4+0,12f +err1; lwz r0,0(r4) /* Less chance of a reject with word ops */ +err1; lwz r6,4(r4) + addi r4,r4,8 +err1; stw r0,0(r3) +err1; stw r6,4(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f +err1; lwz r0,0(r4) + addi r4,r4,4 +err1; stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f +err1; lhz r0,0(r4) + addi r4,r4,2 +err1; sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f +err1; lbz r0,0(r4) +err1; stb r0,0(r3) + +15: li r3,0 + blr + +.Lunwind_stack_nonvmx_copy: + addi r1,r1,STACKFRAMESIZE + b .Lnonvmx_copy + +.Lvmx_copy: +#ifdef CONFIG_ALTIVEC + mflr r0 + std r0,16(r1) + stdu r1,-STACKFRAMESIZE(r1) + bl CFUNC(enter_vmx_usercopy) + cmpwi cr1,r3,0 + ld r0,STACKFRAMESIZE+16(r1) + ld r3,STK_REG(R31)(r1) + ld r4,STK_REG(R30)(r1) + ld r5,STK_REG(R29)(r1) + mtlr r0 + + /* + * We prefetch both the source and destination using enhanced touch + * instructions. We use a stream ID of 0 for the load side and + * 1 for the store side. + */ + clrrdi r6,r4,7 + clrrdi r9,r3,7 + ori r9,r9,1 /* stream=1 */ + + srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */ + cmpldi r7,0x3FF + ble 1f + li r7,0x3FF +1: lis r0,0x0E00 /* depth=7 */ + sldi r7,r7,7 + or r7,r7,r0 + ori r10,r7,1 /* stream=1 */ + + lis r8,0x8000 /* GO=1 */ + clrldi r8,r8,32 + + /* setup read stream 0 */ + dcbt 0,r6,0b01000 /* addr from */ + dcbt 0,r7,0b01010 /* length and depth from */ + /* setup write stream 1 */ + dcbtst 0,r9,0b01000 /* addr to */ + dcbtst 0,r10,0b01010 /* length and depth to */ + eieio + dcbt 0,r8,0b01010 /* all streams GO */ + + beq cr1,.Lunwind_stack_nonvmx_copy + + /* + * If source and destination are not relatively aligned we use a + * slower permute loop. + */ + xor r6,r4,r3 + rldicl. r6,r6,0,(64-4) + bne .Lvmx_unaligned_copy + + /* Get the destination 16B aligned */ + neg r6,r3 + mtocrf 0x01,r6 + clrldi r6,r6,(64-4) + + bf cr7*4+3,1f +err3; lbz r0,0(r4) + addi r4,r4,1 +err3; stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f +err3; lhz r0,0(r4) + addi r4,r4,2 +err3; sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f +err3; lwz r0,0(r4) + addi r4,r4,4 +err3; stw r0,0(r3) + addi r3,r3,4 + +3: bf cr7*4+0,4f +err3; ld r0,0(r4) + addi r4,r4,8 +err3; std r0,0(r3) + addi r3,r3,8 + +4: sub r5,r5,r6 + + /* Get the desination 128B aligned */ + neg r6,r3 + srdi r7,r6,4 + mtocrf 0x01,r7 + clrldi r6,r6,(64-7) + + li r9,16 + li r10,32 + li r11,48 + + bf cr7*4+3,5f +err3; lvx v1,0,r4 + addi r4,r4,16 +err3; stvx v1,0,r3 + addi r3,r3,16 + +5: bf cr7*4+2,6f +err3; lvx v1,0,r4 +err3; lvx v0,r4,r9 + addi r4,r4,32 +err3; stvx v1,0,r3 +err3; stvx v0,r3,r9 + addi r3,r3,32 + +6: bf cr7*4+1,7f +err3; lvx v3,0,r4 +err3; lvx v2,r4,r9 +err3; lvx v1,r4,r10 +err3; lvx v0,r4,r11 + addi r4,r4,64 +err3; stvx v3,0,r3 +err3; stvx v2,r3,r9 +err3; stvx v1,r3,r10 +err3; stvx v0,r3,r11 + addi r3,r3,64 + +7: sub r5,r5,r6 + srdi r6,r5,7 + + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + + li r12,64 + li r14,80 + li r15,96 + li r16,112 + + mtctr r6 + + /* + * Now do cacheline sized loads and stores. By this stage the + * cacheline stores are also cacheline aligned. + */ + .align 5 +8: +err4; lvx v7,0,r4 +err4; lvx v6,r4,r9 +err4; lvx v5,r4,r10 +err4; lvx v4,r4,r11 +err4; lvx v3,r4,r12 +err4; lvx v2,r4,r14 +err4; lvx v1,r4,r15 +err4; lvx v0,r4,r16 + addi r4,r4,128 +err4; stvx v7,0,r3 +err4; stvx v6,r3,r9 +err4; stvx v5,r3,r10 +err4; stvx v4,r3,r11 +err4; stvx v3,r3,r12 +err4; stvx v2,r3,r14 +err4; stvx v1,r3,r15 +err4; stvx v0,r3,r16 + addi r3,r3,128 + bdnz 8b + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + + /* Up to 127B to go */ + clrldi r5,r5,(64-7) + srdi r6,r5,4 + mtocrf 0x01,r6 + + bf cr7*4+1,9f +err3; lvx v3,0,r4 +err3; lvx v2,r4,r9 +err3; lvx v1,r4,r10 +err3; lvx v0,r4,r11 + addi r4,r4,64 +err3; stvx v3,0,r3 +err3; stvx v2,r3,r9 +err3; stvx v1,r3,r10 +err3; stvx v0,r3,r11 + addi r3,r3,64 + +9: bf cr7*4+2,10f +err3; lvx v1,0,r4 +err3; lvx v0,r4,r9 + addi r4,r4,32 +err3; stvx v1,0,r3 +err3; stvx v0,r3,r9 + addi r3,r3,32 + +10: bf cr7*4+3,11f +err3; lvx v1,0,r4 + addi r4,r4,16 +err3; stvx v1,0,r3 + addi r3,r3,16 + + /* Up to 15B to go */ +11: clrldi r5,r5,(64-4) + mtocrf 0x01,r5 + bf cr7*4+0,12f +err3; ld r0,0(r4) + addi r4,r4,8 +err3; std r0,0(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f +err3; lwz r0,0(r4) + addi r4,r4,4 +err3; stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f +err3; lhz r0,0(r4) + addi r4,r4,2 +err3; sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f +err3; lbz r0,0(r4) +err3; stb r0,0(r3) + +15: addi r1,r1,STACKFRAMESIZE + b CFUNC(exit_vmx_usercopy) /* tail call optimise */ + +.Lvmx_unaligned_copy: + /* Get the destination 16B aligned */ + neg r6,r3 + mtocrf 0x01,r6 + clrldi r6,r6,(64-4) + + bf cr7*4+3,1f +err3; lbz r0,0(r4) + addi r4,r4,1 +err3; stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f +err3; lhz r0,0(r4) + addi r4,r4,2 +err3; sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f +err3; lwz r0,0(r4) + addi r4,r4,4 +err3; stw r0,0(r3) + addi r3,r3,4 + +3: bf cr7*4+0,4f +err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ +err3; lwz r7,4(r4) + addi r4,r4,8 +err3; stw r0,0(r3) +err3; stw r7,4(r3) + addi r3,r3,8 + +4: sub r5,r5,r6 + + /* Get the desination 128B aligned */ + neg r6,r3 + srdi r7,r6,4 + mtocrf 0x01,r7 + clrldi r6,r6,(64-7) + + li r9,16 + li r10,32 + li r11,48 + + LVS(v16,0,r4) /* Setup permute control vector */ +err3; lvx v0,0,r4 + addi r4,r4,16 + + bf cr7*4+3,5f +err3; lvx v1,0,r4 + VPERM(v8,v0,v1,v16) + addi r4,r4,16 +err3; stvx v8,0,r3 + addi r3,r3,16 + vor v0,v1,v1 + +5: bf cr7*4+2,6f +err3; lvx v1,0,r4 + VPERM(v8,v0,v1,v16) +err3; lvx v0,r4,r9 + VPERM(v9,v1,v0,v16) + addi r4,r4,32 +err3; stvx v8,0,r3 +err3; stvx v9,r3,r9 + addi r3,r3,32 + +6: bf cr7*4+1,7f +err3; lvx v3,0,r4 + VPERM(v8,v0,v3,v16) +err3; lvx v2,r4,r9 + VPERM(v9,v3,v2,v16) +err3; lvx v1,r4,r10 + VPERM(v10,v2,v1,v16) +err3; lvx v0,r4,r11 + VPERM(v11,v1,v0,v16) + addi r4,r4,64 +err3; stvx v8,0,r3 +err3; stvx v9,r3,r9 +err3; stvx v10,r3,r10 +err3; stvx v11,r3,r11 + addi r3,r3,64 + +7: sub r5,r5,r6 + srdi r6,r5,7 + + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + + li r12,64 + li r14,80 + li r15,96 + li r16,112 + + mtctr r6 + + /* + * Now do cacheline sized loads and stores. By this stage the + * cacheline stores are also cacheline aligned. + */ + .align 5 +8: +err4; lvx v7,0,r4 + VPERM(v8,v0,v7,v16) +err4; lvx v6,r4,r9 + VPERM(v9,v7,v6,v16) +err4; lvx v5,r4,r10 + VPERM(v10,v6,v5,v16) +err4; lvx v4,r4,r11 + VPERM(v11,v5,v4,v16) +err4; lvx v3,r4,r12 + VPERM(v12,v4,v3,v16) +err4; lvx v2,r4,r14 + VPERM(v13,v3,v2,v16) +err4; lvx v1,r4,r15 + VPERM(v14,v2,v1,v16) +err4; lvx v0,r4,r16 + VPERM(v15,v1,v0,v16) + addi r4,r4,128 +err4; stvx v8,0,r3 +err4; stvx v9,r3,r9 +err4; stvx v10,r3,r10 +err4; stvx v11,r3,r11 +err4; stvx v12,r3,r12 +err4; stvx v13,r3,r14 +err4; stvx v14,r3,r15 +err4; stvx v15,r3,r16 + addi r3,r3,128 + bdnz 8b + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + + /* Up to 127B to go */ + clrldi r5,r5,(64-7) + srdi r6,r5,4 + mtocrf 0x01,r6 + + bf cr7*4+1,9f +err3; lvx v3,0,r4 + VPERM(v8,v0,v3,v16) +err3; lvx v2,r4,r9 + VPERM(v9,v3,v2,v16) +err3; lvx v1,r4,r10 + VPERM(v10,v2,v1,v16) +err3; lvx v0,r4,r11 + VPERM(v11,v1,v0,v16) + addi r4,r4,64 +err3; stvx v8,0,r3 +err3; stvx v9,r3,r9 +err3; stvx v10,r3,r10 +err3; stvx v11,r3,r11 + addi r3,r3,64 + +9: bf cr7*4+2,10f +err3; lvx v1,0,r4 + VPERM(v8,v0,v1,v16) +err3; lvx v0,r4,r9 + VPERM(v9,v1,v0,v16) + addi r4,r4,32 +err3; stvx v8,0,r3 +err3; stvx v9,r3,r9 + addi r3,r3,32 + +10: bf cr7*4+3,11f +err3; lvx v1,0,r4 + VPERM(v8,v0,v1,v16) + addi r4,r4,16 +err3; stvx v8,0,r3 + addi r3,r3,16 + + /* Up to 15B to go */ +11: clrldi r5,r5,(64-4) + addi r4,r4,-16 /* Unwind the +16 load offset */ + mtocrf 0x01,r5 + bf cr7*4+0,12f +err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ +err3; lwz r6,4(r4) + addi r4,r4,8 +err3; stw r0,0(r3) +err3; stw r6,4(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f +err3; lwz r0,0(r4) + addi r4,r4,4 +err3; stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f +err3; lhz r0,0(r4) + addi r4,r4,2 +err3; sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f +err3; lbz r0,0(r4) +err3; stb r0,0(r3) + +15: addi r1,r1,STACKFRAMESIZE + b CFUNC(exit_vmx_usercopy) /* tail call optimise */ +#endif /* CONFIG_ALTIVEC */ diff --git a/arch/powerpc/lib/crtsavres.S b/arch/powerpc/lib/crtsavres.S new file mode 100644 index 0000000000..7e5e1c28e5 --- /dev/null +++ b/arch/powerpc/lib/crtsavres.S @@ -0,0 +1,545 @@ +/* + * Special support for eabi and SVR4 + * + * Copyright (C) 1995, 1996, 1998, 2000, 2001 Free Software Foundation, Inc. + * Copyright 2008 Freescale Semiconductor, Inc. + * Written By Michael Meissner + * + * Based on gcc/config/rs6000/crtsavres.asm from gcc + * 64 bit additions from reading the PPC elf64abi document. + * + * This file is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * In addition to the permissions in the GNU General Public License, the + * Free Software Foundation gives you unlimited permission to link the + * compiled version of this file with other programs, and to distribute + * those programs without any restriction coming from the use of this + * file. (The General Public License restrictions do apply in other + * respects; for example, they cover modification of the file, and + * distribution when not linked into another program.) + * + * This file is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + * As a special exception, if you link this library with files + * compiled with GCC to produce an executable, this does not cause + * the resulting executable to be covered by the GNU General Public License. + * This exception does not however invalidate any other reasons why + * the executable file might be covered by the GNU General Public License. + */ + +#include <asm/ppc_asm.h> + + .file "crtsavres.S" + +#ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE + + .section ".text" + +#ifndef CONFIG_PPC64 + +/* Routines for saving integer registers, called by the compiler. */ +/* Called with r11 pointing to the stack header word of the caller of the */ +/* function, just beyond the end of the integer save area. */ + +_GLOBAL(_savegpr_14) +_GLOBAL(_save32gpr_14) + stw 14,-72(11) /* save gp registers */ +_GLOBAL(_savegpr_15) +_GLOBAL(_save32gpr_15) + stw 15,-68(11) +_GLOBAL(_savegpr_16) +_GLOBAL(_save32gpr_16) + stw 16,-64(11) +_GLOBAL(_savegpr_17) +_GLOBAL(_save32gpr_17) + stw 17,-60(11) +_GLOBAL(_savegpr_18) +_GLOBAL(_save32gpr_18) + stw 18,-56(11) +_GLOBAL(_savegpr_19) +_GLOBAL(_save32gpr_19) + stw 19,-52(11) +_GLOBAL(_savegpr_20) +_GLOBAL(_save32gpr_20) + stw 20,-48(11) +_GLOBAL(_savegpr_21) +_GLOBAL(_save32gpr_21) + stw 21,-44(11) +_GLOBAL(_savegpr_22) +_GLOBAL(_save32gpr_22) + stw 22,-40(11) +_GLOBAL(_savegpr_23) +_GLOBAL(_save32gpr_23) + stw 23,-36(11) +_GLOBAL(_savegpr_24) +_GLOBAL(_save32gpr_24) + stw 24,-32(11) +_GLOBAL(_savegpr_25) +_GLOBAL(_save32gpr_25) + stw 25,-28(11) +_GLOBAL(_savegpr_26) +_GLOBAL(_save32gpr_26) + stw 26,-24(11) +_GLOBAL(_savegpr_27) +_GLOBAL(_save32gpr_27) + stw 27,-20(11) +_GLOBAL(_savegpr_28) +_GLOBAL(_save32gpr_28) + stw 28,-16(11) +_GLOBAL(_savegpr_29) +_GLOBAL(_save32gpr_29) + stw 29,-12(11) +_GLOBAL(_savegpr_30) +_GLOBAL(_save32gpr_30) + stw 30,-8(11) +_GLOBAL(_savegpr_31) +_GLOBAL(_save32gpr_31) + stw 31,-4(11) + blr + +/* Routines for restoring integer registers, called by the compiler. */ +/* Called with r11 pointing to the stack header word of the caller of the */ +/* function, just beyond the end of the integer restore area. */ + +_GLOBAL(_restgpr_14) +_GLOBAL(_rest32gpr_14) + lwz 14,-72(11) /* restore gp registers */ +_GLOBAL(_restgpr_15) +_GLOBAL(_rest32gpr_15) + lwz 15,-68(11) +_GLOBAL(_restgpr_16) +_GLOBAL(_rest32gpr_16) + lwz 16,-64(11) +_GLOBAL(_restgpr_17) +_GLOBAL(_rest32gpr_17) + lwz 17,-60(11) +_GLOBAL(_restgpr_18) +_GLOBAL(_rest32gpr_18) + lwz 18,-56(11) +_GLOBAL(_restgpr_19) +_GLOBAL(_rest32gpr_19) + lwz 19,-52(11) +_GLOBAL(_restgpr_20) +_GLOBAL(_rest32gpr_20) + lwz 20,-48(11) +_GLOBAL(_restgpr_21) +_GLOBAL(_rest32gpr_21) + lwz 21,-44(11) +_GLOBAL(_restgpr_22) +_GLOBAL(_rest32gpr_22) + lwz 22,-40(11) +_GLOBAL(_restgpr_23) +_GLOBAL(_rest32gpr_23) + lwz 23,-36(11) +_GLOBAL(_restgpr_24) +_GLOBAL(_rest32gpr_24) + lwz 24,-32(11) +_GLOBAL(_restgpr_25) +_GLOBAL(_rest32gpr_25) + lwz 25,-28(11) +_GLOBAL(_restgpr_26) +_GLOBAL(_rest32gpr_26) + lwz 26,-24(11) +_GLOBAL(_restgpr_27) +_GLOBAL(_rest32gpr_27) + lwz 27,-20(11) +_GLOBAL(_restgpr_28) +_GLOBAL(_rest32gpr_28) + lwz 28,-16(11) +_GLOBAL(_restgpr_29) +_GLOBAL(_rest32gpr_29) + lwz 29,-12(11) +_GLOBAL(_restgpr_30) +_GLOBAL(_rest32gpr_30) + lwz 30,-8(11) +_GLOBAL(_restgpr_31) +_GLOBAL(_rest32gpr_31) + lwz 31,-4(11) + blr + +/* Routines for restoring integer registers, called by the compiler. */ +/* Called with r11 pointing to the stack header word of the caller of the */ +/* function, just beyond the end of the integer restore area. */ + +_GLOBAL(_restgpr_14_x) +_GLOBAL(_rest32gpr_14_x) + lwz 14,-72(11) /* restore gp registers */ +_GLOBAL(_restgpr_15_x) +_GLOBAL(_rest32gpr_15_x) + lwz 15,-68(11) +_GLOBAL(_restgpr_16_x) +_GLOBAL(_rest32gpr_16_x) + lwz 16,-64(11) +_GLOBAL(_restgpr_17_x) +_GLOBAL(_rest32gpr_17_x) + lwz 17,-60(11) +_GLOBAL(_restgpr_18_x) +_GLOBAL(_rest32gpr_18_x) + lwz 18,-56(11) +_GLOBAL(_restgpr_19_x) +_GLOBAL(_rest32gpr_19_x) + lwz 19,-52(11) +_GLOBAL(_restgpr_20_x) +_GLOBAL(_rest32gpr_20_x) + lwz 20,-48(11) +_GLOBAL(_restgpr_21_x) +_GLOBAL(_rest32gpr_21_x) + lwz 21,-44(11) +_GLOBAL(_restgpr_22_x) +_GLOBAL(_rest32gpr_22_x) + lwz 22,-40(11) +_GLOBAL(_restgpr_23_x) +_GLOBAL(_rest32gpr_23_x) + lwz 23,-36(11) +_GLOBAL(_restgpr_24_x) +_GLOBAL(_rest32gpr_24_x) + lwz 24,-32(11) +_GLOBAL(_restgpr_25_x) +_GLOBAL(_rest32gpr_25_x) + lwz 25,-28(11) +_GLOBAL(_restgpr_26_x) +_GLOBAL(_rest32gpr_26_x) + lwz 26,-24(11) +_GLOBAL(_restgpr_27_x) +_GLOBAL(_rest32gpr_27_x) + lwz 27,-20(11) +_GLOBAL(_restgpr_28_x) +_GLOBAL(_rest32gpr_28_x) + lwz 28,-16(11) +_GLOBAL(_restgpr_29_x) +_GLOBAL(_rest32gpr_29_x) + lwz 29,-12(11) +_GLOBAL(_restgpr_30_x) +_GLOBAL(_rest32gpr_30_x) + lwz 30,-8(11) +_GLOBAL(_restgpr_31_x) +_GLOBAL(_rest32gpr_31_x) + lwz 0,4(11) + lwz 31,-4(11) + mtlr 0 + mr 1,11 + blr + +#ifdef CONFIG_ALTIVEC +/* Called with r0 pointing just beyond the end of the vector save area. */ + +_GLOBAL(_savevr_20) + li r11,-192 + stvx v20,r11,r0 +_GLOBAL(_savevr_21) + li r11,-176 + stvx v21,r11,r0 +_GLOBAL(_savevr_22) + li r11,-160 + stvx v22,r11,r0 +_GLOBAL(_savevr_23) + li r11,-144 + stvx v23,r11,r0 +_GLOBAL(_savevr_24) + li r11,-128 + stvx v24,r11,r0 +_GLOBAL(_savevr_25) + li r11,-112 + stvx v25,r11,r0 +_GLOBAL(_savevr_26) + li r11,-96 + stvx v26,r11,r0 +_GLOBAL(_savevr_27) + li r11,-80 + stvx v27,r11,r0 +_GLOBAL(_savevr_28) + li r11,-64 + stvx v28,r11,r0 +_GLOBAL(_savevr_29) + li r11,-48 + stvx v29,r11,r0 +_GLOBAL(_savevr_30) + li r11,-32 + stvx v30,r11,r0 +_GLOBAL(_savevr_31) + li r11,-16 + stvx v31,r11,r0 + blr + +_GLOBAL(_restvr_20) + li r11,-192 + lvx v20,r11,r0 +_GLOBAL(_restvr_21) + li r11,-176 + lvx v21,r11,r0 +_GLOBAL(_restvr_22) + li r11,-160 + lvx v22,r11,r0 +_GLOBAL(_restvr_23) + li r11,-144 + lvx v23,r11,r0 +_GLOBAL(_restvr_24) + li r11,-128 + lvx v24,r11,r0 +_GLOBAL(_restvr_25) + li r11,-112 + lvx v25,r11,r0 +_GLOBAL(_restvr_26) + li r11,-96 + lvx v26,r11,r0 +_GLOBAL(_restvr_27) + li r11,-80 + lvx v27,r11,r0 +_GLOBAL(_restvr_28) + li r11,-64 + lvx v28,r11,r0 +_GLOBAL(_restvr_29) + li r11,-48 + lvx v29,r11,r0 +_GLOBAL(_restvr_30) + li r11,-32 + lvx v30,r11,r0 +_GLOBAL(_restvr_31) + li r11,-16 + lvx v31,r11,r0 + blr + +#endif /* CONFIG_ALTIVEC */ + +#else /* CONFIG_PPC64 */ + +.globl _savegpr0_14 +_savegpr0_14: + std r14,-144(r1) +.globl _savegpr0_15 +_savegpr0_15: + std r15,-136(r1) +.globl _savegpr0_16 +_savegpr0_16: + std r16,-128(r1) +.globl _savegpr0_17 +_savegpr0_17: + std r17,-120(r1) +.globl _savegpr0_18 +_savegpr0_18: + std r18,-112(r1) +.globl _savegpr0_19 +_savegpr0_19: + std r19,-104(r1) +.globl _savegpr0_20 +_savegpr0_20: + std r20,-96(r1) +.globl _savegpr0_21 +_savegpr0_21: + std r21,-88(r1) +.globl _savegpr0_22 +_savegpr0_22: + std r22,-80(r1) +.globl _savegpr0_23 +_savegpr0_23: + std r23,-72(r1) +.globl _savegpr0_24 +_savegpr0_24: + std r24,-64(r1) +.globl _savegpr0_25 +_savegpr0_25: + std r25,-56(r1) +.globl _savegpr0_26 +_savegpr0_26: + std r26,-48(r1) +.globl _savegpr0_27 +_savegpr0_27: + std r27,-40(r1) +.globl _savegpr0_28 +_savegpr0_28: + std r28,-32(r1) +.globl _savegpr0_29 +_savegpr0_29: + std r29,-24(r1) +.globl _savegpr0_30 +_savegpr0_30: + std r30,-16(r1) +.globl _savegpr0_31 +_savegpr0_31: + std r31,-8(r1) + std r0,16(r1) + blr + +.globl _restgpr0_14 +_restgpr0_14: + ld r14,-144(r1) +.globl _restgpr0_15 +_restgpr0_15: + ld r15,-136(r1) +.globl _restgpr0_16 +_restgpr0_16: + ld r16,-128(r1) +.globl _restgpr0_17 +_restgpr0_17: + ld r17,-120(r1) +.globl _restgpr0_18 +_restgpr0_18: + ld r18,-112(r1) +.globl _restgpr0_19 +_restgpr0_19: + ld r19,-104(r1) +.globl _restgpr0_20 +_restgpr0_20: + ld r20,-96(r1) +.globl _restgpr0_21 +_restgpr0_21: + ld r21,-88(r1) +.globl _restgpr0_22 +_restgpr0_22: + ld r22,-80(r1) +.globl _restgpr0_23 +_restgpr0_23: + ld r23,-72(r1) +.globl _restgpr0_24 +_restgpr0_24: + ld r24,-64(r1) +.globl _restgpr0_25 +_restgpr0_25: + ld r25,-56(r1) +.globl _restgpr0_26 +_restgpr0_26: + ld r26,-48(r1) +.globl _restgpr0_27 +_restgpr0_27: + ld r27,-40(r1) +.globl _restgpr0_28 +_restgpr0_28: + ld r28,-32(r1) +.globl _restgpr0_29 +_restgpr0_29: + ld r0,16(r1) + ld r29,-24(r1) + mtlr r0 + ld r30,-16(r1) + ld r31,-8(r1) + blr + +.globl _restgpr0_30 +_restgpr0_30: + ld r30,-16(r1) +.globl _restgpr0_31 +_restgpr0_31: + ld r0,16(r1) + ld r31,-8(r1) + mtlr r0 + blr + +#ifdef CONFIG_ALTIVEC +/* Called with r0 pointing just beyond the end of the vector save area. */ + +.globl _savevr_20 +_savevr_20: + li r12,-192 + stvx v20,r12,r0 +.globl _savevr_21 +_savevr_21: + li r12,-176 + stvx v21,r12,r0 +.globl _savevr_22 +_savevr_22: + li r12,-160 + stvx v22,r12,r0 +.globl _savevr_23 +_savevr_23: + li r12,-144 + stvx v23,r12,r0 +.globl _savevr_24 +_savevr_24: + li r12,-128 + stvx v24,r12,r0 +.globl _savevr_25 +_savevr_25: + li r12,-112 + stvx v25,r12,r0 +.globl _savevr_26 +_savevr_26: + li r12,-96 + stvx v26,r12,r0 +.globl _savevr_27 +_savevr_27: + li r12,-80 + stvx v27,r12,r0 +.globl _savevr_28 +_savevr_28: + li r12,-64 + stvx v28,r12,r0 +.globl _savevr_29 +_savevr_29: + li r12,-48 + stvx v29,r12,r0 +.globl _savevr_30 +_savevr_30: + li r12,-32 + stvx v30,r12,r0 +.globl _savevr_31 +_savevr_31: + li r12,-16 + stvx v31,r12,r0 + blr + +.globl _restvr_20 +_restvr_20: + li r12,-192 + lvx v20,r12,r0 +.globl _restvr_21 +_restvr_21: + li r12,-176 + lvx v21,r12,r0 +.globl _restvr_22 +_restvr_22: + li r12,-160 + lvx v22,r12,r0 +.globl _restvr_23 +_restvr_23: + li r12,-144 + lvx v23,r12,r0 +.globl _restvr_24 +_restvr_24: + li r12,-128 + lvx v24,r12,r0 +.globl _restvr_25 +_restvr_25: + li r12,-112 + lvx v25,r12,r0 +.globl _restvr_26 +_restvr_26: + li r12,-96 + lvx v26,r12,r0 +.globl _restvr_27 +_restvr_27: + li r12,-80 + lvx v27,r12,r0 +.globl _restvr_28 +_restvr_28: + li r12,-64 + lvx v28,r12,r0 +.globl _restvr_29 +_restvr_29: + li r12,-48 + lvx v29,r12,r0 +.globl _restvr_30 +_restvr_30: + li r12,-32 + lvx v30,r12,r0 +.globl _restvr_31 +_restvr_31: + li r12,-16 + lvx v31,r12,r0 + blr + +#endif /* CONFIG_ALTIVEC */ + +#endif /* CONFIG_PPC64 */ + +#endif diff --git a/arch/powerpc/lib/div64.S b/arch/powerpc/lib/div64.S new file mode 100644 index 0000000000..3d5426e7dc --- /dev/null +++ b/arch/powerpc/lib/div64.S @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Divide a 64-bit unsigned number by a 32-bit unsigned number. + * This routine assumes that the top 32 bits of the dividend are + * non-zero to start with. + * On entry, r3 points to the dividend, which get overwritten with + * the 64-bit quotient, and r4 contains the divisor. + * On exit, r3 contains the remainder. + * + * Copyright (C) 2002 Paul Mackerras, IBM Corp. + */ +#include <asm/ppc_asm.h> +#include <asm/processor.h> + +_GLOBAL(__div64_32) + lwz r5,0(r3) # get the dividend into r5/r6 + lwz r6,4(r3) + cmplw r5,r4 + li r7,0 + li r8,0 + blt 1f + divwu r7,r5,r4 # if dividend.hi >= divisor, + mullw r0,r7,r4 # quotient.hi = dividend.hi / divisor + subf. r5,r0,r5 # dividend.hi %= divisor + beq 3f +1: mr r11,r5 # here dividend.hi != 0 + andis. r0,r5,0xc000 + bne 2f + cntlzw r0,r5 # we are shifting the dividend right + li r10,-1 # to make it < 2^32, and shifting + srw r10,r10,r0 # the divisor right the same amount, + addc r9,r4,r10 # rounding up (so the estimate cannot + andc r11,r6,r10 # ever be too large, only too small) + andc r9,r9,r10 + addze r9,r9 + or r11,r5,r11 + rotlw r9,r9,r0 + rotlw r11,r11,r0 + divwu r11,r11,r9 # then we divide the shifted quantities +2: mullw r10,r11,r4 # to get an estimate of the quotient, + mulhwu r9,r11,r4 # multiply the estimate by the divisor, + subfc r6,r10,r6 # take the product from the divisor, + add r8,r8,r11 # and add the estimate to the accumulated + subfe. r5,r9,r5 # quotient + bne 1b +3: cmplw r6,r4 + blt 4f + divwu r0,r6,r4 # perform the remaining 32-bit division + mullw r10,r0,r4 # and get the remainder + add r8,r8,r0 + subf r6,r10,r6 +4: stw r7,0(r3) # return the quotient in *r3 + stw r8,4(r3) + mr r3,r6 # return the remainder in r3 + blr diff --git a/arch/powerpc/lib/error-inject.c b/arch/powerpc/lib/error-inject.c new file mode 100644 index 0000000000..e834079d2b --- /dev/null +++ b/arch/powerpc/lib/error-inject.c @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0+ + +#include <linux/error-injection.h> +#include <linux/kprobes.h> +#include <linux/uaccess.h> + +void override_function_with_return(struct pt_regs *regs) +{ + /* + * Emulate 'blr'. 'regs' represents the state on entry of a predefined + * function in the kernel/module, captured on a kprobe. We don't need + * to worry about 32-bit userspace on a 64-bit kernel. + */ + regs_set_return_ip(regs, regs->link); +} +NOKPROBE_SYMBOL(override_function_with_return); diff --git a/arch/powerpc/lib/feature-fixups-test.S b/arch/powerpc/lib/feature-fixups-test.S new file mode 100644 index 0000000000..480172fbd0 --- /dev/null +++ b/arch/powerpc/lib/feature-fixups-test.S @@ -0,0 +1,862 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright 2008 Michael Ellerman, IBM Corporation. + */ + +#include <asm/feature-fixups.h> +#include <asm/ppc_asm.h> +#include <asm/synch.h> +#include <asm/asm-compat.h> +#include <asm/ppc-opcode.h> + + .text + +#define globl(x) \ + .globl x; \ +x: + +globl(ftr_fixup_test1) + or 1,1,1 + or 2,2,2 /* fixup will nop out this instruction */ + or 3,3,3 + +globl(end_ftr_fixup_test1) + +globl(ftr_fixup_test1_orig) + or 1,1,1 + or 2,2,2 + or 3,3,3 + +globl(ftr_fixup_test1_expected) + or 1,1,1 + nop + or 3,3,3 + +globl(ftr_fixup_test2) + or 1,1,1 + or 2,2,2 /* fixup will replace this with ftr_fixup_test2_alt */ + or 3,3,3 + +globl(end_ftr_fixup_test2) + +globl(ftr_fixup_test2_orig) + or 1,1,1 + or 2,2,2 + or 3,3,3 + +globl(ftr_fixup_test2_alt) + or 31,31,31 + +globl(ftr_fixup_test2_expected) + or 1,1,1 + or 31,31,31 + or 3,3,3 + +globl(ftr_fixup_test3) + or 1,1,1 + or 2,2,2 /* fixup will fail to replace this */ + or 3,3,3 + +globl(end_ftr_fixup_test3) + +globl(ftr_fixup_test3_orig) + or 1,1,1 + or 2,2,2 + or 3,3,3 + +globl(ftr_fixup_test3_alt) + or 31,31,31 + or 31,31,31 + +globl(ftr_fixup_test4) + or 1,1,1 + or 2,2,2 + or 2,2,2 + or 2,2,2 + or 2,2,2 + or 3,3,3 + +globl(end_ftr_fixup_test4) + +globl(ftr_fixup_test4_expected) + or 1,1,1 + or 31,31,31 + or 31,31,31 + nop + nop + or 3,3,3 + +globl(ftr_fixup_test4_orig) + or 1,1,1 + or 2,2,2 + or 2,2,2 + or 2,2,2 + or 2,2,2 + or 3,3,3 + +globl(ftr_fixup_test4_alt) + or 31,31,31 + or 31,31,31 + + +globl(ftr_fixup_test5) + or 1,1,1 +BEGIN_FTR_SECTION + or 2,2,2 + or 2,2,2 + or 2,2,2 + or 2,2,2 + or 2,2,2 + or 2,2,2 + or 2,2,2 +FTR_SECTION_ELSE +2: b 3f +3: or 5,5,5 + beq 3b + b 1f + or 6,6,6 + b 2b +1: bdnz 3b +ALT_FTR_SECTION_END(0, 1) + or 1,1,1 + +globl(end_ftr_fixup_test5) + +globl(ftr_fixup_test5_expected) + or 1,1,1 +2: b 3f +3: or 5,5,5 + beq 3b + b 1f + or 6,6,6 + b 2b +1: bdnz 3b + or 1,1,1 + +globl(ftr_fixup_test6) +1: or 1,1,1 +BEGIN_FTR_SECTION + or 5,5,5 +2: PPC_LCMPI r3,0 + beq 4f + blt 2b + b 1b + b 4f +FTR_SECTION_ELSE +2: or 2,2,2 + PPC_LCMPI r3,1 + beq 3f + blt 2b + b 3f + b 1b +ALT_FTR_SECTION_END(0, 1) +3: or 1,1,1 + or 2,2,2 +4: or 3,3,3 + +globl(end_ftr_fixup_test6) + +globl(ftr_fixup_test6_expected) +1: or 1,1,1 +2: or 2,2,2 + PPC_LCMPI r3,1 + beq 3f + blt 2b + b 3f + b 1b +3: or 1,1,1 + or 2,2,2 + or 3,3,3 + +globl(ftr_fixup_test7) + or 1,1,1 +BEGIN_FTR_SECTION + or 2,2,2 + or 2,2,2 + or 2,2,2 + or 2,2,2 + or 2,2,2 + or 2,2,2 + or 2,2,2 +FTR_SECTION_ELSE +2: b 3f +3: or 5,5,5 + beq 3b + b 1f + or 6,6,6 + b 2b + bdnz 3b +1: +ALT_FTR_SECTION_END(0, 1) + or 1,1,1 + or 1,1,1 + +globl(end_ftr_fixup_test7) + nop + +globl(ftr_fixup_test7_expected) + or 1,1,1 +2: b 3f +3: or 5,5,5 + beq 3b + b 1f + or 6,6,6 + b 2b + bdnz 3b +1: or 1,1,1 + +#if 0 +/* Test that if we have a larger else case the assembler spots it and + * reports an error. #if 0'ed so as not to break the build normally. + */ +ftr_fixup_test_too_big: + or 1,1,1 +BEGIN_FTR_SECTION + or 2,2,2 + or 2,2,2 + or 2,2,2 +FTR_SECTION_ELSE + or 3,3,3 + or 3,3,3 + or 3,3,3 + or 3,3,3 +ALT_FTR_SECTION_END(0, 1) + or 1,1,1 +#endif + +#define MAKE_MACRO_TEST(TYPE) \ +globl(ftr_fixup_test_ ##TYPE##_macros) \ + or 1,1,1; \ + /* Basic test, this section should all be nop'ed */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ + or 2,2,2; \ + or 2,2,2; \ +END_##TYPE##_SECTION(0, 1) \ + or 1,1,1; \ + or 1,1,1; \ + /* Basic test, this section should NOT be nop'ed */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ + or 2,2,2; \ + or 2,2,2; \ +END_##TYPE##_SECTION(0, 0) \ + or 1,1,1; \ + or 1,1,1; \ + /* Nesting test, inner section should be nop'ed */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ + or 2,2,2; \ +BEGIN_##TYPE##_SECTION_NESTED(80) \ + or 3,3,3; \ + or 3,3,3; \ +END_##TYPE##_SECTION_NESTED(0, 1, 80) \ + or 2,2,2; \ + or 2,2,2; \ +END_##TYPE##_SECTION(0, 0) \ + or 1,1,1; \ + or 1,1,1; \ + /* Nesting test, whole section should be nop'ed */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ + or 2,2,2; \ +BEGIN_##TYPE##_SECTION_NESTED(80) \ + or 3,3,3; \ + or 3,3,3; \ +END_##TYPE##_SECTION_NESTED(0, 0, 80) \ + or 2,2,2; \ + or 2,2,2; \ +END_##TYPE##_SECTION(0, 1) \ + or 1,1,1; \ + or 1,1,1; \ + /* Nesting test, none should be nop'ed */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ + or 2,2,2; \ +BEGIN_##TYPE##_SECTION_NESTED(80) \ + or 3,3,3; \ + or 3,3,3; \ +END_##TYPE##_SECTION_NESTED(0, 0, 80) \ + or 2,2,2; \ + or 2,2,2; \ +END_##TYPE##_SECTION(0, 0) \ + or 1,1,1; \ + or 1,1,1; \ + /* Basic alt section test, default case should be taken */ \ +BEGIN_##TYPE##_SECTION \ + or 3,3,3; \ + or 3,3,3; \ + or 3,3,3; \ +##TYPE##_SECTION_ELSE \ + or 5,5,5; \ + or 5,5,5; \ +ALT_##TYPE##_SECTION_END(0, 0) \ + or 1,1,1; \ + or 1,1,1; \ + /* Basic alt section test, else case should be taken */ \ +BEGIN_##TYPE##_SECTION \ + or 3,3,3; \ + or 3,3,3; \ + or 3,3,3; \ +##TYPE##_SECTION_ELSE \ + or 31,31,31; \ + or 31,31,31; \ + or 31,31,31; \ +ALT_##TYPE##_SECTION_END(0, 1) \ + or 1,1,1; \ + or 1,1,1; \ + /* Alt with smaller else case, should be padded with nops */ \ +BEGIN_##TYPE##_SECTION \ + or 3,3,3; \ + or 3,3,3; \ + or 3,3,3; \ +##TYPE##_SECTION_ELSE \ + or 31,31,31; \ +ALT_##TYPE##_SECTION_END(0, 1) \ + or 1,1,1; \ + or 1,1,1; \ + /* Alt section with nested section in default case */ \ + /* Default case should be taken, with nop'ed inner section */ \ +BEGIN_##TYPE##_SECTION \ + or 3,3,3; \ +BEGIN_##TYPE##_SECTION_NESTED(95) \ + or 3,3,3; \ + or 3,3,3; \ +END_##TYPE##_SECTION_NESTED(0, 1, 95) \ + or 3,3,3; \ +##TYPE##_SECTION_ELSE \ + or 2,2,2; \ + or 2,2,2; \ +ALT_##TYPE##_SECTION_END(0, 0) \ + or 1,1,1; \ + or 1,1,1; \ + /* Alt section with nested section in else, default taken */ \ +BEGIN_##TYPE##_SECTION \ + or 3,3,3; \ + or 3,3,3; \ + or 3,3,3; \ +##TYPE##_SECTION_ELSE \ + or 5,5,5; \ +BEGIN_##TYPE##_SECTION_NESTED(95) \ + or 3,3,3; \ +END_##TYPE##_SECTION_NESTED(0, 1, 95) \ + or 5,5,5; \ +ALT_##TYPE##_SECTION_END(0, 0) \ + or 1,1,1; \ + or 1,1,1; \ + /* Alt section with nested section in else, else taken & nop */ \ +BEGIN_##TYPE##_SECTION \ + or 3,3,3; \ + or 3,3,3; \ + or 3,3,3; \ +##TYPE##_SECTION_ELSE \ + or 5,5,5; \ +BEGIN_##TYPE##_SECTION_NESTED(95) \ + or 3,3,3; \ +END_##TYPE##_SECTION_NESTED(0, 1, 95) \ + or 5,5,5; \ +ALT_##TYPE##_SECTION_END(0, 1) \ + or 1,1,1; \ + or 1,1,1; \ + /* Feature section with nested alt section, default taken */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ +BEGIN_##TYPE##_SECTION_NESTED(95) \ + or 1,1,1; \ +##TYPE##_SECTION_ELSE_NESTED(95) \ + or 5,5,5; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 0, 95) \ + or 2,2,2; \ +END_##TYPE##_SECTION(0, 0) \ + or 1,1,1; \ + or 1,1,1; \ + /* Feature section with nested alt section, else taken */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ +BEGIN_##TYPE##_SECTION_NESTED(95) \ + or 1,1,1; \ +##TYPE##_SECTION_ELSE_NESTED(95) \ + or 5,5,5; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 1, 95) \ + or 2,2,2; \ +END_##TYPE##_SECTION(0, 0) \ + or 1,1,1; \ + or 1,1,1; \ + /* Feature section with nested alt section, all nop'ed */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ +BEGIN_##TYPE##_SECTION_NESTED(95) \ + or 1,1,1; \ +##TYPE##_SECTION_ELSE_NESTED(95) \ + or 5,5,5; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 0, 95) \ + or 2,2,2; \ +END_##TYPE##_SECTION(0, 1) \ + or 1,1,1; \ + or 1,1,1; \ + /* Nested alt sections, default with inner default taken */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ +BEGIN_##TYPE##_SECTION_NESTED(95) \ + or 1,1,1; \ +##TYPE##_SECTION_ELSE_NESTED(95) \ + or 5,5,5; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 0, 95) \ + or 2,2,2; \ +##TYPE##_SECTION_ELSE \ + or 31,31,31; \ +BEGIN_##TYPE##_SECTION_NESTED(94) \ + or 5,5,5; \ +##TYPE##_SECTION_ELSE_NESTED(94) \ + or 1,1,1; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 0, 94) \ + or 31,31,31; \ +ALT_##TYPE##_SECTION_END(0, 0) \ + or 1,1,1; \ + or 1,1,1; \ + /* Nested alt sections, default with inner else taken */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ +BEGIN_##TYPE##_SECTION_NESTED(95) \ + or 1,1,1; \ +##TYPE##_SECTION_ELSE_NESTED(95) \ + or 5,5,5; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 1, 95) \ + or 2,2,2; \ +##TYPE##_SECTION_ELSE \ + or 31,31,31; \ +BEGIN_##TYPE##_SECTION_NESTED(94) \ + or 5,5,5; \ +##TYPE##_SECTION_ELSE_NESTED(94) \ + or 1,1,1; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 0, 94) \ + or 31,31,31; \ +ALT_##TYPE##_SECTION_END(0, 0) \ + or 1,1,1; \ + or 1,1,1; \ + /* Nested alt sections, else with inner default taken */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ +BEGIN_##TYPE##_SECTION_NESTED(95) \ + or 1,1,1; \ +##TYPE##_SECTION_ELSE_NESTED(95) \ + or 5,5,5; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 1, 95) \ + or 2,2,2; \ +##TYPE##_SECTION_ELSE \ + or 31,31,31; \ +BEGIN_##TYPE##_SECTION_NESTED(94) \ + or 5,5,5; \ +##TYPE##_SECTION_ELSE_NESTED(94) \ + or 1,1,1; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 0, 94) \ + or 31,31,31; \ +ALT_##TYPE##_SECTION_END(0, 1) \ + or 1,1,1; \ + or 1,1,1; \ + /* Nested alt sections, else with inner else taken */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ +BEGIN_##TYPE##_SECTION_NESTED(95) \ + or 1,1,1; \ +##TYPE##_SECTION_ELSE_NESTED(95) \ + or 5,5,5; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 1, 95) \ + or 2,2,2; \ +##TYPE##_SECTION_ELSE \ + or 31,31,31; \ +BEGIN_##TYPE##_SECTION_NESTED(94) \ + or 5,5,5; \ +##TYPE##_SECTION_ELSE_NESTED(94) \ + or 1,1,1; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 1, 94) \ + or 31,31,31; \ +ALT_##TYPE##_SECTION_END(0, 1) \ + or 1,1,1; \ + or 1,1,1; \ + /* Nested alt sections, else can have large else case */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ + or 2,2,2; \ + or 2,2,2; \ + or 2,2,2; \ +##TYPE##_SECTION_ELSE \ +BEGIN_##TYPE##_SECTION_NESTED(94) \ + or 5,5,5; \ + or 5,5,5; \ + or 5,5,5; \ + or 5,5,5; \ +##TYPE##_SECTION_ELSE_NESTED(94) \ + or 1,1,1; \ + or 1,1,1; \ + or 1,1,1; \ + or 1,1,1; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 1, 94) \ +ALT_##TYPE##_SECTION_END(0, 1) \ + or 1,1,1; \ + or 1,1,1; + +#define MAKE_MACRO_TEST_EXPECTED(TYPE) \ +globl(ftr_fixup_test_ ##TYPE##_macros_expected) \ + or 1,1,1; \ + /* Basic test, this section should all be nop'ed */ \ +/* BEGIN_##TYPE##_SECTION */ \ + nop; \ + nop; \ + nop; \ +/* END_##TYPE##_SECTION(0, 1) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Basic test, this section should NOT be nop'ed */ \ +/* BEGIN_##TYPE##_SECTION */ \ + or 2,2,2; \ + or 2,2,2; \ + or 2,2,2; \ +/* END_##TYPE##_SECTION(0, 0) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Nesting test, inner section should be nop'ed */ \ +/* BEGIN_##TYPE##_SECTION */ \ + or 2,2,2; \ + or 2,2,2; \ +/* BEGIN_##TYPE##_SECTION_NESTED(80) */ \ + nop; \ + nop; \ +/* END_##TYPE##_SECTION_NESTED(0, 1, 80) */ \ + or 2,2,2; \ + or 2,2,2; \ +/* END_##TYPE##_SECTION(0, 0) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Nesting test, whole section should be nop'ed */ \ + /* NB. inner section is not nop'ed, but then entire outer is */ \ +/* BEGIN_##TYPE##_SECTION */ \ + nop; \ + nop; \ +/* BEGIN_##TYPE##_SECTION_NESTED(80) */ \ + nop; \ + nop; \ +/* END_##TYPE##_SECTION_NESTED(0, 0, 80) */ \ + nop; \ + nop; \ +/* END_##TYPE##_SECTION(0, 1) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Nesting test, none should be nop'ed */ \ +/* BEGIN_##TYPE##_SECTION */ \ + or 2,2,2; \ + or 2,2,2; \ +/* BEGIN_##TYPE##_SECTION_NESTED(80) */ \ + or 3,3,3; \ + or 3,3,3; \ +/* END_##TYPE##_SECTION_NESTED(0, 0, 80) */ \ + or 2,2,2; \ + or 2,2,2; \ +/* END_##TYPE##_SECTION(0, 0) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Basic alt section test, default case should be taken */ \ +/* BEGIN_##TYPE##_SECTION */ \ + or 3,3,3; \ + or 3,3,3; \ + or 3,3,3; \ +/* ##TYPE##_SECTION_ELSE */ \ + /* or 5,5,5; */ \ + /* or 5,5,5; */ \ +/* ALT_##TYPE##_SECTION_END(0, 0) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Basic alt section test, else case should be taken */ \ +/* BEGIN_##TYPE##_SECTION */ \ + /* or 3,3,3; */ \ + /* or 3,3,3; */ \ + /* or 3,3,3; */ \ +/* ##TYPE##_SECTION_ELSE */ \ + or 31,31,31; \ + or 31,31,31; \ + or 31,31,31; \ +/* ALT_##TYPE##_SECTION_END(0, 1) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Alt with smaller else case, should be padded with nops */ \ +/* BEGIN_##TYPE##_SECTION */ \ + /* or 3,3,3; */ \ + /* or 3,3,3; */ \ + /* or 3,3,3; */ \ +/* ##TYPE##_SECTION_ELSE */ \ + or 31,31,31; \ + nop; \ + nop; \ +/* ALT_##TYPE##_SECTION_END(0, 1) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Alt section with nested section in default case */ \ + /* Default case should be taken, with nop'ed inner section */ \ +/* BEGIN_##TYPE##_SECTION */ \ + or 3,3,3; \ +/* BEGIN_##TYPE##_SECTION_NESTED(95) */ \ + nop; \ + nop; \ +/* END_##TYPE##_SECTION_NESTED(0, 1, 95) */ \ + or 3,3,3; \ +/* ##TYPE##_SECTION_ELSE */ \ + /* or 2,2,2; */ \ + /* or 2,2,2; */ \ +/* ALT_##TYPE##_SECTION_END(0, 0) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Alt section with nested section in else, default taken */ \ +/* BEGIN_##TYPE##_SECTION */ \ + or 3,3,3; \ + or 3,3,3; \ + or 3,3,3; \ +/* ##TYPE##_SECTION_ELSE */ \ + /* or 5,5,5; */ \ +/* BEGIN_##TYPE##_SECTION_NESTED(95) */ \ + /* or 3,3,3; */ \ +/* END_##TYPE##_SECTION_NESTED(0, 1, 95) */ \ + /* or 5,5,5; */ \ +/* ALT_##TYPE##_SECTION_END(0, 0) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Alt section with nested section in else, else taken & nop */ \ +/* BEGIN_##TYPE##_SECTION */ \ + /* or 3,3,3; */ \ + /* or 3,3,3; */ \ + /* or 3,3,3; */ \ +/* ##TYPE##_SECTION_ELSE */ \ + or 5,5,5; \ +/* BEGIN_##TYPE##_SECTION_NESTED(95) */ \ + nop; \ +/* END_##TYPE##_SECTION_NESTED(0, 1, 95) */ \ + or 5,5,5; \ +/* ALT_##TYPE##_SECTION_END(0, 1) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Feature section with nested alt section, default taken */ \ +/* BEGIN_##TYPE##_SECTION */ \ + or 2,2,2; \ +/* BEGIN_##TYPE##_SECTION_NESTED(95) */ \ + or 1,1,1; \ +/* ##TYPE##_SECTION_ELSE_NESTED(95) */ \ + /* or 5,5,5; */ \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 0, 95) */ \ + or 2,2,2; \ +/* END_##TYPE##_SECTION(0, 0) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Feature section with nested alt section, else taken */ \ +/* BEGIN_##TYPE##_SECTION */ \ + or 2,2,2; \ +/* BEGIN_##TYPE##_SECTION_NESTED(95) */ \ + /* or 1,1,1; */ \ +/* ##TYPE##_SECTION_ELSE_NESTED(95) */ \ + or 5,5,5; \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 1, 95) */ \ + or 2,2,2; \ +/* END_##TYPE##_SECTION(0, 0) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Feature section with nested alt section, all nop'ed */ \ +/* BEGIN_##TYPE##_SECTION */ \ + nop; \ +/* BEGIN_##TYPE##_SECTION_NESTED(95) */ \ + nop; \ +/* ##TYPE##_SECTION_ELSE_NESTED(95) */ \ + /* or 5,5,5; */ \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 0, 95) */ \ + nop; \ +/* END_##TYPE##_SECTION(0, 1) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Nested alt sections, default with inner default taken */ \ +/* BEGIN_##TYPE##_SECTION */ \ + or 2,2,2; \ +/* BEGIN_##TYPE##_SECTION_NESTED(95) */ \ + or 1,1,1; \ +/* ##TYPE##_SECTION_ELSE_NESTED(95) */ \ + /* or 5,5,5; */ \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 0, 95) */ \ + or 2,2,2; \ +/* ##TYPE##_SECTION_ELSE */ \ + /* or 31,31,31; */ \ +/* BEGIN_##TYPE##_SECTION_NESTED(94) */ \ + /* or 5,5,5; */ \ +/* ##TYPE##_SECTION_ELSE_NESTED(94) */ \ + /* or 1,1,1; */ \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 0, 94) */ \ + /* or 31,31,31; */ \ +/* ALT_##TYPE##_SECTION_END(0, 0) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Nested alt sections, default with inner else taken */ \ +/* BEGIN_##TYPE##_SECTION */ \ + or 2,2,2; \ +/* BEGIN_##TYPE##_SECTION_NESTED(95) */ \ + /* or 1,1,1; */ \ +/* ##TYPE##_SECTION_ELSE_NESTED(95) */ \ + or 5,5,5; \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 1, 95) */ \ + or 2,2,2; \ +/* ##TYPE##_SECTION_ELSE */ \ + /* or 31,31,31; */ \ +/* BEGIN_##TYPE##_SECTION_NESTED(94) */ \ + /* or 5,5,5; */ \ +/* ##TYPE##_SECTION_ELSE_NESTED(94) */ \ + /* or 1,1,1; */ \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 0, 94) */ \ + /* or 31,31,31; */ \ +/* ALT_##TYPE##_SECTION_END(0, 0) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Nested alt sections, else with inner default taken */ \ +/* BEGIN_##TYPE##_SECTION */ \ + /* or 2,2,2; */ \ +/* BEGIN_##TYPE##_SECTION_NESTED(95) */ \ + /* or 1,1,1; */ \ +/* ##TYPE##_SECTION_ELSE_NESTED(95) */ \ + /* or 5,5,5; */ \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 1, 95) */ \ + /* or 2,2,2; */ \ +/* ##TYPE##_SECTION_ELSE */ \ + or 31,31,31; \ +/* BEGIN_##TYPE##_SECTION_NESTED(94) */ \ + or 5,5,5; \ +/* ##TYPE##_SECTION_ELSE_NESTED(94) */ \ + /* or 1,1,1; */ \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 0, 94) */ \ + or 31,31,31; \ +/* ALT_##TYPE##_SECTION_END(0, 1) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Nested alt sections, else with inner else taken */ \ +/* BEGIN_##TYPE##_SECTION */ \ + /* or 2,2,2; */ \ +/* BEGIN_##TYPE##_SECTION_NESTED(95) */ \ + /* or 1,1,1; */ \ +/* ##TYPE##_SECTION_ELSE_NESTED(95) */ \ + /* or 5,5,5; */ \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 1, 95) */ \ + /* or 2,2,2; */ \ +/* ##TYPE##_SECTION_ELSE */ \ + or 31,31,31; \ +/* BEGIN_##TYPE##_SECTION_NESTED(94) */ \ + /* or 5,5,5; */ \ +/* ##TYPE##_SECTION_ELSE_NESTED(94) */ \ + or 1,1,1; \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 1, 94) */ \ + or 31,31,31; \ +/* ALT_##TYPE##_SECTION_END(0, 1) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Nested alt sections, else can have large else case */ \ +/* BEGIN_##TYPE##_SECTION */ \ + /* or 2,2,2; */ \ + /* or 2,2,2; */ \ + /* or 2,2,2; */ \ + /* or 2,2,2; */ \ +/* ##TYPE##_SECTION_ELSE */ \ +/* BEGIN_##TYPE##_SECTION_NESTED(94) */ \ + /* or 5,5,5; */ \ + /* or 5,5,5; */ \ + /* or 5,5,5; */ \ + /* or 5,5,5; */ \ +/* ##TYPE##_SECTION_ELSE_NESTED(94) */ \ + or 1,1,1; \ + or 1,1,1; \ + or 1,1,1; \ + or 1,1,1; \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 1, 94) */ \ +/* ALT_##TYPE##_SECTION_END(0, 1) */ \ + or 1,1,1; \ + or 1,1,1; + +MAKE_MACRO_TEST(FTR); +MAKE_MACRO_TEST_EXPECTED(FTR); + +#ifdef CONFIG_PPC64 +MAKE_MACRO_TEST(FW_FTR); +MAKE_MACRO_TEST_EXPECTED(FW_FTR); +#endif + +globl(lwsync_fixup_test) +1: or 1,1,1 + LWSYNC +globl(end_lwsync_fixup_test) + +globl(lwsync_fixup_test_expected_LWSYNC) +1: or 1,1,1 + lwsync + +globl(lwsync_fixup_test_expected_SYNC) +1: or 1,1,1 + sync + +globl(ftr_fixup_prefix1) + or 1,1,1 + .long OP_PREFIX << 26 + .long 0x0000000 + or 2,2,2 +globl(end_ftr_fixup_prefix1) + +globl(ftr_fixup_prefix1_orig) + or 1,1,1 + .long OP_PREFIX << 26 + .long 0x0000000 + or 2,2,2 + +globl(ftr_fixup_prefix1_expected) + or 1,1,1 + nop + nop + or 2,2,2 + +globl(ftr_fixup_prefix2) + or 1,1,1 + .long OP_PREFIX << 26 + .long 0x0000000 + or 2,2,2 +globl(end_ftr_fixup_prefix2) + +globl(ftr_fixup_prefix2_orig) + or 1,1,1 + .long OP_PREFIX << 26 + .long 0x0000000 + or 2,2,2 + +globl(ftr_fixup_prefix2_alt) + .long OP_PREFIX << 26 + .long 0x0000001 + +globl(ftr_fixup_prefix2_expected) + or 1,1,1 + .long OP_PREFIX << 26 + .long 0x0000001 + or 2,2,2 + +globl(ftr_fixup_prefix3) + or 1,1,1 + .long OP_PREFIX << 26 + .long 0x0000000 + or 2,2,2 + or 3,3,3 +globl(end_ftr_fixup_prefix3) + +globl(ftr_fixup_prefix3_orig) + or 1,1,1 + .long OP_PREFIX << 26 + .long 0x0000000 + or 2,2,2 + or 3,3,3 + +globl(ftr_fixup_prefix3_alt) + .long OP_PREFIX << 26 + .long 0x0000001 + nop + +globl(ftr_fixup_prefix3_expected) + or 1,1,1 + .long OP_PREFIX << 26 + .long 0x0000001 + nop + or 3,3,3 diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c new file mode 100644 index 0000000000..4f82581ca2 --- /dev/null +++ b/arch/powerpc/lib/feature-fixups.c @@ -0,0 +1,1012 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2001 Ben. Herrenschmidt (benh@kernel.crashing.org) + * + * Modifications for ppc64: + * Copyright (C) 2003 Dave Engebretsen <engebret@us.ibm.com> + * + * Copyright 2008 Michael Ellerman, IBM Corporation. + */ + +#include <linux/types.h> +#include <linux/jump_label.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/sched/mm.h> +#include <linux/stop_machine.h> +#include <asm/cputable.h> +#include <asm/code-patching.h> +#include <asm/interrupt.h> +#include <asm/page.h> +#include <asm/sections.h> +#include <asm/setup.h> +#include <asm/security_features.h> +#include <asm/firmware.h> +#include <asm/inst.h> + +struct fixup_entry { + unsigned long mask; + unsigned long value; + long start_off; + long end_off; + long alt_start_off; + long alt_end_off; +}; + +static u32 *calc_addr(struct fixup_entry *fcur, long offset) +{ + /* + * We store the offset to the code as a negative offset from + * the start of the alt_entry, to support the VDSO. This + * routine converts that back into an actual address. + */ + return (u32 *)((unsigned long)fcur + offset); +} + +static int patch_alt_instruction(u32 *src, u32 *dest, u32 *alt_start, u32 *alt_end) +{ + int err; + ppc_inst_t instr; + + instr = ppc_inst_read(src); + + if (instr_is_relative_branch(ppc_inst_read(src))) { + u32 *target = (u32 *)branch_target(src); + + /* Branch within the section doesn't need translating */ + if (target < alt_start || target > alt_end) { + err = translate_branch(&instr, dest, src); + if (err) + return 1; + } + } + + raw_patch_instruction(dest, instr); + + return 0; +} + +static int patch_feature_section_mask(unsigned long value, unsigned long mask, + struct fixup_entry *fcur) +{ + u32 *start, *end, *alt_start, *alt_end, *src, *dest; + + start = calc_addr(fcur, fcur->start_off); + end = calc_addr(fcur, fcur->end_off); + alt_start = calc_addr(fcur, fcur->alt_start_off); + alt_end = calc_addr(fcur, fcur->alt_end_off); + + if ((alt_end - alt_start) > (end - start)) + return 1; + + if ((value & fcur->mask & mask) == (fcur->value & mask)) + return 0; + + src = alt_start; + dest = start; + + for (; src < alt_end; src = ppc_inst_next(src, src), + dest = ppc_inst_next(dest, dest)) { + if (patch_alt_instruction(src, dest, alt_start, alt_end)) + return 1; + } + + for (; dest < end; dest++) + raw_patch_instruction(dest, ppc_inst(PPC_RAW_NOP())); + + return 0; +} + +static void do_feature_fixups_mask(unsigned long value, unsigned long mask, + void *fixup_start, void *fixup_end) +{ + struct fixup_entry *fcur, *fend; + + fcur = fixup_start; + fend = fixup_end; + + for (; fcur < fend; fcur++) { + if (patch_feature_section_mask(value, mask, fcur)) { + WARN_ON(1); + printk("Unable to patch feature section at %p - %p" \ + " with %p - %p\n", + calc_addr(fcur, fcur->start_off), + calc_addr(fcur, fcur->end_off), + calc_addr(fcur, fcur->alt_start_off), + calc_addr(fcur, fcur->alt_end_off)); + } + } +} + +void do_feature_fixups(unsigned long value, void *fixup_start, void *fixup_end) +{ + do_feature_fixups_mask(value, ~0, fixup_start, fixup_end); +} + +#ifdef CONFIG_PPC_BARRIER_NOSPEC +static bool is_fixup_addr_valid(void *dest, size_t size) +{ + return system_state < SYSTEM_FREEING_INITMEM || + !init_section_contains(dest, size); +} + +static int do_patch_fixups(long *start, long *end, unsigned int *instrs, int num) +{ + int i; + + for (i = 0; start < end; start++, i++) { + int j; + unsigned int *dest = (void *)start + *start; + + if (!is_fixup_addr_valid(dest, sizeof(*instrs) * num)) + continue; + + pr_devel("patching dest %lx\n", (unsigned long)dest); + + for (j = 0; j < num; j++) + patch_instruction(dest + j, ppc_inst(instrs[j])); + } + return i; +} +#endif + +#ifdef CONFIG_PPC_BOOK3S_64 +static int do_patch_entry_fixups(long *start, long *end, unsigned int *instrs, + bool do_fallback, void *fallback) +{ + int i; + + for (i = 0; start < end; start++, i++) { + unsigned int *dest = (void *)start + *start; + + if (!is_fixup_addr_valid(dest, sizeof(*instrs) * 3)) + continue; + + pr_devel("patching dest %lx\n", (unsigned long)dest); + + // See comment in do_entry_flush_fixups() RE order of patching + if (do_fallback) { + patch_instruction(dest, ppc_inst(instrs[0])); + patch_instruction(dest + 2, ppc_inst(instrs[2])); + patch_branch(dest + 1, (unsigned long)fallback, BRANCH_SET_LINK); + } else { + patch_instruction(dest + 1, ppc_inst(instrs[1])); + patch_instruction(dest + 2, ppc_inst(instrs[2])); + patch_instruction(dest, ppc_inst(instrs[0])); + } + } + return i; +} + +static void do_stf_entry_barrier_fixups(enum stf_barrier_type types) +{ + unsigned int instrs[3]; + long *start, *end; + int i; + + start = PTRRELOC(&__start___stf_entry_barrier_fixup); + end = PTRRELOC(&__stop___stf_entry_barrier_fixup); + + instrs[0] = PPC_RAW_NOP(); + instrs[1] = PPC_RAW_NOP(); + instrs[2] = PPC_RAW_NOP(); + + i = 0; + if (types & STF_BARRIER_FALLBACK) { + instrs[i++] = PPC_RAW_MFLR(_R10); + instrs[i++] = PPC_RAW_NOP(); /* branch patched below */ + instrs[i++] = PPC_RAW_MTLR(_R10); + } else if (types & STF_BARRIER_EIEIO) { + instrs[i++] = PPC_RAW_EIEIO() | 0x02000000; /* eieio + bit 6 hint */ + } else if (types & STF_BARRIER_SYNC_ORI) { + instrs[i++] = PPC_RAW_SYNC(); + instrs[i++] = PPC_RAW_LD(_R10, _R13, 0); + instrs[i++] = PPC_RAW_ORI(_R31, _R31, 0); /* speculation barrier */ + } + + i = do_patch_entry_fixups(start, end, instrs, types & STF_BARRIER_FALLBACK, + &stf_barrier_fallback); + + printk(KERN_DEBUG "stf-barrier: patched %d entry locations (%s barrier)\n", i, + (types == STF_BARRIER_NONE) ? "no" : + (types == STF_BARRIER_FALLBACK) ? "fallback" : + (types == STF_BARRIER_EIEIO) ? "eieio" : + (types == (STF_BARRIER_SYNC_ORI)) ? "hwsync" + : "unknown"); +} + +static void do_stf_exit_barrier_fixups(enum stf_barrier_type types) +{ + unsigned int instrs[6]; + long *start, *end; + int i; + + start = PTRRELOC(&__start___stf_exit_barrier_fixup); + end = PTRRELOC(&__stop___stf_exit_barrier_fixup); + + instrs[0] = PPC_RAW_NOP(); + instrs[1] = PPC_RAW_NOP(); + instrs[2] = PPC_RAW_NOP(); + instrs[3] = PPC_RAW_NOP(); + instrs[4] = PPC_RAW_NOP(); + instrs[5] = PPC_RAW_NOP(); + + i = 0; + if (types & STF_BARRIER_FALLBACK || types & STF_BARRIER_SYNC_ORI) { + if (cpu_has_feature(CPU_FTR_HVMODE)) { + instrs[i++] = PPC_RAW_MTSPR(SPRN_HSPRG1, _R13); + instrs[i++] = PPC_RAW_MFSPR(_R13, SPRN_HSPRG0); + } else { + instrs[i++] = PPC_RAW_MTSPR(SPRN_SPRG2, _R13); + instrs[i++] = PPC_RAW_MFSPR(_R13, SPRN_SPRG1); + } + instrs[i++] = PPC_RAW_SYNC(); + instrs[i++] = PPC_RAW_LD(_R13, _R13, 0); + instrs[i++] = PPC_RAW_ORI(_R31, _R31, 0); /* speculation barrier */ + if (cpu_has_feature(CPU_FTR_HVMODE)) + instrs[i++] = PPC_RAW_MFSPR(_R13, SPRN_HSPRG1); + else + instrs[i++] = PPC_RAW_MFSPR(_R13, SPRN_SPRG2); + } else if (types & STF_BARRIER_EIEIO) { + instrs[i++] = PPC_RAW_EIEIO() | 0x02000000; /* eieio + bit 6 hint */ + } + + i = do_patch_fixups(start, end, instrs, ARRAY_SIZE(instrs)); + + printk(KERN_DEBUG "stf-barrier: patched %d exit locations (%s barrier)\n", i, + (types == STF_BARRIER_NONE) ? "no" : + (types == STF_BARRIER_FALLBACK) ? "fallback" : + (types == STF_BARRIER_EIEIO) ? "eieio" : + (types == (STF_BARRIER_SYNC_ORI)) ? "hwsync" + : "unknown"); +} + +static bool stf_exit_reentrant = false; +static bool rfi_exit_reentrant = false; +static DEFINE_MUTEX(exit_flush_lock); + +static int __do_stf_barrier_fixups(void *data) +{ + enum stf_barrier_type *types = data; + + do_stf_entry_barrier_fixups(*types); + do_stf_exit_barrier_fixups(*types); + + return 0; +} + +void do_stf_barrier_fixups(enum stf_barrier_type types) +{ + /* + * The call to the fallback entry flush, and the fallback/sync-ori exit + * flush can not be safely patched in/out while other CPUs are + * executing them. So call __do_stf_barrier_fixups() on one CPU while + * all other CPUs spin in the stop machine core with interrupts hard + * disabled. + * + * The branch to mark interrupt exits non-reentrant is enabled first, + * then stop_machine runs which will ensure all CPUs are out of the + * low level interrupt exit code before patching. After the patching, + * if allowed, then flip the branch to allow fast exits. + */ + + // Prevent static key update races with do_rfi_flush_fixups() + mutex_lock(&exit_flush_lock); + static_branch_enable(&interrupt_exit_not_reentrant); + + stop_machine(__do_stf_barrier_fixups, &types, NULL); + + if ((types & STF_BARRIER_FALLBACK) || (types & STF_BARRIER_SYNC_ORI)) + stf_exit_reentrant = false; + else + stf_exit_reentrant = true; + + if (stf_exit_reentrant && rfi_exit_reentrant) + static_branch_disable(&interrupt_exit_not_reentrant); + + mutex_unlock(&exit_flush_lock); +} + +void do_uaccess_flush_fixups(enum l1d_flush_type types) +{ + unsigned int instrs[4]; + long *start, *end; + int i; + + start = PTRRELOC(&__start___uaccess_flush_fixup); + end = PTRRELOC(&__stop___uaccess_flush_fixup); + + instrs[0] = PPC_RAW_NOP(); + instrs[1] = PPC_RAW_NOP(); + instrs[2] = PPC_RAW_NOP(); + instrs[3] = PPC_RAW_BLR(); + + i = 0; + if (types == L1D_FLUSH_FALLBACK) { + instrs[3] = PPC_RAW_NOP(); + /* fallthrough to fallback flush */ + } + + if (types & L1D_FLUSH_ORI) { + instrs[i++] = PPC_RAW_ORI(_R31, _R31, 0); /* speculation barrier */ + instrs[i++] = PPC_RAW_ORI(_R30, _R30, 0); /* L1d flush */ + } + + if (types & L1D_FLUSH_MTTRIG) + instrs[i++] = PPC_RAW_MTSPR(SPRN_TRIG2, _R0); + + i = do_patch_fixups(start, end, instrs, ARRAY_SIZE(instrs)); + + printk(KERN_DEBUG "uaccess-flush: patched %d locations (%s flush)\n", i, + (types == L1D_FLUSH_NONE) ? "no" : + (types == L1D_FLUSH_FALLBACK) ? "fallback displacement" : + (types & L1D_FLUSH_ORI) ? (types & L1D_FLUSH_MTTRIG) + ? "ori+mttrig type" + : "ori type" : + (types & L1D_FLUSH_MTTRIG) ? "mttrig type" + : "unknown"); +} + +static int __do_entry_flush_fixups(void *data) +{ + enum l1d_flush_type types = *(enum l1d_flush_type *)data; + unsigned int instrs[3]; + long *start, *end; + int i; + + instrs[0] = PPC_RAW_NOP(); + instrs[1] = PPC_RAW_NOP(); + instrs[2] = PPC_RAW_NOP(); + + i = 0; + if (types == L1D_FLUSH_FALLBACK) { + instrs[i++] = PPC_RAW_MFLR(_R10); + instrs[i++] = PPC_RAW_NOP(); /* branch patched below */ + instrs[i++] = PPC_RAW_MTLR(_R10); + } + + if (types & L1D_FLUSH_ORI) { + instrs[i++] = PPC_RAW_ORI(_R31, _R31, 0); /* speculation barrier */ + instrs[i++] = PPC_RAW_ORI(_R30, _R30, 0); /* L1d flush */ + } + + if (types & L1D_FLUSH_MTTRIG) + instrs[i++] = PPC_RAW_MTSPR(SPRN_TRIG2, _R0); + + /* + * If we're patching in or out the fallback flush we need to be careful about the + * order in which we patch instructions. That's because it's possible we could + * take a page fault after patching one instruction, so the sequence of + * instructions must be safe even in a half patched state. + * + * To make that work, when patching in the fallback flush we patch in this order: + * - the mflr (dest) + * - the mtlr (dest + 2) + * - the branch (dest + 1) + * + * That ensures the sequence is safe to execute at any point. In contrast if we + * patch the mtlr last, it's possible we could return from the branch and not + * restore LR, leading to a crash later. + * + * When patching out the fallback flush (either with nops or another flush type), + * we patch in this order: + * - the branch (dest + 1) + * - the mtlr (dest + 2) + * - the mflr (dest) + * + * Note we are protected by stop_machine() from other CPUs executing the code in a + * semi-patched state. + */ + + start = PTRRELOC(&__start___entry_flush_fixup); + end = PTRRELOC(&__stop___entry_flush_fixup); + i = do_patch_entry_fixups(start, end, instrs, types == L1D_FLUSH_FALLBACK, + &entry_flush_fallback); + + start = PTRRELOC(&__start___scv_entry_flush_fixup); + end = PTRRELOC(&__stop___scv_entry_flush_fixup); + i += do_patch_entry_fixups(start, end, instrs, types == L1D_FLUSH_FALLBACK, + &scv_entry_flush_fallback); + + printk(KERN_DEBUG "entry-flush: patched %d locations (%s flush)\n", i, + (types == L1D_FLUSH_NONE) ? "no" : + (types == L1D_FLUSH_FALLBACK) ? "fallback displacement" : + (types & L1D_FLUSH_ORI) ? (types & L1D_FLUSH_MTTRIG) + ? "ori+mttrig type" + : "ori type" : + (types & L1D_FLUSH_MTTRIG) ? "mttrig type" + : "unknown"); + + return 0; +} + +void do_entry_flush_fixups(enum l1d_flush_type types) +{ + /* + * The call to the fallback flush can not be safely patched in/out while + * other CPUs are executing it. So call __do_entry_flush_fixups() on one + * CPU while all other CPUs spin in the stop machine core with interrupts + * hard disabled. + */ + stop_machine(__do_entry_flush_fixups, &types, NULL); +} + +static int __do_rfi_flush_fixups(void *data) +{ + enum l1d_flush_type types = *(enum l1d_flush_type *)data; + unsigned int instrs[3]; + long *start, *end; + int i; + + start = PTRRELOC(&__start___rfi_flush_fixup); + end = PTRRELOC(&__stop___rfi_flush_fixup); + + instrs[0] = PPC_RAW_NOP(); + instrs[1] = PPC_RAW_NOP(); + instrs[2] = PPC_RAW_NOP(); + + if (types & L1D_FLUSH_FALLBACK) + /* b .+16 to fallback flush */ + instrs[0] = PPC_RAW_BRANCH(16); + + i = 0; + if (types & L1D_FLUSH_ORI) { + instrs[i++] = PPC_RAW_ORI(_R31, _R31, 0); /* speculation barrier */ + instrs[i++] = PPC_RAW_ORI(_R30, _R30, 0); /* L1d flush */ + } + + if (types & L1D_FLUSH_MTTRIG) + instrs[i++] = PPC_RAW_MTSPR(SPRN_TRIG2, _R0); + + i = do_patch_fixups(start, end, instrs, ARRAY_SIZE(instrs)); + + printk(KERN_DEBUG "rfi-flush: patched %d locations (%s flush)\n", i, + (types == L1D_FLUSH_NONE) ? "no" : + (types == L1D_FLUSH_FALLBACK) ? "fallback displacement" : + (types & L1D_FLUSH_ORI) ? (types & L1D_FLUSH_MTTRIG) + ? "ori+mttrig type" + : "ori type" : + (types & L1D_FLUSH_MTTRIG) ? "mttrig type" + : "unknown"); + + return 0; +} + +void do_rfi_flush_fixups(enum l1d_flush_type types) +{ + /* + * stop_machine gets all CPUs out of the interrupt exit handler same + * as do_stf_barrier_fixups. do_rfi_flush_fixups patching can run + * without stop_machine, so this could be achieved with a broadcast + * IPI instead, but this matches the stf sequence. + */ + + // Prevent static key update races with do_stf_barrier_fixups() + mutex_lock(&exit_flush_lock); + static_branch_enable(&interrupt_exit_not_reentrant); + + stop_machine(__do_rfi_flush_fixups, &types, NULL); + + if (types & L1D_FLUSH_FALLBACK) + rfi_exit_reentrant = false; + else + rfi_exit_reentrant = true; + + if (stf_exit_reentrant && rfi_exit_reentrant) + static_branch_disable(&interrupt_exit_not_reentrant); + + mutex_unlock(&exit_flush_lock); +} + +void do_barrier_nospec_fixups_range(bool enable, void *fixup_start, void *fixup_end) +{ + unsigned int instr; + long *start, *end; + int i; + + start = fixup_start; + end = fixup_end; + + instr = PPC_RAW_NOP(); + + if (enable) { + pr_info("barrier-nospec: using ORI speculation barrier\n"); + instr = PPC_RAW_ORI(_R31, _R31, 0); /* speculation barrier */ + } + + i = do_patch_fixups(start, end, &instr, 1); + + printk(KERN_DEBUG "barrier-nospec: patched %d locations\n", i); +} + +#endif /* CONFIG_PPC_BOOK3S_64 */ + +#ifdef CONFIG_PPC_BARRIER_NOSPEC +void do_barrier_nospec_fixups(bool enable) +{ + void *start, *end; + + start = PTRRELOC(&__start___barrier_nospec_fixup); + end = PTRRELOC(&__stop___barrier_nospec_fixup); + + do_barrier_nospec_fixups_range(enable, start, end); +} +#endif /* CONFIG_PPC_BARRIER_NOSPEC */ + +#ifdef CONFIG_PPC_E500 +void do_barrier_nospec_fixups_range(bool enable, void *fixup_start, void *fixup_end) +{ + unsigned int instr[2]; + long *start, *end; + int i; + + start = fixup_start; + end = fixup_end; + + instr[0] = PPC_RAW_NOP(); + instr[1] = PPC_RAW_NOP(); + + if (enable) { + pr_info("barrier-nospec: using isync; sync as speculation barrier\n"); + instr[0] = PPC_RAW_ISYNC(); + instr[1] = PPC_RAW_SYNC(); + } + + i = do_patch_fixups(start, end, instr, ARRAY_SIZE(instr)); + + printk(KERN_DEBUG "barrier-nospec: patched %d locations\n", i); +} + +static void __init patch_btb_flush_section(long *curr) +{ + unsigned int *start, *end; + + start = (void *)curr + *curr; + end = (void *)curr + *(curr + 1); + for (; start < end; start++) { + pr_devel("patching dest %lx\n", (unsigned long)start); + patch_instruction(start, ppc_inst(PPC_RAW_NOP())); + } +} + +void __init do_btb_flush_fixups(void) +{ + long *start, *end; + + start = PTRRELOC(&__start__btb_flush_fixup); + end = PTRRELOC(&__stop__btb_flush_fixup); + + for (; start < end; start += 2) + patch_btb_flush_section(start); +} +#endif /* CONFIG_PPC_E500 */ + +void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end) +{ + long *start, *end; + u32 *dest; + + if (!(value & CPU_FTR_LWSYNC)) + return ; + + start = fixup_start; + end = fixup_end; + + for (; start < end; start++) { + dest = (void *)start + *start; + raw_patch_instruction(dest, ppc_inst(PPC_INST_LWSYNC)); + } +} + +static void __init do_final_fixups(void) +{ +#if defined(CONFIG_PPC64) && defined(CONFIG_RELOCATABLE) + ppc_inst_t inst; + u32 *src, *dest, *end; + + if (PHYSICAL_START == 0) + return; + + src = (u32 *)(KERNELBASE + PHYSICAL_START); + dest = (u32 *)KERNELBASE; + end = (void *)src + (__end_interrupts - _stext); + + while (src < end) { + inst = ppc_inst_read(src); + raw_patch_instruction(dest, inst); + src = ppc_inst_next(src, src); + dest = ppc_inst_next(dest, dest); + } +#endif +} + +static unsigned long __initdata saved_cpu_features; +static unsigned int __initdata saved_mmu_features; +#ifdef CONFIG_PPC64 +static unsigned long __initdata saved_firmware_features; +#endif + +void __init apply_feature_fixups(void) +{ + struct cpu_spec *spec = PTRRELOC(*PTRRELOC(&cur_cpu_spec)); + + *PTRRELOC(&saved_cpu_features) = spec->cpu_features; + *PTRRELOC(&saved_mmu_features) = spec->mmu_features; + + /* + * Apply the CPU-specific and firmware specific fixups to kernel text + * (nop out sections not relevant to this CPU or this firmware). + */ + do_feature_fixups(spec->cpu_features, + PTRRELOC(&__start___ftr_fixup), + PTRRELOC(&__stop___ftr_fixup)); + + do_feature_fixups(spec->mmu_features, + PTRRELOC(&__start___mmu_ftr_fixup), + PTRRELOC(&__stop___mmu_ftr_fixup)); + + do_lwsync_fixups(spec->cpu_features, + PTRRELOC(&__start___lwsync_fixup), + PTRRELOC(&__stop___lwsync_fixup)); + +#ifdef CONFIG_PPC64 + saved_firmware_features = powerpc_firmware_features; + do_feature_fixups(powerpc_firmware_features, + &__start___fw_ftr_fixup, &__stop___fw_ftr_fixup); +#endif + do_final_fixups(); +} + +void __init update_mmu_feature_fixups(unsigned long mask) +{ + saved_mmu_features &= ~mask; + saved_mmu_features |= cur_cpu_spec->mmu_features & mask; + + do_feature_fixups_mask(cur_cpu_spec->mmu_features, mask, + PTRRELOC(&__start___mmu_ftr_fixup), + PTRRELOC(&__stop___mmu_ftr_fixup)); + mmu_feature_keys_init(); +} + +void __init setup_feature_keys(void) +{ + /* + * Initialise jump label. This causes all the cpu/mmu_has_feature() + * checks to take on their correct polarity based on the current set of + * CPU/MMU features. + */ + jump_label_init(); + cpu_feature_keys_init(); + mmu_feature_keys_init(); +} + +static int __init check_features(void) +{ + WARN(saved_cpu_features != cur_cpu_spec->cpu_features, + "CPU features changed after feature patching!\n"); + WARN(saved_mmu_features != cur_cpu_spec->mmu_features, + "MMU features changed after feature patching!\n"); +#ifdef CONFIG_PPC64 + WARN(saved_firmware_features != powerpc_firmware_features, + "Firmware features changed after feature patching!\n"); +#endif + + return 0; +} +late_initcall(check_features); + +#ifdef CONFIG_FTR_FIXUP_SELFTEST + +#define check(x) \ + if (!(x)) printk("feature-fixups: test failed at line %d\n", __LINE__); + +static int patch_feature_section(unsigned long value, struct fixup_entry *fcur) +{ + return patch_feature_section_mask(value, ~0, fcur); +} + +/* This must be after the text it fixes up, vmlinux.lds.S enforces that atm */ +static struct fixup_entry fixup; + +static long __init calc_offset(struct fixup_entry *entry, unsigned int *p) +{ + return (unsigned long)p - (unsigned long)entry; +} + +static void __init test_basic_patching(void) +{ + extern unsigned int ftr_fixup_test1[]; + extern unsigned int end_ftr_fixup_test1[]; + extern unsigned int ftr_fixup_test1_orig[]; + extern unsigned int ftr_fixup_test1_expected[]; + int size = 4 * (end_ftr_fixup_test1 - ftr_fixup_test1); + + fixup.value = fixup.mask = 8; + fixup.start_off = calc_offset(&fixup, ftr_fixup_test1 + 1); + fixup.end_off = calc_offset(&fixup, ftr_fixup_test1 + 2); + fixup.alt_start_off = fixup.alt_end_off = 0; + + /* Sanity check */ + check(memcmp(ftr_fixup_test1, ftr_fixup_test1_orig, size) == 0); + + /* Check we don't patch if the value matches */ + patch_feature_section(8, &fixup); + check(memcmp(ftr_fixup_test1, ftr_fixup_test1_orig, size) == 0); + + /* Check we do patch if the value doesn't match */ + patch_feature_section(0, &fixup); + check(memcmp(ftr_fixup_test1, ftr_fixup_test1_expected, size) == 0); + + /* Check we do patch if the mask doesn't match */ + memcpy(ftr_fixup_test1, ftr_fixup_test1_orig, size); + check(memcmp(ftr_fixup_test1, ftr_fixup_test1_orig, size) == 0); + patch_feature_section(~8, &fixup); + check(memcmp(ftr_fixup_test1, ftr_fixup_test1_expected, size) == 0); +} + +static void __init test_alternative_patching(void) +{ + extern unsigned int ftr_fixup_test2[]; + extern unsigned int end_ftr_fixup_test2[]; + extern unsigned int ftr_fixup_test2_orig[]; + extern unsigned int ftr_fixup_test2_alt[]; + extern unsigned int ftr_fixup_test2_expected[]; + int size = 4 * (end_ftr_fixup_test2 - ftr_fixup_test2); + + fixup.value = fixup.mask = 0xF; + fixup.start_off = calc_offset(&fixup, ftr_fixup_test2 + 1); + fixup.end_off = calc_offset(&fixup, ftr_fixup_test2 + 2); + fixup.alt_start_off = calc_offset(&fixup, ftr_fixup_test2_alt); + fixup.alt_end_off = calc_offset(&fixup, ftr_fixup_test2_alt + 1); + + /* Sanity check */ + check(memcmp(ftr_fixup_test2, ftr_fixup_test2_orig, size) == 0); + + /* Check we don't patch if the value matches */ + patch_feature_section(0xF, &fixup); + check(memcmp(ftr_fixup_test2, ftr_fixup_test2_orig, size) == 0); + + /* Check we do patch if the value doesn't match */ + patch_feature_section(0, &fixup); + check(memcmp(ftr_fixup_test2, ftr_fixup_test2_expected, size) == 0); + + /* Check we do patch if the mask doesn't match */ + memcpy(ftr_fixup_test2, ftr_fixup_test2_orig, size); + check(memcmp(ftr_fixup_test2, ftr_fixup_test2_orig, size) == 0); + patch_feature_section(~0xF, &fixup); + check(memcmp(ftr_fixup_test2, ftr_fixup_test2_expected, size) == 0); +} + +static void __init test_alternative_case_too_big(void) +{ + extern unsigned int ftr_fixup_test3[]; + extern unsigned int end_ftr_fixup_test3[]; + extern unsigned int ftr_fixup_test3_orig[]; + extern unsigned int ftr_fixup_test3_alt[]; + int size = 4 * (end_ftr_fixup_test3 - ftr_fixup_test3); + + fixup.value = fixup.mask = 0xC; + fixup.start_off = calc_offset(&fixup, ftr_fixup_test3 + 1); + fixup.end_off = calc_offset(&fixup, ftr_fixup_test3 + 2); + fixup.alt_start_off = calc_offset(&fixup, ftr_fixup_test3_alt); + fixup.alt_end_off = calc_offset(&fixup, ftr_fixup_test3_alt + 2); + + /* Sanity check */ + check(memcmp(ftr_fixup_test3, ftr_fixup_test3_orig, size) == 0); + + /* Expect nothing to be patched, and the error returned to us */ + check(patch_feature_section(0xF, &fixup) == 1); + check(memcmp(ftr_fixup_test3, ftr_fixup_test3_orig, size) == 0); + check(patch_feature_section(0, &fixup) == 1); + check(memcmp(ftr_fixup_test3, ftr_fixup_test3_orig, size) == 0); + check(patch_feature_section(~0xF, &fixup) == 1); + check(memcmp(ftr_fixup_test3, ftr_fixup_test3_orig, size) == 0); +} + +static void __init test_alternative_case_too_small(void) +{ + extern unsigned int ftr_fixup_test4[]; + extern unsigned int end_ftr_fixup_test4[]; + extern unsigned int ftr_fixup_test4_orig[]; + extern unsigned int ftr_fixup_test4_alt[]; + extern unsigned int ftr_fixup_test4_expected[]; + int size = 4 * (end_ftr_fixup_test4 - ftr_fixup_test4); + unsigned long flag; + + /* Check a high-bit flag */ + flag = 1UL << ((sizeof(unsigned long) - 1) * 8); + fixup.value = fixup.mask = flag; + fixup.start_off = calc_offset(&fixup, ftr_fixup_test4 + 1); + fixup.end_off = calc_offset(&fixup, ftr_fixup_test4 + 5); + fixup.alt_start_off = calc_offset(&fixup, ftr_fixup_test4_alt); + fixup.alt_end_off = calc_offset(&fixup, ftr_fixup_test4_alt + 2); + + /* Sanity check */ + check(memcmp(ftr_fixup_test4, ftr_fixup_test4_orig, size) == 0); + + /* Check we don't patch if the value matches */ + patch_feature_section(flag, &fixup); + check(memcmp(ftr_fixup_test4, ftr_fixup_test4_orig, size) == 0); + + /* Check we do patch if the value doesn't match */ + patch_feature_section(0, &fixup); + check(memcmp(ftr_fixup_test4, ftr_fixup_test4_expected, size) == 0); + + /* Check we do patch if the mask doesn't match */ + memcpy(ftr_fixup_test4, ftr_fixup_test4_orig, size); + check(memcmp(ftr_fixup_test4, ftr_fixup_test4_orig, size) == 0); + patch_feature_section(~flag, &fixup); + check(memcmp(ftr_fixup_test4, ftr_fixup_test4_expected, size) == 0); +} + +static void test_alternative_case_with_branch(void) +{ + extern unsigned int ftr_fixup_test5[]; + extern unsigned int end_ftr_fixup_test5[]; + extern unsigned int ftr_fixup_test5_expected[]; + int size = 4 * (end_ftr_fixup_test5 - ftr_fixup_test5); + + check(memcmp(ftr_fixup_test5, ftr_fixup_test5_expected, size) == 0); +} + +static void __init test_alternative_case_with_external_branch(void) +{ + extern unsigned int ftr_fixup_test6[]; + extern unsigned int end_ftr_fixup_test6[]; + extern unsigned int ftr_fixup_test6_expected[]; + int size = 4 * (end_ftr_fixup_test6 - ftr_fixup_test6); + + check(memcmp(ftr_fixup_test6, ftr_fixup_test6_expected, size) == 0); +} + +static void __init test_alternative_case_with_branch_to_end(void) +{ + extern unsigned int ftr_fixup_test7[]; + extern unsigned int end_ftr_fixup_test7[]; + extern unsigned int ftr_fixup_test7_expected[]; + int size = 4 * (end_ftr_fixup_test7 - ftr_fixup_test7); + + check(memcmp(ftr_fixup_test7, ftr_fixup_test7_expected, size) == 0); +} + +static void __init test_cpu_macros(void) +{ + extern u8 ftr_fixup_test_FTR_macros[]; + extern u8 ftr_fixup_test_FTR_macros_expected[]; + unsigned long size = ftr_fixup_test_FTR_macros_expected - + ftr_fixup_test_FTR_macros; + + /* The fixups have already been done for us during boot */ + check(memcmp(ftr_fixup_test_FTR_macros, + ftr_fixup_test_FTR_macros_expected, size) == 0); +} + +static void __init test_fw_macros(void) +{ +#ifdef CONFIG_PPC64 + extern u8 ftr_fixup_test_FW_FTR_macros[]; + extern u8 ftr_fixup_test_FW_FTR_macros_expected[]; + unsigned long size = ftr_fixup_test_FW_FTR_macros_expected - + ftr_fixup_test_FW_FTR_macros; + + /* The fixups have already been done for us during boot */ + check(memcmp(ftr_fixup_test_FW_FTR_macros, + ftr_fixup_test_FW_FTR_macros_expected, size) == 0); +#endif +} + +static void __init test_lwsync_macros(void) +{ + extern u8 lwsync_fixup_test[]; + extern u8 end_lwsync_fixup_test[]; + extern u8 lwsync_fixup_test_expected_LWSYNC[]; + extern u8 lwsync_fixup_test_expected_SYNC[]; + unsigned long size = end_lwsync_fixup_test - + lwsync_fixup_test; + + /* The fixups have already been done for us during boot */ + if (cur_cpu_spec->cpu_features & CPU_FTR_LWSYNC) { + check(memcmp(lwsync_fixup_test, + lwsync_fixup_test_expected_LWSYNC, size) == 0); + } else { + check(memcmp(lwsync_fixup_test, + lwsync_fixup_test_expected_SYNC, size) == 0); + } +} + +#ifdef CONFIG_PPC64 +static void __init test_prefix_patching(void) +{ + extern unsigned int ftr_fixup_prefix1[]; + extern unsigned int end_ftr_fixup_prefix1[]; + extern unsigned int ftr_fixup_prefix1_orig[]; + extern unsigned int ftr_fixup_prefix1_expected[]; + int size = sizeof(unsigned int) * (end_ftr_fixup_prefix1 - ftr_fixup_prefix1); + + fixup.value = fixup.mask = 8; + fixup.start_off = calc_offset(&fixup, ftr_fixup_prefix1 + 1); + fixup.end_off = calc_offset(&fixup, ftr_fixup_prefix1 + 3); + fixup.alt_start_off = fixup.alt_end_off = 0; + + /* Sanity check */ + check(memcmp(ftr_fixup_prefix1, ftr_fixup_prefix1_orig, size) == 0); + + patch_feature_section(0, &fixup); + check(memcmp(ftr_fixup_prefix1, ftr_fixup_prefix1_expected, size) == 0); + check(memcmp(ftr_fixup_prefix1, ftr_fixup_prefix1_orig, size) != 0); +} + +static void __init test_prefix_alt_patching(void) +{ + extern unsigned int ftr_fixup_prefix2[]; + extern unsigned int end_ftr_fixup_prefix2[]; + extern unsigned int ftr_fixup_prefix2_orig[]; + extern unsigned int ftr_fixup_prefix2_expected[]; + extern unsigned int ftr_fixup_prefix2_alt[]; + int size = sizeof(unsigned int) * (end_ftr_fixup_prefix2 - ftr_fixup_prefix2); + + fixup.value = fixup.mask = 8; + fixup.start_off = calc_offset(&fixup, ftr_fixup_prefix2 + 1); + fixup.end_off = calc_offset(&fixup, ftr_fixup_prefix2 + 3); + fixup.alt_start_off = calc_offset(&fixup, ftr_fixup_prefix2_alt); + fixup.alt_end_off = calc_offset(&fixup, ftr_fixup_prefix2_alt + 2); + /* Sanity check */ + check(memcmp(ftr_fixup_prefix2, ftr_fixup_prefix2_orig, size) == 0); + + patch_feature_section(0, &fixup); + check(memcmp(ftr_fixup_prefix2, ftr_fixup_prefix2_expected, size) == 0); + check(memcmp(ftr_fixup_prefix2, ftr_fixup_prefix2_orig, size) != 0); +} + +static void __init test_prefix_word_alt_patching(void) +{ + extern unsigned int ftr_fixup_prefix3[]; + extern unsigned int end_ftr_fixup_prefix3[]; + extern unsigned int ftr_fixup_prefix3_orig[]; + extern unsigned int ftr_fixup_prefix3_expected[]; + extern unsigned int ftr_fixup_prefix3_alt[]; + int size = sizeof(unsigned int) * (end_ftr_fixup_prefix3 - ftr_fixup_prefix3); + + fixup.value = fixup.mask = 8; + fixup.start_off = calc_offset(&fixup, ftr_fixup_prefix3 + 1); + fixup.end_off = calc_offset(&fixup, ftr_fixup_prefix3 + 4); + fixup.alt_start_off = calc_offset(&fixup, ftr_fixup_prefix3_alt); + fixup.alt_end_off = calc_offset(&fixup, ftr_fixup_prefix3_alt + 3); + /* Sanity check */ + check(memcmp(ftr_fixup_prefix3, ftr_fixup_prefix3_orig, size) == 0); + + patch_feature_section(0, &fixup); + check(memcmp(ftr_fixup_prefix3, ftr_fixup_prefix3_expected, size) == 0); + patch_feature_section(0, &fixup); + check(memcmp(ftr_fixup_prefix3, ftr_fixup_prefix3_orig, size) != 0); +} +#else +static inline void test_prefix_patching(void) {} +static inline void test_prefix_alt_patching(void) {} +static inline void test_prefix_word_alt_patching(void) {} +#endif /* CONFIG_PPC64 */ + +static int __init test_feature_fixups(void) +{ + printk(KERN_DEBUG "Running feature fixup self-tests ...\n"); + + test_basic_patching(); + test_alternative_patching(); + test_alternative_case_too_big(); + test_alternative_case_too_small(); + test_alternative_case_with_branch(); + test_alternative_case_with_external_branch(); + test_alternative_case_with_branch_to_end(); + test_cpu_macros(); + test_fw_macros(); + test_lwsync_macros(); + test_prefix_patching(); + test_prefix_alt_patching(); + test_prefix_word_alt_patching(); + + return 0; +} +late_initcall(test_feature_fixups); + +#endif /* CONFIG_FTR_FIXUP_SELFTEST */ diff --git a/arch/powerpc/lib/hweight_64.S b/arch/powerpc/lib/hweight_64.S new file mode 100644 index 0000000000..151875050d --- /dev/null +++ b/arch/powerpc/lib/hweight_64.S @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * + * Copyright (C) IBM Corporation, 2010 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ +#include <linux/export.h> +#include <asm/processor.h> +#include <asm/ppc_asm.h> +#include <asm/feature-fixups.h> + +/* Note: This code relies on -mminimal-toc */ + +_GLOBAL(__arch_hweight8) +BEGIN_FTR_SECTION + b CFUNC(__sw_hweight8) + nop + nop +FTR_SECTION_ELSE + PPC_POPCNTB(R3,R3) + clrldi r3,r3,64-8 + blr +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB) +EXPORT_SYMBOL(__arch_hweight8) + +_GLOBAL(__arch_hweight16) +BEGIN_FTR_SECTION + b CFUNC(__sw_hweight16) + nop + nop + nop + nop +FTR_SECTION_ELSE + BEGIN_FTR_SECTION_NESTED(50) + PPC_POPCNTB(R3,R3) + srdi r4,r3,8 + add r3,r4,r3 + clrldi r3,r3,64-8 + blr + FTR_SECTION_ELSE_NESTED(50) + clrlwi r3,r3,16 + PPC_POPCNTW(R3,R3) + clrldi r3,r3,64-8 + blr + ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 50) +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB) +EXPORT_SYMBOL(__arch_hweight16) + +_GLOBAL(__arch_hweight32) +BEGIN_FTR_SECTION + b CFUNC(__sw_hweight32) + nop + nop + nop + nop + nop + nop +FTR_SECTION_ELSE + BEGIN_FTR_SECTION_NESTED(51) + PPC_POPCNTB(R3,R3) + srdi r4,r3,16 + add r3,r4,r3 + srdi r4,r3,8 + add r3,r4,r3 + clrldi r3,r3,64-8 + blr + FTR_SECTION_ELSE_NESTED(51) + PPC_POPCNTW(R3,R3) + clrldi r3,r3,64-8 + blr + ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 51) +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB) +EXPORT_SYMBOL(__arch_hweight32) + +_GLOBAL(__arch_hweight64) +BEGIN_FTR_SECTION + b CFUNC(__sw_hweight64) + nop + nop + nop + nop + nop + nop + nop + nop +FTR_SECTION_ELSE + BEGIN_FTR_SECTION_NESTED(52) + PPC_POPCNTB(R3,R3) + srdi r4,r3,32 + add r3,r4,r3 + srdi r4,r3,16 + add r3,r4,r3 + srdi r4,r3,8 + add r3,r4,r3 + clrldi r3,r3,64-8 + blr + FTR_SECTION_ELSE_NESTED(52) + PPC_POPCNTD(R3,R3) + clrldi r3,r3,64-8 + blr + ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 52) +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB) +EXPORT_SYMBOL(__arch_hweight64) diff --git a/arch/powerpc/lib/ldstfp.S b/arch/powerpc/lib/ldstfp.S new file mode 100644 index 0000000000..e00abeabc5 --- /dev/null +++ b/arch/powerpc/lib/ldstfp.S @@ -0,0 +1,237 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Floating-point, VMX/Altivec and VSX loads and stores + * for use in instruction emulation. + * + * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + */ + +#include <asm/processor.h> +#include <asm/ppc_asm.h> +#include <asm/ppc-opcode.h> +#include <asm/reg.h> +#include <asm/asm-offsets.h> +#include <asm/asm-compat.h> +#include <linux/errno.h> + +#define STKFRM (PPC_MIN_STKFRM + 16) + +/* Get the contents of frN into *p; N is in r3 and p is in r4. */ +_GLOBAL(get_fpr) + mflr r0 + mfmsr r6 + ori r7, r6, MSR_FP + MTMSRD(r7) + isync + rlwinm r3,r3,3,0xf8 + bcl 20,31,1f +reg = 0 + .rept 32 + stfd reg, 0(r4) + b 2f +reg = reg + 1 + .endr +1: mflr r5 + add r5,r3,r5 + mtctr r5 + mtlr r0 + bctr +2: MTMSRD(r6) + isync + blr + +/* Put the contents of *p into frN; N is in r3 and p is in r4. */ +_GLOBAL(put_fpr) + mflr r0 + mfmsr r6 + ori r7, r6, MSR_FP + MTMSRD(r7) + isync + rlwinm r3,r3,3,0xf8 + bcl 20,31,1f +reg = 0 + .rept 32 + lfd reg, 0(r4) + b 2f +reg = reg + 1 + .endr +1: mflr r5 + add r5,r3,r5 + mtctr r5 + mtlr r0 + bctr +2: MTMSRD(r6) + isync + blr + +#ifdef CONFIG_ALTIVEC +/* Get the contents of vrN into *p; N is in r3 and p is in r4. */ +_GLOBAL(get_vr) + mflr r0 + mfmsr r6 + oris r7, r6, MSR_VEC@h + MTMSRD(r7) + isync + rlwinm r3,r3,3,0xf8 + bcl 20,31,1f +reg = 0 + .rept 32 + stvx reg, 0, r4 + b 2f +reg = reg + 1 + .endr +1: mflr r5 + add r5,r3,r5 + mtctr r5 + mtlr r0 + bctr +2: MTMSRD(r6) + isync + blr + +/* Put the contents of *p into vrN; N is in r3 and p is in r4. */ +_GLOBAL(put_vr) + mflr r0 + mfmsr r6 + oris r7, r6, MSR_VEC@h + MTMSRD(r7) + isync + rlwinm r3,r3,3,0xf8 + bcl 20,31,1f +reg = 0 + .rept 32 + lvx reg, 0, r4 + b 2f +reg = reg + 1 + .endr +1: mflr r5 + add r5,r3,r5 + mtctr r5 + mtlr r0 + bctr +2: MTMSRD(r6) + isync + blr +#endif /* CONFIG_ALTIVEC */ + +#ifdef CONFIG_VSX +/* Get the contents of vsN into vs0; N is in r3. */ +_GLOBAL(get_vsr) + mflr r0 + rlwinm r3,r3,3,0x1f8 + bcl 20,31,1f + blr /* vs0 is already in vs0 */ + nop +reg = 1 + .rept 63 + XXLOR(0,reg,reg) + blr +reg = reg + 1 + .endr +1: mflr r5 + add r5,r3,r5 + mtctr r5 + mtlr r0 + bctr + +/* Put the contents of vs0 into vsN; N is in r3. */ +_GLOBAL(put_vsr) + mflr r0 + rlwinm r3,r3,3,0x1f8 + bcl 20,31,1f + blr /* v0 is already in v0 */ + nop +reg = 1 + .rept 63 + XXLOR(reg,0,0) + blr +reg = reg + 1 + .endr +1: mflr r5 + add r5,r3,r5 + mtctr r5 + mtlr r0 + bctr + +/* Load VSX reg N from vector doubleword *p. N is in r3, p in r4. */ +_GLOBAL(load_vsrn) + PPC_STLU r1,-STKFRM(r1) + mflr r0 + PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) + mfmsr r6 + oris r7,r6,MSR_VSX@h + cmpwi cr7,r3,0 + li r8,STKFRM-16 + MTMSRD(r7) + isync + beq cr7,1f + STXVD2X(0,R1,R8) +1: LXVD2X(0,R0,R4) +#ifdef __LITTLE_ENDIAN__ + XXSWAPD(0,0) +#endif + beq cr7,4f + bl put_vsr + LXVD2X(0,R1,R8) +4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) + mtlr r0 + MTMSRD(r6) + isync + addi r1,r1,STKFRM + blr + +/* Store VSX reg N to vector doubleword *p. N is in r3, p in r4. */ +_GLOBAL(store_vsrn) + PPC_STLU r1,-STKFRM(r1) + mflr r0 + PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) + mfmsr r6 + oris r7,r6,MSR_VSX@h + li r8,STKFRM-16 + MTMSRD(r7) + isync + STXVD2X(0,R1,R8) + bl get_vsr +#ifdef __LITTLE_ENDIAN__ + XXSWAPD(0,0) +#endif + STXVD2X(0,R0,R4) + LXVD2X(0,R1,R8) + PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) + mtlr r0 + MTMSRD(r6) + isync + mr r3,r9 + addi r1,r1,STKFRM + blr +#endif /* CONFIG_VSX */ + +/* Convert single-precision to double, without disturbing FPRs. */ +/* conv_sp_to_dp(float *sp, double *dp) */ +_GLOBAL(conv_sp_to_dp) + mfmsr r6 + ori r7, r6, MSR_FP + MTMSRD(r7) + isync + stfd fr0, -16(r1) + lfs fr0, 0(r3) + stfd fr0, 0(r4) + lfd fr0, -16(r1) + MTMSRD(r6) + isync + blr + +/* Convert single-precision to double, without disturbing FPRs. */ +/* conv_sp_to_dp(double *dp, float *sp) */ +_GLOBAL(conv_dp_to_sp) + mfmsr r6 + ori r7, r6, MSR_FP + MTMSRD(r7) + isync + stfd fr0, -16(r1) + lfd fr0, 0(r3) + stfs fr0, 0(r4) + lfd fr0, -16(r1) + MTMSRD(r6) + isync + blr diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c new file mode 100644 index 0000000000..04165b7a16 --- /dev/null +++ b/arch/powerpc/lib/locks.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Spin and read/write lock operations. + * + * Copyright (C) 2001-2004 Paul Mackerras <paulus@au.ibm.com>, IBM + * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM + * Copyright (C) 2002 Dave Engebretsen <engebret@us.ibm.com>, IBM + * Rework to support virtual processors + */ + +#include <linux/kernel.h> +#include <linux/spinlock.h> +#include <linux/export.h> +#include <linux/smp.h> + +/* waiting for a spinlock... */ +#if defined(CONFIG_PPC_SPLPAR) +#include <asm/hvcall.h> +#include <asm/smp.h> + +void splpar_spin_yield(arch_spinlock_t *lock) +{ + unsigned int lock_value, holder_cpu, yield_count; + + lock_value = lock->slock; + if (lock_value == 0) + return; + holder_cpu = lock_value & 0xffff; + BUG_ON(holder_cpu >= NR_CPUS); + + yield_count = yield_count_of(holder_cpu); + if ((yield_count & 1) == 0) + return; /* virtual cpu is currently running */ + rmb(); + if (lock->slock != lock_value) + return; /* something has changed */ + yield_to_preempted(holder_cpu, yield_count); +} +EXPORT_SYMBOL_GPL(splpar_spin_yield); + +/* + * Waiting for a read lock or a write lock on a rwlock... + * This turns out to be the same for read and write locks, since + * we only know the holder if it is write-locked. + */ +void splpar_rw_yield(arch_rwlock_t *rw) +{ + int lock_value; + unsigned int holder_cpu, yield_count; + + lock_value = rw->lock; + if (lock_value >= 0) + return; /* no write lock at present */ + holder_cpu = lock_value & 0xffff; + BUG_ON(holder_cpu >= NR_CPUS); + + yield_count = yield_count_of(holder_cpu); + if ((yield_count & 1) == 0) + return; /* virtual cpu is currently running */ + rmb(); + if (rw->lock != lock_value) + return; /* something has changed */ + yield_to_preempted(holder_cpu, yield_count); +} +#endif diff --git a/arch/powerpc/lib/mem_64.S b/arch/powerpc/lib/mem_64.S new file mode 100644 index 0000000000..6fd06cd20f --- /dev/null +++ b/arch/powerpc/lib/mem_64.S @@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * String handling functions for PowerPC. + * + * Copyright (C) 1996 Paul Mackerras. + */ +#include <linux/export.h> +#include <asm/processor.h> +#include <asm/errno.h> +#include <asm/ppc_asm.h> +#include <asm/kasan.h> + +#ifndef CONFIG_KASAN +_GLOBAL(__memset16) + rlwimi r4,r4,16,0,15 + /* fall through */ + +_GLOBAL(__memset32) + rldimi r4,r4,32,0 + /* fall through */ + +_GLOBAL(__memset64) + neg r0,r3 + andi. r0,r0,7 + cmplw cr1,r5,r0 + b .Lms +EXPORT_SYMBOL(__memset16) +EXPORT_SYMBOL(__memset32) +EXPORT_SYMBOL(__memset64) +#endif + +_GLOBAL_KASAN(memset) + neg r0,r3 + rlwimi r4,r4,8,16,23 + andi. r0,r0,7 /* # bytes to be 8-byte aligned */ + rlwimi r4,r4,16,0,15 + cmplw cr1,r5,r0 /* do we get that far? */ + rldimi r4,r4,32,0 +.Lms: PPC_MTOCRF(1,r0) + mr r6,r3 + blt cr1,8f + beq 3f /* if already 8-byte aligned */ + subf r5,r0,r5 + bf 31,1f + stb r4,0(r6) + addi r6,r6,1 +1: bf 30,2f + sth r4,0(r6) + addi r6,r6,2 +2: bf 29,3f + stw r4,0(r6) + addi r6,r6,4 +3: srdi. r0,r5,6 + clrldi r5,r5,58 + mtctr r0 + beq 5f + .balign 16 +4: std r4,0(r6) + std r4,8(r6) + std r4,16(r6) + std r4,24(r6) + std r4,32(r6) + std r4,40(r6) + std r4,48(r6) + std r4,56(r6) + addi r6,r6,64 + bdnz 4b +5: srwi. r0,r5,3 + clrlwi r5,r5,29 + PPC_MTOCRF(1,r0) + beq 8f + bf 29,6f + std r4,0(r6) + std r4,8(r6) + std r4,16(r6) + std r4,24(r6) + addi r6,r6,32 +6: bf 30,7f + std r4,0(r6) + std r4,8(r6) + addi r6,r6,16 +7: bf 31,8f + std r4,0(r6) + addi r6,r6,8 +8: cmpwi r5,0 + PPC_MTOCRF(1,r5) + beqlr + bf 29,9f + stw r4,0(r6) + addi r6,r6,4 +9: bf 30,10f + sth r4,0(r6) + addi r6,r6,2 +10: bflr 31 + stb r4,0(r6) + blr +EXPORT_SYMBOL(memset) +EXPORT_SYMBOL_KASAN(memset) + +_GLOBAL_TOC_KASAN(memmove) + cmplw 0,r3,r4 + bgt backwards_memcpy + b memcpy + +_GLOBAL(backwards_memcpy) + rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */ + add r6,r3,r5 + add r4,r4,r5 + beq 2f + andi. r0,r6,3 + mtctr r7 + bne 5f + .balign 16 +1: lwz r7,-4(r4) + lwzu r8,-8(r4) + stw r7,-4(r6) + stwu r8,-8(r6) + bdnz 1b + andi. r5,r5,7 +2: cmplwi 0,r5,4 + blt 3f + lwzu r0,-4(r4) + subi r5,r5,4 + stwu r0,-4(r6) +3: cmpwi 0,r5,0 + beqlr + mtctr r5 +4: lbzu r0,-1(r4) + stbu r0,-1(r6) + bdnz 4b + blr +5: mtctr r0 +6: lbzu r7,-1(r4) + stbu r7,-1(r6) + bdnz 6b + subf r5,r0,r5 + rlwinm. r7,r5,32-3,3,31 + beq 2b + mtctr r7 + b 1b +EXPORT_SYMBOL(memmove) +EXPORT_SYMBOL_KASAN(memmove) diff --git a/arch/powerpc/lib/memcmp_32.S b/arch/powerpc/lib/memcmp_32.S new file mode 100644 index 0000000000..f6fca5664e --- /dev/null +++ b/arch/powerpc/lib/memcmp_32.S @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * memcmp for PowerPC32 + * + * Copyright (C) 1996 Paul Mackerras. + * + */ + +#include <linux/export.h> +#include <asm/ppc_asm.h> + + .text + +_GLOBAL(memcmp) + srawi. r7, r5, 2 /* Divide len by 4 */ + mr r6, r3 + beq- 3f + mtctr r7 + li r7, 0 +1: lwzx r3, r6, r7 + lwzx r0, r4, r7 + addi r7, r7, 4 + cmplw cr0, r3, r0 + bdnzt eq, 1b + bne 5f +3: andi. r3, r5, 3 + beqlr + cmplwi cr1, r3, 2 + blt- cr1, 4f + lhzx r3, r6, r7 + lhzx r0, r4, r7 + addi r7, r7, 2 + subf. r3, r0, r3 + beqlr cr1 + bnelr +4: lbzx r3, r6, r7 + lbzx r0, r4, r7 + subf. r3, r0, r3 + blr +5: li r3, 1 + bgtlr + li r3, -1 + blr +EXPORT_SYMBOL(memcmp) diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S new file mode 100644 index 0000000000..142c666d38 --- /dev/null +++ b/arch/powerpc/lib/memcmp_64.S @@ -0,0 +1,638 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Author: Anton Blanchard <anton@au.ibm.com> + * Copyright 2015 IBM Corporation. + */ +#include <linux/export.h> +#include <asm/ppc_asm.h> +#include <asm/ppc-opcode.h> + +#define off8 r6 +#define off16 r7 +#define off24 r8 + +#define rA r9 +#define rB r10 +#define rC r11 +#define rD r27 +#define rE r28 +#define rF r29 +#define rG r30 +#define rH r31 + +#ifdef __LITTLE_ENDIAN__ +#define LH lhbrx +#define LW lwbrx +#define LD ldbrx +#define LVS lvsr +#define VPERM(_VRT,_VRA,_VRB,_VRC) \ + vperm _VRT,_VRB,_VRA,_VRC +#else +#define LH lhzx +#define LW lwzx +#define LD ldx +#define LVS lvsl +#define VPERM(_VRT,_VRA,_VRB,_VRC) \ + vperm _VRT,_VRA,_VRB,_VRC +#endif + +#define VMX_THRESH 4096 +#define ENTER_VMX_OPS \ + mflr r0; \ + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ + std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ + std r0,16(r1); \ + stdu r1,-STACKFRAMESIZE(r1); \ + bl CFUNC(enter_vmx_ops); \ + cmpwi cr1,r3,0; \ + ld r0,STACKFRAMESIZE+16(r1); \ + ld r3,STK_REG(R31)(r1); \ + ld r4,STK_REG(R30)(r1); \ + ld r5,STK_REG(R29)(r1); \ + addi r1,r1,STACKFRAMESIZE; \ + mtlr r0 + +#define EXIT_VMX_OPS \ + mflr r0; \ + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ + std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ + std r0,16(r1); \ + stdu r1,-STACKFRAMESIZE(r1); \ + bl CFUNC(exit_vmx_ops); \ + ld r0,STACKFRAMESIZE+16(r1); \ + ld r3,STK_REG(R31)(r1); \ + ld r4,STK_REG(R30)(r1); \ + ld r5,STK_REG(R29)(r1); \ + addi r1,r1,STACKFRAMESIZE; \ + mtlr r0 + +/* + * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with + * 16 bytes boundary and permute the result with the 1st 16 bytes. + + * | y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z | + * ^ ^ ^ + * 0xbbbb10 0xbbbb20 0xbbb30 + * ^ + * _vaddr + * + * + * _vmask is the mask generated by LVS + * _v1st_qw is the 1st aligned QW of current addr which is already loaded. + * for example: 0xyyyyyyyyyyyyy012 for big endian + * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded. + * for example: 0x3456789abcdefzzz for big endian + * The permute result is saved in _v_res. + * for example: 0x0123456789abcdef for big endian. + */ +#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \ + lvx _v2nd_qw,_vaddr,off16; \ + VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask) + +/* + * There are 2 categories for memcmp: + * 1) src/dst has the same offset to the 8 bytes boundary. The handlers + * are named like .Lsameoffset_xxxx + * 2) src/dst has different offset to the 8 bytes boundary. The handlers + * are named like .Ldiffoffset_xxxx + */ +_GLOBAL_TOC(memcmp) + cmpdi cr1,r5,0 + + /* Use the short loop if the src/dst addresses are not + * with the same offset of 8 bytes align boundary. + */ + xor r6,r3,r4 + andi. r6,r6,7 + + /* Fall back to short loop if compare at aligned addrs + * with less than 8 bytes. + */ + cmpdi cr6,r5,7 + + beq cr1,.Lzero + bgt cr6,.Lno_short + +.Lshort: + mtctr r5 +1: lbz rA,0(r3) + lbz rB,0(r4) + subf. rC,rB,rA + bne .Lnon_zero + bdz .Lzero + + lbz rA,1(r3) + lbz rB,1(r4) + subf. rC,rB,rA + bne .Lnon_zero + bdz .Lzero + + lbz rA,2(r3) + lbz rB,2(r4) + subf. rC,rB,rA + bne .Lnon_zero + bdz .Lzero + + lbz rA,3(r3) + lbz rB,3(r4) + subf. rC,rB,rA + bne .Lnon_zero + + addi r3,r3,4 + addi r4,r4,4 + + bdnz 1b + +.Lzero: + li r3,0 + blr + +.Lno_short: + dcbt 0,r3 + dcbt 0,r4 + bne .Ldiffoffset_8bytes_make_align_start + + +.Lsameoffset_8bytes_make_align_start: + /* attempt to compare bytes not aligned with 8 bytes so that + * rest comparison can run based on 8 bytes alignment. + */ + andi. r6,r3,7 + + /* Try to compare the first double word which is not 8 bytes aligned: + * load the first double word at (src & ~7UL) and shift left appropriate + * bits before comparision. + */ + rlwinm r6,r3,3,26,28 + beq .Lsameoffset_8bytes_aligned + clrrdi r3,r3,3 + clrrdi r4,r4,3 + LD rA,0,r3 + LD rB,0,r4 + sld rA,rA,r6 + sld rB,rB,r6 + cmpld cr0,rA,rB + srwi r6,r6,3 + bne cr0,.LcmpAB_lightweight + subfic r6,r6,8 + subf. r5,r6,r5 + addi r3,r3,8 + addi r4,r4,8 + beq .Lzero + +.Lsameoffset_8bytes_aligned: + /* now we are aligned with 8 bytes. + * Use .Llong loop if left cmp bytes are equal or greater than 32B. + */ + cmpdi cr6,r5,31 + bgt cr6,.Llong + +.Lcmp_lt32bytes: + /* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */ + cmpdi cr5,r5,7 + srdi r0,r5,3 + ble cr5,.Lcmp_rest_lt8bytes + + /* handle 8 ~ 31 bytes */ + clrldi r5,r5,61 + mtctr r0 +2: + LD rA,0,r3 + LD rB,0,r4 + cmpld cr0,rA,rB + addi r3,r3,8 + addi r4,r4,8 + bne cr0,.LcmpAB_lightweight + bdnz 2b + + cmpwi r5,0 + beq .Lzero + +.Lcmp_rest_lt8bytes: + /* + * Here we have less than 8 bytes to compare. At least s1 is aligned to + * 8 bytes, but s2 may not be. We must make sure s2 + 7 doesn't cross a + * page boundary, otherwise we might read past the end of the buffer and + * trigger a page fault. We use 4K as the conservative minimum page + * size. If we detect that case we go to the byte-by-byte loop. + * + * Otherwise the next double word is loaded from s1 and s2, and shifted + * right to compare the appropriate bits. + */ + clrldi r6,r4,(64-12) // r6 = r4 & 0xfff + cmpdi r6,0xff8 + bgt .Lshort + + subfic r6,r5,8 + slwi r6,r6,3 + LD rA,0,r3 + LD rB,0,r4 + srd rA,rA,r6 + srd rB,rB,r6 + cmpld cr0,rA,rB + bne cr0,.LcmpAB_lightweight + b .Lzero + +.Lnon_zero: + mr r3,rC + blr + +.Llong: +#ifdef CONFIG_ALTIVEC +BEGIN_FTR_SECTION + /* Try to use vmx loop if length is equal or greater than 4K */ + cmpldi cr6,r5,VMX_THRESH + bge cr6,.Lsameoffset_vmx_cmp +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + +.Llong_novmx_cmp: +#endif + /* At least s1 addr is aligned with 8 bytes */ + li off8,8 + li off16,16 + li off24,24 + + std r31,-8(r1) + std r30,-16(r1) + std r29,-24(r1) + std r28,-32(r1) + std r27,-40(r1) + + srdi r0,r5,5 + mtctr r0 + andi. r5,r5,31 + + LD rA,0,r3 + LD rB,0,r4 + + LD rC,off8,r3 + LD rD,off8,r4 + + LD rE,off16,r3 + LD rF,off16,r4 + + LD rG,off24,r3 + LD rH,off24,r4 + cmpld cr0,rA,rB + + addi r3,r3,32 + addi r4,r4,32 + + bdz .Lfirst32 + + LD rA,0,r3 + LD rB,0,r4 + cmpld cr1,rC,rD + + LD rC,off8,r3 + LD rD,off8,r4 + cmpld cr6,rE,rF + + LD rE,off16,r3 + LD rF,off16,r4 + cmpld cr7,rG,rH + bne cr0,.LcmpAB + + LD rG,off24,r3 + LD rH,off24,r4 + cmpld cr0,rA,rB + bne cr1,.LcmpCD + + addi r3,r3,32 + addi r4,r4,32 + + bdz .Lsecond32 + + .balign 16 + +1: LD rA,0,r3 + LD rB,0,r4 + cmpld cr1,rC,rD + bne cr6,.LcmpEF + + LD rC,off8,r3 + LD rD,off8,r4 + cmpld cr6,rE,rF + bne cr7,.LcmpGH + + LD rE,off16,r3 + LD rF,off16,r4 + cmpld cr7,rG,rH + bne cr0,.LcmpAB + + LD rG,off24,r3 + LD rH,off24,r4 + cmpld cr0,rA,rB + bne cr1,.LcmpCD + + addi r3,r3,32 + addi r4,r4,32 + + bdnz 1b + +.Lsecond32: + cmpld cr1,rC,rD + bne cr6,.LcmpEF + + cmpld cr6,rE,rF + bne cr7,.LcmpGH + + cmpld cr7,rG,rH + bne cr0,.LcmpAB + + bne cr1,.LcmpCD + bne cr6,.LcmpEF + bne cr7,.LcmpGH + +.Ltail: + ld r31,-8(r1) + ld r30,-16(r1) + ld r29,-24(r1) + ld r28,-32(r1) + ld r27,-40(r1) + + cmpdi r5,0 + beq .Lzero + b .Lshort + +.Lfirst32: + cmpld cr1,rC,rD + cmpld cr6,rE,rF + cmpld cr7,rG,rH + + bne cr0,.LcmpAB + bne cr1,.LcmpCD + bne cr6,.LcmpEF + bne cr7,.LcmpGH + + b .Ltail + +.LcmpAB: + li r3,1 + bgt cr0,.Lout + li r3,-1 + b .Lout + +.LcmpCD: + li r3,1 + bgt cr1,.Lout + li r3,-1 + b .Lout + +.LcmpEF: + li r3,1 + bgt cr6,.Lout + li r3,-1 + b .Lout + +.LcmpGH: + li r3,1 + bgt cr7,.Lout + li r3,-1 + +.Lout: + ld r31,-8(r1) + ld r30,-16(r1) + ld r29,-24(r1) + ld r28,-32(r1) + ld r27,-40(r1) + blr + +.LcmpAB_lightweight: /* skip NV GPRS restore */ + li r3,1 + bgtlr + li r3,-1 + blr + +#ifdef CONFIG_ALTIVEC +.Lsameoffset_vmx_cmp: + /* Enter with src/dst addrs has the same offset with 8 bytes + * align boundary. + * + * There is an optimization based on following fact: memcmp() + * prones to fail early at the first 32 bytes. + * Before applying VMX instructions which will lead to 32x128bits + * VMX regs load/restore penalty, we compare the first 32 bytes + * so that we can catch the ~80% fail cases. + */ + + li r0,4 + mtctr r0 +.Lsameoffset_prechk_32B_loop: + LD rA,0,r3 + LD rB,0,r4 + cmpld cr0,rA,rB + addi r3,r3,8 + addi r4,r4,8 + bne cr0,.LcmpAB_lightweight + addi r5,r5,-8 + bdnz .Lsameoffset_prechk_32B_loop + + ENTER_VMX_OPS + beq cr1,.Llong_novmx_cmp + +3: + /* need to check whether r4 has the same offset with r3 + * for 16 bytes boundary. + */ + xor r0,r3,r4 + andi. r0,r0,0xf + bne .Ldiffoffset_vmx_cmp_start + + /* len is no less than 4KB. Need to align with 16 bytes further. + */ + andi. rA,r3,8 + LD rA,0,r3 + beq 4f + LD rB,0,r4 + cmpld cr0,rA,rB + addi r3,r3,8 + addi r4,r4,8 + addi r5,r5,-8 + + beq cr0,4f + /* save and restore cr0 */ + mfocrf r5,128 + EXIT_VMX_OPS + mtocrf 128,r5 + b .LcmpAB_lightweight + +4: + /* compare 32 bytes for each loop */ + srdi r0,r5,5 + mtctr r0 + clrldi r5,r5,59 + li off16,16 + +.balign 16 +5: + lvx v0,0,r3 + lvx v1,0,r4 + VCMPEQUD_RC(v0,v0,v1) + bnl cr6,7f + lvx v0,off16,r3 + lvx v1,off16,r4 + VCMPEQUD_RC(v0,v0,v1) + bnl cr6,6f + addi r3,r3,32 + addi r4,r4,32 + bdnz 5b + + EXIT_VMX_OPS + cmpdi r5,0 + beq .Lzero + b .Lcmp_lt32bytes + +6: + addi r3,r3,16 + addi r4,r4,16 + +7: + /* diff the last 16 bytes */ + EXIT_VMX_OPS + LD rA,0,r3 + LD rB,0,r4 + cmpld cr0,rA,rB + li off8,8 + bne cr0,.LcmpAB_lightweight + + LD rA,off8,r3 + LD rB,off8,r4 + cmpld cr0,rA,rB + bne cr0,.LcmpAB_lightweight + b .Lzero +#endif + +.Ldiffoffset_8bytes_make_align_start: + /* now try to align s1 with 8 bytes */ + rlwinm r6,r3,3,26,28 + beq .Ldiffoffset_align_s1_8bytes + + clrrdi r3,r3,3 + LD rA,0,r3 + LD rB,0,r4 /* unaligned load */ + sld rA,rA,r6 + srd rA,rA,r6 + srd rB,rB,r6 + cmpld cr0,rA,rB + srwi r6,r6,3 + bne cr0,.LcmpAB_lightweight + + subfic r6,r6,8 + subf. r5,r6,r5 + addi r3,r3,8 + add r4,r4,r6 + + beq .Lzero + +.Ldiffoffset_align_s1_8bytes: + /* now s1 is aligned with 8 bytes. */ +#ifdef CONFIG_ALTIVEC +BEGIN_FTR_SECTION + /* only do vmx ops when the size equal or greater than 4K bytes */ + cmpdi cr5,r5,VMX_THRESH + bge cr5,.Ldiffoffset_vmx_cmp +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + +.Ldiffoffset_novmx_cmp: +#endif + + + cmpdi cr5,r5,31 + ble cr5,.Lcmp_lt32bytes + +#ifdef CONFIG_ALTIVEC + b .Llong_novmx_cmp +#else + b .Llong +#endif + +#ifdef CONFIG_ALTIVEC +.Ldiffoffset_vmx_cmp: + /* perform a 32 bytes pre-checking before + * enable VMX operations. + */ + li r0,4 + mtctr r0 +.Ldiffoffset_prechk_32B_loop: + LD rA,0,r3 + LD rB,0,r4 + cmpld cr0,rA,rB + addi r3,r3,8 + addi r4,r4,8 + bne cr0,.LcmpAB_lightweight + addi r5,r5,-8 + bdnz .Ldiffoffset_prechk_32B_loop + + ENTER_VMX_OPS + beq cr1,.Ldiffoffset_novmx_cmp + +.Ldiffoffset_vmx_cmp_start: + /* Firstly try to align r3 with 16 bytes */ + andi. r6,r3,0xf + li off16,16 + beq .Ldiffoffset_vmx_s1_16bytes_align + + LVS v3,0,r3 + LVS v4,0,r4 + + lvx v5,0,r3 + lvx v6,0,r4 + LD_VSR_CROSS16B(r3,v3,v5,v7,v9) + LD_VSR_CROSS16B(r4,v4,v6,v8,v10) + + VCMPEQUB_RC(v7,v9,v10) + bnl cr6,.Ldiffoffset_vmx_diff_found + + subfic r6,r6,16 + subf r5,r6,r5 + add r3,r3,r6 + add r4,r4,r6 + +.Ldiffoffset_vmx_s1_16bytes_align: + /* now s1 is aligned with 16 bytes */ + lvx v6,0,r4 + LVS v4,0,r4 + srdi r6,r5,5 /* loop for 32 bytes each */ + clrldi r5,r5,59 + mtctr r6 + +.balign 16 +.Ldiffoffset_vmx_32bytesloop: + /* the first qw of r4 was saved in v6 */ + lvx v9,0,r3 + LD_VSR_CROSS16B(r4,v4,v6,v8,v10) + VCMPEQUB_RC(v7,v9,v10) + vor v6,v8,v8 + bnl cr6,.Ldiffoffset_vmx_diff_found + + addi r3,r3,16 + addi r4,r4,16 + + lvx v9,0,r3 + LD_VSR_CROSS16B(r4,v4,v6,v8,v10) + VCMPEQUB_RC(v7,v9,v10) + vor v6,v8,v8 + bnl cr6,.Ldiffoffset_vmx_diff_found + + addi r3,r3,16 + addi r4,r4,16 + + bdnz .Ldiffoffset_vmx_32bytesloop + + EXIT_VMX_OPS + + cmpdi r5,0 + beq .Lzero + b .Lcmp_lt32bytes + +.Ldiffoffset_vmx_diff_found: + EXIT_VMX_OPS + /* anyway, the diff will appear in next 16 bytes */ + li r5,16 + b .Lcmp_lt32bytes + +#endif +EXPORT_SYMBOL(memcmp) diff --git a/arch/powerpc/lib/memcpy_64.S b/arch/powerpc/lib/memcpy_64.S new file mode 100644 index 0000000000..b5a67e2014 --- /dev/null +++ b/arch/powerpc/lib/memcpy_64.S @@ -0,0 +1,230 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2002 Paul Mackerras, IBM Corp. + */ +#include <linux/export.h> +#include <asm/processor.h> +#include <asm/ppc_asm.h> +#include <asm/asm-compat.h> +#include <asm/feature-fixups.h> +#include <asm/kasan.h> + +#ifndef SELFTEST_CASE +/* For big-endian, 0 == most CPUs, 1 == POWER6, 2 == Cell */ +#define SELFTEST_CASE 0 +#endif + + .align 7 +_GLOBAL_TOC_KASAN(memcpy) +BEGIN_FTR_SECTION +#ifdef __LITTLE_ENDIAN__ + cmpdi cr7,r5,0 +#else + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) /* save destination pointer for return value */ +#endif +FTR_SECTION_ELSE +#ifdef CONFIG_PPC_BOOK3S_64 + b memcpy_power7 +#endif +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY) +#ifdef __LITTLE_ENDIAN__ + /* dumb little-endian memcpy that will get replaced at runtime */ + addi r9,r3,-1 + addi r4,r4,-1 + beqlr cr7 + mtctr r5 +1: lbzu r10,1(r4) + stbu r10,1(r9) + bdnz 1b + blr +#else + PPC_MTOCRF(0x01,r5) + cmpldi cr1,r5,16 + neg r6,r3 # LS 3 bits = # bytes to 8-byte dest bdry + andi. r6,r6,7 + dcbt 0,r4 + blt cr1,.Lshort_copy +/* Below we want to nop out the bne if we're on a CPU that has the + CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit + cleared. + At the time of writing the only CPU that has this combination of bits + set is Power6. */ +test_feature = (SELFTEST_CASE == 1) +BEGIN_FTR_SECTION + nop +FTR_SECTION_ELSE + bne .Ldst_unaligned +ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \ + CPU_FTR_UNALIGNED_LD_STD) +.Ldst_aligned: + addi r3,r3,-16 +test_feature = (SELFTEST_CASE == 0) +BEGIN_FTR_SECTION + andi. r0,r4,7 + bne .Lsrc_unaligned +END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) + srdi r7,r5,4 + ld r9,0(r4) + addi r4,r4,-8 + mtctr r7 + andi. r5,r5,7 + bf cr7*4+0,2f + addi r3,r3,8 + addi r4,r4,8 + mr r8,r9 + blt cr1,3f +1: ld r9,8(r4) + std r8,8(r3) +2: ldu r8,16(r4) + stdu r9,16(r3) + bdnz 1b +3: std r8,8(r3) + beq 3f + addi r3,r3,16 +.Ldo_tail: + bf cr7*4+1,1f + lwz r9,8(r4) + addi r4,r4,4 + stw r9,0(r3) + addi r3,r3,4 +1: bf cr7*4+2,2f + lhz r9,8(r4) + addi r4,r4,2 + sth r9,0(r3) + addi r3,r3,2 +2: bf cr7*4+3,3f + lbz r9,8(r4) + stb r9,0(r3) +3: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) /* return dest pointer */ + blr + +.Lsrc_unaligned: + srdi r6,r5,3 + addi r5,r5,-16 + subf r4,r0,r4 + srdi r7,r5,4 + sldi r10,r0,3 + cmpdi cr6,r6,3 + andi. r5,r5,7 + mtctr r7 + subfic r11,r10,64 + add r5,r5,r0 + + bt cr7*4+0,0f + + ld r9,0(r4) # 3+2n loads, 2+2n stores + ld r0,8(r4) + sld r6,r9,r10 + ldu r9,16(r4) + srd r7,r0,r11 + sld r8,r0,r10 + or r7,r7,r6 + blt cr6,4f + ld r0,8(r4) + # s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12 + b 2f + +0: ld r0,0(r4) # 4+2n loads, 3+2n stores + ldu r9,8(r4) + sld r8,r0,r10 + addi r3,r3,-8 + blt cr6,5f + ld r0,8(r4) + srd r12,r9,r11 + sld r6,r9,r10 + ldu r9,16(r4) + or r12,r8,r12 + srd r7,r0,r11 + sld r8,r0,r10 + addi r3,r3,16 + beq cr6,3f + + # d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9 +1: or r7,r7,r6 + ld r0,8(r4) + std r12,8(r3) +2: srd r12,r9,r11 + sld r6,r9,r10 + ldu r9,16(r4) + or r12,r8,r12 + stdu r7,16(r3) + srd r7,r0,r11 + sld r8,r0,r10 + bdnz 1b + +3: std r12,8(r3) + or r7,r7,r6 +4: std r7,16(r3) +5: srd r12,r9,r11 + or r12,r8,r12 + std r12,24(r3) + beq 4f + cmpwi cr1,r5,8 + addi r3,r3,32 + sld r9,r9,r10 + ble cr1,6f + ld r0,8(r4) + srd r7,r0,r11 + or r9,r7,r9 +6: + bf cr7*4+1,1f + rotldi r9,r9,32 + stw r9,0(r3) + addi r3,r3,4 +1: bf cr7*4+2,2f + rotldi r9,r9,16 + sth r9,0(r3) + addi r3,r3,2 +2: bf cr7*4+3,3f + rotldi r9,r9,8 + stb r9,0(r3) +3: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) /* return dest pointer */ + blr + +.Ldst_unaligned: + PPC_MTOCRF(0x01,r6) # put #bytes to 8B bdry into cr7 + subf r5,r6,r5 + li r7,0 + cmpldi cr1,r5,16 + bf cr7*4+3,1f + lbz r0,0(r4) + stb r0,0(r3) + addi r7,r7,1 +1: bf cr7*4+2,2f + lhzx r0,r7,r4 + sthx r0,r7,r3 + addi r7,r7,2 +2: bf cr7*4+1,3f + lwzx r0,r7,r4 + stwx r0,r7,r3 +3: PPC_MTOCRF(0x01,r5) + add r4,r6,r4 + add r3,r6,r3 + b .Ldst_aligned + +.Lshort_copy: + bf cr7*4+0,1f + lwz r0,0(r4) + lwz r9,4(r4) + addi r4,r4,8 + stw r0,0(r3) + stw r9,4(r3) + addi r3,r3,8 +1: bf cr7*4+1,2f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 +2: bf cr7*4+2,3f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 +3: bf cr7*4+3,4f + lbz r0,0(r4) + stb r0,0(r3) +4: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) /* return dest pointer */ + blr +#endif +EXPORT_SYMBOL(memcpy) +EXPORT_SYMBOL_KASAN(memcpy) diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S new file mode 100644 index 0000000000..9398b2b746 --- /dev/null +++ b/arch/powerpc/lib/memcpy_power7.S @@ -0,0 +1,641 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ +#include <asm/ppc_asm.h> + +#ifndef SELFTEST_CASE +/* 0 == don't use VMX, 1 == use VMX */ +#define SELFTEST_CASE 0 +#endif + +#ifdef __BIG_ENDIAN__ +#define LVS(VRT,RA,RB) lvsl VRT,RA,RB +#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC +#else +#define LVS(VRT,RA,RB) lvsr VRT,RA,RB +#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC +#endif + +_GLOBAL(memcpy_power7) + cmpldi r5,16 + cmpldi cr1,r5,4096 + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + blt .Lshort_copy + +#ifdef CONFIG_ALTIVEC +test_feature = SELFTEST_CASE +BEGIN_FTR_SECTION + bgt cr1, .Lvmx_copy +END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) +#endif + +.Lnonvmx_copy: + /* Get the source 8B aligned */ + neg r6,r4 + mtocrf 0x01,r6 + clrldi r6,r6,(64-3) + + bf cr7*4+3,1f + lbz r0,0(r4) + addi r4,r4,1 + stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +3: sub r5,r5,r6 + cmpldi r5,128 + blt 5f + + mflr r0 + stdu r1,-STACKFRAMESIZE(r1) + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + std r17,STK_REG(R17)(r1) + std r18,STK_REG(R18)(r1) + std r19,STK_REG(R19)(r1) + std r20,STK_REG(R20)(r1) + std r21,STK_REG(R21)(r1) + std r22,STK_REG(R22)(r1) + std r0,STACKFRAMESIZE+16(r1) + + srdi r6,r5,7 + mtctr r6 + + /* Now do cacheline (128B) sized loads and stores. */ + .align 5 +4: + ld r0,0(r4) + ld r6,8(r4) + ld r7,16(r4) + ld r8,24(r4) + ld r9,32(r4) + ld r10,40(r4) + ld r11,48(r4) + ld r12,56(r4) + ld r14,64(r4) + ld r15,72(r4) + ld r16,80(r4) + ld r17,88(r4) + ld r18,96(r4) + ld r19,104(r4) + ld r20,112(r4) + ld r21,120(r4) + addi r4,r4,128 + std r0,0(r3) + std r6,8(r3) + std r7,16(r3) + std r8,24(r3) + std r9,32(r3) + std r10,40(r3) + std r11,48(r3) + std r12,56(r3) + std r14,64(r3) + std r15,72(r3) + std r16,80(r3) + std r17,88(r3) + std r18,96(r3) + std r19,104(r3) + std r20,112(r3) + std r21,120(r3) + addi r3,r3,128 + bdnz 4b + + clrldi r5,r5,(64-7) + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + ld r17,STK_REG(R17)(r1) + ld r18,STK_REG(R18)(r1) + ld r19,STK_REG(R19)(r1) + ld r20,STK_REG(R20)(r1) + ld r21,STK_REG(R21)(r1) + ld r22,STK_REG(R22)(r1) + addi r1,r1,STACKFRAMESIZE + + /* Up to 127B to go */ +5: srdi r6,r5,4 + mtocrf 0x01,r6 + +6: bf cr7*4+1,7f + ld r0,0(r4) + ld r6,8(r4) + ld r7,16(r4) + ld r8,24(r4) + ld r9,32(r4) + ld r10,40(r4) + ld r11,48(r4) + ld r12,56(r4) + addi r4,r4,64 + std r0,0(r3) + std r6,8(r3) + std r7,16(r3) + std r8,24(r3) + std r9,32(r3) + std r10,40(r3) + std r11,48(r3) + std r12,56(r3) + addi r3,r3,64 + + /* Up to 63B to go */ +7: bf cr7*4+2,8f + ld r0,0(r4) + ld r6,8(r4) + ld r7,16(r4) + ld r8,24(r4) + addi r4,r4,32 + std r0,0(r3) + std r6,8(r3) + std r7,16(r3) + std r8,24(r3) + addi r3,r3,32 + + /* Up to 31B to go */ +8: bf cr7*4+3,9f + ld r0,0(r4) + ld r6,8(r4) + addi r4,r4,16 + std r0,0(r3) + std r6,8(r3) + addi r3,r3,16 + +9: clrldi r5,r5,(64-4) + + /* Up to 15B to go */ +.Lshort_copy: + mtocrf 0x01,r5 + bf cr7*4+0,12f + lwz r0,0(r4) /* Less chance of a reject with word ops */ + lwz r6,4(r4) + addi r4,r4,8 + stw r0,0(r3) + stw r6,4(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f + lbz r0,0(r4) + stb r0,0(r3) + +15: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + blr + +.Lunwind_stack_nonvmx_copy: + addi r1,r1,STACKFRAMESIZE + b .Lnonvmx_copy + +.Lvmx_copy: +#ifdef CONFIG_ALTIVEC + mflr r0 + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) + std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) + std r0,16(r1) + stdu r1,-STACKFRAMESIZE(r1) + bl CFUNC(enter_vmx_ops) + cmpwi cr1,r3,0 + ld r0,STACKFRAMESIZE+16(r1) + ld r3,STK_REG(R31)(r1) + ld r4,STK_REG(R30)(r1) + ld r5,STK_REG(R29)(r1) + mtlr r0 + + /* + * We prefetch both the source and destination using enhanced touch + * instructions. We use a stream ID of 0 for the load side and + * 1 for the store side. + */ + clrrdi r6,r4,7 + clrrdi r9,r3,7 + ori r9,r9,1 /* stream=1 */ + + srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */ + cmpldi r7,0x3FF + ble 1f + li r7,0x3FF +1: lis r0,0x0E00 /* depth=7 */ + sldi r7,r7,7 + or r7,r7,r0 + ori r10,r7,1 /* stream=1 */ + + lis r8,0x8000 /* GO=1 */ + clrldi r8,r8,32 + + dcbt 0,r6,0b01000 + dcbt 0,r7,0b01010 + dcbtst 0,r9,0b01000 + dcbtst 0,r10,0b01010 + eieio + dcbt 0,r8,0b01010 /* GO */ + + beq cr1,.Lunwind_stack_nonvmx_copy + + /* + * If source and destination are not relatively aligned we use a + * slower permute loop. + */ + xor r6,r4,r3 + rldicl. r6,r6,0,(64-4) + bne .Lvmx_unaligned_copy + + /* Get the destination 16B aligned */ + neg r6,r3 + mtocrf 0x01,r6 + clrldi r6,r6,(64-4) + + bf cr7*4+3,1f + lbz r0,0(r4) + addi r4,r4,1 + stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +3: bf cr7*4+0,4f + ld r0,0(r4) + addi r4,r4,8 + std r0,0(r3) + addi r3,r3,8 + +4: sub r5,r5,r6 + + /* Get the desination 128B aligned */ + neg r6,r3 + srdi r7,r6,4 + mtocrf 0x01,r7 + clrldi r6,r6,(64-7) + + li r9,16 + li r10,32 + li r11,48 + + bf cr7*4+3,5f + lvx v1,0,r4 + addi r4,r4,16 + stvx v1,0,r3 + addi r3,r3,16 + +5: bf cr7*4+2,6f + lvx v1,0,r4 + lvx v0,r4,r9 + addi r4,r4,32 + stvx v1,0,r3 + stvx v0,r3,r9 + addi r3,r3,32 + +6: bf cr7*4+1,7f + lvx v3,0,r4 + lvx v2,r4,r9 + lvx v1,r4,r10 + lvx v0,r4,r11 + addi r4,r4,64 + stvx v3,0,r3 + stvx v2,r3,r9 + stvx v1,r3,r10 + stvx v0,r3,r11 + addi r3,r3,64 + +7: sub r5,r5,r6 + srdi r6,r5,7 + + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + + li r12,64 + li r14,80 + li r15,96 + li r16,112 + + mtctr r6 + + /* + * Now do cacheline sized loads and stores. By this stage the + * cacheline stores are also cacheline aligned. + */ + .align 5 +8: + lvx v7,0,r4 + lvx v6,r4,r9 + lvx v5,r4,r10 + lvx v4,r4,r11 + lvx v3,r4,r12 + lvx v2,r4,r14 + lvx v1,r4,r15 + lvx v0,r4,r16 + addi r4,r4,128 + stvx v7,0,r3 + stvx v6,r3,r9 + stvx v5,r3,r10 + stvx v4,r3,r11 + stvx v3,r3,r12 + stvx v2,r3,r14 + stvx v1,r3,r15 + stvx v0,r3,r16 + addi r3,r3,128 + bdnz 8b + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + + /* Up to 127B to go */ + clrldi r5,r5,(64-7) + srdi r6,r5,4 + mtocrf 0x01,r6 + + bf cr7*4+1,9f + lvx v3,0,r4 + lvx v2,r4,r9 + lvx v1,r4,r10 + lvx v0,r4,r11 + addi r4,r4,64 + stvx v3,0,r3 + stvx v2,r3,r9 + stvx v1,r3,r10 + stvx v0,r3,r11 + addi r3,r3,64 + +9: bf cr7*4+2,10f + lvx v1,0,r4 + lvx v0,r4,r9 + addi r4,r4,32 + stvx v1,0,r3 + stvx v0,r3,r9 + addi r3,r3,32 + +10: bf cr7*4+3,11f + lvx v1,0,r4 + addi r4,r4,16 + stvx v1,0,r3 + addi r3,r3,16 + + /* Up to 15B to go */ +11: clrldi r5,r5,(64-4) + mtocrf 0x01,r5 + bf cr7*4+0,12f + ld r0,0(r4) + addi r4,r4,8 + std r0,0(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f + lbz r0,0(r4) + stb r0,0(r3) + +15: addi r1,r1,STACKFRAMESIZE + ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + b CFUNC(exit_vmx_ops) /* tail call optimise */ + +.Lvmx_unaligned_copy: + /* Get the destination 16B aligned */ + neg r6,r3 + mtocrf 0x01,r6 + clrldi r6,r6,(64-4) + + bf cr7*4+3,1f + lbz r0,0(r4) + addi r4,r4,1 + stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +3: bf cr7*4+0,4f + lwz r0,0(r4) /* Less chance of a reject with word ops */ + lwz r7,4(r4) + addi r4,r4,8 + stw r0,0(r3) + stw r7,4(r3) + addi r3,r3,8 + +4: sub r5,r5,r6 + + /* Get the desination 128B aligned */ + neg r6,r3 + srdi r7,r6,4 + mtocrf 0x01,r7 + clrldi r6,r6,(64-7) + + li r9,16 + li r10,32 + li r11,48 + + LVS(v16,0,r4) /* Setup permute control vector */ + lvx v0,0,r4 + addi r4,r4,16 + + bf cr7*4+3,5f + lvx v1,0,r4 + VPERM(v8,v0,v1,v16) + addi r4,r4,16 + stvx v8,0,r3 + addi r3,r3,16 + vor v0,v1,v1 + +5: bf cr7*4+2,6f + lvx v1,0,r4 + VPERM(v8,v0,v1,v16) + lvx v0,r4,r9 + VPERM(v9,v1,v0,v16) + addi r4,r4,32 + stvx v8,0,r3 + stvx v9,r3,r9 + addi r3,r3,32 + +6: bf cr7*4+1,7f + lvx v3,0,r4 + VPERM(v8,v0,v3,v16) + lvx v2,r4,r9 + VPERM(v9,v3,v2,v16) + lvx v1,r4,r10 + VPERM(v10,v2,v1,v16) + lvx v0,r4,r11 + VPERM(v11,v1,v0,v16) + addi r4,r4,64 + stvx v8,0,r3 + stvx v9,r3,r9 + stvx v10,r3,r10 + stvx v11,r3,r11 + addi r3,r3,64 + +7: sub r5,r5,r6 + srdi r6,r5,7 + + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + + li r12,64 + li r14,80 + li r15,96 + li r16,112 + + mtctr r6 + + /* + * Now do cacheline sized loads and stores. By this stage the + * cacheline stores are also cacheline aligned. + */ + .align 5 +8: + lvx v7,0,r4 + VPERM(v8,v0,v7,v16) + lvx v6,r4,r9 + VPERM(v9,v7,v6,v16) + lvx v5,r4,r10 + VPERM(v10,v6,v5,v16) + lvx v4,r4,r11 + VPERM(v11,v5,v4,v16) + lvx v3,r4,r12 + VPERM(v12,v4,v3,v16) + lvx v2,r4,r14 + VPERM(v13,v3,v2,v16) + lvx v1,r4,r15 + VPERM(v14,v2,v1,v16) + lvx v0,r4,r16 + VPERM(v15,v1,v0,v16) + addi r4,r4,128 + stvx v8,0,r3 + stvx v9,r3,r9 + stvx v10,r3,r10 + stvx v11,r3,r11 + stvx v12,r3,r12 + stvx v13,r3,r14 + stvx v14,r3,r15 + stvx v15,r3,r16 + addi r3,r3,128 + bdnz 8b + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + + /* Up to 127B to go */ + clrldi r5,r5,(64-7) + srdi r6,r5,4 + mtocrf 0x01,r6 + + bf cr7*4+1,9f + lvx v3,0,r4 + VPERM(v8,v0,v3,v16) + lvx v2,r4,r9 + VPERM(v9,v3,v2,v16) + lvx v1,r4,r10 + VPERM(v10,v2,v1,v16) + lvx v0,r4,r11 + VPERM(v11,v1,v0,v16) + addi r4,r4,64 + stvx v8,0,r3 + stvx v9,r3,r9 + stvx v10,r3,r10 + stvx v11,r3,r11 + addi r3,r3,64 + +9: bf cr7*4+2,10f + lvx v1,0,r4 + VPERM(v8,v0,v1,v16) + lvx v0,r4,r9 + VPERM(v9,v1,v0,v16) + addi r4,r4,32 + stvx v8,0,r3 + stvx v9,r3,r9 + addi r3,r3,32 + +10: bf cr7*4+3,11f + lvx v1,0,r4 + VPERM(v8,v0,v1,v16) + addi r4,r4,16 + stvx v8,0,r3 + addi r3,r3,16 + + /* Up to 15B to go */ +11: clrldi r5,r5,(64-4) + addi r4,r4,-16 /* Unwind the +16 load offset */ + mtocrf 0x01,r5 + bf cr7*4+0,12f + lwz r0,0(r4) /* Less chance of a reject with word ops */ + lwz r6,4(r4) + addi r4,r4,8 + stw r0,0(r3) + stw r6,4(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f + lbz r0,0(r4) + stb r0,0(r3) + +15: addi r1,r1,STACKFRAMESIZE + ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + b CFUNC(exit_vmx_ops) /* tail call optimise */ +#endif /* CONFIG_ALTIVEC */ diff --git a/arch/powerpc/lib/pmem.c b/arch/powerpc/lib/pmem.c new file mode 100644 index 0000000000..4e724c4c01 --- /dev/null +++ b/arch/powerpc/lib/pmem.c @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright(c) 2017 IBM Corporation. All rights reserved. + */ + +#include <linux/string.h> +#include <linux/export.h> +#include <linux/uaccess.h> +#include <linux/libnvdimm.h> + +#include <asm/cacheflush.h> + +static inline void __clean_pmem_range(unsigned long start, unsigned long stop) +{ + unsigned long shift = l1_dcache_shift(); + unsigned long bytes = l1_dcache_bytes(); + void *addr = (void *)(start & ~(bytes - 1)); + unsigned long size = stop - (unsigned long)addr + (bytes - 1); + unsigned long i; + + for (i = 0; i < size >> shift; i++, addr += bytes) + asm volatile(PPC_DCBSTPS(%0, %1): :"i"(0), "r"(addr): "memory"); +} + +static inline void __flush_pmem_range(unsigned long start, unsigned long stop) +{ + unsigned long shift = l1_dcache_shift(); + unsigned long bytes = l1_dcache_bytes(); + void *addr = (void *)(start & ~(bytes - 1)); + unsigned long size = stop - (unsigned long)addr + (bytes - 1); + unsigned long i; + + for (i = 0; i < size >> shift; i++, addr += bytes) + asm volatile(PPC_DCBFPS(%0, %1): :"i"(0), "r"(addr): "memory"); +} + +static inline void clean_pmem_range(unsigned long start, unsigned long stop) +{ + if (cpu_has_feature(CPU_FTR_ARCH_207S)) + return __clean_pmem_range(start, stop); +} + +static inline void flush_pmem_range(unsigned long start, unsigned long stop) +{ + if (cpu_has_feature(CPU_FTR_ARCH_207S)) + return __flush_pmem_range(start, stop); +} + +/* + * CONFIG_ARCH_HAS_PMEM_API symbols + */ +void arch_wb_cache_pmem(void *addr, size_t size) +{ + unsigned long start = (unsigned long) addr; + clean_pmem_range(start, start + size); +} +EXPORT_SYMBOL_GPL(arch_wb_cache_pmem); + +void arch_invalidate_pmem(void *addr, size_t size) +{ + unsigned long start = (unsigned long) addr; + flush_pmem_range(start, start + size); +} +EXPORT_SYMBOL_GPL(arch_invalidate_pmem); + +/* + * CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE symbols + */ +long __copy_from_user_flushcache(void *dest, const void __user *src, + unsigned size) +{ + unsigned long copied, start = (unsigned long) dest; + + copied = __copy_from_user(dest, src, size); + clean_pmem_range(start, start + size); + + return copied; +} + +void memcpy_flushcache(void *dest, const void *src, size_t size) +{ + unsigned long start = (unsigned long) dest; + + memcpy(dest, src, size); + clean_pmem_range(start, start + size); +} +EXPORT_SYMBOL(memcpy_flushcache); diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c new file mode 100644 index 0000000000..6dd2f46bd3 --- /dev/null +++ b/arch/powerpc/lib/qspinlock.c @@ -0,0 +1,1007 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/bug.h> +#include <linux/compiler.h> +#include <linux/export.h> +#include <linux/percpu.h> +#include <linux/processor.h> +#include <linux/smp.h> +#include <linux/topology.h> +#include <linux/sched/clock.h> +#include <asm/qspinlock.h> +#include <asm/paravirt.h> + +#define MAX_NODES 4 + +struct qnode { + struct qnode *next; + struct qspinlock *lock; + int cpu; + int yield_cpu; + u8 locked; /* 1 if lock acquired */ +}; + +struct qnodes { + int count; + struct qnode nodes[MAX_NODES]; +}; + +/* Tuning parameters */ +static int steal_spins __read_mostly = (1 << 5); +static int remote_steal_spins __read_mostly = (1 << 2); +#if _Q_SPIN_TRY_LOCK_STEAL == 1 +static const bool maybe_stealers = true; +#else +static bool maybe_stealers __read_mostly = true; +#endif +static int head_spins __read_mostly = (1 << 8); + +static bool pv_yield_owner __read_mostly = true; +static bool pv_yield_allow_steal __read_mostly = false; +static bool pv_spin_on_preempted_owner __read_mostly = false; +static bool pv_sleepy_lock __read_mostly = true; +static bool pv_sleepy_lock_sticky __read_mostly = false; +static u64 pv_sleepy_lock_interval_ns __read_mostly = 0; +static int pv_sleepy_lock_factor __read_mostly = 256; +static bool pv_yield_prev __read_mostly = true; +static bool pv_yield_propagate_owner __read_mostly = true; +static bool pv_prod_head __read_mostly = false; + +static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes); +static DEFINE_PER_CPU_ALIGNED(u64, sleepy_lock_seen_clock); + +#if _Q_SPIN_SPEC_BARRIER == 1 +#define spec_barrier() do { asm volatile("ori 31,31,0" ::: "memory"); } while (0) +#else +#define spec_barrier() do { } while (0) +#endif + +static __always_inline bool recently_sleepy(void) +{ + /* pv_sleepy_lock is true when this is called */ + if (pv_sleepy_lock_interval_ns) { + u64 seen = this_cpu_read(sleepy_lock_seen_clock); + + if (seen) { + u64 delta = sched_clock() - seen; + if (delta < pv_sleepy_lock_interval_ns) + return true; + this_cpu_write(sleepy_lock_seen_clock, 0); + } + } + + return false; +} + +static __always_inline int get_steal_spins(bool paravirt, bool sleepy) +{ + if (paravirt && sleepy) + return steal_spins * pv_sleepy_lock_factor; + else + return steal_spins; +} + +static __always_inline int get_remote_steal_spins(bool paravirt, bool sleepy) +{ + if (paravirt && sleepy) + return remote_steal_spins * pv_sleepy_lock_factor; + else + return remote_steal_spins; +} + +static __always_inline int get_head_spins(bool paravirt, bool sleepy) +{ + if (paravirt && sleepy) + return head_spins * pv_sleepy_lock_factor; + else + return head_spins; +} + +static inline u32 encode_tail_cpu(int cpu) +{ + return (cpu + 1) << _Q_TAIL_CPU_OFFSET; +} + +static inline int decode_tail_cpu(u32 val) +{ + return (val >> _Q_TAIL_CPU_OFFSET) - 1; +} + +static inline int get_owner_cpu(u32 val) +{ + return (val & _Q_OWNER_CPU_MASK) >> _Q_OWNER_CPU_OFFSET; +} + +/* + * Try to acquire the lock if it was not already locked. If the tail matches + * mytail then clear it, otherwise leave it unchnaged. Return previous value. + * + * This is used by the head of the queue to acquire the lock and clean up + * its tail if it was the last one queued. + */ +static __always_inline u32 trylock_clean_tail(struct qspinlock *lock, u32 tail) +{ + u32 newval = queued_spin_encode_locked_val(); + u32 prev, tmp; + + asm volatile( +"1: lwarx %0,0,%2,%7 # trylock_clean_tail \n" + /* This test is necessary if there could be stealers */ +" andi. %1,%0,%5 \n" +" bne 3f \n" + /* Test whether the lock tail == mytail */ +" and %1,%0,%6 \n" +" cmpw 0,%1,%3 \n" + /* Merge the new locked value */ +" or %1,%1,%4 \n" +" bne 2f \n" + /* If the lock tail matched, then clear it, otherwise leave it. */ +" andc %1,%1,%6 \n" +"2: stwcx. %1,0,%2 \n" +" bne- 1b \n" +"\t" PPC_ACQUIRE_BARRIER " \n" +"3: \n" + : "=&r" (prev), "=&r" (tmp) + : "r" (&lock->val), "r"(tail), "r" (newval), + "i" (_Q_LOCKED_VAL), + "r" (_Q_TAIL_CPU_MASK), + "i" (_Q_SPIN_EH_HINT) + : "cr0", "memory"); + + return prev; +} + +/* + * Publish our tail, replacing previous tail. Return previous value. + * + * This provides a release barrier for publishing node, this pairs with the + * acquire barrier in get_tail_qnode() when the next CPU finds this tail + * value. + */ +static __always_inline u32 publish_tail_cpu(struct qspinlock *lock, u32 tail) +{ + u32 prev, tmp; + + kcsan_release(); + + asm volatile( +"\t" PPC_RELEASE_BARRIER " \n" +"1: lwarx %0,0,%2 # publish_tail_cpu \n" +" andc %1,%0,%4 \n" +" or %1,%1,%3 \n" +" stwcx. %1,0,%2 \n" +" bne- 1b \n" + : "=&r" (prev), "=&r"(tmp) + : "r" (&lock->val), "r" (tail), "r"(_Q_TAIL_CPU_MASK) + : "cr0", "memory"); + + return prev; +} + +static __always_inline u32 set_mustq(struct qspinlock *lock) +{ + u32 prev; + + asm volatile( +"1: lwarx %0,0,%1 # set_mustq \n" +" or %0,%0,%2 \n" +" stwcx. %0,0,%1 \n" +" bne- 1b \n" + : "=&r" (prev) + : "r" (&lock->val), "r" (_Q_MUST_Q_VAL) + : "cr0", "memory"); + + return prev; +} + +static __always_inline u32 clear_mustq(struct qspinlock *lock) +{ + u32 prev; + + asm volatile( +"1: lwarx %0,0,%1 # clear_mustq \n" +" andc %0,%0,%2 \n" +" stwcx. %0,0,%1 \n" +" bne- 1b \n" + : "=&r" (prev) + : "r" (&lock->val), "r" (_Q_MUST_Q_VAL) + : "cr0", "memory"); + + return prev; +} + +static __always_inline bool try_set_sleepy(struct qspinlock *lock, u32 old) +{ + u32 prev; + u32 new = old | _Q_SLEEPY_VAL; + + BUG_ON(!(old & _Q_LOCKED_VAL)); + BUG_ON(old & _Q_SLEEPY_VAL); + + asm volatile( +"1: lwarx %0,0,%1 # try_set_sleepy \n" +" cmpw 0,%0,%2 \n" +" bne- 2f \n" +" stwcx. %3,0,%1 \n" +" bne- 1b \n" +"2: \n" + : "=&r" (prev) + : "r" (&lock->val), "r"(old), "r" (new) + : "cr0", "memory"); + + return likely(prev == old); +} + +static __always_inline void seen_sleepy_owner(struct qspinlock *lock, u32 val) +{ + if (pv_sleepy_lock) { + if (pv_sleepy_lock_interval_ns) + this_cpu_write(sleepy_lock_seen_clock, sched_clock()); + if (!(val & _Q_SLEEPY_VAL)) + try_set_sleepy(lock, val); + } +} + +static __always_inline void seen_sleepy_lock(void) +{ + if (pv_sleepy_lock && pv_sleepy_lock_interval_ns) + this_cpu_write(sleepy_lock_seen_clock, sched_clock()); +} + +static __always_inline void seen_sleepy_node(struct qspinlock *lock, u32 val) +{ + if (pv_sleepy_lock) { + if (pv_sleepy_lock_interval_ns) + this_cpu_write(sleepy_lock_seen_clock, sched_clock()); + if (val & _Q_LOCKED_VAL) { + if (!(val & _Q_SLEEPY_VAL)) + try_set_sleepy(lock, val); + } + } +} + +static struct qnode *get_tail_qnode(struct qspinlock *lock, u32 val) +{ + int cpu = decode_tail_cpu(val); + struct qnodes *qnodesp = per_cpu_ptr(&qnodes, cpu); + int idx; + + /* + * After publishing the new tail and finding a previous tail in the + * previous val (which is the control dependency), this barrier + * orders the release barrier in publish_tail_cpu performed by the + * last CPU, with subsequently looking at its qnode structures + * after the barrier. + */ + smp_acquire__after_ctrl_dep(); + + for (idx = 0; idx < MAX_NODES; idx++) { + struct qnode *qnode = &qnodesp->nodes[idx]; + if (qnode->lock == lock) + return qnode; + } + + BUG(); +} + +/* Called inside spin_begin(). Returns whether or not the vCPU was preempted. */ +static __always_inline bool __yield_to_locked_owner(struct qspinlock *lock, u32 val, bool paravirt, bool mustq) +{ + int owner; + u32 yield_count; + bool preempted = false; + + BUG_ON(!(val & _Q_LOCKED_VAL)); + + if (!paravirt) + goto relax; + + if (!pv_yield_owner) + goto relax; + + owner = get_owner_cpu(val); + yield_count = yield_count_of(owner); + + if ((yield_count & 1) == 0) + goto relax; /* owner vcpu is running */ + + spin_end(); + + seen_sleepy_owner(lock, val); + preempted = true; + + /* + * Read the lock word after sampling the yield count. On the other side + * there may a wmb because the yield count update is done by the + * hypervisor preemption and the value update by the OS, however this + * ordering might reduce the chance of out of order accesses and + * improve the heuristic. + */ + smp_rmb(); + + if (READ_ONCE(lock->val) == val) { + if (mustq) + clear_mustq(lock); + yield_to_preempted(owner, yield_count); + if (mustq) + set_mustq(lock); + spin_begin(); + + /* Don't relax if we yielded. Maybe we should? */ + return preempted; + } + spin_begin(); +relax: + spin_cpu_relax(); + + return preempted; +} + +/* Called inside spin_begin(). Returns whether or not the vCPU was preempted. */ +static __always_inline bool yield_to_locked_owner(struct qspinlock *lock, u32 val, bool paravirt) +{ + return __yield_to_locked_owner(lock, val, paravirt, false); +} + +/* Called inside spin_begin(). Returns whether or not the vCPU was preempted. */ +static __always_inline bool yield_head_to_locked_owner(struct qspinlock *lock, u32 val, bool paravirt) +{ + bool mustq = false; + + if ((val & _Q_MUST_Q_VAL) && pv_yield_allow_steal) + mustq = true; + + return __yield_to_locked_owner(lock, val, paravirt, mustq); +} + +static __always_inline void propagate_yield_cpu(struct qnode *node, u32 val, int *set_yield_cpu, bool paravirt) +{ + struct qnode *next; + int owner; + + if (!paravirt) + return; + if (!pv_yield_propagate_owner) + return; + + owner = get_owner_cpu(val); + if (*set_yield_cpu == owner) + return; + + next = READ_ONCE(node->next); + if (!next) + return; + + if (vcpu_is_preempted(owner)) { + next->yield_cpu = owner; + *set_yield_cpu = owner; + } else if (*set_yield_cpu != -1) { + next->yield_cpu = owner; + *set_yield_cpu = owner; + } +} + +/* Called inside spin_begin() */ +static __always_inline bool yield_to_prev(struct qspinlock *lock, struct qnode *node, u32 val, bool paravirt) +{ + int prev_cpu = decode_tail_cpu(val); + u32 yield_count; + int yield_cpu; + bool preempted = false; + + if (!paravirt) + goto relax; + + if (!pv_yield_propagate_owner) + goto yield_prev; + + yield_cpu = READ_ONCE(node->yield_cpu); + if (yield_cpu == -1) { + /* Propagate back the -1 CPU */ + if (node->next && node->next->yield_cpu != -1) + node->next->yield_cpu = yield_cpu; + goto yield_prev; + } + + yield_count = yield_count_of(yield_cpu); + if ((yield_count & 1) == 0) + goto yield_prev; /* owner vcpu is running */ + + if (get_owner_cpu(READ_ONCE(lock->val)) != yield_cpu) + goto yield_prev; /* re-sample lock owner */ + + spin_end(); + + preempted = true; + seen_sleepy_node(lock, val); + + smp_rmb(); + + if (yield_cpu == node->yield_cpu) { + if (node->next && node->next->yield_cpu != yield_cpu) + node->next->yield_cpu = yield_cpu; + yield_to_preempted(yield_cpu, yield_count); + spin_begin(); + return preempted; + } + spin_begin(); + +yield_prev: + if (!pv_yield_prev) + goto relax; + + yield_count = yield_count_of(prev_cpu); + if ((yield_count & 1) == 0) + goto relax; /* owner vcpu is running */ + + spin_end(); + + preempted = true; + seen_sleepy_node(lock, val); + + smp_rmb(); /* See __yield_to_locked_owner comment */ + + if (!READ_ONCE(node->locked)) { + yield_to_preempted(prev_cpu, yield_count); + spin_begin(); + return preempted; + } + spin_begin(); + +relax: + spin_cpu_relax(); + + return preempted; +} + +static __always_inline bool steal_break(u32 val, int iters, bool paravirt, bool sleepy) +{ + if (iters >= get_steal_spins(paravirt, sleepy)) + return true; + + if (IS_ENABLED(CONFIG_NUMA) && + (iters >= get_remote_steal_spins(paravirt, sleepy))) { + int cpu = get_owner_cpu(val); + if (numa_node_id() != cpu_to_node(cpu)) + return true; + } + return false; +} + +static __always_inline bool try_to_steal_lock(struct qspinlock *lock, bool paravirt) +{ + bool seen_preempted = false; + bool sleepy = false; + int iters = 0; + u32 val; + + if (!steal_spins) { + /* XXX: should spin_on_preempted_owner do anything here? */ + return false; + } + + /* Attempt to steal the lock */ + spin_begin(); + do { + bool preempted = false; + + val = READ_ONCE(lock->val); + if (val & _Q_MUST_Q_VAL) + break; + spec_barrier(); + + if (unlikely(!(val & _Q_LOCKED_VAL))) { + spin_end(); + if (__queued_spin_trylock_steal(lock)) + return true; + spin_begin(); + } else { + preempted = yield_to_locked_owner(lock, val, paravirt); + } + + if (paravirt && pv_sleepy_lock) { + if (!sleepy) { + if (val & _Q_SLEEPY_VAL) { + seen_sleepy_lock(); + sleepy = true; + } else if (recently_sleepy()) { + sleepy = true; + } + } + if (pv_sleepy_lock_sticky && seen_preempted && + !(val & _Q_SLEEPY_VAL)) { + if (try_set_sleepy(lock, val)) + val |= _Q_SLEEPY_VAL; + } + } + + if (preempted) { + seen_preempted = true; + sleepy = true; + if (!pv_spin_on_preempted_owner) + iters++; + /* + * pv_spin_on_preempted_owner don't increase iters + * while the owner is preempted -- we won't interfere + * with it by definition. This could introduce some + * latency issue if we continually observe preempted + * owners, but hopefully that's a rare corner case of + * a badly oversubscribed system. + */ + } else { + iters++; + } + } while (!steal_break(val, iters, paravirt, sleepy)); + + spin_end(); + + return false; +} + +static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, bool paravirt) +{ + struct qnodes *qnodesp; + struct qnode *next, *node; + u32 val, old, tail; + bool seen_preempted = false; + bool sleepy = false; + bool mustq = false; + int idx; + int set_yield_cpu = -1; + int iters = 0; + + BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); + + qnodesp = this_cpu_ptr(&qnodes); + if (unlikely(qnodesp->count >= MAX_NODES)) { + spec_barrier(); + while (!queued_spin_trylock(lock)) + cpu_relax(); + return; + } + + idx = qnodesp->count++; + /* + * Ensure that we increment the head node->count before initialising + * the actual node. If the compiler is kind enough to reorder these + * stores, then an IRQ could overwrite our assignments. + */ + barrier(); + node = &qnodesp->nodes[idx]; + node->next = NULL; + node->lock = lock; + node->cpu = smp_processor_id(); + node->yield_cpu = -1; + node->locked = 0; + + tail = encode_tail_cpu(node->cpu); + + /* + * Assign all attributes of a node before it can be published. + * Issues an lwsync, serving as a release barrier, as well as a + * compiler barrier. + */ + old = publish_tail_cpu(lock, tail); + + /* + * If there was a previous node; link it and wait until reaching the + * head of the waitqueue. + */ + if (old & _Q_TAIL_CPU_MASK) { + struct qnode *prev = get_tail_qnode(lock, old); + + /* Link @node into the waitqueue. */ + WRITE_ONCE(prev->next, node); + + /* Wait for mcs node lock to be released */ + spin_begin(); + while (!READ_ONCE(node->locked)) { + spec_barrier(); + + if (yield_to_prev(lock, node, old, paravirt)) + seen_preempted = true; + } + spec_barrier(); + spin_end(); + + /* Clear out stale propagated yield_cpu */ + if (paravirt && pv_yield_propagate_owner && node->yield_cpu != -1) + node->yield_cpu = -1; + + smp_rmb(); /* acquire barrier for the mcs lock */ + + /* + * Generic qspinlocks have this prefetch here, but it seems + * like it could cause additional line transitions because + * the waiter will keep loading from it. + */ + if (_Q_SPIN_PREFETCH_NEXT) { + next = READ_ONCE(node->next); + if (next) + prefetchw(next); + } + } + + /* We're at the head of the waitqueue, wait for the lock. */ +again: + spin_begin(); + for (;;) { + bool preempted; + + val = READ_ONCE(lock->val); + if (!(val & _Q_LOCKED_VAL)) + break; + spec_barrier(); + + if (paravirt && pv_sleepy_lock && maybe_stealers) { + if (!sleepy) { + if (val & _Q_SLEEPY_VAL) { + seen_sleepy_lock(); + sleepy = true; + } else if (recently_sleepy()) { + sleepy = true; + } + } + if (pv_sleepy_lock_sticky && seen_preempted && + !(val & _Q_SLEEPY_VAL)) { + if (try_set_sleepy(lock, val)) + val |= _Q_SLEEPY_VAL; + } + } + + propagate_yield_cpu(node, val, &set_yield_cpu, paravirt); + preempted = yield_head_to_locked_owner(lock, val, paravirt); + if (!maybe_stealers) + continue; + + if (preempted) + seen_preempted = true; + + if (paravirt && preempted) { + sleepy = true; + + if (!pv_spin_on_preempted_owner) + iters++; + } else { + iters++; + } + + if (!mustq && iters >= get_head_spins(paravirt, sleepy)) { + mustq = true; + set_mustq(lock); + val |= _Q_MUST_Q_VAL; + } + } + spec_barrier(); + spin_end(); + + /* If we're the last queued, must clean up the tail. */ + old = trylock_clean_tail(lock, tail); + if (unlikely(old & _Q_LOCKED_VAL)) { + BUG_ON(!maybe_stealers); + goto again; /* Can only be true if maybe_stealers. */ + } + + if ((old & _Q_TAIL_CPU_MASK) == tail) + goto release; /* We were the tail, no next. */ + + /* There is a next, must wait for node->next != NULL (MCS protocol) */ + next = READ_ONCE(node->next); + if (!next) { + spin_begin(); + while (!(next = READ_ONCE(node->next))) + cpu_relax(); + spin_end(); + } + spec_barrier(); + + /* + * Unlock the next mcs waiter node. Release barrier is not required + * here because the acquirer is only accessing the lock word, and + * the acquire barrier we took the lock with orders that update vs + * this store to locked. The corresponding barrier is the smp_rmb() + * acquire barrier for mcs lock, above. + */ + if (paravirt && pv_prod_head) { + int next_cpu = next->cpu; + WRITE_ONCE(next->locked, 1); + if (_Q_SPIN_MISO) + asm volatile("miso" ::: "memory"); + if (vcpu_is_preempted(next_cpu)) + prod_cpu(next_cpu); + } else { + WRITE_ONCE(next->locked, 1); + if (_Q_SPIN_MISO) + asm volatile("miso" ::: "memory"); + } + +release: + qnodesp->count--; /* release the node */ +} + +void queued_spin_lock_slowpath(struct qspinlock *lock) +{ + /* + * This looks funny, but it induces the compiler to inline both + * sides of the branch rather than share code as when the condition + * is passed as the paravirt argument to the functions. + */ + if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) && is_shared_processor()) { + if (try_to_steal_lock(lock, true)) { + spec_barrier(); + return; + } + queued_spin_lock_mcs_queue(lock, true); + } else { + if (try_to_steal_lock(lock, false)) { + spec_barrier(); + return; + } + queued_spin_lock_mcs_queue(lock, false); + } +} +EXPORT_SYMBOL(queued_spin_lock_slowpath); + +#ifdef CONFIG_PARAVIRT_SPINLOCKS +void pv_spinlocks_init(void) +{ +} +#endif + +#include <linux/debugfs.h> +static int steal_spins_set(void *data, u64 val) +{ +#if _Q_SPIN_TRY_LOCK_STEAL == 1 + /* MAYBE_STEAL remains true */ + steal_spins = val; +#else + static DEFINE_MUTEX(lock); + + /* + * The lock slow path has a !maybe_stealers case that can assume + * the head of queue will not see concurrent waiters. That waiter + * is unsafe in the presence of stealers, so must keep them away + * from one another. + */ + + mutex_lock(&lock); + if (val && !steal_spins) { + maybe_stealers = true; + /* wait for queue head waiter to go away */ + synchronize_rcu(); + steal_spins = val; + } else if (!val && steal_spins) { + steal_spins = val; + /* wait for all possible stealers to go away */ + synchronize_rcu(); + maybe_stealers = false; + } else { + steal_spins = val; + } + mutex_unlock(&lock); +#endif + + return 0; +} + +static int steal_spins_get(void *data, u64 *val) +{ + *val = steal_spins; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_steal_spins, steal_spins_get, steal_spins_set, "%llu\n"); + +static int remote_steal_spins_set(void *data, u64 val) +{ + remote_steal_spins = val; + + return 0; +} + +static int remote_steal_spins_get(void *data, u64 *val) +{ + *val = remote_steal_spins; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_remote_steal_spins, remote_steal_spins_get, remote_steal_spins_set, "%llu\n"); + +static int head_spins_set(void *data, u64 val) +{ + head_spins = val; + + return 0; +} + +static int head_spins_get(void *data, u64 *val) +{ + *val = head_spins; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_head_spins, head_spins_get, head_spins_set, "%llu\n"); + +static int pv_yield_owner_set(void *data, u64 val) +{ + pv_yield_owner = !!val; + + return 0; +} + +static int pv_yield_owner_get(void *data, u64 *val) +{ + *val = pv_yield_owner; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_owner, pv_yield_owner_get, pv_yield_owner_set, "%llu\n"); + +static int pv_yield_allow_steal_set(void *data, u64 val) +{ + pv_yield_allow_steal = !!val; + + return 0; +} + +static int pv_yield_allow_steal_get(void *data, u64 *val) +{ + *val = pv_yield_allow_steal; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_allow_steal, pv_yield_allow_steal_get, pv_yield_allow_steal_set, "%llu\n"); + +static int pv_spin_on_preempted_owner_set(void *data, u64 val) +{ + pv_spin_on_preempted_owner = !!val; + + return 0; +} + +static int pv_spin_on_preempted_owner_get(void *data, u64 *val) +{ + *val = pv_spin_on_preempted_owner; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_spin_on_preempted_owner, pv_spin_on_preempted_owner_get, pv_spin_on_preempted_owner_set, "%llu\n"); + +static int pv_sleepy_lock_set(void *data, u64 val) +{ + pv_sleepy_lock = !!val; + + return 0; +} + +static int pv_sleepy_lock_get(void *data, u64 *val) +{ + *val = pv_sleepy_lock; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_sleepy_lock, pv_sleepy_lock_get, pv_sleepy_lock_set, "%llu\n"); + +static int pv_sleepy_lock_sticky_set(void *data, u64 val) +{ + pv_sleepy_lock_sticky = !!val; + + return 0; +} + +static int pv_sleepy_lock_sticky_get(void *data, u64 *val) +{ + *val = pv_sleepy_lock_sticky; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_sleepy_lock_sticky, pv_sleepy_lock_sticky_get, pv_sleepy_lock_sticky_set, "%llu\n"); + +static int pv_sleepy_lock_interval_ns_set(void *data, u64 val) +{ + pv_sleepy_lock_interval_ns = val; + + return 0; +} + +static int pv_sleepy_lock_interval_ns_get(void *data, u64 *val) +{ + *val = pv_sleepy_lock_interval_ns; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_sleepy_lock_interval_ns, pv_sleepy_lock_interval_ns_get, pv_sleepy_lock_interval_ns_set, "%llu\n"); + +static int pv_sleepy_lock_factor_set(void *data, u64 val) +{ + pv_sleepy_lock_factor = val; + + return 0; +} + +static int pv_sleepy_lock_factor_get(void *data, u64 *val) +{ + *val = pv_sleepy_lock_factor; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_sleepy_lock_factor, pv_sleepy_lock_factor_get, pv_sleepy_lock_factor_set, "%llu\n"); + +static int pv_yield_prev_set(void *data, u64 val) +{ + pv_yield_prev = !!val; + + return 0; +} + +static int pv_yield_prev_get(void *data, u64 *val) +{ + *val = pv_yield_prev; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_prev, pv_yield_prev_get, pv_yield_prev_set, "%llu\n"); + +static int pv_yield_propagate_owner_set(void *data, u64 val) +{ + pv_yield_propagate_owner = !!val; + + return 0; +} + +static int pv_yield_propagate_owner_get(void *data, u64 *val) +{ + *val = pv_yield_propagate_owner; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_propagate_owner, pv_yield_propagate_owner_get, pv_yield_propagate_owner_set, "%llu\n"); + +static int pv_prod_head_set(void *data, u64 val) +{ + pv_prod_head = !!val; + + return 0; +} + +static int pv_prod_head_get(void *data, u64 *val) +{ + *val = pv_prod_head; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_prod_head, pv_prod_head_get, pv_prod_head_set, "%llu\n"); + +static __init int spinlock_debugfs_init(void) +{ + debugfs_create_file("qspl_steal_spins", 0600, arch_debugfs_dir, NULL, &fops_steal_spins); + debugfs_create_file("qspl_remote_steal_spins", 0600, arch_debugfs_dir, NULL, &fops_remote_steal_spins); + debugfs_create_file("qspl_head_spins", 0600, arch_debugfs_dir, NULL, &fops_head_spins); + if (is_shared_processor()) { + debugfs_create_file("qspl_pv_yield_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_owner); + debugfs_create_file("qspl_pv_yield_allow_steal", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_allow_steal); + debugfs_create_file("qspl_pv_spin_on_preempted_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_spin_on_preempted_owner); + debugfs_create_file("qspl_pv_sleepy_lock", 0600, arch_debugfs_dir, NULL, &fops_pv_sleepy_lock); + debugfs_create_file("qspl_pv_sleepy_lock_sticky", 0600, arch_debugfs_dir, NULL, &fops_pv_sleepy_lock_sticky); + debugfs_create_file("qspl_pv_sleepy_lock_interval_ns", 0600, arch_debugfs_dir, NULL, &fops_pv_sleepy_lock_interval_ns); + debugfs_create_file("qspl_pv_sleepy_lock_factor", 0600, arch_debugfs_dir, NULL, &fops_pv_sleepy_lock_factor); + debugfs_create_file("qspl_pv_yield_prev", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_prev); + debugfs_create_file("qspl_pv_yield_propagate_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_propagate_owner); + debugfs_create_file("qspl_pv_prod_head", 0600, arch_debugfs_dir, NULL, &fops_pv_prod_head); + } + + return 0; +} +device_initcall(spinlock_debugfs_init); diff --git a/arch/powerpc/lib/quad.S b/arch/powerpc/lib/quad.S new file mode 100644 index 0000000000..da71760e50 --- /dev/null +++ b/arch/powerpc/lib/quad.S @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Quadword loads and stores + * for use in instruction emulation. + * + * Copyright 2017 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + */ + +#include <asm/processor.h> +#include <asm/ppc_asm.h> +#include <asm/ppc-opcode.h> +#include <asm/reg.h> +#include <asm/asm-offsets.h> +#include <linux/errno.h> + +/* do_lq(unsigned long ea, unsigned long *regs) */ +_GLOBAL(do_lq) +1: lq r6, 0(r3) + std r6, 0(r4) + std r7, 8(r4) + li r3, 0 + blr +2: li r3, -EFAULT + blr + EX_TABLE(1b, 2b) + +/* do_stq(unsigned long ea, unsigned long val0, unsigned long val1) */ +_GLOBAL(do_stq) +1: stq r4, 0(r3) + li r3, 0 + blr +2: li r3, -EFAULT + blr + EX_TABLE(1b, 2b) + +/* do_lqarx(unsigned long ea, unsigned long *regs) */ +_GLOBAL(do_lqarx) +1: PPC_LQARX(6, 0, 3, 0) + std r6, 0(r4) + std r7, 8(r4) + li r3, 0 + blr +2: li r3, -EFAULT + blr + EX_TABLE(1b, 2b) + +/* do_stqcx(unsigned long ea, unsigned long val0, unsigned long val1, + unsigned int *crp) */ + +_GLOBAL(do_stqcx) +1: PPC_STQCX(4, 0, 3) + mfcr r5 + stw r5, 0(r6) + li r3, 0 + blr +2: li r3, -EFAULT + blr + EX_TABLE(1b, 2b) diff --git a/arch/powerpc/lib/restart_table.c b/arch/powerpc/lib/restart_table.c new file mode 100644 index 0000000000..bccb662c1b --- /dev/null +++ b/arch/powerpc/lib/restart_table.c @@ -0,0 +1,56 @@ +#include <asm/interrupt.h> +#include <asm/kprobes.h> + +struct soft_mask_table_entry { + unsigned long start; + unsigned long end; +}; + +struct restart_table_entry { + unsigned long start; + unsigned long end; + unsigned long fixup; +}; + +extern struct soft_mask_table_entry __start___soft_mask_table[]; +extern struct soft_mask_table_entry __stop___soft_mask_table[]; + +extern struct restart_table_entry __start___restart_table[]; +extern struct restart_table_entry __stop___restart_table[]; + +/* Given an address, look for it in the soft mask table */ +bool search_kernel_soft_mask_table(unsigned long addr) +{ + struct soft_mask_table_entry *smte = __start___soft_mask_table; + + while (smte < __stop___soft_mask_table) { + unsigned long start = smte->start; + unsigned long end = smte->end; + + if (addr >= start && addr < end) + return true; + + smte++; + } + return false; +} +NOKPROBE_SYMBOL(search_kernel_soft_mask_table); + +/* Given an address, look for it in the kernel exception table */ +unsigned long search_kernel_restart_table(unsigned long addr) +{ + struct restart_table_entry *rte = __start___restart_table; + + while (rte < __stop___restart_table) { + unsigned long start = rte->start; + unsigned long end = rte->end; + unsigned long fixup = rte->fixup; + + if (addr >= start && addr < end) + return fixup; + + rte++; + } + return 0; +} +NOKPROBE_SYMBOL(search_kernel_restart_table); diff --git a/arch/powerpc/lib/rheap.c b/arch/powerpc/lib/rheap.c new file mode 100644 index 0000000000..6aa774aa5b --- /dev/null +++ b/arch/powerpc/lib/rheap.c @@ -0,0 +1,747 @@ +/* + * A Remote Heap. Remote means that we don't touch the memory that the + * heap points to. Normal heap implementations use the memory they manage + * to place their list. We cannot do that because the memory we manage may + * have special properties, for example it is uncachable or of different + * endianess. + * + * Author: Pantelis Antoniou <panto@intracom.gr> + * + * 2004 (c) INTRACOM S.A. Greece. This file is licensed under + * the terms of the GNU General Public License version 2. This program + * is licensed "as is" without any warranty of any kind, whether express + * or implied. + */ +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/mm.h> +#include <linux/err.h> +#include <linux/slab.h> + +#include <asm/rheap.h> + +/* + * Fixup a list_head, needed when copying lists. If the pointers fall + * between s and e, apply the delta. This assumes that + * sizeof(struct list_head *) == sizeof(unsigned long *). + */ +static inline void fixup(unsigned long s, unsigned long e, int d, + struct list_head *l) +{ + unsigned long *pp; + + pp = (unsigned long *)&l->next; + if (*pp >= s && *pp < e) + *pp += d; + + pp = (unsigned long *)&l->prev; + if (*pp >= s && *pp < e) + *pp += d; +} + +/* Grow the allocated blocks */ +static int grow(rh_info_t * info, int max_blocks) +{ + rh_block_t *block, *blk; + int i, new_blocks; + int delta; + unsigned long blks, blke; + + if (max_blocks <= info->max_blocks) + return -EINVAL; + + new_blocks = max_blocks - info->max_blocks; + + block = kmalloc_array(max_blocks, sizeof(rh_block_t), GFP_ATOMIC); + if (block == NULL) + return -ENOMEM; + + if (info->max_blocks > 0) { + + /* copy old block area */ + memcpy(block, info->block, + sizeof(rh_block_t) * info->max_blocks); + + delta = (char *)block - (char *)info->block; + + /* and fixup list pointers */ + blks = (unsigned long)info->block; + blke = (unsigned long)(info->block + info->max_blocks); + + for (i = 0, blk = block; i < info->max_blocks; i++, blk++) + fixup(blks, blke, delta, &blk->list); + + fixup(blks, blke, delta, &info->empty_list); + fixup(blks, blke, delta, &info->free_list); + fixup(blks, blke, delta, &info->taken_list); + + /* free the old allocated memory */ + if ((info->flags & RHIF_STATIC_BLOCK) == 0) + kfree(info->block); + } + + info->block = block; + info->empty_slots += new_blocks; + info->max_blocks = max_blocks; + info->flags &= ~RHIF_STATIC_BLOCK; + + /* add all new blocks to the free list */ + blk = block + info->max_blocks - new_blocks; + for (i = 0; i < new_blocks; i++, blk++) + list_add(&blk->list, &info->empty_list); + + return 0; +} + +/* + * Assure at least the required amount of empty slots. If this function + * causes a grow in the block area then all pointers kept to the block + * area are invalid! + */ +static int assure_empty(rh_info_t * info, int slots) +{ + int max_blocks; + + /* This function is not meant to be used to grow uncontrollably */ + if (slots >= 4) + return -EINVAL; + + /* Enough space */ + if (info->empty_slots >= slots) + return 0; + + /* Next 16 sized block */ + max_blocks = ((info->max_blocks + slots) + 15) & ~15; + + return grow(info, max_blocks); +} + +static rh_block_t *get_slot(rh_info_t * info) +{ + rh_block_t *blk; + + /* If no more free slots, and failure to extend. */ + /* XXX: You should have called assure_empty before */ + if (info->empty_slots == 0) { + printk(KERN_ERR "rh: out of slots; crash is imminent.\n"); + return NULL; + } + + /* Get empty slot to use */ + blk = list_entry(info->empty_list.next, rh_block_t, list); + list_del_init(&blk->list); + info->empty_slots--; + + /* Initialize */ + blk->start = 0; + blk->size = 0; + blk->owner = NULL; + + return blk; +} + +static inline void release_slot(rh_info_t * info, rh_block_t * blk) +{ + list_add(&blk->list, &info->empty_list); + info->empty_slots++; +} + +static void attach_free_block(rh_info_t * info, rh_block_t * blkn) +{ + rh_block_t *blk; + rh_block_t *before; + rh_block_t *after; + rh_block_t *next; + int size; + unsigned long s, e, bs, be; + struct list_head *l; + + /* We assume that they are aligned properly */ + size = blkn->size; + s = blkn->start; + e = s + size; + + /* Find the blocks immediately before and after the given one + * (if any) */ + before = NULL; + after = NULL; + next = NULL; + + list_for_each(l, &info->free_list) { + blk = list_entry(l, rh_block_t, list); + + bs = blk->start; + be = bs + blk->size; + + if (next == NULL && s >= bs) + next = blk; + + if (be == s) + before = blk; + + if (e == bs) + after = blk; + + /* If both are not null, break now */ + if (before != NULL && after != NULL) + break; + } + + /* Now check if they are really adjacent */ + if (before && s != (before->start + before->size)) + before = NULL; + + if (after && e != after->start) + after = NULL; + + /* No coalescing; list insert and return */ + if (before == NULL && after == NULL) { + + if (next != NULL) + list_add(&blkn->list, &next->list); + else + list_add(&blkn->list, &info->free_list); + + return; + } + + /* We don't need it anymore */ + release_slot(info, blkn); + + /* Grow the before block */ + if (before != NULL && after == NULL) { + before->size += size; + return; + } + + /* Grow the after block backwards */ + if (before == NULL && after != NULL) { + after->start -= size; + after->size += size; + return; + } + + /* Grow the before block, and release the after block */ + before->size += size + after->size; + list_del(&after->list); + release_slot(info, after); +} + +static void attach_taken_block(rh_info_t * info, rh_block_t * blkn) +{ + rh_block_t *blk; + struct list_head *l; + + /* Find the block immediately before the given one (if any) */ + list_for_each(l, &info->taken_list) { + blk = list_entry(l, rh_block_t, list); + if (blk->start > blkn->start) { + list_add_tail(&blkn->list, &blk->list); + return; + } + } + + list_add_tail(&blkn->list, &info->taken_list); +} + +/* + * Create a remote heap dynamically. Note that no memory for the blocks + * are allocated. It will upon the first allocation + */ +rh_info_t *rh_create(unsigned int alignment) +{ + rh_info_t *info; + + /* Alignment must be a power of two */ + if ((alignment & (alignment - 1)) != 0) + return ERR_PTR(-EINVAL); + + info = kmalloc(sizeof(*info), GFP_ATOMIC); + if (info == NULL) + return ERR_PTR(-ENOMEM); + + info->alignment = alignment; + + /* Initially everything as empty */ + info->block = NULL; + info->max_blocks = 0; + info->empty_slots = 0; + info->flags = 0; + + INIT_LIST_HEAD(&info->empty_list); + INIT_LIST_HEAD(&info->free_list); + INIT_LIST_HEAD(&info->taken_list); + + return info; +} +EXPORT_SYMBOL_GPL(rh_create); + +/* + * Destroy a dynamically created remote heap. Deallocate only if the areas + * are not static + */ +void rh_destroy(rh_info_t * info) +{ + if ((info->flags & RHIF_STATIC_BLOCK) == 0) + kfree(info->block); + + if ((info->flags & RHIF_STATIC_INFO) == 0) + kfree(info); +} +EXPORT_SYMBOL_GPL(rh_destroy); + +/* + * Initialize in place a remote heap info block. This is needed to support + * operation very early in the startup of the kernel, when it is not yet safe + * to call kmalloc. + */ +void rh_init(rh_info_t * info, unsigned int alignment, int max_blocks, + rh_block_t * block) +{ + int i; + rh_block_t *blk; + + /* Alignment must be a power of two */ + if ((alignment & (alignment - 1)) != 0) + return; + + info->alignment = alignment; + + /* Initially everything as empty */ + info->block = block; + info->max_blocks = max_blocks; + info->empty_slots = max_blocks; + info->flags = RHIF_STATIC_INFO | RHIF_STATIC_BLOCK; + + INIT_LIST_HEAD(&info->empty_list); + INIT_LIST_HEAD(&info->free_list); + INIT_LIST_HEAD(&info->taken_list); + + /* Add all new blocks to the free list */ + for (i = 0, blk = block; i < max_blocks; i++, blk++) + list_add(&blk->list, &info->empty_list); +} +EXPORT_SYMBOL_GPL(rh_init); + +/* Attach a free memory region, coalesces regions if adjacent */ +int rh_attach_region(rh_info_t * info, unsigned long start, int size) +{ + rh_block_t *blk; + unsigned long s, e, m; + int r; + + /* The region must be aligned */ + s = start; + e = s + size; + m = info->alignment - 1; + + /* Round start up */ + s = (s + m) & ~m; + + /* Round end down */ + e = e & ~m; + + if (IS_ERR_VALUE(e) || (e < s)) + return -ERANGE; + + /* Take final values */ + start = s; + size = e - s; + + /* Grow the blocks, if needed */ + r = assure_empty(info, 1); + if (r < 0) + return r; + + blk = get_slot(info); + blk->start = start; + blk->size = size; + blk->owner = NULL; + + attach_free_block(info, blk); + + return 0; +} +EXPORT_SYMBOL_GPL(rh_attach_region); + +/* Detatch given address range, splits free block if needed. */ +unsigned long rh_detach_region(rh_info_t * info, unsigned long start, int size) +{ + struct list_head *l; + rh_block_t *blk, *newblk; + unsigned long s, e, m, bs, be; + + /* Validate size */ + if (size <= 0) + return (unsigned long) -EINVAL; + + /* The region must be aligned */ + s = start; + e = s + size; + m = info->alignment - 1; + + /* Round start up */ + s = (s + m) & ~m; + + /* Round end down */ + e = e & ~m; + + if (assure_empty(info, 1) < 0) + return (unsigned long) -ENOMEM; + + blk = NULL; + list_for_each(l, &info->free_list) { + blk = list_entry(l, rh_block_t, list); + /* The range must lie entirely inside one free block */ + bs = blk->start; + be = blk->start + blk->size; + if (s >= bs && e <= be) + break; + blk = NULL; + } + + if (blk == NULL) + return (unsigned long) -ENOMEM; + + /* Perfect fit */ + if (bs == s && be == e) { + /* Delete from free list, release slot */ + list_del(&blk->list); + release_slot(info, blk); + return s; + } + + /* blk still in free list, with updated start and/or size */ + if (bs == s || be == e) { + if (bs == s) + blk->start += size; + blk->size -= size; + + } else { + /* The front free fragment */ + blk->size = s - bs; + + /* the back free fragment */ + newblk = get_slot(info); + newblk->start = e; + newblk->size = be - e; + + list_add(&newblk->list, &blk->list); + } + + return s; +} +EXPORT_SYMBOL_GPL(rh_detach_region); + +/* Allocate a block of memory at the specified alignment. The value returned + * is an offset into the buffer initialized by rh_init(), or a negative number + * if there is an error. + */ +unsigned long rh_alloc_align(rh_info_t * info, int size, int alignment, const char *owner) +{ + struct list_head *l; + rh_block_t *blk; + rh_block_t *newblk; + unsigned long start, sp_size; + + /* Validate size, and alignment must be power of two */ + if (size <= 0 || (alignment & (alignment - 1)) != 0) + return (unsigned long) -EINVAL; + + /* Align to configured alignment */ + size = (size + (info->alignment - 1)) & ~(info->alignment - 1); + + if (assure_empty(info, 2) < 0) + return (unsigned long) -ENOMEM; + + blk = NULL; + list_for_each(l, &info->free_list) { + blk = list_entry(l, rh_block_t, list); + if (size <= blk->size) { + start = (blk->start + alignment - 1) & ~(alignment - 1); + if (start + size <= blk->start + blk->size) + break; + } + blk = NULL; + } + + if (blk == NULL) + return (unsigned long) -ENOMEM; + + /* Just fits */ + if (blk->size == size) { + /* Move from free list to taken list */ + list_del(&blk->list); + newblk = blk; + } else { + /* Fragment caused, split if needed */ + /* Create block for fragment in the beginning */ + sp_size = start - blk->start; + if (sp_size) { + rh_block_t *spblk; + + spblk = get_slot(info); + spblk->start = blk->start; + spblk->size = sp_size; + /* add before the blk */ + list_add(&spblk->list, blk->list.prev); + } + newblk = get_slot(info); + newblk->start = start; + newblk->size = size; + + /* blk still in free list, with updated start and size + * for fragment in the end */ + blk->start = start + size; + blk->size -= sp_size + size; + /* No fragment in the end, remove blk */ + if (blk->size == 0) { + list_del(&blk->list); + release_slot(info, blk); + } + } + + newblk->owner = owner; + attach_taken_block(info, newblk); + + return start; +} +EXPORT_SYMBOL_GPL(rh_alloc_align); + +/* Allocate a block of memory at the default alignment. The value returned is + * an offset into the buffer initialized by rh_init(), or a negative number if + * there is an error. + */ +unsigned long rh_alloc(rh_info_t * info, int size, const char *owner) +{ + return rh_alloc_align(info, size, info->alignment, owner); +} +EXPORT_SYMBOL_GPL(rh_alloc); + +/* Allocate a block of memory at the given offset, rounded up to the default + * alignment. The value returned is an offset into the buffer initialized by + * rh_init(), or a negative number if there is an error. + */ +unsigned long rh_alloc_fixed(rh_info_t * info, unsigned long start, int size, const char *owner) +{ + struct list_head *l; + rh_block_t *blk, *newblk1, *newblk2; + unsigned long s, e, m, bs = 0, be = 0; + + /* Validate size */ + if (size <= 0) + return (unsigned long) -EINVAL; + + /* The region must be aligned */ + s = start; + e = s + size; + m = info->alignment - 1; + + /* Round start up */ + s = (s + m) & ~m; + + /* Round end down */ + e = e & ~m; + + if (assure_empty(info, 2) < 0) + return (unsigned long) -ENOMEM; + + blk = NULL; + list_for_each(l, &info->free_list) { + blk = list_entry(l, rh_block_t, list); + /* The range must lie entirely inside one free block */ + bs = blk->start; + be = blk->start + blk->size; + if (s >= bs && e <= be) + break; + blk = NULL; + } + + if (blk == NULL) + return (unsigned long) -ENOMEM; + + /* Perfect fit */ + if (bs == s && be == e) { + /* Move from free list to taken list */ + list_del(&blk->list); + blk->owner = owner; + + start = blk->start; + attach_taken_block(info, blk); + + return start; + + } + + /* blk still in free list, with updated start and/or size */ + if (bs == s || be == e) { + if (bs == s) + blk->start += size; + blk->size -= size; + + } else { + /* The front free fragment */ + blk->size = s - bs; + + /* The back free fragment */ + newblk2 = get_slot(info); + newblk2->start = e; + newblk2->size = be - e; + + list_add(&newblk2->list, &blk->list); + } + + newblk1 = get_slot(info); + newblk1->start = s; + newblk1->size = e - s; + newblk1->owner = owner; + + start = newblk1->start; + attach_taken_block(info, newblk1); + + return start; +} +EXPORT_SYMBOL_GPL(rh_alloc_fixed); + +/* Deallocate the memory previously allocated by one of the rh_alloc functions. + * The return value is the size of the deallocated block, or a negative number + * if there is an error. + */ +int rh_free(rh_info_t * info, unsigned long start) +{ + rh_block_t *blk, *blk2; + struct list_head *l; + int size; + + /* Linear search for block */ + blk = NULL; + list_for_each(l, &info->taken_list) { + blk2 = list_entry(l, rh_block_t, list); + if (start < blk2->start) + break; + blk = blk2; + } + + if (blk == NULL || start > (blk->start + blk->size)) + return -EINVAL; + + /* Remove from taken list */ + list_del(&blk->list); + + /* Get size of freed block */ + size = blk->size; + attach_free_block(info, blk); + + return size; +} +EXPORT_SYMBOL_GPL(rh_free); + +int rh_get_stats(rh_info_t * info, int what, int max_stats, rh_stats_t * stats) +{ + rh_block_t *blk; + struct list_head *l; + struct list_head *h; + int nr; + + switch (what) { + + case RHGS_FREE: + h = &info->free_list; + break; + + case RHGS_TAKEN: + h = &info->taken_list; + break; + + default: + return -EINVAL; + } + + /* Linear search for block */ + nr = 0; + list_for_each(l, h) { + blk = list_entry(l, rh_block_t, list); + if (stats != NULL && nr < max_stats) { + stats->start = blk->start; + stats->size = blk->size; + stats->owner = blk->owner; + stats++; + } + nr++; + } + + return nr; +} +EXPORT_SYMBOL_GPL(rh_get_stats); + +int rh_set_owner(rh_info_t * info, unsigned long start, const char *owner) +{ + rh_block_t *blk, *blk2; + struct list_head *l; + int size; + + /* Linear search for block */ + blk = NULL; + list_for_each(l, &info->taken_list) { + blk2 = list_entry(l, rh_block_t, list); + if (start < blk2->start) + break; + blk = blk2; + } + + if (blk == NULL || start > (blk->start + blk->size)) + return -EINVAL; + + blk->owner = owner; + size = blk->size; + + return size; +} +EXPORT_SYMBOL_GPL(rh_set_owner); + +void rh_dump(rh_info_t * info) +{ + static rh_stats_t st[32]; /* XXX maximum 32 blocks */ + int maxnr; + int i, nr; + + maxnr = ARRAY_SIZE(st); + + printk(KERN_INFO + "info @0x%p (%d slots empty / %d max)\n", + info, info->empty_slots, info->max_blocks); + + printk(KERN_INFO " Free:\n"); + nr = rh_get_stats(info, RHGS_FREE, maxnr, st); + if (nr > maxnr) + nr = maxnr; + for (i = 0; i < nr; i++) + printk(KERN_INFO + " 0x%lx-0x%lx (%u)\n", + st[i].start, st[i].start + st[i].size, + st[i].size); + printk(KERN_INFO "\n"); + + printk(KERN_INFO " Taken:\n"); + nr = rh_get_stats(info, RHGS_TAKEN, maxnr, st); + if (nr > maxnr) + nr = maxnr; + for (i = 0; i < nr; i++) + printk(KERN_INFO + " 0x%lx-0x%lx (%u) %s\n", + st[i].start, st[i].start + st[i].size, + st[i].size, st[i].owner != NULL ? st[i].owner : ""); + printk(KERN_INFO "\n"); +} +EXPORT_SYMBOL_GPL(rh_dump); + +void rh_dump_blk(rh_info_t * info, rh_block_t * blk) +{ + printk(KERN_INFO + "blk @0x%p: 0x%lx-0x%lx (%u)\n", + blk, blk->start, blk->start + blk->size, blk->size); +} +EXPORT_SYMBOL_GPL(rh_dump_blk); + diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c new file mode 100644 index 0000000000..a4ab862506 --- /dev/null +++ b/arch/powerpc/lib/sstep.c @@ -0,0 +1,3666 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Single-step support. + * + * Copyright (C) 2004 Paul Mackerras <paulus@au.ibm.com>, IBM + */ +#include <linux/kernel.h> +#include <linux/kprobes.h> +#include <linux/ptrace.h> +#include <linux/prefetch.h> +#include <asm/sstep.h> +#include <asm/processor.h> +#include <linux/uaccess.h> +#include <asm/cpu_has_feature.h> +#include <asm/cputable.h> +#include <asm/disassemble.h> + +#ifdef CONFIG_PPC64 +/* Bits in SRR1 that are copied from MSR */ +#define MSR_MASK 0xffffffff87c0ffffUL +#else +#define MSR_MASK 0x87c0ffff +#endif + +/* Bits in XER */ +#define XER_SO 0x80000000U +#define XER_OV 0x40000000U +#define XER_CA 0x20000000U +#define XER_OV32 0x00080000U +#define XER_CA32 0x00040000U + +#ifdef CONFIG_VSX +#define VSX_REGISTER_XTP(rd) ((((rd) & 1) << 5) | ((rd) & 0xfe)) +#endif + +#ifdef CONFIG_PPC_FPU +/* + * Functions in ldstfp.S + */ +extern void get_fpr(int rn, double *p); +extern void put_fpr(int rn, const double *p); +extern void get_vr(int rn, __vector128 *p); +extern void put_vr(int rn, __vector128 *p); +extern void load_vsrn(int vsr, const void *p); +extern void store_vsrn(int vsr, void *p); +extern void conv_sp_to_dp(const float *sp, double *dp); +extern void conv_dp_to_sp(const double *dp, float *sp); +#endif + +#ifdef __powerpc64__ +/* + * Functions in quad.S + */ +extern int do_lq(unsigned long ea, unsigned long *regs); +extern int do_stq(unsigned long ea, unsigned long val0, unsigned long val1); +extern int do_lqarx(unsigned long ea, unsigned long *regs); +extern int do_stqcx(unsigned long ea, unsigned long val0, unsigned long val1, + unsigned int *crp); +#endif + +#ifdef __LITTLE_ENDIAN__ +#define IS_LE 1 +#define IS_BE 0 +#else +#define IS_LE 0 +#define IS_BE 1 +#endif + +/* + * Emulate the truncation of 64 bit values in 32-bit mode. + */ +static nokprobe_inline unsigned long truncate_if_32bit(unsigned long msr, + unsigned long val) +{ + if ((msr & MSR_64BIT) == 0) + val &= 0xffffffffUL; + return val; +} + +/* + * Determine whether a conditional branch instruction would branch. + */ +static nokprobe_inline int branch_taken(unsigned int instr, + const struct pt_regs *regs, + struct instruction_op *op) +{ + unsigned int bo = (instr >> 21) & 0x1f; + unsigned int bi; + + if ((bo & 4) == 0) { + /* decrement counter */ + op->type |= DECCTR; + if (((bo >> 1) & 1) ^ (regs->ctr == 1)) + return 0; + } + if ((bo & 0x10) == 0) { + /* check bit from CR */ + bi = (instr >> 16) & 0x1f; + if (((regs->ccr >> (31 - bi)) & 1) != ((bo >> 3) & 1)) + return 0; + } + return 1; +} + +static nokprobe_inline long address_ok(struct pt_regs *regs, + unsigned long ea, int nb) +{ + if (!user_mode(regs)) + return 1; + if (access_ok((void __user *)ea, nb)) + return 1; + if (access_ok((void __user *)ea, 1)) + /* Access overlaps the end of the user region */ + regs->dar = TASK_SIZE_MAX - 1; + else + regs->dar = ea; + return 0; +} + +/* + * Calculate effective address for a D-form instruction + */ +static nokprobe_inline unsigned long dform_ea(unsigned int instr, + const struct pt_regs *regs) +{ + int ra; + unsigned long ea; + + ra = (instr >> 16) & 0x1f; + ea = (signed short) instr; /* sign-extend */ + if (ra) + ea += regs->gpr[ra]; + + return ea; +} + +#ifdef __powerpc64__ +/* + * Calculate effective address for a DS-form instruction + */ +static nokprobe_inline unsigned long dsform_ea(unsigned int instr, + const struct pt_regs *regs) +{ + int ra; + unsigned long ea; + + ra = (instr >> 16) & 0x1f; + ea = (signed short) (instr & ~3); /* sign-extend */ + if (ra) + ea += regs->gpr[ra]; + + return ea; +} + +/* + * Calculate effective address for a DQ-form instruction + */ +static nokprobe_inline unsigned long dqform_ea(unsigned int instr, + const struct pt_regs *regs) +{ + int ra; + unsigned long ea; + + ra = (instr >> 16) & 0x1f; + ea = (signed short) (instr & ~0xf); /* sign-extend */ + if (ra) + ea += regs->gpr[ra]; + + return ea; +} +#endif /* __powerpc64 */ + +/* + * Calculate effective address for an X-form instruction + */ +static nokprobe_inline unsigned long xform_ea(unsigned int instr, + const struct pt_regs *regs) +{ + int ra, rb; + unsigned long ea; + + ra = (instr >> 16) & 0x1f; + rb = (instr >> 11) & 0x1f; + ea = regs->gpr[rb]; + if (ra) + ea += regs->gpr[ra]; + + return ea; +} + +/* + * Calculate effective address for a MLS:D-form / 8LS:D-form + * prefixed instruction + */ +static nokprobe_inline unsigned long mlsd_8lsd_ea(unsigned int instr, + unsigned int suffix, + const struct pt_regs *regs) +{ + int ra, prefix_r; + unsigned int dd; + unsigned long ea, d0, d1, d; + + prefix_r = GET_PREFIX_R(instr); + ra = GET_PREFIX_RA(suffix); + + d0 = instr & 0x3ffff; + d1 = suffix & 0xffff; + d = (d0 << 16) | d1; + + /* + * sign extend a 34 bit number + */ + dd = (unsigned int)(d >> 2); + ea = (signed int)dd; + ea = (ea << 2) | (d & 0x3); + + if (!prefix_r && ra) + ea += regs->gpr[ra]; + else if (!prefix_r && !ra) + ; /* Leave ea as is */ + else if (prefix_r) + ea += regs->nip; + + /* + * (prefix_r && ra) is an invalid form. Should already be + * checked for by caller! + */ + + return ea; +} + +/* + * Return the largest power of 2, not greater than sizeof(unsigned long), + * such that x is a multiple of it. + */ +static nokprobe_inline unsigned long max_align(unsigned long x) +{ + x |= sizeof(unsigned long); + return x & -x; /* isolates rightmost bit */ +} + +static nokprobe_inline unsigned long byterev_2(unsigned long x) +{ + return ((x >> 8) & 0xff) | ((x & 0xff) << 8); +} + +static nokprobe_inline unsigned long byterev_4(unsigned long x) +{ + return ((x >> 24) & 0xff) | ((x >> 8) & 0xff00) | + ((x & 0xff00) << 8) | ((x & 0xff) << 24); +} + +#ifdef __powerpc64__ +static nokprobe_inline unsigned long byterev_8(unsigned long x) +{ + return (byterev_4(x) << 32) | byterev_4(x >> 32); +} +#endif + +static nokprobe_inline void do_byte_reverse(void *ptr, int nb) +{ + switch (nb) { + case 2: + *(u16 *)ptr = byterev_2(*(u16 *)ptr); + break; + case 4: + *(u32 *)ptr = byterev_4(*(u32 *)ptr); + break; +#ifdef __powerpc64__ + case 8: + *(unsigned long *)ptr = byterev_8(*(unsigned long *)ptr); + break; + case 16: { + unsigned long *up = (unsigned long *)ptr; + unsigned long tmp; + tmp = byterev_8(up[0]); + up[0] = byterev_8(up[1]); + up[1] = tmp; + break; + } + case 32: { + unsigned long *up = (unsigned long *)ptr; + unsigned long tmp; + + tmp = byterev_8(up[0]); + up[0] = byterev_8(up[3]); + up[3] = tmp; + tmp = byterev_8(up[2]); + up[2] = byterev_8(up[1]); + up[1] = tmp; + break; + } + +#endif + default: + WARN_ON_ONCE(1); + } +} + +static __always_inline int +__read_mem_aligned(unsigned long *dest, unsigned long ea, int nb, struct pt_regs *regs) +{ + unsigned long x = 0; + + switch (nb) { + case 1: + unsafe_get_user(x, (unsigned char __user *)ea, Efault); + break; + case 2: + unsafe_get_user(x, (unsigned short __user *)ea, Efault); + break; + case 4: + unsafe_get_user(x, (unsigned int __user *)ea, Efault); + break; +#ifdef __powerpc64__ + case 8: + unsafe_get_user(x, (unsigned long __user *)ea, Efault); + break; +#endif + } + *dest = x; + return 0; + +Efault: + regs->dar = ea; + return -EFAULT; +} + +static nokprobe_inline int +read_mem_aligned(unsigned long *dest, unsigned long ea, int nb, struct pt_regs *regs) +{ + int err; + + if (is_kernel_addr(ea)) + return __read_mem_aligned(dest, ea, nb, regs); + + if (user_read_access_begin((void __user *)ea, nb)) { + err = __read_mem_aligned(dest, ea, nb, regs); + user_read_access_end(); + } else { + err = -EFAULT; + regs->dar = ea; + } + + return err; +} + +/* + * Copy from userspace to a buffer, using the largest possible + * aligned accesses, up to sizeof(long). + */ +static __always_inline int __copy_mem_in(u8 *dest, unsigned long ea, int nb, struct pt_regs *regs) +{ + int c; + + for (; nb > 0; nb -= c) { + c = max_align(ea); + if (c > nb) + c = max_align(nb); + switch (c) { + case 1: + unsafe_get_user(*dest, (u8 __user *)ea, Efault); + break; + case 2: + unsafe_get_user(*(u16 *)dest, (u16 __user *)ea, Efault); + break; + case 4: + unsafe_get_user(*(u32 *)dest, (u32 __user *)ea, Efault); + break; +#ifdef __powerpc64__ + case 8: + unsafe_get_user(*(u64 *)dest, (u64 __user *)ea, Efault); + break; +#endif + } + dest += c; + ea += c; + } + return 0; + +Efault: + regs->dar = ea; + return -EFAULT; +} + +static nokprobe_inline int copy_mem_in(u8 *dest, unsigned long ea, int nb, struct pt_regs *regs) +{ + int err; + + if (is_kernel_addr(ea)) + return __copy_mem_in(dest, ea, nb, regs); + + if (user_read_access_begin((void __user *)ea, nb)) { + err = __copy_mem_in(dest, ea, nb, regs); + user_read_access_end(); + } else { + err = -EFAULT; + regs->dar = ea; + } + + return err; +} + +static nokprobe_inline int read_mem_unaligned(unsigned long *dest, + unsigned long ea, int nb, + struct pt_regs *regs) +{ + union { + unsigned long ul; + u8 b[sizeof(unsigned long)]; + } u; + int i; + int err; + + u.ul = 0; + i = IS_BE ? sizeof(unsigned long) - nb : 0; + err = copy_mem_in(&u.b[i], ea, nb, regs); + if (!err) + *dest = u.ul; + return err; +} + +/* + * Read memory at address ea for nb bytes, return 0 for success + * or -EFAULT if an error occurred. N.B. nb must be 1, 2, 4 or 8. + * If nb < sizeof(long), the result is right-justified on BE systems. + */ +static int read_mem(unsigned long *dest, unsigned long ea, int nb, + struct pt_regs *regs) +{ + if (!address_ok(regs, ea, nb)) + return -EFAULT; + if ((ea & (nb - 1)) == 0) + return read_mem_aligned(dest, ea, nb, regs); + return read_mem_unaligned(dest, ea, nb, regs); +} +NOKPROBE_SYMBOL(read_mem); + +static __always_inline int +__write_mem_aligned(unsigned long val, unsigned long ea, int nb, struct pt_regs *regs) +{ + switch (nb) { + case 1: + unsafe_put_user(val, (unsigned char __user *)ea, Efault); + break; + case 2: + unsafe_put_user(val, (unsigned short __user *)ea, Efault); + break; + case 4: + unsafe_put_user(val, (unsigned int __user *)ea, Efault); + break; +#ifdef __powerpc64__ + case 8: + unsafe_put_user(val, (unsigned long __user *)ea, Efault); + break; +#endif + } + return 0; + +Efault: + regs->dar = ea; + return -EFAULT; +} + +static nokprobe_inline int +write_mem_aligned(unsigned long val, unsigned long ea, int nb, struct pt_regs *regs) +{ + int err; + + if (is_kernel_addr(ea)) + return __write_mem_aligned(val, ea, nb, regs); + + if (user_write_access_begin((void __user *)ea, nb)) { + err = __write_mem_aligned(val, ea, nb, regs); + user_write_access_end(); + } else { + err = -EFAULT; + regs->dar = ea; + } + + return err; +} + +/* + * Copy from a buffer to userspace, using the largest possible + * aligned accesses, up to sizeof(long). + */ +static __always_inline int __copy_mem_out(u8 *dest, unsigned long ea, int nb, struct pt_regs *regs) +{ + int c; + + for (; nb > 0; nb -= c) { + c = max_align(ea); + if (c > nb) + c = max_align(nb); + switch (c) { + case 1: + unsafe_put_user(*dest, (u8 __user *)ea, Efault); + break; + case 2: + unsafe_put_user(*(u16 *)dest, (u16 __user *)ea, Efault); + break; + case 4: + unsafe_put_user(*(u32 *)dest, (u32 __user *)ea, Efault); + break; +#ifdef __powerpc64__ + case 8: + unsafe_put_user(*(u64 *)dest, (u64 __user *)ea, Efault); + break; +#endif + } + dest += c; + ea += c; + } + return 0; + +Efault: + regs->dar = ea; + return -EFAULT; +} + +static nokprobe_inline int copy_mem_out(u8 *dest, unsigned long ea, int nb, struct pt_regs *regs) +{ + int err; + + if (is_kernel_addr(ea)) + return __copy_mem_out(dest, ea, nb, regs); + + if (user_write_access_begin((void __user *)ea, nb)) { + err = __copy_mem_out(dest, ea, nb, regs); + user_write_access_end(); + } else { + err = -EFAULT; + regs->dar = ea; + } + + return err; +} + +static nokprobe_inline int write_mem_unaligned(unsigned long val, + unsigned long ea, int nb, + struct pt_regs *regs) +{ + union { + unsigned long ul; + u8 b[sizeof(unsigned long)]; + } u; + int i; + + u.ul = val; + i = IS_BE ? sizeof(unsigned long) - nb : 0; + return copy_mem_out(&u.b[i], ea, nb, regs); +} + +/* + * Write memory at address ea for nb bytes, return 0 for success + * or -EFAULT if an error occurred. N.B. nb must be 1, 2, 4 or 8. + */ +static int write_mem(unsigned long val, unsigned long ea, int nb, + struct pt_regs *regs) +{ + if (!address_ok(regs, ea, nb)) + return -EFAULT; + if ((ea & (nb - 1)) == 0) + return write_mem_aligned(val, ea, nb, regs); + return write_mem_unaligned(val, ea, nb, regs); +} +NOKPROBE_SYMBOL(write_mem); + +#ifdef CONFIG_PPC_FPU +/* + * These access either the real FP register or the image in the + * thread_struct, depending on regs->msr & MSR_FP. + */ +static int do_fp_load(struct instruction_op *op, unsigned long ea, + struct pt_regs *regs, bool cross_endian) +{ + int err, rn, nb; + union { + int i; + unsigned int u; + float f; + double d[2]; + unsigned long l[2]; + u8 b[2 * sizeof(double)]; + } u; + + nb = GETSIZE(op->type); + if (!address_ok(regs, ea, nb)) + return -EFAULT; + rn = op->reg; + err = copy_mem_in(u.b, ea, nb, regs); + if (err) + return err; + if (unlikely(cross_endian)) { + do_byte_reverse(u.b, min(nb, 8)); + if (nb == 16) + do_byte_reverse(&u.b[8], 8); + } + preempt_disable(); + if (nb == 4) { + if (op->type & FPCONV) + conv_sp_to_dp(&u.f, &u.d[0]); + else if (op->type & SIGNEXT) + u.l[0] = u.i; + else + u.l[0] = u.u; + } + if (regs->msr & MSR_FP) + put_fpr(rn, &u.d[0]); + else + current->thread.TS_FPR(rn) = u.l[0]; + if (nb == 16) { + /* lfdp */ + rn |= 1; + if (regs->msr & MSR_FP) + put_fpr(rn, &u.d[1]); + else + current->thread.TS_FPR(rn) = u.l[1]; + } + preempt_enable(); + return 0; +} +NOKPROBE_SYMBOL(do_fp_load); + +static int do_fp_store(struct instruction_op *op, unsigned long ea, + struct pt_regs *regs, bool cross_endian) +{ + int rn, nb; + union { + unsigned int u; + float f; + double d[2]; + unsigned long l[2]; + u8 b[2 * sizeof(double)]; + } u; + + nb = GETSIZE(op->type); + if (!address_ok(regs, ea, nb)) + return -EFAULT; + rn = op->reg; + preempt_disable(); + if (regs->msr & MSR_FP) + get_fpr(rn, &u.d[0]); + else + u.l[0] = current->thread.TS_FPR(rn); + if (nb == 4) { + if (op->type & FPCONV) + conv_dp_to_sp(&u.d[0], &u.f); + else + u.u = u.l[0]; + } + if (nb == 16) { + rn |= 1; + if (regs->msr & MSR_FP) + get_fpr(rn, &u.d[1]); + else + u.l[1] = current->thread.TS_FPR(rn); + } + preempt_enable(); + if (unlikely(cross_endian)) { + do_byte_reverse(u.b, min(nb, 8)); + if (nb == 16) + do_byte_reverse(&u.b[8], 8); + } + return copy_mem_out(u.b, ea, nb, regs); +} +NOKPROBE_SYMBOL(do_fp_store); +#endif + +#ifdef CONFIG_ALTIVEC +/* For Altivec/VMX, no need to worry about alignment */ +static nokprobe_inline int do_vec_load(int rn, unsigned long ea, + int size, struct pt_regs *regs, + bool cross_endian) +{ + int err; + union { + __vector128 v; + u8 b[sizeof(__vector128)]; + } u = {}; + + if (!address_ok(regs, ea & ~0xfUL, 16)) + return -EFAULT; + /* align to multiple of size */ + ea &= ~(size - 1); + err = copy_mem_in(&u.b[ea & 0xf], ea, size, regs); + if (err) + return err; + if (unlikely(cross_endian)) + do_byte_reverse(&u.b[ea & 0xf], size); + preempt_disable(); + if (regs->msr & MSR_VEC) + put_vr(rn, &u.v); + else + current->thread.vr_state.vr[rn] = u.v; + preempt_enable(); + return 0; +} + +static nokprobe_inline int do_vec_store(int rn, unsigned long ea, + int size, struct pt_regs *regs, + bool cross_endian) +{ + union { + __vector128 v; + u8 b[sizeof(__vector128)]; + } u; + + if (!address_ok(regs, ea & ~0xfUL, 16)) + return -EFAULT; + /* align to multiple of size */ + ea &= ~(size - 1); + + preempt_disable(); + if (regs->msr & MSR_VEC) + get_vr(rn, &u.v); + else + u.v = current->thread.vr_state.vr[rn]; + preempt_enable(); + if (unlikely(cross_endian)) + do_byte_reverse(&u.b[ea & 0xf], size); + return copy_mem_out(&u.b[ea & 0xf], ea, size, regs); +} +#endif /* CONFIG_ALTIVEC */ + +#ifdef __powerpc64__ +static nokprobe_inline int emulate_lq(struct pt_regs *regs, unsigned long ea, + int reg, bool cross_endian) +{ + int err; + + if (!address_ok(regs, ea, 16)) + return -EFAULT; + /* if aligned, should be atomic */ + if ((ea & 0xf) == 0) { + err = do_lq(ea, ®s->gpr[reg]); + } else { + err = read_mem(®s->gpr[reg + IS_LE], ea, 8, regs); + if (!err) + err = read_mem(®s->gpr[reg + IS_BE], ea + 8, 8, regs); + } + if (!err && unlikely(cross_endian)) + do_byte_reverse(®s->gpr[reg], 16); + return err; +} + +static nokprobe_inline int emulate_stq(struct pt_regs *regs, unsigned long ea, + int reg, bool cross_endian) +{ + int err; + unsigned long vals[2]; + + if (!address_ok(regs, ea, 16)) + return -EFAULT; + vals[0] = regs->gpr[reg]; + vals[1] = regs->gpr[reg + 1]; + if (unlikely(cross_endian)) + do_byte_reverse(vals, 16); + + /* if aligned, should be atomic */ + if ((ea & 0xf) == 0) + return do_stq(ea, vals[0], vals[1]); + + err = write_mem(vals[IS_LE], ea, 8, regs); + if (!err) + err = write_mem(vals[IS_BE], ea + 8, 8, regs); + return err; +} +#endif /* __powerpc64 */ + +#ifdef CONFIG_VSX +void emulate_vsx_load(struct instruction_op *op, union vsx_reg *reg, + const void *mem, bool rev) +{ + int size, read_size; + int i, j; + const unsigned int *wp; + const unsigned short *hp; + const unsigned char *bp; + + size = GETSIZE(op->type); + reg->d[0] = reg->d[1] = 0; + + switch (op->element_size) { + case 32: + /* [p]lxvp[x] */ + case 16: + /* whole vector; lxv[x] or lxvl[l] */ + if (size == 0) + break; + memcpy(reg, mem, size); + if (IS_LE && (op->vsx_flags & VSX_LDLEFT)) + rev = !rev; + if (rev) + do_byte_reverse(reg, size); + break; + case 8: + /* scalar loads, lxvd2x, lxvdsx */ + read_size = (size >= 8) ? 8 : size; + i = IS_LE ? 8 : 8 - read_size; + memcpy(®->b[i], mem, read_size); + if (rev) + do_byte_reverse(®->b[i], 8); + if (size < 8) { + if (op->type & SIGNEXT) { + /* size == 4 is the only case here */ + reg->d[IS_LE] = (signed int) reg->d[IS_LE]; + } else if (op->vsx_flags & VSX_FPCONV) { + preempt_disable(); + conv_sp_to_dp(®->fp[1 + IS_LE], + ®->dp[IS_LE]); + preempt_enable(); + } + } else { + if (size == 16) { + unsigned long v = *(unsigned long *)(mem + 8); + reg->d[IS_BE] = !rev ? v : byterev_8(v); + } else if (op->vsx_flags & VSX_SPLAT) + reg->d[IS_BE] = reg->d[IS_LE]; + } + break; + case 4: + /* lxvw4x, lxvwsx */ + wp = mem; + for (j = 0; j < size / 4; ++j) { + i = IS_LE ? 3 - j : j; + reg->w[i] = !rev ? *wp++ : byterev_4(*wp++); + } + if (op->vsx_flags & VSX_SPLAT) { + u32 val = reg->w[IS_LE ? 3 : 0]; + for (; j < 4; ++j) { + i = IS_LE ? 3 - j : j; + reg->w[i] = val; + } + } + break; + case 2: + /* lxvh8x */ + hp = mem; + for (j = 0; j < size / 2; ++j) { + i = IS_LE ? 7 - j : j; + reg->h[i] = !rev ? *hp++ : byterev_2(*hp++); + } + break; + case 1: + /* lxvb16x */ + bp = mem; + for (j = 0; j < size; ++j) { + i = IS_LE ? 15 - j : j; + reg->b[i] = *bp++; + } + break; + } +} +EXPORT_SYMBOL_GPL(emulate_vsx_load); +NOKPROBE_SYMBOL(emulate_vsx_load); + +void emulate_vsx_store(struct instruction_op *op, const union vsx_reg *reg, + void *mem, bool rev) +{ + int size, write_size; + int i, j; + union vsx_reg buf; + unsigned int *wp; + unsigned short *hp; + unsigned char *bp; + + size = GETSIZE(op->type); + + switch (op->element_size) { + case 32: + /* [p]stxvp[x] */ + if (size == 0) + break; + if (rev) { + /* reverse 32 bytes */ + union vsx_reg buf32[2]; + buf32[0].d[0] = byterev_8(reg[1].d[1]); + buf32[0].d[1] = byterev_8(reg[1].d[0]); + buf32[1].d[0] = byterev_8(reg[0].d[1]); + buf32[1].d[1] = byterev_8(reg[0].d[0]); + memcpy(mem, buf32, size); + } else { + memcpy(mem, reg, size); + } + break; + case 16: + /* stxv, stxvx, stxvl, stxvll */ + if (size == 0) + break; + if (IS_LE && (op->vsx_flags & VSX_LDLEFT)) + rev = !rev; + if (rev) { + /* reverse 16 bytes */ + buf.d[0] = byterev_8(reg->d[1]); + buf.d[1] = byterev_8(reg->d[0]); + reg = &buf; + } + memcpy(mem, reg, size); + break; + case 8: + /* scalar stores, stxvd2x */ + write_size = (size >= 8) ? 8 : size; + i = IS_LE ? 8 : 8 - write_size; + if (size < 8 && op->vsx_flags & VSX_FPCONV) { + buf.d[0] = buf.d[1] = 0; + preempt_disable(); + conv_dp_to_sp(®->dp[IS_LE], &buf.fp[1 + IS_LE]); + preempt_enable(); + reg = &buf; + } + memcpy(mem, ®->b[i], write_size); + if (size == 16) + memcpy(mem + 8, ®->d[IS_BE], 8); + if (unlikely(rev)) { + do_byte_reverse(mem, write_size); + if (size == 16) + do_byte_reverse(mem + 8, 8); + } + break; + case 4: + /* stxvw4x */ + wp = mem; + for (j = 0; j < size / 4; ++j) { + i = IS_LE ? 3 - j : j; + *wp++ = !rev ? reg->w[i] : byterev_4(reg->w[i]); + } + break; + case 2: + /* stxvh8x */ + hp = mem; + for (j = 0; j < size / 2; ++j) { + i = IS_LE ? 7 - j : j; + *hp++ = !rev ? reg->h[i] : byterev_2(reg->h[i]); + } + break; + case 1: + /* stvxb16x */ + bp = mem; + for (j = 0; j < size; ++j) { + i = IS_LE ? 15 - j : j; + *bp++ = reg->b[i]; + } + break; + } +} +EXPORT_SYMBOL_GPL(emulate_vsx_store); +NOKPROBE_SYMBOL(emulate_vsx_store); + +static nokprobe_inline int do_vsx_load(struct instruction_op *op, + unsigned long ea, struct pt_regs *regs, + bool cross_endian) +{ + int reg = op->reg; + int i, j, nr_vsx_regs; + u8 mem[32]; + union vsx_reg buf[2]; + int size = GETSIZE(op->type); + + if (!address_ok(regs, ea, size) || copy_mem_in(mem, ea, size, regs)) + return -EFAULT; + + nr_vsx_regs = max(1ul, size / sizeof(__vector128)); + emulate_vsx_load(op, buf, mem, cross_endian); + preempt_disable(); + if (reg < 32) { + /* FP regs + extensions */ + if (regs->msr & MSR_FP) { + for (i = 0; i < nr_vsx_regs; i++) { + j = IS_LE ? nr_vsx_regs - i - 1 : i; + load_vsrn(reg + i, &buf[j].v); + } + } else { + for (i = 0; i < nr_vsx_regs; i++) { + j = IS_LE ? nr_vsx_regs - i - 1 : i; + current->thread.fp_state.fpr[reg + i][0] = buf[j].d[0]; + current->thread.fp_state.fpr[reg + i][1] = buf[j].d[1]; + } + } + } else { + if (regs->msr & MSR_VEC) { + for (i = 0; i < nr_vsx_regs; i++) { + j = IS_LE ? nr_vsx_regs - i - 1 : i; + load_vsrn(reg + i, &buf[j].v); + } + } else { + for (i = 0; i < nr_vsx_regs; i++) { + j = IS_LE ? nr_vsx_regs - i - 1 : i; + current->thread.vr_state.vr[reg - 32 + i] = buf[j].v; + } + } + } + preempt_enable(); + return 0; +} + +static nokprobe_inline int do_vsx_store(struct instruction_op *op, + unsigned long ea, struct pt_regs *regs, + bool cross_endian) +{ + int reg = op->reg; + int i, j, nr_vsx_regs; + u8 mem[32]; + union vsx_reg buf[2]; + int size = GETSIZE(op->type); + + if (!address_ok(regs, ea, size)) + return -EFAULT; + + nr_vsx_regs = max(1ul, size / sizeof(__vector128)); + preempt_disable(); + if (reg < 32) { + /* FP regs + extensions */ + if (regs->msr & MSR_FP) { + for (i = 0; i < nr_vsx_regs; i++) { + j = IS_LE ? nr_vsx_regs - i - 1 : i; + store_vsrn(reg + i, &buf[j].v); + } + } else { + for (i = 0; i < nr_vsx_regs; i++) { + j = IS_LE ? nr_vsx_regs - i - 1 : i; + buf[j].d[0] = current->thread.fp_state.fpr[reg + i][0]; + buf[j].d[1] = current->thread.fp_state.fpr[reg + i][1]; + } + } + } else { + if (regs->msr & MSR_VEC) { + for (i = 0; i < nr_vsx_regs; i++) { + j = IS_LE ? nr_vsx_regs - i - 1 : i; + store_vsrn(reg + i, &buf[j].v); + } + } else { + for (i = 0; i < nr_vsx_regs; i++) { + j = IS_LE ? nr_vsx_regs - i - 1 : i; + buf[j].v = current->thread.vr_state.vr[reg - 32 + i]; + } + } + } + preempt_enable(); + emulate_vsx_store(op, buf, mem, cross_endian); + return copy_mem_out(mem, ea, size, regs); +} +#endif /* CONFIG_VSX */ + +static __always_inline int __emulate_dcbz(unsigned long ea) +{ + unsigned long i; + unsigned long size = l1_dcache_bytes(); + + for (i = 0; i < size; i += sizeof(long)) + unsafe_put_user(0, (unsigned long __user *)(ea + i), Efault); + + return 0; + +Efault: + return -EFAULT; +} + +int emulate_dcbz(unsigned long ea, struct pt_regs *regs) +{ + int err; + unsigned long size = l1_dcache_bytes(); + + ea = truncate_if_32bit(regs->msr, ea); + ea &= ~(size - 1); + if (!address_ok(regs, ea, size)) + return -EFAULT; + + if (is_kernel_addr(ea)) { + err = __emulate_dcbz(ea); + } else if (user_write_access_begin((void __user *)ea, size)) { + err = __emulate_dcbz(ea); + user_write_access_end(); + } else { + err = -EFAULT; + } + + if (err) + regs->dar = ea; + + + return err; +} +NOKPROBE_SYMBOL(emulate_dcbz); + +#define __put_user_asmx(x, addr, err, op, cr) \ + __asm__ __volatile__( \ + ".machine push\n" \ + ".machine power8\n" \ + "1: " op " %2,0,%3\n" \ + ".machine pop\n" \ + " mfcr %1\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: li %0,%4\n" \ + " b 2b\n" \ + ".previous\n" \ + EX_TABLE(1b, 3b) \ + : "=r" (err), "=r" (cr) \ + : "r" (x), "r" (addr), "i" (-EFAULT), "0" (err)) + +#define __get_user_asmx(x, addr, err, op) \ + __asm__ __volatile__( \ + ".machine push\n" \ + ".machine power8\n" \ + "1: "op" %1,0,%2\n" \ + ".machine pop\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: li %0,%3\n" \ + " b 2b\n" \ + ".previous\n" \ + EX_TABLE(1b, 3b) \ + : "=r" (err), "=r" (x) \ + : "r" (addr), "i" (-EFAULT), "0" (err)) + +#define __cacheop_user_asmx(addr, err, op) \ + __asm__ __volatile__( \ + "1: "op" 0,%1\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: li %0,%3\n" \ + " b 2b\n" \ + ".previous\n" \ + EX_TABLE(1b, 3b) \ + : "=r" (err) \ + : "r" (addr), "i" (-EFAULT), "0" (err)) + +static nokprobe_inline void set_cr0(const struct pt_regs *regs, + struct instruction_op *op) +{ + long val = op->val; + + op->type |= SETCC; + op->ccval = (regs->ccr & 0x0fffffff) | ((regs->xer >> 3) & 0x10000000); + if (!(regs->msr & MSR_64BIT)) + val = (int) val; + if (val < 0) + op->ccval |= 0x80000000; + else if (val > 0) + op->ccval |= 0x40000000; + else + op->ccval |= 0x20000000; +} + +static nokprobe_inline void set_ca32(struct instruction_op *op, bool val) +{ + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + if (val) + op->xerval |= XER_CA32; + else + op->xerval &= ~XER_CA32; + } +} + +static nokprobe_inline void add_with_carry(const struct pt_regs *regs, + struct instruction_op *op, int rd, + unsigned long val1, unsigned long val2, + unsigned long carry_in) +{ + unsigned long val = val1 + val2; + + if (carry_in) + ++val; + op->type = COMPUTE | SETREG | SETXER; + op->reg = rd; + op->val = val; + val = truncate_if_32bit(regs->msr, val); + val1 = truncate_if_32bit(regs->msr, val1); + op->xerval = regs->xer; + if (val < val1 || (carry_in && val == val1)) + op->xerval |= XER_CA; + else + op->xerval &= ~XER_CA; + + set_ca32(op, (unsigned int)val < (unsigned int)val1 || + (carry_in && (unsigned int)val == (unsigned int)val1)); +} + +static nokprobe_inline void do_cmp_signed(const struct pt_regs *regs, + struct instruction_op *op, + long v1, long v2, int crfld) +{ + unsigned int crval, shift; + + op->type = COMPUTE | SETCC; + crval = (regs->xer >> 31) & 1; /* get SO bit */ + if (v1 < v2) + crval |= 8; + else if (v1 > v2) + crval |= 4; + else + crval |= 2; + shift = (7 - crfld) * 4; + op->ccval = (regs->ccr & ~(0xf << shift)) | (crval << shift); +} + +static nokprobe_inline void do_cmp_unsigned(const struct pt_regs *regs, + struct instruction_op *op, + unsigned long v1, + unsigned long v2, int crfld) +{ + unsigned int crval, shift; + + op->type = COMPUTE | SETCC; + crval = (regs->xer >> 31) & 1; /* get SO bit */ + if (v1 < v2) + crval |= 8; + else if (v1 > v2) + crval |= 4; + else + crval |= 2; + shift = (7 - crfld) * 4; + op->ccval = (regs->ccr & ~(0xf << shift)) | (crval << shift); +} + +static nokprobe_inline void do_cmpb(const struct pt_regs *regs, + struct instruction_op *op, + unsigned long v1, unsigned long v2) +{ + unsigned long long out_val, mask; + int i; + + out_val = 0; + for (i = 0; i < 8; i++) { + mask = 0xffUL << (i * 8); + if ((v1 & mask) == (v2 & mask)) + out_val |= mask; + } + op->val = out_val; +} + +/* + * The size parameter is used to adjust the equivalent popcnt instruction. + * popcntb = 8, popcntw = 32, popcntd = 64 + */ +static nokprobe_inline void do_popcnt(const struct pt_regs *regs, + struct instruction_op *op, + unsigned long v1, int size) +{ + unsigned long long out = v1; + + out -= (out >> 1) & 0x5555555555555555ULL; + out = (0x3333333333333333ULL & out) + + (0x3333333333333333ULL & (out >> 2)); + out = (out + (out >> 4)) & 0x0f0f0f0f0f0f0f0fULL; + + if (size == 8) { /* popcntb */ + op->val = out; + return; + } + out += out >> 8; + out += out >> 16; + if (size == 32) { /* popcntw */ + op->val = out & 0x0000003f0000003fULL; + return; + } + + out = (out + (out >> 32)) & 0x7f; + op->val = out; /* popcntd */ +} + +#ifdef CONFIG_PPC64 +static nokprobe_inline void do_bpermd(const struct pt_regs *regs, + struct instruction_op *op, + unsigned long v1, unsigned long v2) +{ + unsigned char perm, idx; + unsigned int i; + + perm = 0; + for (i = 0; i < 8; i++) { + idx = (v1 >> (i * 8)) & 0xff; + if (idx < 64) + if (v2 & PPC_BIT(idx)) + perm |= 1 << i; + } + op->val = perm; +} +#endif /* CONFIG_PPC64 */ +/* + * The size parameter adjusts the equivalent prty instruction. + * prtyw = 32, prtyd = 64 + */ +static nokprobe_inline void do_prty(const struct pt_regs *regs, + struct instruction_op *op, + unsigned long v, int size) +{ + unsigned long long res = v ^ (v >> 8); + + res ^= res >> 16; + if (size == 32) { /* prtyw */ + op->val = res & 0x0000000100000001ULL; + return; + } + + res ^= res >> 32; + op->val = res & 1; /*prtyd */ +} + +static nokprobe_inline int trap_compare(long v1, long v2) +{ + int ret = 0; + + if (v1 < v2) + ret |= 0x10; + else if (v1 > v2) + ret |= 0x08; + else + ret |= 0x04; + if ((unsigned long)v1 < (unsigned long)v2) + ret |= 0x02; + else if ((unsigned long)v1 > (unsigned long)v2) + ret |= 0x01; + return ret; +} + +/* + * Elements of 32-bit rotate and mask instructions. + */ +#define MASK32(mb, me) ((0xffffffffUL >> (mb)) + \ + ((signed long)-0x80000000L >> (me)) + ((me) >= (mb))) +#ifdef __powerpc64__ +#define MASK64_L(mb) (~0UL >> (mb)) +#define MASK64_R(me) ((signed long)-0x8000000000000000L >> (me)) +#define MASK64(mb, me) (MASK64_L(mb) + MASK64_R(me) + ((me) >= (mb))) +#define DATA32(x) (((x) & 0xffffffffUL) | (((x) & 0xffffffffUL) << 32)) +#else +#define DATA32(x) (x) +#endif +#define ROTATE(x, n) ((n) ? (((x) << (n)) | ((x) >> (8 * sizeof(long) - (n)))) : (x)) + +/* + * Decode an instruction, and return information about it in *op + * without changing *regs. + * Integer arithmetic and logical instructions, branches, and barrier + * instructions can be emulated just using the information in *op. + * + * Return value is 1 if the instruction can be emulated just by + * updating *regs with the information in *op, -1 if we need the + * GPRs but *regs doesn't contain the full register set, or 0 + * otherwise. + */ +int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, + ppc_inst_t instr) +{ +#ifdef CONFIG_PPC64 + unsigned int suffixopcode, prefixtype, prefix_r; +#endif + unsigned int opcode, ra, rb, rc, rd, spr, u; + unsigned long int imm; + unsigned long int val, val2; + unsigned int mb, me, sh; + unsigned int word, suffix; + long ival; + + word = ppc_inst_val(instr); + suffix = ppc_inst_suffix(instr); + + op->type = COMPUTE; + + opcode = ppc_inst_primary_opcode(instr); + switch (opcode) { + case 16: /* bc */ + op->type = BRANCH; + imm = (signed short)(word & 0xfffc); + if ((word & 2) == 0) + imm += regs->nip; + op->val = truncate_if_32bit(regs->msr, imm); + if (word & 1) + op->type |= SETLK; + if (branch_taken(word, regs, op)) + op->type |= BRTAKEN; + return 1; + case 17: /* sc */ + if ((word & 0xfe2) == 2) + op->type = SYSCALL; + else if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && + (word & 0xfe3) == 1) { /* scv */ + op->type = SYSCALL_VECTORED_0; + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + goto unknown_opcode; + } else + op->type = UNKNOWN; + return 0; + case 18: /* b */ + op->type = BRANCH | BRTAKEN; + imm = word & 0x03fffffc; + if (imm & 0x02000000) + imm -= 0x04000000; + if ((word & 2) == 0) + imm += regs->nip; + op->val = truncate_if_32bit(regs->msr, imm); + if (word & 1) + op->type |= SETLK; + return 1; + case 19: + switch ((word >> 1) & 0x3ff) { + case 0: /* mcrf */ + op->type = COMPUTE + SETCC; + rd = 7 - ((word >> 23) & 0x7); + ra = 7 - ((word >> 18) & 0x7); + rd *= 4; + ra *= 4; + val = (regs->ccr >> ra) & 0xf; + op->ccval = (regs->ccr & ~(0xfUL << rd)) | (val << rd); + return 1; + + case 16: /* bclr */ + case 528: /* bcctr */ + op->type = BRANCH; + imm = (word & 0x400)? regs->ctr: regs->link; + op->val = truncate_if_32bit(regs->msr, imm); + if (word & 1) + op->type |= SETLK; + if (branch_taken(word, regs, op)) + op->type |= BRTAKEN; + return 1; + + case 18: /* rfid, scary */ + if (regs->msr & MSR_PR) + goto priv; + op->type = RFI; + return 0; + + case 150: /* isync */ + op->type = BARRIER | BARRIER_ISYNC; + return 1; + + case 33: /* crnor */ + case 129: /* crandc */ + case 193: /* crxor */ + case 225: /* crnand */ + case 257: /* crand */ + case 289: /* creqv */ + case 417: /* crorc */ + case 449: /* cror */ + op->type = COMPUTE + SETCC; + ra = (word >> 16) & 0x1f; + rb = (word >> 11) & 0x1f; + rd = (word >> 21) & 0x1f; + ra = (regs->ccr >> (31 - ra)) & 1; + rb = (regs->ccr >> (31 - rb)) & 1; + val = (word >> (6 + ra * 2 + rb)) & 1; + op->ccval = (regs->ccr & ~(1UL << (31 - rd))) | + (val << (31 - rd)); + return 1; + } + break; + case 31: + switch ((word >> 1) & 0x3ff) { + case 598: /* sync */ + op->type = BARRIER + BARRIER_SYNC; +#ifdef __powerpc64__ + switch ((word >> 21) & 3) { + case 1: /* lwsync */ + op->type = BARRIER + BARRIER_LWSYNC; + break; + case 2: /* ptesync */ + op->type = BARRIER + BARRIER_PTESYNC; + break; + } +#endif + return 1; + + case 854: /* eieio */ + op->type = BARRIER + BARRIER_EIEIO; + return 1; + } + break; + } + + rd = (word >> 21) & 0x1f; + ra = (word >> 16) & 0x1f; + rb = (word >> 11) & 0x1f; + rc = (word >> 6) & 0x1f; + + switch (opcode) { +#ifdef __powerpc64__ + case 1: + if (!cpu_has_feature(CPU_FTR_ARCH_31)) + goto unknown_opcode; + + prefix_r = GET_PREFIX_R(word); + ra = GET_PREFIX_RA(suffix); + rd = (suffix >> 21) & 0x1f; + op->reg = rd; + op->val = regs->gpr[rd]; + suffixopcode = get_op(suffix); + prefixtype = (word >> 24) & 0x3; + switch (prefixtype) { + case 2: + if (prefix_r && ra) + return 0; + switch (suffixopcode) { + case 14: /* paddi */ + op->type = COMPUTE | PREFIXED; + op->val = mlsd_8lsd_ea(word, suffix, regs); + goto compute_done; + } + } + break; + case 2: /* tdi */ + if (rd & trap_compare(regs->gpr[ra], (short) word)) + goto trap; + return 1; +#endif + case 3: /* twi */ + if (rd & trap_compare((int)regs->gpr[ra], (short) word)) + goto trap; + return 1; + +#ifdef __powerpc64__ + case 4: + /* + * There are very many instructions with this primary opcode + * introduced in the ISA as early as v2.03. However, the ones + * we currently emulate were all introduced with ISA 3.0 + */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + goto unknown_opcode; + + switch (word & 0x3f) { + case 48: /* maddhd */ + asm volatile(PPC_MADDHD(%0, %1, %2, %3) : + "=r" (op->val) : "r" (regs->gpr[ra]), + "r" (regs->gpr[rb]), "r" (regs->gpr[rc])); + goto compute_done; + + case 49: /* maddhdu */ + asm volatile(PPC_MADDHDU(%0, %1, %2, %3) : + "=r" (op->val) : "r" (regs->gpr[ra]), + "r" (regs->gpr[rb]), "r" (regs->gpr[rc])); + goto compute_done; + + case 51: /* maddld */ + asm volatile(PPC_MADDLD(%0, %1, %2, %3) : + "=r" (op->val) : "r" (regs->gpr[ra]), + "r" (regs->gpr[rb]), "r" (regs->gpr[rc])); + goto compute_done; + } + + /* + * There are other instructions from ISA 3.0 with the same + * primary opcode which do not have emulation support yet. + */ + goto unknown_opcode; +#endif + + case 7: /* mulli */ + op->val = regs->gpr[ra] * (short) word; + goto compute_done; + + case 8: /* subfic */ + imm = (short) word; + add_with_carry(regs, op, rd, ~regs->gpr[ra], imm, 1); + return 1; + + case 10: /* cmpli */ + imm = (unsigned short) word; + val = regs->gpr[ra]; +#ifdef __powerpc64__ + if ((rd & 1) == 0) + val = (unsigned int) val; +#endif + do_cmp_unsigned(regs, op, val, imm, rd >> 2); + return 1; + + case 11: /* cmpi */ + imm = (short) word; + val = regs->gpr[ra]; +#ifdef __powerpc64__ + if ((rd & 1) == 0) + val = (int) val; +#endif + do_cmp_signed(regs, op, val, imm, rd >> 2); + return 1; + + case 12: /* addic */ + imm = (short) word; + add_with_carry(regs, op, rd, regs->gpr[ra], imm, 0); + return 1; + + case 13: /* addic. */ + imm = (short) word; + add_with_carry(regs, op, rd, regs->gpr[ra], imm, 0); + set_cr0(regs, op); + return 1; + + case 14: /* addi */ + imm = (short) word; + if (ra) + imm += regs->gpr[ra]; + op->val = imm; + goto compute_done; + + case 15: /* addis */ + imm = ((short) word) << 16; + if (ra) + imm += regs->gpr[ra]; + op->val = imm; + goto compute_done; + + case 19: + if (((word >> 1) & 0x1f) == 2) { + /* addpcis */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + goto unknown_opcode; + imm = (short) (word & 0xffc1); /* d0 + d2 fields */ + imm |= (word >> 15) & 0x3e; /* d1 field */ + op->val = regs->nip + (imm << 16) + 4; + goto compute_done; + } + op->type = UNKNOWN; + return 0; + + case 20: /* rlwimi */ + mb = (word >> 6) & 0x1f; + me = (word >> 1) & 0x1f; + val = DATA32(regs->gpr[rd]); + imm = MASK32(mb, me); + op->val = (regs->gpr[ra] & ~imm) | (ROTATE(val, rb) & imm); + goto logical_done; + + case 21: /* rlwinm */ + mb = (word >> 6) & 0x1f; + me = (word >> 1) & 0x1f; + val = DATA32(regs->gpr[rd]); + op->val = ROTATE(val, rb) & MASK32(mb, me); + goto logical_done; + + case 23: /* rlwnm */ + mb = (word >> 6) & 0x1f; + me = (word >> 1) & 0x1f; + rb = regs->gpr[rb] & 0x1f; + val = DATA32(regs->gpr[rd]); + op->val = ROTATE(val, rb) & MASK32(mb, me); + goto logical_done; + + case 24: /* ori */ + op->val = regs->gpr[rd] | (unsigned short) word; + goto logical_done_nocc; + + case 25: /* oris */ + imm = (unsigned short) word; + op->val = regs->gpr[rd] | (imm << 16); + goto logical_done_nocc; + + case 26: /* xori */ + op->val = regs->gpr[rd] ^ (unsigned short) word; + goto logical_done_nocc; + + case 27: /* xoris */ + imm = (unsigned short) word; + op->val = regs->gpr[rd] ^ (imm << 16); + goto logical_done_nocc; + + case 28: /* andi. */ + op->val = regs->gpr[rd] & (unsigned short) word; + set_cr0(regs, op); + goto logical_done_nocc; + + case 29: /* andis. */ + imm = (unsigned short) word; + op->val = regs->gpr[rd] & (imm << 16); + set_cr0(regs, op); + goto logical_done_nocc; + +#ifdef __powerpc64__ + case 30: /* rld* */ + mb = ((word >> 6) & 0x1f) | (word & 0x20); + val = regs->gpr[rd]; + if ((word & 0x10) == 0) { + sh = rb | ((word & 2) << 4); + val = ROTATE(val, sh); + switch ((word >> 2) & 3) { + case 0: /* rldicl */ + val &= MASK64_L(mb); + break; + case 1: /* rldicr */ + val &= MASK64_R(mb); + break; + case 2: /* rldic */ + val &= MASK64(mb, 63 - sh); + break; + case 3: /* rldimi */ + imm = MASK64(mb, 63 - sh); + val = (regs->gpr[ra] & ~imm) | + (val & imm); + } + op->val = val; + goto logical_done; + } else { + sh = regs->gpr[rb] & 0x3f; + val = ROTATE(val, sh); + switch ((word >> 1) & 7) { + case 0: /* rldcl */ + op->val = val & MASK64_L(mb); + goto logical_done; + case 1: /* rldcr */ + op->val = val & MASK64_R(mb); + goto logical_done; + } + } +#endif + op->type = UNKNOWN; /* illegal instruction */ + return 0; + + case 31: + /* isel occupies 32 minor opcodes */ + if (((word >> 1) & 0x1f) == 15) { + mb = (word >> 6) & 0x1f; /* bc field */ + val = (regs->ccr >> (31 - mb)) & 1; + val2 = (ra) ? regs->gpr[ra] : 0; + + op->val = (val) ? val2 : regs->gpr[rb]; + goto compute_done; + } + + switch ((word >> 1) & 0x3ff) { + case 4: /* tw */ + if (rd == 0x1f || + (rd & trap_compare((int)regs->gpr[ra], + (int)regs->gpr[rb]))) + goto trap; + return 1; +#ifdef __powerpc64__ + case 68: /* td */ + if (rd & trap_compare(regs->gpr[ra], regs->gpr[rb])) + goto trap; + return 1; +#endif + case 83: /* mfmsr */ + if (regs->msr & MSR_PR) + goto priv; + op->type = MFMSR; + op->reg = rd; + return 0; + case 146: /* mtmsr */ + if (regs->msr & MSR_PR) + goto priv; + op->type = MTMSR; + op->reg = rd; + op->val = 0xffffffff & ~(MSR_ME | MSR_LE); + return 0; +#ifdef CONFIG_PPC64 + case 178: /* mtmsrd */ + if (regs->msr & MSR_PR) + goto priv; + op->type = MTMSR; + op->reg = rd; + /* only MSR_EE and MSR_RI get changed if bit 15 set */ + /* mtmsrd doesn't change MSR_HV, MSR_ME or MSR_LE */ + imm = (word & 0x10000)? 0x8002: 0xefffffffffffeffeUL; + op->val = imm; + return 0; +#endif + + case 19: /* mfcr */ + imm = 0xffffffffUL; + if ((word >> 20) & 1) { + imm = 0xf0000000UL; + for (sh = 0; sh < 8; ++sh) { + if (word & (0x80000 >> sh)) + break; + imm >>= 4; + } + } + op->val = regs->ccr & imm; + goto compute_done; + + case 128: /* setb */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + goto unknown_opcode; + /* + * 'ra' encodes the CR field number (bfa) in the top 3 bits. + * Since each CR field is 4 bits, + * we can simply mask off the bottom two bits (bfa * 4) + * to yield the first bit in the CR field. + */ + ra = ra & ~0x3; + /* 'val' stores bits of the CR field (bfa) */ + val = regs->ccr >> (CR0_SHIFT - ra); + /* checks if the LT bit of CR field (bfa) is set */ + if (val & 8) + op->val = -1; + /* checks if the GT bit of CR field (bfa) is set */ + else if (val & 4) + op->val = 1; + else + op->val = 0; + goto compute_done; + + case 144: /* mtcrf */ + op->type = COMPUTE + SETCC; + imm = 0xf0000000UL; + val = regs->gpr[rd]; + op->ccval = regs->ccr; + for (sh = 0; sh < 8; ++sh) { + if (word & (0x80000 >> sh)) + op->ccval = (op->ccval & ~imm) | + (val & imm); + imm >>= 4; + } + return 1; + + case 339: /* mfspr */ + spr = ((word >> 16) & 0x1f) | ((word >> 6) & 0x3e0); + op->type = MFSPR; + op->reg = rd; + op->spr = spr; + if (spr == SPRN_XER || spr == SPRN_LR || + spr == SPRN_CTR) + return 1; + return 0; + + case 467: /* mtspr */ + spr = ((word >> 16) & 0x1f) | ((word >> 6) & 0x3e0); + op->type = MTSPR; + op->val = regs->gpr[rd]; + op->spr = spr; + if (spr == SPRN_XER || spr == SPRN_LR || + spr == SPRN_CTR) + return 1; + return 0; + +/* + * Compare instructions + */ + case 0: /* cmp */ + val = regs->gpr[ra]; + val2 = regs->gpr[rb]; +#ifdef __powerpc64__ + if ((rd & 1) == 0) { + /* word (32-bit) compare */ + val = (int) val; + val2 = (int) val2; + } +#endif + do_cmp_signed(regs, op, val, val2, rd >> 2); + return 1; + + case 32: /* cmpl */ + val = regs->gpr[ra]; + val2 = regs->gpr[rb]; +#ifdef __powerpc64__ + if ((rd & 1) == 0) { + /* word (32-bit) compare */ + val = (unsigned int) val; + val2 = (unsigned int) val2; + } +#endif + do_cmp_unsigned(regs, op, val, val2, rd >> 2); + return 1; + + case 508: /* cmpb */ + do_cmpb(regs, op, regs->gpr[rd], regs->gpr[rb]); + goto logical_done_nocc; + +/* + * Arithmetic instructions + */ + case 8: /* subfc */ + add_with_carry(regs, op, rd, ~regs->gpr[ra], + regs->gpr[rb], 1); + goto arith_done; +#ifdef __powerpc64__ + case 9: /* mulhdu */ + asm("mulhdu %0,%1,%2" : "=r" (op->val) : + "r" (regs->gpr[ra]), "r" (regs->gpr[rb])); + goto arith_done; +#endif + case 10: /* addc */ + add_with_carry(regs, op, rd, regs->gpr[ra], + regs->gpr[rb], 0); + goto arith_done; + + case 11: /* mulhwu */ + asm("mulhwu %0,%1,%2" : "=r" (op->val) : + "r" (regs->gpr[ra]), "r" (regs->gpr[rb])); + goto arith_done; + + case 40: /* subf */ + op->val = regs->gpr[rb] - regs->gpr[ra]; + goto arith_done; +#ifdef __powerpc64__ + case 73: /* mulhd */ + asm("mulhd %0,%1,%2" : "=r" (op->val) : + "r" (regs->gpr[ra]), "r" (regs->gpr[rb])); + goto arith_done; +#endif + case 75: /* mulhw */ + asm("mulhw %0,%1,%2" : "=r" (op->val) : + "r" (regs->gpr[ra]), "r" (regs->gpr[rb])); + goto arith_done; + + case 104: /* neg */ + op->val = -regs->gpr[ra]; + goto arith_done; + + case 136: /* subfe */ + add_with_carry(regs, op, rd, ~regs->gpr[ra], + regs->gpr[rb], regs->xer & XER_CA); + goto arith_done; + + case 138: /* adde */ + add_with_carry(regs, op, rd, regs->gpr[ra], + regs->gpr[rb], regs->xer & XER_CA); + goto arith_done; + + case 200: /* subfze */ + add_with_carry(regs, op, rd, ~regs->gpr[ra], 0L, + regs->xer & XER_CA); + goto arith_done; + + case 202: /* addze */ + add_with_carry(regs, op, rd, regs->gpr[ra], 0L, + regs->xer & XER_CA); + goto arith_done; + + case 232: /* subfme */ + add_with_carry(regs, op, rd, ~regs->gpr[ra], -1L, + regs->xer & XER_CA); + goto arith_done; +#ifdef __powerpc64__ + case 233: /* mulld */ + op->val = regs->gpr[ra] * regs->gpr[rb]; + goto arith_done; +#endif + case 234: /* addme */ + add_with_carry(regs, op, rd, regs->gpr[ra], -1L, + regs->xer & XER_CA); + goto arith_done; + + case 235: /* mullw */ + op->val = (long)(int) regs->gpr[ra] * + (int) regs->gpr[rb]; + + goto arith_done; +#ifdef __powerpc64__ + case 265: /* modud */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + goto unknown_opcode; + op->val = regs->gpr[ra] % regs->gpr[rb]; + goto compute_done; +#endif + case 266: /* add */ + op->val = regs->gpr[ra] + regs->gpr[rb]; + goto arith_done; + + case 267: /* moduw */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + goto unknown_opcode; + op->val = (unsigned int) regs->gpr[ra] % + (unsigned int) regs->gpr[rb]; + goto compute_done; +#ifdef __powerpc64__ + case 457: /* divdu */ + op->val = regs->gpr[ra] / regs->gpr[rb]; + goto arith_done; +#endif + case 459: /* divwu */ + op->val = (unsigned int) regs->gpr[ra] / + (unsigned int) regs->gpr[rb]; + goto arith_done; +#ifdef __powerpc64__ + case 489: /* divd */ + op->val = (long int) regs->gpr[ra] / + (long int) regs->gpr[rb]; + goto arith_done; +#endif + case 491: /* divw */ + op->val = (int) regs->gpr[ra] / + (int) regs->gpr[rb]; + goto arith_done; +#ifdef __powerpc64__ + case 425: /* divde[.] */ + asm volatile(PPC_DIVDE(%0, %1, %2) : + "=r" (op->val) : "r" (regs->gpr[ra]), + "r" (regs->gpr[rb])); + goto arith_done; + case 393: /* divdeu[.] */ + asm volatile(PPC_DIVDEU(%0, %1, %2) : + "=r" (op->val) : "r" (regs->gpr[ra]), + "r" (regs->gpr[rb])); + goto arith_done; +#endif + case 755: /* darn */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + goto unknown_opcode; + switch (ra & 0x3) { + case 0: + /* 32-bit conditioned */ + asm volatile(PPC_DARN(%0, 0) : "=r" (op->val)); + goto compute_done; + + case 1: + /* 64-bit conditioned */ + asm volatile(PPC_DARN(%0, 1) : "=r" (op->val)); + goto compute_done; + + case 2: + /* 64-bit raw */ + asm volatile(PPC_DARN(%0, 2) : "=r" (op->val)); + goto compute_done; + } + + goto unknown_opcode; +#ifdef __powerpc64__ + case 777: /* modsd */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + goto unknown_opcode; + op->val = (long int) regs->gpr[ra] % + (long int) regs->gpr[rb]; + goto compute_done; +#endif + case 779: /* modsw */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + goto unknown_opcode; + op->val = (int) regs->gpr[ra] % + (int) regs->gpr[rb]; + goto compute_done; + + +/* + * Logical instructions + */ + case 26: /* cntlzw */ + val = (unsigned int) regs->gpr[rd]; + op->val = ( val ? __builtin_clz(val) : 32 ); + goto logical_done; +#ifdef __powerpc64__ + case 58: /* cntlzd */ + val = regs->gpr[rd]; + op->val = ( val ? __builtin_clzl(val) : 64 ); + goto logical_done; +#endif + case 28: /* and */ + op->val = regs->gpr[rd] & regs->gpr[rb]; + goto logical_done; + + case 60: /* andc */ + op->val = regs->gpr[rd] & ~regs->gpr[rb]; + goto logical_done; + + case 122: /* popcntb */ + do_popcnt(regs, op, regs->gpr[rd], 8); + goto logical_done_nocc; + + case 124: /* nor */ + op->val = ~(regs->gpr[rd] | regs->gpr[rb]); + goto logical_done; + + case 154: /* prtyw */ + do_prty(regs, op, regs->gpr[rd], 32); + goto logical_done_nocc; + + case 186: /* prtyd */ + do_prty(regs, op, regs->gpr[rd], 64); + goto logical_done_nocc; +#ifdef CONFIG_PPC64 + case 252: /* bpermd */ + do_bpermd(regs, op, regs->gpr[rd], regs->gpr[rb]); + goto logical_done_nocc; +#endif + case 284: /* xor */ + op->val = ~(regs->gpr[rd] ^ regs->gpr[rb]); + goto logical_done; + + case 316: /* xor */ + op->val = regs->gpr[rd] ^ regs->gpr[rb]; + goto logical_done; + + case 378: /* popcntw */ + do_popcnt(regs, op, regs->gpr[rd], 32); + goto logical_done_nocc; + + case 412: /* orc */ + op->val = regs->gpr[rd] | ~regs->gpr[rb]; + goto logical_done; + + case 444: /* or */ + op->val = regs->gpr[rd] | regs->gpr[rb]; + goto logical_done; + + case 476: /* nand */ + op->val = ~(regs->gpr[rd] & regs->gpr[rb]); + goto logical_done; +#ifdef CONFIG_PPC64 + case 506: /* popcntd */ + do_popcnt(regs, op, regs->gpr[rd], 64); + goto logical_done_nocc; +#endif + case 538: /* cnttzw */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + goto unknown_opcode; + val = (unsigned int) regs->gpr[rd]; + op->val = (val ? __builtin_ctz(val) : 32); + goto logical_done; +#ifdef __powerpc64__ + case 570: /* cnttzd */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + goto unknown_opcode; + val = regs->gpr[rd]; + op->val = (val ? __builtin_ctzl(val) : 64); + goto logical_done; +#endif + case 922: /* extsh */ + op->val = (signed short) regs->gpr[rd]; + goto logical_done; + + case 954: /* extsb */ + op->val = (signed char) regs->gpr[rd]; + goto logical_done; +#ifdef __powerpc64__ + case 986: /* extsw */ + op->val = (signed int) regs->gpr[rd]; + goto logical_done; +#endif + +/* + * Shift instructions + */ + case 24: /* slw */ + sh = regs->gpr[rb] & 0x3f; + if (sh < 32) + op->val = (regs->gpr[rd] << sh) & 0xffffffffUL; + else + op->val = 0; + goto logical_done; + + case 536: /* srw */ + sh = regs->gpr[rb] & 0x3f; + if (sh < 32) + op->val = (regs->gpr[rd] & 0xffffffffUL) >> sh; + else + op->val = 0; + goto logical_done; + + case 792: /* sraw */ + op->type = COMPUTE + SETREG + SETXER; + sh = regs->gpr[rb] & 0x3f; + ival = (signed int) regs->gpr[rd]; + op->val = ival >> (sh < 32 ? sh : 31); + op->xerval = regs->xer; + if (ival < 0 && (sh >= 32 || (ival & ((1ul << sh) - 1)) != 0)) + op->xerval |= XER_CA; + else + op->xerval &= ~XER_CA; + set_ca32(op, op->xerval & XER_CA); + goto logical_done; + + case 824: /* srawi */ + op->type = COMPUTE + SETREG + SETXER; + sh = rb; + ival = (signed int) regs->gpr[rd]; + op->val = ival >> sh; + op->xerval = regs->xer; + if (ival < 0 && (ival & ((1ul << sh) - 1)) != 0) + op->xerval |= XER_CA; + else + op->xerval &= ~XER_CA; + set_ca32(op, op->xerval & XER_CA); + goto logical_done; + +#ifdef __powerpc64__ + case 27: /* sld */ + sh = regs->gpr[rb] & 0x7f; + if (sh < 64) + op->val = regs->gpr[rd] << sh; + else + op->val = 0; + goto logical_done; + + case 539: /* srd */ + sh = regs->gpr[rb] & 0x7f; + if (sh < 64) + op->val = regs->gpr[rd] >> sh; + else + op->val = 0; + goto logical_done; + + case 794: /* srad */ + op->type = COMPUTE + SETREG + SETXER; + sh = regs->gpr[rb] & 0x7f; + ival = (signed long int) regs->gpr[rd]; + op->val = ival >> (sh < 64 ? sh : 63); + op->xerval = regs->xer; + if (ival < 0 && (sh >= 64 || (ival & ((1ul << sh) - 1)) != 0)) + op->xerval |= XER_CA; + else + op->xerval &= ~XER_CA; + set_ca32(op, op->xerval & XER_CA); + goto logical_done; + + case 826: /* sradi with sh_5 = 0 */ + case 827: /* sradi with sh_5 = 1 */ + op->type = COMPUTE + SETREG + SETXER; + sh = rb | ((word & 2) << 4); + ival = (signed long int) regs->gpr[rd]; + op->val = ival >> sh; + op->xerval = regs->xer; + if (ival < 0 && (ival & ((1ul << sh) - 1)) != 0) + op->xerval |= XER_CA; + else + op->xerval &= ~XER_CA; + set_ca32(op, op->xerval & XER_CA); + goto logical_done; + + case 890: /* extswsli with sh_5 = 0 */ + case 891: /* extswsli with sh_5 = 1 */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + goto unknown_opcode; + op->type = COMPUTE + SETREG; + sh = rb | ((word & 2) << 4); + val = (signed int) regs->gpr[rd]; + if (sh) + op->val = ROTATE(val, sh) & MASK64(0, 63 - sh); + else + op->val = val; + goto logical_done; + +#endif /* __powerpc64__ */ + +/* + * Cache instructions + */ + case 54: /* dcbst */ + op->type = MKOP(CACHEOP, DCBST, 0); + op->ea = xform_ea(word, regs); + return 0; + + case 86: /* dcbf */ + op->type = MKOP(CACHEOP, DCBF, 0); + op->ea = xform_ea(word, regs); + return 0; + + case 246: /* dcbtst */ + op->type = MKOP(CACHEOP, DCBTST, 0); + op->ea = xform_ea(word, regs); + op->reg = rd; + return 0; + + case 278: /* dcbt */ + op->type = MKOP(CACHEOP, DCBTST, 0); + op->ea = xform_ea(word, regs); + op->reg = rd; + return 0; + + case 982: /* icbi */ + op->type = MKOP(CACHEOP, ICBI, 0); + op->ea = xform_ea(word, regs); + return 0; + + case 1014: /* dcbz */ + op->type = MKOP(CACHEOP, DCBZ, 0); + op->ea = xform_ea(word, regs); + return 0; + } + break; + } + +/* + * Loads and stores. + */ + op->type = UNKNOWN; + op->update_reg = ra; + op->reg = rd; + op->val = regs->gpr[rd]; + u = (word >> 20) & UPDATE; + op->vsx_flags = 0; + + switch (opcode) { + case 31: + u = word & UPDATE; + op->ea = xform_ea(word, regs); + switch ((word >> 1) & 0x3ff) { + case 20: /* lwarx */ + op->type = MKOP(LARX, 0, 4); + break; + + case 150: /* stwcx. */ + op->type = MKOP(STCX, 0, 4); + break; + +#ifdef CONFIG_PPC_HAS_LBARX_LHARX + case 52: /* lbarx */ + op->type = MKOP(LARX, 0, 1); + break; + + case 694: /* stbcx. */ + op->type = MKOP(STCX, 0, 1); + break; + + case 116: /* lharx */ + op->type = MKOP(LARX, 0, 2); + break; + + case 726: /* sthcx. */ + op->type = MKOP(STCX, 0, 2); + break; +#endif +#ifdef __powerpc64__ + case 84: /* ldarx */ + op->type = MKOP(LARX, 0, 8); + break; + + case 214: /* stdcx. */ + op->type = MKOP(STCX, 0, 8); + break; + + case 276: /* lqarx */ + if (!((rd & 1) || rd == ra || rd == rb)) + op->type = MKOP(LARX, 0, 16); + break; + + case 182: /* stqcx. */ + if (!(rd & 1)) + op->type = MKOP(STCX, 0, 16); + break; +#endif + + case 23: /* lwzx */ + case 55: /* lwzux */ + op->type = MKOP(LOAD, u, 4); + break; + + case 87: /* lbzx */ + case 119: /* lbzux */ + op->type = MKOP(LOAD, u, 1); + break; + +#ifdef CONFIG_ALTIVEC + /* + * Note: for the load/store vector element instructions, + * bits of the EA say which field of the VMX register to use. + */ + case 7: /* lvebx */ + op->type = MKOP(LOAD_VMX, 0, 1); + op->element_size = 1; + break; + + case 39: /* lvehx */ + op->type = MKOP(LOAD_VMX, 0, 2); + op->element_size = 2; + break; + + case 71: /* lvewx */ + op->type = MKOP(LOAD_VMX, 0, 4); + op->element_size = 4; + break; + + case 103: /* lvx */ + case 359: /* lvxl */ + op->type = MKOP(LOAD_VMX, 0, 16); + op->element_size = 16; + break; + + case 135: /* stvebx */ + op->type = MKOP(STORE_VMX, 0, 1); + op->element_size = 1; + break; + + case 167: /* stvehx */ + op->type = MKOP(STORE_VMX, 0, 2); + op->element_size = 2; + break; + + case 199: /* stvewx */ + op->type = MKOP(STORE_VMX, 0, 4); + op->element_size = 4; + break; + + case 231: /* stvx */ + case 487: /* stvxl */ + op->type = MKOP(STORE_VMX, 0, 16); + break; +#endif /* CONFIG_ALTIVEC */ + +#ifdef __powerpc64__ + case 21: /* ldx */ + case 53: /* ldux */ + op->type = MKOP(LOAD, u, 8); + break; + + case 149: /* stdx */ + case 181: /* stdux */ + op->type = MKOP(STORE, u, 8); + break; +#endif + + case 151: /* stwx */ + case 183: /* stwux */ + op->type = MKOP(STORE, u, 4); + break; + + case 215: /* stbx */ + case 247: /* stbux */ + op->type = MKOP(STORE, u, 1); + break; + + case 279: /* lhzx */ + case 311: /* lhzux */ + op->type = MKOP(LOAD, u, 2); + break; + +#ifdef __powerpc64__ + case 341: /* lwax */ + case 373: /* lwaux */ + op->type = MKOP(LOAD, SIGNEXT | u, 4); + break; +#endif + + case 343: /* lhax */ + case 375: /* lhaux */ + op->type = MKOP(LOAD, SIGNEXT | u, 2); + break; + + case 407: /* sthx */ + case 439: /* sthux */ + op->type = MKOP(STORE, u, 2); + break; + +#ifdef __powerpc64__ + case 532: /* ldbrx */ + op->type = MKOP(LOAD, BYTEREV, 8); + break; + +#endif + case 533: /* lswx */ + op->type = MKOP(LOAD_MULTI, 0, regs->xer & 0x7f); + break; + + case 534: /* lwbrx */ + op->type = MKOP(LOAD, BYTEREV, 4); + break; + + case 597: /* lswi */ + if (rb == 0) + rb = 32; /* # bytes to load */ + op->type = MKOP(LOAD_MULTI, 0, rb); + op->ea = ra ? regs->gpr[ra] : 0; + break; + +#ifdef CONFIG_PPC_FPU + case 535: /* lfsx */ + case 567: /* lfsux */ + op->type = MKOP(LOAD_FP, u | FPCONV, 4); + break; + + case 599: /* lfdx */ + case 631: /* lfdux */ + op->type = MKOP(LOAD_FP, u, 8); + break; + + case 663: /* stfsx */ + case 695: /* stfsux */ + op->type = MKOP(STORE_FP, u | FPCONV, 4); + break; + + case 727: /* stfdx */ + case 759: /* stfdux */ + op->type = MKOP(STORE_FP, u, 8); + break; + +#ifdef __powerpc64__ + case 791: /* lfdpx */ + op->type = MKOP(LOAD_FP, 0, 16); + break; + + case 855: /* lfiwax */ + op->type = MKOP(LOAD_FP, SIGNEXT, 4); + break; + + case 887: /* lfiwzx */ + op->type = MKOP(LOAD_FP, 0, 4); + break; + + case 919: /* stfdpx */ + op->type = MKOP(STORE_FP, 0, 16); + break; + + case 983: /* stfiwx */ + op->type = MKOP(STORE_FP, 0, 4); + break; +#endif /* __powerpc64 */ +#endif /* CONFIG_PPC_FPU */ + +#ifdef __powerpc64__ + case 660: /* stdbrx */ + op->type = MKOP(STORE, BYTEREV, 8); + op->val = byterev_8(regs->gpr[rd]); + break; + +#endif + case 661: /* stswx */ + op->type = MKOP(STORE_MULTI, 0, regs->xer & 0x7f); + break; + + case 662: /* stwbrx */ + op->type = MKOP(STORE, BYTEREV, 4); + op->val = byterev_4(regs->gpr[rd]); + break; + + case 725: /* stswi */ + if (rb == 0) + rb = 32; /* # bytes to store */ + op->type = MKOP(STORE_MULTI, 0, rb); + op->ea = ra ? regs->gpr[ra] : 0; + break; + + case 790: /* lhbrx */ + op->type = MKOP(LOAD, BYTEREV, 2); + break; + + case 918: /* sthbrx */ + op->type = MKOP(STORE, BYTEREV, 2); + op->val = byterev_2(regs->gpr[rd]); + break; + +#ifdef CONFIG_VSX + case 12: /* lxsiwzx */ + op->reg = rd | ((word & 1) << 5); + op->type = MKOP(LOAD_VSX, 0, 4); + op->element_size = 8; + break; + + case 76: /* lxsiwax */ + op->reg = rd | ((word & 1) << 5); + op->type = MKOP(LOAD_VSX, SIGNEXT, 4); + op->element_size = 8; + break; + + case 140: /* stxsiwx */ + op->reg = rd | ((word & 1) << 5); + op->type = MKOP(STORE_VSX, 0, 4); + op->element_size = 8; + break; + + case 268: /* lxvx */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + goto unknown_opcode; + op->reg = rd | ((word & 1) << 5); + op->type = MKOP(LOAD_VSX, 0, 16); + op->element_size = 16; + op->vsx_flags = VSX_CHECK_VEC; + break; + + case 269: /* lxvl */ + case 301: { /* lxvll */ + int nb; + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + goto unknown_opcode; + op->reg = rd | ((word & 1) << 5); + op->ea = ra ? regs->gpr[ra] : 0; + nb = regs->gpr[rb] & 0xff; + if (nb > 16) + nb = 16; + op->type = MKOP(LOAD_VSX, 0, nb); + op->element_size = 16; + op->vsx_flags = ((word & 0x20) ? VSX_LDLEFT : 0) | + VSX_CHECK_VEC; + break; + } + case 332: /* lxvdsx */ + op->reg = rd | ((word & 1) << 5); + op->type = MKOP(LOAD_VSX, 0, 8); + op->element_size = 8; + op->vsx_flags = VSX_SPLAT; + break; + + case 333: /* lxvpx */ + if (!cpu_has_feature(CPU_FTR_ARCH_31)) + goto unknown_opcode; + op->reg = VSX_REGISTER_XTP(rd); + op->type = MKOP(LOAD_VSX, 0, 32); + op->element_size = 32; + break; + + case 364: /* lxvwsx */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + goto unknown_opcode; + op->reg = rd | ((word & 1) << 5); + op->type = MKOP(LOAD_VSX, 0, 4); + op->element_size = 4; + op->vsx_flags = VSX_SPLAT | VSX_CHECK_VEC; + break; + + case 396: /* stxvx */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + goto unknown_opcode; + op->reg = rd | ((word & 1) << 5); + op->type = MKOP(STORE_VSX, 0, 16); + op->element_size = 16; + op->vsx_flags = VSX_CHECK_VEC; + break; + + case 397: /* stxvl */ + case 429: { /* stxvll */ + int nb; + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + goto unknown_opcode; + op->reg = rd | ((word & 1) << 5); + op->ea = ra ? regs->gpr[ra] : 0; + nb = regs->gpr[rb] & 0xff; + if (nb > 16) + nb = 16; + op->type = MKOP(STORE_VSX, 0, nb); + op->element_size = 16; + op->vsx_flags = ((word & 0x20) ? VSX_LDLEFT : 0) | + VSX_CHECK_VEC; + break; + } + case 461: /* stxvpx */ + if (!cpu_has_feature(CPU_FTR_ARCH_31)) + goto unknown_opcode; + op->reg = VSX_REGISTER_XTP(rd); + op->type = MKOP(STORE_VSX, 0, 32); + op->element_size = 32; + break; + case 524: /* lxsspx */ + op->reg = rd | ((word & 1) << 5); + op->type = MKOP(LOAD_VSX, 0, 4); + op->element_size = 8; + op->vsx_flags = VSX_FPCONV; + break; + + case 588: /* lxsdx */ + op->reg = rd | ((word & 1) << 5); + op->type = MKOP(LOAD_VSX, 0, 8); + op->element_size = 8; + break; + + case 652: /* stxsspx */ + op->reg = rd | ((word & 1) << 5); + op->type = MKOP(STORE_VSX, 0, 4); + op->element_size = 8; + op->vsx_flags = VSX_FPCONV; + break; + + case 716: /* stxsdx */ + op->reg = rd | ((word & 1) << 5); + op->type = MKOP(STORE_VSX, 0, 8); + op->element_size = 8; + break; + + case 780: /* lxvw4x */ + op->reg = rd | ((word & 1) << 5); + op->type = MKOP(LOAD_VSX, 0, 16); + op->element_size = 4; + break; + + case 781: /* lxsibzx */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + goto unknown_opcode; + op->reg = rd | ((word & 1) << 5); + op->type = MKOP(LOAD_VSX, 0, 1); + op->element_size = 8; + op->vsx_flags = VSX_CHECK_VEC; + break; + + case 812: /* lxvh8x */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + goto unknown_opcode; + op->reg = rd | ((word & 1) << 5); + op->type = MKOP(LOAD_VSX, 0, 16); + op->element_size = 2; + op->vsx_flags = VSX_CHECK_VEC; + break; + + case 813: /* lxsihzx */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + goto unknown_opcode; + op->reg = rd | ((word & 1) << 5); + op->type = MKOP(LOAD_VSX, 0, 2); + op->element_size = 8; + op->vsx_flags = VSX_CHECK_VEC; + break; + + case 844: /* lxvd2x */ + op->reg = rd | ((word & 1) << 5); + op->type = MKOP(LOAD_VSX, 0, 16); + op->element_size = 8; + break; + + case 876: /* lxvb16x */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + goto unknown_opcode; + op->reg = rd | ((word & 1) << 5); + op->type = MKOP(LOAD_VSX, 0, 16); + op->element_size = 1; + op->vsx_flags = VSX_CHECK_VEC; + break; + + case 908: /* stxvw4x */ + op->reg = rd | ((word & 1) << 5); + op->type = MKOP(STORE_VSX, 0, 16); + op->element_size = 4; + break; + + case 909: /* stxsibx */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + goto unknown_opcode; + op->reg = rd | ((word & 1) << 5); + op->type = MKOP(STORE_VSX, 0, 1); + op->element_size = 8; + op->vsx_flags = VSX_CHECK_VEC; + break; + + case 940: /* stxvh8x */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + goto unknown_opcode; + op->reg = rd | ((word & 1) << 5); + op->type = MKOP(STORE_VSX, 0, 16); + op->element_size = 2; + op->vsx_flags = VSX_CHECK_VEC; + break; + + case 941: /* stxsihx */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + goto unknown_opcode; + op->reg = rd | ((word & 1) << 5); + op->type = MKOP(STORE_VSX, 0, 2); + op->element_size = 8; + op->vsx_flags = VSX_CHECK_VEC; + break; + + case 972: /* stxvd2x */ + op->reg = rd | ((word & 1) << 5); + op->type = MKOP(STORE_VSX, 0, 16); + op->element_size = 8; + break; + + case 1004: /* stxvb16x */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + goto unknown_opcode; + op->reg = rd | ((word & 1) << 5); + op->type = MKOP(STORE_VSX, 0, 16); + op->element_size = 1; + op->vsx_flags = VSX_CHECK_VEC; + break; + +#endif /* CONFIG_VSX */ + } + break; + + case 32: /* lwz */ + case 33: /* lwzu */ + op->type = MKOP(LOAD, u, 4); + op->ea = dform_ea(word, regs); + break; + + case 34: /* lbz */ + case 35: /* lbzu */ + op->type = MKOP(LOAD, u, 1); + op->ea = dform_ea(word, regs); + break; + + case 36: /* stw */ + case 37: /* stwu */ + op->type = MKOP(STORE, u, 4); + op->ea = dform_ea(word, regs); + break; + + case 38: /* stb */ + case 39: /* stbu */ + op->type = MKOP(STORE, u, 1); + op->ea = dform_ea(word, regs); + break; + + case 40: /* lhz */ + case 41: /* lhzu */ + op->type = MKOP(LOAD, u, 2); + op->ea = dform_ea(word, regs); + break; + + case 42: /* lha */ + case 43: /* lhau */ + op->type = MKOP(LOAD, SIGNEXT | u, 2); + op->ea = dform_ea(word, regs); + break; + + case 44: /* sth */ + case 45: /* sthu */ + op->type = MKOP(STORE, u, 2); + op->ea = dform_ea(word, regs); + break; + + case 46: /* lmw */ + if (ra >= rd) + break; /* invalid form, ra in range to load */ + op->type = MKOP(LOAD_MULTI, 0, 4 * (32 - rd)); + op->ea = dform_ea(word, regs); + break; + + case 47: /* stmw */ + op->type = MKOP(STORE_MULTI, 0, 4 * (32 - rd)); + op->ea = dform_ea(word, regs); + break; + +#ifdef CONFIG_PPC_FPU + case 48: /* lfs */ + case 49: /* lfsu */ + op->type = MKOP(LOAD_FP, u | FPCONV, 4); + op->ea = dform_ea(word, regs); + break; + + case 50: /* lfd */ + case 51: /* lfdu */ + op->type = MKOP(LOAD_FP, u, 8); + op->ea = dform_ea(word, regs); + break; + + case 52: /* stfs */ + case 53: /* stfsu */ + op->type = MKOP(STORE_FP, u | FPCONV, 4); + op->ea = dform_ea(word, regs); + break; + + case 54: /* stfd */ + case 55: /* stfdu */ + op->type = MKOP(STORE_FP, u, 8); + op->ea = dform_ea(word, regs); + break; +#endif + +#ifdef __powerpc64__ + case 56: /* lq */ + if (!((rd & 1) || (rd == ra))) + op->type = MKOP(LOAD, 0, 16); + op->ea = dqform_ea(word, regs); + break; +#endif + +#ifdef CONFIG_VSX + case 57: /* lfdp, lxsd, lxssp */ + op->ea = dsform_ea(word, regs); + switch (word & 3) { + case 0: /* lfdp */ + if (rd & 1) + break; /* reg must be even */ + op->type = MKOP(LOAD_FP, 0, 16); + break; + case 2: /* lxsd */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + goto unknown_opcode; + op->reg = rd + 32; + op->type = MKOP(LOAD_VSX, 0, 8); + op->element_size = 8; + op->vsx_flags = VSX_CHECK_VEC; + break; + case 3: /* lxssp */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + goto unknown_opcode; + op->reg = rd + 32; + op->type = MKOP(LOAD_VSX, 0, 4); + op->element_size = 8; + op->vsx_flags = VSX_FPCONV | VSX_CHECK_VEC; + break; + } + break; +#endif /* CONFIG_VSX */ + +#ifdef __powerpc64__ + case 58: /* ld[u], lwa */ + op->ea = dsform_ea(word, regs); + switch (word & 3) { + case 0: /* ld */ + op->type = MKOP(LOAD, 0, 8); + break; + case 1: /* ldu */ + op->type = MKOP(LOAD, UPDATE, 8); + break; + case 2: /* lwa */ + op->type = MKOP(LOAD, SIGNEXT, 4); + break; + } + break; +#endif + +#ifdef CONFIG_VSX + case 6: + if (!cpu_has_feature(CPU_FTR_ARCH_31)) + goto unknown_opcode; + op->ea = dqform_ea(word, regs); + op->reg = VSX_REGISTER_XTP(rd); + op->element_size = 32; + switch (word & 0xf) { + case 0: /* lxvp */ + op->type = MKOP(LOAD_VSX, 0, 32); + break; + case 1: /* stxvp */ + op->type = MKOP(STORE_VSX, 0, 32); + break; + } + break; + + case 61: /* stfdp, lxv, stxsd, stxssp, stxv */ + switch (word & 7) { + case 0: /* stfdp with LSB of DS field = 0 */ + case 4: /* stfdp with LSB of DS field = 1 */ + op->ea = dsform_ea(word, regs); + op->type = MKOP(STORE_FP, 0, 16); + break; + + case 1: /* lxv */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + goto unknown_opcode; + op->ea = dqform_ea(word, regs); + if (word & 8) + op->reg = rd + 32; + op->type = MKOP(LOAD_VSX, 0, 16); + op->element_size = 16; + op->vsx_flags = VSX_CHECK_VEC; + break; + + case 2: /* stxsd with LSB of DS field = 0 */ + case 6: /* stxsd with LSB of DS field = 1 */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + goto unknown_opcode; + op->ea = dsform_ea(word, regs); + op->reg = rd + 32; + op->type = MKOP(STORE_VSX, 0, 8); + op->element_size = 8; + op->vsx_flags = VSX_CHECK_VEC; + break; + + case 3: /* stxssp with LSB of DS field = 0 */ + case 7: /* stxssp with LSB of DS field = 1 */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + goto unknown_opcode; + op->ea = dsform_ea(word, regs); + op->reg = rd + 32; + op->type = MKOP(STORE_VSX, 0, 4); + op->element_size = 8; + op->vsx_flags = VSX_FPCONV | VSX_CHECK_VEC; + break; + + case 5: /* stxv */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + goto unknown_opcode; + op->ea = dqform_ea(word, regs); + if (word & 8) + op->reg = rd + 32; + op->type = MKOP(STORE_VSX, 0, 16); + op->element_size = 16; + op->vsx_flags = VSX_CHECK_VEC; + break; + } + break; +#endif /* CONFIG_VSX */ + +#ifdef __powerpc64__ + case 62: /* std[u] */ + op->ea = dsform_ea(word, regs); + switch (word & 3) { + case 0: /* std */ + op->type = MKOP(STORE, 0, 8); + break; + case 1: /* stdu */ + op->type = MKOP(STORE, UPDATE, 8); + break; + case 2: /* stq */ + if (!(rd & 1)) + op->type = MKOP(STORE, 0, 16); + break; + } + break; + case 1: /* Prefixed instructions */ + if (!cpu_has_feature(CPU_FTR_ARCH_31)) + goto unknown_opcode; + + prefix_r = GET_PREFIX_R(word); + ra = GET_PREFIX_RA(suffix); + op->update_reg = ra; + rd = (suffix >> 21) & 0x1f; + op->reg = rd; + op->val = regs->gpr[rd]; + + suffixopcode = get_op(suffix); + prefixtype = (word >> 24) & 0x3; + switch (prefixtype) { + case 0: /* Type 00 Eight-Byte Load/Store */ + if (prefix_r && ra) + break; + op->ea = mlsd_8lsd_ea(word, suffix, regs); + switch (suffixopcode) { + case 41: /* plwa */ + op->type = MKOP(LOAD, PREFIXED | SIGNEXT, 4); + break; +#ifdef CONFIG_VSX + case 42: /* plxsd */ + op->reg = rd + 32; + op->type = MKOP(LOAD_VSX, PREFIXED, 8); + op->element_size = 8; + op->vsx_flags = VSX_CHECK_VEC; + break; + case 43: /* plxssp */ + op->reg = rd + 32; + op->type = MKOP(LOAD_VSX, PREFIXED, 4); + op->element_size = 8; + op->vsx_flags = VSX_FPCONV | VSX_CHECK_VEC; + break; + case 46: /* pstxsd */ + op->reg = rd + 32; + op->type = MKOP(STORE_VSX, PREFIXED, 8); + op->element_size = 8; + op->vsx_flags = VSX_CHECK_VEC; + break; + case 47: /* pstxssp */ + op->reg = rd + 32; + op->type = MKOP(STORE_VSX, PREFIXED, 4); + op->element_size = 8; + op->vsx_flags = VSX_FPCONV | VSX_CHECK_VEC; + break; + case 51: /* plxv1 */ + op->reg += 32; + fallthrough; + case 50: /* plxv0 */ + op->type = MKOP(LOAD_VSX, PREFIXED, 16); + op->element_size = 16; + op->vsx_flags = VSX_CHECK_VEC; + break; + case 55: /* pstxv1 */ + op->reg = rd + 32; + fallthrough; + case 54: /* pstxv0 */ + op->type = MKOP(STORE_VSX, PREFIXED, 16); + op->element_size = 16; + op->vsx_flags = VSX_CHECK_VEC; + break; +#endif /* CONFIG_VSX */ + case 56: /* plq */ + op->type = MKOP(LOAD, PREFIXED, 16); + break; + case 57: /* pld */ + op->type = MKOP(LOAD, PREFIXED, 8); + break; +#ifdef CONFIG_VSX + case 58: /* plxvp */ + op->reg = VSX_REGISTER_XTP(rd); + op->type = MKOP(LOAD_VSX, PREFIXED, 32); + op->element_size = 32; + break; +#endif /* CONFIG_VSX */ + case 60: /* pstq */ + op->type = MKOP(STORE, PREFIXED, 16); + break; + case 61: /* pstd */ + op->type = MKOP(STORE, PREFIXED, 8); + break; +#ifdef CONFIG_VSX + case 62: /* pstxvp */ + op->reg = VSX_REGISTER_XTP(rd); + op->type = MKOP(STORE_VSX, PREFIXED, 32); + op->element_size = 32; + break; +#endif /* CONFIG_VSX */ + } + break; + case 1: /* Type 01 Eight-Byte Register-to-Register */ + break; + case 2: /* Type 10 Modified Load/Store */ + if (prefix_r && ra) + break; + op->ea = mlsd_8lsd_ea(word, suffix, regs); + switch (suffixopcode) { + case 32: /* plwz */ + op->type = MKOP(LOAD, PREFIXED, 4); + break; + case 34: /* plbz */ + op->type = MKOP(LOAD, PREFIXED, 1); + break; + case 36: /* pstw */ + op->type = MKOP(STORE, PREFIXED, 4); + break; + case 38: /* pstb */ + op->type = MKOP(STORE, PREFIXED, 1); + break; + case 40: /* plhz */ + op->type = MKOP(LOAD, PREFIXED, 2); + break; + case 42: /* plha */ + op->type = MKOP(LOAD, PREFIXED | SIGNEXT, 2); + break; + case 44: /* psth */ + op->type = MKOP(STORE, PREFIXED, 2); + break; + case 48: /* plfs */ + op->type = MKOP(LOAD_FP, PREFIXED | FPCONV, 4); + break; + case 50: /* plfd */ + op->type = MKOP(LOAD_FP, PREFIXED, 8); + break; + case 52: /* pstfs */ + op->type = MKOP(STORE_FP, PREFIXED | FPCONV, 4); + break; + case 54: /* pstfd */ + op->type = MKOP(STORE_FP, PREFIXED, 8); + break; + } + break; + case 3: /* Type 11 Modified Register-to-Register */ + break; + } +#endif /* __powerpc64__ */ + + } + + if (OP_IS_LOAD_STORE(op->type) && (op->type & UPDATE)) { + switch (GETTYPE(op->type)) { + case LOAD: + if (ra == rd) + goto unknown_opcode; + fallthrough; + case STORE: + case LOAD_FP: + case STORE_FP: + if (ra == 0) + goto unknown_opcode; + } + } + +#ifdef CONFIG_VSX + if ((GETTYPE(op->type) == LOAD_VSX || + GETTYPE(op->type) == STORE_VSX) && + !cpu_has_feature(CPU_FTR_VSX)) { + return -1; + } +#endif /* CONFIG_VSX */ + + return 0; + + unknown_opcode: + op->type = UNKNOWN; + return 0; + + logical_done: + if (word & 1) + set_cr0(regs, op); + logical_done_nocc: + op->reg = ra; + op->type |= SETREG; + return 1; + + arith_done: + if (word & 1) + set_cr0(regs, op); + compute_done: + op->reg = rd; + op->type |= SETREG; + return 1; + + priv: + op->type = INTERRUPT | 0x700; + op->val = SRR1_PROGPRIV; + return 0; + + trap: + op->type = INTERRUPT | 0x700; + op->val = SRR1_PROGTRAP; + return 0; +} +EXPORT_SYMBOL_GPL(analyse_instr); +NOKPROBE_SYMBOL(analyse_instr); + +/* + * For PPC32 we always use stwu with r1 to change the stack pointer. + * So this emulated store may corrupt the exception frame, now we + * have to provide the exception frame trampoline, which is pushed + * below the kprobed function stack. So we only update gpr[1] but + * don't emulate the real store operation. We will do real store + * operation safely in exception return code by checking this flag. + */ +static nokprobe_inline int handle_stack_update(unsigned long ea, struct pt_regs *regs) +{ + /* + * Check if we already set since that means we'll + * lose the previous value. + */ + WARN_ON(test_thread_flag(TIF_EMULATE_STACK_STORE)); + set_thread_flag(TIF_EMULATE_STACK_STORE); + return 0; +} + +static nokprobe_inline void do_signext(unsigned long *valp, int size) +{ + switch (size) { + case 2: + *valp = (signed short) *valp; + break; + case 4: + *valp = (signed int) *valp; + break; + } +} + +static nokprobe_inline void do_byterev(unsigned long *valp, int size) +{ + switch (size) { + case 2: + *valp = byterev_2(*valp); + break; + case 4: + *valp = byterev_4(*valp); + break; +#ifdef __powerpc64__ + case 8: + *valp = byterev_8(*valp); + break; +#endif + } +} + +/* + * Emulate an instruction that can be executed just by updating + * fields in *regs. + */ +void emulate_update_regs(struct pt_regs *regs, struct instruction_op *op) +{ + unsigned long next_pc; + + next_pc = truncate_if_32bit(regs->msr, regs->nip + GETLENGTH(op->type)); + switch (GETTYPE(op->type)) { + case COMPUTE: + if (op->type & SETREG) + regs->gpr[op->reg] = op->val; + if (op->type & SETCC) + regs->ccr = op->ccval; + if (op->type & SETXER) + regs->xer = op->xerval; + break; + + case BRANCH: + if (op->type & SETLK) + regs->link = next_pc; + if (op->type & BRTAKEN) + next_pc = op->val; + if (op->type & DECCTR) + --regs->ctr; + break; + + case BARRIER: + switch (op->type & BARRIER_MASK) { + case BARRIER_SYNC: + mb(); + break; + case BARRIER_ISYNC: + isync(); + break; + case BARRIER_EIEIO: + eieio(); + break; +#ifdef CONFIG_PPC64 + case BARRIER_LWSYNC: + asm volatile("lwsync" : : : "memory"); + break; + case BARRIER_PTESYNC: + asm volatile("ptesync" : : : "memory"); + break; +#endif + } + break; + + case MFSPR: + switch (op->spr) { + case SPRN_XER: + regs->gpr[op->reg] = regs->xer & 0xffffffffUL; + break; + case SPRN_LR: + regs->gpr[op->reg] = regs->link; + break; + case SPRN_CTR: + regs->gpr[op->reg] = regs->ctr; + break; + default: + WARN_ON_ONCE(1); + } + break; + + case MTSPR: + switch (op->spr) { + case SPRN_XER: + regs->xer = op->val & 0xffffffffUL; + break; + case SPRN_LR: + regs->link = op->val; + break; + case SPRN_CTR: + regs->ctr = op->val; + break; + default: + WARN_ON_ONCE(1); + } + break; + + default: + WARN_ON_ONCE(1); + } + regs_set_return_ip(regs, next_pc); +} +NOKPROBE_SYMBOL(emulate_update_regs); + +/* + * Emulate a previously-analysed load or store instruction. + * Return values are: + * 0 = instruction emulated successfully + * -EFAULT = address out of range or access faulted (regs->dar + * contains the faulting address) + * -EACCES = misaligned access, instruction requires alignment + * -EINVAL = unknown operation in *op + */ +int emulate_loadstore(struct pt_regs *regs, struct instruction_op *op) +{ + int err, size, type; + int i, rd, nb; + unsigned int cr; + unsigned long val; + unsigned long ea; + bool cross_endian; + + err = 0; + size = GETSIZE(op->type); + type = GETTYPE(op->type); + cross_endian = (regs->msr & MSR_LE) != (MSR_KERNEL & MSR_LE); + ea = truncate_if_32bit(regs->msr, op->ea); + + switch (type) { + case LARX: + if (ea & (size - 1)) + return -EACCES; /* can't handle misaligned */ + if (!address_ok(regs, ea, size)) + return -EFAULT; + err = 0; + val = 0; + switch (size) { +#ifdef CONFIG_PPC_HAS_LBARX_LHARX + case 1: + __get_user_asmx(val, ea, err, "lbarx"); + break; + case 2: + __get_user_asmx(val, ea, err, "lharx"); + break; +#endif + case 4: + __get_user_asmx(val, ea, err, "lwarx"); + break; +#ifdef __powerpc64__ + case 8: + __get_user_asmx(val, ea, err, "ldarx"); + break; + case 16: + err = do_lqarx(ea, ®s->gpr[op->reg]); + break; +#endif + default: + return -EINVAL; + } + if (err) { + regs->dar = ea; + break; + } + if (size < 16) + regs->gpr[op->reg] = val; + break; + + case STCX: + if (ea & (size - 1)) + return -EACCES; /* can't handle misaligned */ + if (!address_ok(regs, ea, size)) + return -EFAULT; + err = 0; + switch (size) { +#ifdef __powerpc64__ + case 1: + __put_user_asmx(op->val, ea, err, "stbcx.", cr); + break; + case 2: + __put_user_asmx(op->val, ea, err, "sthcx.", cr); + break; +#endif + case 4: + __put_user_asmx(op->val, ea, err, "stwcx.", cr); + break; +#ifdef __powerpc64__ + case 8: + __put_user_asmx(op->val, ea, err, "stdcx.", cr); + break; + case 16: + err = do_stqcx(ea, regs->gpr[op->reg], + regs->gpr[op->reg + 1], &cr); + break; +#endif + default: + return -EINVAL; + } + if (!err) + regs->ccr = (regs->ccr & 0x0fffffff) | + (cr & 0xe0000000) | + ((regs->xer >> 3) & 0x10000000); + else + regs->dar = ea; + break; + + case LOAD: +#ifdef __powerpc64__ + if (size == 16) { + err = emulate_lq(regs, ea, op->reg, cross_endian); + break; + } +#endif + err = read_mem(®s->gpr[op->reg], ea, size, regs); + if (!err) { + if (op->type & SIGNEXT) + do_signext(®s->gpr[op->reg], size); + if ((op->type & BYTEREV) == (cross_endian ? 0 : BYTEREV)) + do_byterev(®s->gpr[op->reg], size); + } + break; + +#ifdef CONFIG_PPC_FPU + case LOAD_FP: + /* + * If the instruction is in userspace, we can emulate it even + * if the VMX state is not live, because we have the state + * stored in the thread_struct. If the instruction is in + * the kernel, we must not touch the state in the thread_struct. + */ + if (!(regs->msr & MSR_PR) && !(regs->msr & MSR_FP)) + return 0; + err = do_fp_load(op, ea, regs, cross_endian); + break; +#endif +#ifdef CONFIG_ALTIVEC + case LOAD_VMX: + if (!(regs->msr & MSR_PR) && !(regs->msr & MSR_VEC)) + return 0; + err = do_vec_load(op->reg, ea, size, regs, cross_endian); + break; +#endif +#ifdef CONFIG_VSX + case LOAD_VSX: { + unsigned long msrbit = MSR_VSX; + + /* + * Some VSX instructions check the MSR_VEC bit rather than MSR_VSX + * when the target of the instruction is a vector register. + */ + if (op->reg >= 32 && (op->vsx_flags & VSX_CHECK_VEC)) + msrbit = MSR_VEC; + if (!(regs->msr & MSR_PR) && !(regs->msr & msrbit)) + return 0; + err = do_vsx_load(op, ea, regs, cross_endian); + break; + } +#endif + case LOAD_MULTI: + if (!address_ok(regs, ea, size)) + return -EFAULT; + rd = op->reg; + for (i = 0; i < size; i += 4) { + unsigned int v32 = 0; + + nb = size - i; + if (nb > 4) + nb = 4; + err = copy_mem_in((u8 *) &v32, ea, nb, regs); + if (err) + break; + if (unlikely(cross_endian)) + v32 = byterev_4(v32); + regs->gpr[rd] = v32; + ea += 4; + /* reg number wraps from 31 to 0 for lsw[ix] */ + rd = (rd + 1) & 0x1f; + } + break; + + case STORE: +#ifdef __powerpc64__ + if (size == 16) { + err = emulate_stq(regs, ea, op->reg, cross_endian); + break; + } +#endif + if ((op->type & UPDATE) && size == sizeof(long) && + op->reg == 1 && op->update_reg == 1 && + !(regs->msr & MSR_PR) && + ea >= regs->gpr[1] - STACK_INT_FRAME_SIZE) { + err = handle_stack_update(ea, regs); + break; + } + if (unlikely(cross_endian)) + do_byterev(&op->val, size); + err = write_mem(op->val, ea, size, regs); + break; + +#ifdef CONFIG_PPC_FPU + case STORE_FP: + if (!(regs->msr & MSR_PR) && !(regs->msr & MSR_FP)) + return 0; + err = do_fp_store(op, ea, regs, cross_endian); + break; +#endif +#ifdef CONFIG_ALTIVEC + case STORE_VMX: + if (!(regs->msr & MSR_PR) && !(regs->msr & MSR_VEC)) + return 0; + err = do_vec_store(op->reg, ea, size, regs, cross_endian); + break; +#endif +#ifdef CONFIG_VSX + case STORE_VSX: { + unsigned long msrbit = MSR_VSX; + + /* + * Some VSX instructions check the MSR_VEC bit rather than MSR_VSX + * when the target of the instruction is a vector register. + */ + if (op->reg >= 32 && (op->vsx_flags & VSX_CHECK_VEC)) + msrbit = MSR_VEC; + if (!(regs->msr & MSR_PR) && !(regs->msr & msrbit)) + return 0; + err = do_vsx_store(op, ea, regs, cross_endian); + break; + } +#endif + case STORE_MULTI: + if (!address_ok(regs, ea, size)) + return -EFAULT; + rd = op->reg; + for (i = 0; i < size; i += 4) { + unsigned int v32 = regs->gpr[rd]; + + nb = size - i; + if (nb > 4) + nb = 4; + if (unlikely(cross_endian)) + v32 = byterev_4(v32); + err = copy_mem_out((u8 *) &v32, ea, nb, regs); + if (err) + break; + ea += 4; + /* reg number wraps from 31 to 0 for stsw[ix] */ + rd = (rd + 1) & 0x1f; + } + break; + + default: + return -EINVAL; + } + + if (err) + return err; + + if (op->type & UPDATE) + regs->gpr[op->update_reg] = op->ea; + + return 0; +} +NOKPROBE_SYMBOL(emulate_loadstore); + +/* + * Emulate instructions that cause a transfer of control, + * loads and stores, and a few other instructions. + * Returns 1 if the step was emulated, 0 if not, + * or -1 if the instruction is one that should not be stepped, + * such as an rfid, or a mtmsrd that would clear MSR_RI. + */ +int emulate_step(struct pt_regs *regs, ppc_inst_t instr) +{ + struct instruction_op op; + int r, err, type; + unsigned long val; + unsigned long ea; + + r = analyse_instr(&op, regs, instr); + if (r < 0) + return r; + if (r > 0) { + emulate_update_regs(regs, &op); + return 1; + } + + err = 0; + type = GETTYPE(op.type); + + if (OP_IS_LOAD_STORE(type)) { + err = emulate_loadstore(regs, &op); + if (err) + return 0; + goto instr_done; + } + + switch (type) { + case CACHEOP: + ea = truncate_if_32bit(regs->msr, op.ea); + if (!address_ok(regs, ea, 8)) + return 0; + switch (op.type & CACHEOP_MASK) { + case DCBST: + __cacheop_user_asmx(ea, err, "dcbst"); + break; + case DCBF: + __cacheop_user_asmx(ea, err, "dcbf"); + break; + case DCBTST: + if (op.reg == 0) + prefetchw((void *) ea); + break; + case DCBT: + if (op.reg == 0) + prefetch((void *) ea); + break; + case ICBI: + __cacheop_user_asmx(ea, err, "icbi"); + break; + case DCBZ: + err = emulate_dcbz(ea, regs); + break; + } + if (err) { + regs->dar = ea; + return 0; + } + goto instr_done; + + case MFMSR: + regs->gpr[op.reg] = regs->msr & MSR_MASK; + goto instr_done; + + case MTMSR: + val = regs->gpr[op.reg]; + if ((val & MSR_RI) == 0) + /* can't step mtmsr[d] that would clear MSR_RI */ + return -1; + /* here op.val is the mask of bits to change */ + regs_set_return_msr(regs, (regs->msr & ~op.val) | (val & op.val)); + goto instr_done; + + case SYSCALL: /* sc */ + /* + * Per ISA v3.1, section 7.5.15 'Trace Interrupt', we can't + * single step a system call instruction: + * + * Successful completion for an instruction means that the + * instruction caused no other interrupt. Thus a Trace + * interrupt never occurs for a System Call or System Call + * Vectored instruction, or for a Trap instruction that + * traps. + */ + return -1; + case SYSCALL_VECTORED_0: /* scv 0 */ + return -1; + case RFI: + return -1; + } + return 0; + + instr_done: + regs_set_return_ip(regs, + truncate_if_32bit(regs->msr, regs->nip + GETLENGTH(op.type))); + return 1; +} +NOKPROBE_SYMBOL(emulate_step); diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S new file mode 100644 index 0000000000..daa72061dc --- /dev/null +++ b/arch/powerpc/lib/string.S @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * String handling functions for PowerPC. + * + * Copyright (C) 1996 Paul Mackerras. + */ +#include <linux/export.h> +#include <asm/ppc_asm.h> +#include <asm/cache.h> + + .text + +/* This clears out any unused part of the destination buffer, + just as the libc version does. -- paulus */ +_GLOBAL(strncpy) + PPC_LCMPI 0,r5,0 + beqlr + mtctr r5 + addi r6,r3,-1 + addi r4,r4,-1 + .balign IFETCH_ALIGN_BYTES +1: lbzu r0,1(r4) + cmpwi 0,r0,0 + stbu r0,1(r6) + bdnzf 2,1b /* dec ctr, branch if ctr != 0 && !cr0.eq */ + bnelr /* if we didn't hit a null char, we're done */ + mfctr r5 + PPC_LCMPI 0,r5,0 /* any space left in destination buffer? */ + beqlr /* we know r0 == 0 here */ +2: stbu r0,1(r6) /* clear it out if so */ + bdnz 2b + blr +EXPORT_SYMBOL(strncpy) + +_GLOBAL(strncmp) + PPC_LCMPI 0,r5,0 + beq- 2f + mtctr r5 + addi r5,r3,-1 + addi r4,r4,-1 + .balign IFETCH_ALIGN_BYTES +1: lbzu r3,1(r5) + cmpwi 1,r3,0 + lbzu r0,1(r4) + subf. r3,r0,r3 + beqlr 1 + bdnzt eq,1b + blr +2: li r3,0 + blr +EXPORT_SYMBOL(strncmp) + +_GLOBAL(memchr) + PPC_LCMPI 0,r5,0 + beq- 2f + mtctr r5 + addi r3,r3,-1 + .balign IFETCH_ALIGN_BYTES +1: lbzu r0,1(r3) + cmpw 0,r0,r4 + bdnzf 2,1b + beqlr +2: li r3,0 + blr +EXPORT_SYMBOL(memchr) diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S new file mode 100644 index 0000000000..3ee45619a3 --- /dev/null +++ b/arch/powerpc/lib/string_32.S @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * String handling functions for PowerPC32 + * + * Copyright (C) 1996 Paul Mackerras. + * + */ + +#include <linux/export.h> +#include <asm/ppc_asm.h> +#include <asm/cache.h> + + .text + +CACHELINE_BYTES = L1_CACHE_BYTES +LG_CACHELINE_BYTES = L1_CACHE_SHIFT +CACHELINE_MASK = (L1_CACHE_BYTES-1) + +_GLOBAL(__arch_clear_user) +/* + * Use dcbz on the complete cache lines in the destination + * to set them to zero. This requires that the destination + * area is cacheable. + */ + cmplwi cr0, r4, 4 + mr r10, r3 + li r3, 0 + blt 7f + +11: stw r3, 0(r10) + beqlr + andi. r0, r10, 3 + add r11, r0, r4 + subf r6, r0, r10 + + clrlwi r7, r6, 32 - LG_CACHELINE_BYTES + add r8, r7, r11 + srwi r9, r8, LG_CACHELINE_BYTES + addic. r9, r9, -1 /* total number of complete cachelines */ + ble 2f + xori r0, r7, CACHELINE_MASK & ~3 + srwi. r0, r0, 2 + beq 3f + mtctr r0 +4: stwu r3, 4(r6) + bdnz 4b +3: mtctr r9 + li r7, 4 +10: dcbz r7, r6 + addi r6, r6, CACHELINE_BYTES + bdnz 10b + clrlwi r11, r8, 32 - LG_CACHELINE_BYTES + addi r11, r11, 4 + +2: srwi r0 ,r11 ,2 + mtctr r0 + bdz 6f +1: stwu r3, 4(r6) + bdnz 1b +6: andi. r11, r11, 3 + beqlr + mtctr r11 + addi r6, r6, 3 +8: stbu r3, 1(r6) + bdnz 8b + blr + +7: cmpwi cr0, r4, 0 + beqlr + mtctr r4 + addi r6, r10, -1 +9: stbu r3, 1(r6) + bdnz 9b + blr + +90: mr r3, r4 + blr +91: add r3, r10, r4 + subf r3, r6, r3 + blr + + EX_TABLE(11b, 90b) + EX_TABLE(4b, 91b) + EX_TABLE(10b, 91b) + EX_TABLE(1b, 91b) + EX_TABLE(8b, 91b) + EX_TABLE(9b, 91b) + +EXPORT_SYMBOL(__arch_clear_user) diff --git a/arch/powerpc/lib/string_64.S b/arch/powerpc/lib/string_64.S new file mode 100644 index 0000000000..a25eb85884 --- /dev/null +++ b/arch/powerpc/lib/string_64.S @@ -0,0 +1,179 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ + +#include <linux/export.h> +#include <asm/ppc_asm.h> +#include <asm/linkage.h> +#include <asm/asm-offsets.h> + +/** + * __arch_clear_user: - Zero a block of memory in user space, with less checking. + * @to: Destination address, in user space. + * @n: Number of bytes to zero. + * + * Zero a block of memory in user space. Caller must check + * the specified block with access_ok() before calling this function. + * + * Returns number of bytes that could not be cleared. + * On success, this will be zero. + */ + + .macro err1 +100: + EX_TABLE(100b,.Ldo_err1) + .endm + + .macro err2 +200: + EX_TABLE(200b,.Ldo_err2) + .endm + + .macro err3 +300: + EX_TABLE(300b,.Ldo_err3) + .endm + +.Ldo_err1: + mr r3,r8 + +.Ldo_err2: + mtctr r4 +1: +err3; stb r0,0(r3) + addi r3,r3,1 + addi r4,r4,-1 + bdnz 1b + +.Ldo_err3: + mr r3,r4 + blr + +_GLOBAL_TOC(__arch_clear_user) + cmpdi r4,32 + neg r6,r3 + li r0,0 + blt .Lshort_clear + mr r8,r3 + mtocrf 0x01,r6 + clrldi r6,r6,(64-3) + + /* Get the destination 8 byte aligned */ + bf cr7*4+3,1f +err1; stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f +err1; sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f +err1; stw r0,0(r3) + addi r3,r3,4 + +3: sub r4,r4,r6 + + cmpdi r4,32 + cmpdi cr1,r4,512 + blt .Lshort_clear + bgt cr1,.Llong_clear + +.Lmedium_clear: + srdi r6,r4,5 + mtctr r6 + + /* Do 32 byte chunks */ +4: +err2; std r0,0(r3) +err2; std r0,8(r3) +err2; std r0,16(r3) +err2; std r0,24(r3) + addi r3,r3,32 + addi r4,r4,-32 + bdnz 4b + +.Lshort_clear: + /* up to 31 bytes to go */ + cmpdi r4,16 + blt 6f +err2; std r0,0(r3) +err2; std r0,8(r3) + addi r3,r3,16 + addi r4,r4,-16 + + /* Up to 15 bytes to go */ +6: mr r8,r3 + clrldi r4,r4,(64-4) + mtocrf 0x01,r4 + bf cr7*4+0,7f +err1; std r0,0(r3) + addi r3,r3,8 + +7: bf cr7*4+1,8f +err1; stw r0,0(r3) + addi r3,r3,4 + +8: bf cr7*4+2,9f +err1; sth r0,0(r3) + addi r3,r3,2 + +9: bf cr7*4+3,10f +err1; stb r0,0(r3) + +10: li r3,0 + blr + +.Llong_clear: + LOAD_REG_ADDR(r5, ppc64_caches) + + bf cr7*4+0,11f +err2; std r0,0(r3) + addi r3,r3,8 + addi r4,r4,-8 + + /* Destination is 16 byte aligned, need to get it cache block aligned */ +11: lwz r7,DCACHEL1LOGBLOCKSIZE(r5) + lwz r9,DCACHEL1BLOCKSIZE(r5) + + /* + * With worst case alignment the long clear loop takes a minimum + * of 1 byte less than 2 cachelines. + */ + sldi r10,r9,2 + cmpd r4,r10 + blt .Lmedium_clear + + neg r6,r3 + addi r10,r9,-1 + and. r5,r6,r10 + beq 13f + + srdi r6,r5,4 + mtctr r6 + mr r8,r3 +12: +err1; std r0,0(r3) +err1; std r0,8(r3) + addi r3,r3,16 + bdnz 12b + + sub r4,r4,r5 + +13: srd r6,r4,r7 + mtctr r6 + mr r8,r3 +14: +err1; dcbz 0,r3 + add r3,r3,r9 + bdnz 14b + + and r4,r4,r10 + + cmpdi r4,32 + blt .Lshort_clear + b .Lmedium_clear +EXPORT_SYMBOL(__arch_clear_user) diff --git a/arch/powerpc/lib/strlen_32.S b/arch/powerpc/lib/strlen_32.S new file mode 100644 index 0000000000..bbd24feb23 --- /dev/null +++ b/arch/powerpc/lib/strlen_32.S @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * strlen() for PPC32 + * + * Copyright (C) 2018 Christophe Leroy CS Systemes d'Information. + * + * Inspired from glibc implementation + */ +#include <linux/export.h> +#include <asm/ppc_asm.h> +#include <asm/cache.h> + + .text + +/* + * Algorithm: + * + * 1) Given a word 'x', we can test to see if it contains any 0 bytes + * by subtracting 0x01010101, and seeing if any of the high bits of each + * byte changed from 0 to 1. This works because the least significant + * 0 byte must have had no incoming carry (otherwise it's not the least + * significant), so it is 0x00 - 0x01 == 0xff. For all other + * byte values, either they have the high bit set initially, or when + * 1 is subtracted you get a value in the range 0x00-0x7f, none of which + * have their high bit set. The expression here is + * (x - 0x01010101) & ~x & 0x80808080), which gives 0x00000000 when + * there were no 0x00 bytes in the word. You get 0x80 in bytes that + * match, but possibly false 0x80 matches in the next more significant + * byte to a true match due to carries. For little-endian this is + * of no consequence since the least significant match is the one + * we're interested in, but big-endian needs method 2 to find which + * byte matches. + * 2) Given a word 'x', we can test to see _which_ byte was zero by + * calculating ~(((x & ~0x80808080) - 0x80808080 - 1) | x | ~0x80808080). + * This produces 0x80 in each byte that was zero, and 0x00 in all + * the other bytes. The '| ~0x80808080' clears the low 7 bits in each + * byte, and the '| x' part ensures that bytes with the high bit set + * produce 0x00. The addition will carry into the high bit of each byte + * iff that byte had one of its low 7 bits set. We can then just see + * which was the most significant bit set and divide by 8 to find how + * many to add to the index. + * This is from the book 'The PowerPC Compiler Writer's Guide', + * by Steve Hoxey, Faraydon Karim, Bill Hay and Hank Warren. + */ + +_GLOBAL(strlen) + andi. r0, r3, 3 + lis r7, 0x0101 + addi r10, r3, -4 + addic r7, r7, 0x0101 /* r7 = 0x01010101 (lomagic) & clear XER[CA] */ + rotlwi r6, r7, 31 /* r6 = 0x80808080 (himagic) */ + bne- 3f + .balign IFETCH_ALIGN_BYTES +1: lwzu r9, 4(r10) +2: subf r8, r7, r9 + and. r8, r8, r6 + beq+ 1b + andc. r8, r8, r9 + beq+ 1b + andc r8, r9, r6 + orc r9, r9, r6 + subfe r8, r6, r8 + nor r8, r8, r9 + cntlzw r8, r8 + subf r3, r3, r10 + srwi r8, r8, 3 + add r3, r3, r8 + blr + + /* Missaligned string: make sure bytes before string are seen not 0 */ +3: xor r10, r10, r0 + orc r8, r8, r8 + lwzu r9, 4(r10) + slwi r0, r0, 3 + srw r8, r8, r0 + orc r9, r9, r8 + b 2b +EXPORT_SYMBOL(strlen) diff --git a/arch/powerpc/lib/test-code-patching.c b/arch/powerpc/lib/test-code-patching.c new file mode 100644 index 0000000000..c44823292f --- /dev/null +++ b/arch/powerpc/lib/test-code-patching.c @@ -0,0 +1,362 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2008 Michael Ellerman, IBM Corporation. + */ + +#include <linux/vmalloc.h> +#include <linux/init.h> + +#include <asm/code-patching.h> + +static int __init instr_is_branch_to_addr(const u32 *instr, unsigned long addr) +{ + if (instr_is_branch_iform(ppc_inst_read(instr)) || + instr_is_branch_bform(ppc_inst_read(instr))) + return branch_target(instr) == addr; + + return 0; +} + +static void __init test_trampoline(void) +{ + asm ("nop;nop;\n"); +} + +#define check(x) do { \ + if (!(x)) \ + pr_err("code-patching: test failed at line %d\n", __LINE__); \ +} while (0) + +static void __init test_branch_iform(void) +{ + int err; + ppc_inst_t instr; + u32 tmp[2]; + u32 *iptr = tmp; + unsigned long addr = (unsigned long)tmp; + + /* The simplest case, branch to self, no flags */ + check(instr_is_branch_iform(ppc_inst(0x48000000))); + /* All bits of target set, and flags */ + check(instr_is_branch_iform(ppc_inst(0x4bffffff))); + /* High bit of opcode set, which is wrong */ + check(!instr_is_branch_iform(ppc_inst(0xcbffffff))); + /* Middle bits of opcode set, which is wrong */ + check(!instr_is_branch_iform(ppc_inst(0x7bffffff))); + + /* Simplest case, branch to self with link */ + check(instr_is_branch_iform(ppc_inst(0x48000001))); + /* All bits of targets set */ + check(instr_is_branch_iform(ppc_inst(0x4bfffffd))); + /* Some bits of targets set */ + check(instr_is_branch_iform(ppc_inst(0x4bff00fd))); + /* Must be a valid branch to start with */ + check(!instr_is_branch_iform(ppc_inst(0x7bfffffd))); + + /* Absolute branch to 0x100 */ + ppc_inst_write(iptr, ppc_inst(0x48000103)); + check(instr_is_branch_to_addr(iptr, 0x100)); + /* Absolute branch to 0x420fc */ + ppc_inst_write(iptr, ppc_inst(0x480420ff)); + check(instr_is_branch_to_addr(iptr, 0x420fc)); + /* Maximum positive relative branch, + 20MB - 4B */ + ppc_inst_write(iptr, ppc_inst(0x49fffffc)); + check(instr_is_branch_to_addr(iptr, addr + 0x1FFFFFC)); + /* Smallest negative relative branch, - 4B */ + ppc_inst_write(iptr, ppc_inst(0x4bfffffc)); + check(instr_is_branch_to_addr(iptr, addr - 4)); + /* Largest negative relative branch, - 32 MB */ + ppc_inst_write(iptr, ppc_inst(0x4a000000)); + check(instr_is_branch_to_addr(iptr, addr - 0x2000000)); + + /* Branch to self, with link */ + err = create_branch(&instr, iptr, addr, BRANCH_SET_LINK); + ppc_inst_write(iptr, instr); + check(instr_is_branch_to_addr(iptr, addr)); + + /* Branch to self - 0x100, with link */ + err = create_branch(&instr, iptr, addr - 0x100, BRANCH_SET_LINK); + ppc_inst_write(iptr, instr); + check(instr_is_branch_to_addr(iptr, addr - 0x100)); + + /* Branch to self + 0x100, no link */ + err = create_branch(&instr, iptr, addr + 0x100, 0); + ppc_inst_write(iptr, instr); + check(instr_is_branch_to_addr(iptr, addr + 0x100)); + + /* Maximum relative negative offset, - 32 MB */ + err = create_branch(&instr, iptr, addr - 0x2000000, BRANCH_SET_LINK); + ppc_inst_write(iptr, instr); + check(instr_is_branch_to_addr(iptr, addr - 0x2000000)); + + /* Out of range relative negative offset, - 32 MB + 4*/ + err = create_branch(&instr, iptr, addr - 0x2000004, BRANCH_SET_LINK); + check(err); + + /* Out of range relative positive offset, + 32 MB */ + err = create_branch(&instr, iptr, addr + 0x2000000, BRANCH_SET_LINK); + check(err); + + /* Unaligned target */ + err = create_branch(&instr, iptr, addr + 3, BRANCH_SET_LINK); + check(err); + + /* Check flags are masked correctly */ + err = create_branch(&instr, iptr, addr, 0xFFFFFFFC); + ppc_inst_write(iptr, instr); + check(instr_is_branch_to_addr(iptr, addr)); + check(ppc_inst_equal(instr, ppc_inst(0x48000000))); +} + +static void __init test_create_function_call(void) +{ + u32 *iptr; + unsigned long dest; + ppc_inst_t instr; + + /* Check we can create a function call */ + iptr = (u32 *)ppc_function_entry(test_trampoline); + dest = ppc_function_entry(test_create_function_call); + create_branch(&instr, iptr, dest, BRANCH_SET_LINK); + patch_instruction(iptr, instr); + check(instr_is_branch_to_addr(iptr, dest)); +} + +static void __init test_branch_bform(void) +{ + int err; + unsigned long addr; + ppc_inst_t instr; + u32 tmp[2]; + u32 *iptr = tmp; + unsigned int flags; + + addr = (unsigned long)iptr; + + /* The simplest case, branch to self, no flags */ + check(instr_is_branch_bform(ppc_inst(0x40000000))); + /* All bits of target set, and flags */ + check(instr_is_branch_bform(ppc_inst(0x43ffffff))); + /* High bit of opcode set, which is wrong */ + check(!instr_is_branch_bform(ppc_inst(0xc3ffffff))); + /* Middle bits of opcode set, which is wrong */ + check(!instr_is_branch_bform(ppc_inst(0x7bffffff))); + + /* Absolute conditional branch to 0x100 */ + ppc_inst_write(iptr, ppc_inst(0x43ff0103)); + check(instr_is_branch_to_addr(iptr, 0x100)); + /* Absolute conditional branch to 0x20fc */ + ppc_inst_write(iptr, ppc_inst(0x43ff20ff)); + check(instr_is_branch_to_addr(iptr, 0x20fc)); + /* Maximum positive relative conditional branch, + 32 KB - 4B */ + ppc_inst_write(iptr, ppc_inst(0x43ff7ffc)); + check(instr_is_branch_to_addr(iptr, addr + 0x7FFC)); + /* Smallest negative relative conditional branch, - 4B */ + ppc_inst_write(iptr, ppc_inst(0x43fffffc)); + check(instr_is_branch_to_addr(iptr, addr - 4)); + /* Largest negative relative conditional branch, - 32 KB */ + ppc_inst_write(iptr, ppc_inst(0x43ff8000)); + check(instr_is_branch_to_addr(iptr, addr - 0x8000)); + + /* All condition code bits set & link */ + flags = 0x3ff000 | BRANCH_SET_LINK; + + /* Branch to self */ + err = create_cond_branch(&instr, iptr, addr, flags); + ppc_inst_write(iptr, instr); + check(instr_is_branch_to_addr(iptr, addr)); + + /* Branch to self - 0x100 */ + err = create_cond_branch(&instr, iptr, addr - 0x100, flags); + ppc_inst_write(iptr, instr); + check(instr_is_branch_to_addr(iptr, addr - 0x100)); + + /* Branch to self + 0x100 */ + err = create_cond_branch(&instr, iptr, addr + 0x100, flags); + ppc_inst_write(iptr, instr); + check(instr_is_branch_to_addr(iptr, addr + 0x100)); + + /* Maximum relative negative offset, - 32 KB */ + err = create_cond_branch(&instr, iptr, addr - 0x8000, flags); + ppc_inst_write(iptr, instr); + check(instr_is_branch_to_addr(iptr, addr - 0x8000)); + + /* Out of range relative negative offset, - 32 KB + 4*/ + err = create_cond_branch(&instr, iptr, addr - 0x8004, flags); + check(err); + + /* Out of range relative positive offset, + 32 KB */ + err = create_cond_branch(&instr, iptr, addr + 0x8000, flags); + check(err); + + /* Unaligned target */ + err = create_cond_branch(&instr, iptr, addr + 3, flags); + check(err); + + /* Check flags are masked correctly */ + err = create_cond_branch(&instr, iptr, addr, 0xFFFFFFFC); + ppc_inst_write(iptr, instr); + check(instr_is_branch_to_addr(iptr, addr)); + check(ppc_inst_equal(instr, ppc_inst(0x43FF0000))); +} + +static void __init test_translate_branch(void) +{ + unsigned long addr; + void *p, *q; + ppc_inst_t instr; + void *buf; + + buf = vmalloc(PAGE_ALIGN(0x2000000 + 1)); + check(buf); + if (!buf) + return; + + /* Simple case, branch to self moved a little */ + p = buf; + addr = (unsigned long)p; + create_branch(&instr, p, addr, 0); + ppc_inst_write(p, instr); + check(instr_is_branch_to_addr(p, addr)); + q = p + 4; + translate_branch(&instr, q, p); + ppc_inst_write(q, instr); + check(instr_is_branch_to_addr(q, addr)); + + /* Maximum negative case, move b . to addr + 32 MB */ + p = buf; + addr = (unsigned long)p; + create_branch(&instr, p, addr, 0); + ppc_inst_write(p, instr); + q = buf + 0x2000000; + translate_branch(&instr, q, p); + ppc_inst_write(q, instr); + check(instr_is_branch_to_addr(p, addr)); + check(instr_is_branch_to_addr(q, addr)); + check(ppc_inst_equal(ppc_inst_read(q), ppc_inst(0x4a000000))); + + /* Maximum positive case, move x to x - 32 MB + 4 */ + p = buf + 0x2000000; + addr = (unsigned long)p; + create_branch(&instr, p, addr, 0); + ppc_inst_write(p, instr); + q = buf + 4; + translate_branch(&instr, q, p); + ppc_inst_write(q, instr); + check(instr_is_branch_to_addr(p, addr)); + check(instr_is_branch_to_addr(q, addr)); + check(ppc_inst_equal(ppc_inst_read(q), ppc_inst(0x49fffffc))); + + /* Jump to x + 16 MB moved to x + 20 MB */ + p = buf; + addr = 0x1000000 + (unsigned long)buf; + create_branch(&instr, p, addr, BRANCH_SET_LINK); + ppc_inst_write(p, instr); + q = buf + 0x1400000; + translate_branch(&instr, q, p); + ppc_inst_write(q, instr); + check(instr_is_branch_to_addr(p, addr)); + check(instr_is_branch_to_addr(q, addr)); + + /* Jump to x + 16 MB moved to x - 16 MB + 4 */ + p = buf + 0x1000000; + addr = 0x2000000 + (unsigned long)buf; + create_branch(&instr, p, addr, 0); + ppc_inst_write(p, instr); + q = buf + 4; + translate_branch(&instr, q, p); + ppc_inst_write(q, instr); + check(instr_is_branch_to_addr(p, addr)); + check(instr_is_branch_to_addr(q, addr)); + + + /* Conditional branch tests */ + + /* Simple case, branch to self moved a little */ + p = buf; + addr = (unsigned long)p; + create_cond_branch(&instr, p, addr, 0); + ppc_inst_write(p, instr); + check(instr_is_branch_to_addr(p, addr)); + q = buf + 4; + translate_branch(&instr, q, p); + ppc_inst_write(q, instr); + check(instr_is_branch_to_addr(q, addr)); + + /* Maximum negative case, move b . to addr + 32 KB */ + p = buf; + addr = (unsigned long)p; + create_cond_branch(&instr, p, addr, 0xFFFFFFFC); + ppc_inst_write(p, instr); + q = buf + 0x8000; + translate_branch(&instr, q, p); + ppc_inst_write(q, instr); + check(instr_is_branch_to_addr(p, addr)); + check(instr_is_branch_to_addr(q, addr)); + check(ppc_inst_equal(ppc_inst_read(q), ppc_inst(0x43ff8000))); + + /* Maximum positive case, move x to x - 32 KB + 4 */ + p = buf + 0x8000; + addr = (unsigned long)p; + create_cond_branch(&instr, p, addr, 0xFFFFFFFC); + ppc_inst_write(p, instr); + q = buf + 4; + translate_branch(&instr, q, p); + ppc_inst_write(q, instr); + check(instr_is_branch_to_addr(p, addr)); + check(instr_is_branch_to_addr(q, addr)); + check(ppc_inst_equal(ppc_inst_read(q), ppc_inst(0x43ff7ffc))); + + /* Jump to x + 12 KB moved to x + 20 KB */ + p = buf; + addr = 0x3000 + (unsigned long)buf; + create_cond_branch(&instr, p, addr, BRANCH_SET_LINK); + ppc_inst_write(p, instr); + q = buf + 0x5000; + translate_branch(&instr, q, p); + ppc_inst_write(q, instr); + check(instr_is_branch_to_addr(p, addr)); + check(instr_is_branch_to_addr(q, addr)); + + /* Jump to x + 8 KB moved to x - 8 KB + 4 */ + p = buf + 0x2000; + addr = 0x4000 + (unsigned long)buf; + create_cond_branch(&instr, p, addr, 0); + ppc_inst_write(p, instr); + q = buf + 4; + translate_branch(&instr, q, p); + ppc_inst_write(q, instr); + check(instr_is_branch_to_addr(p, addr)); + check(instr_is_branch_to_addr(q, addr)); + + /* Free the buffer we were using */ + vfree(buf); +} + +static void __init test_prefixed_patching(void) +{ + u32 *iptr = (u32 *)ppc_function_entry(test_trampoline); + u32 expected[2] = {OP_PREFIX << 26, 0}; + ppc_inst_t inst = ppc_inst_prefix(OP_PREFIX << 26, 0); + + if (!IS_ENABLED(CONFIG_PPC64)) + return; + + patch_instruction(iptr, inst); + + check(!memcmp(iptr, expected, sizeof(expected))); +} + +static int __init test_code_patching(void) +{ + pr_info("Running code patching self-tests ...\n"); + + test_branch_iform(); + test_branch_bform(); + test_create_function_call(); + test_translate_branch(); + test_prefixed_patching(); + + return 0; +} +late_initcall(test_code_patching); diff --git a/arch/powerpc/lib/test_emulate_step.c b/arch/powerpc/lib/test_emulate_step.c new file mode 100644 index 0000000000..23c7805fb7 --- /dev/null +++ b/arch/powerpc/lib/test_emulate_step.c @@ -0,0 +1,1741 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Simple sanity tests for instruction emulation infrastructure. + * + * Copyright IBM Corp. 2016 + */ + +#define pr_fmt(fmt) "emulate_step_test: " fmt + +#include <linux/ptrace.h> +#include <asm/cpu_has_feature.h> +#include <asm/sstep.h> +#include <asm/ppc-opcode.h> +#include <asm/code-patching.h> +#include <asm/inst.h> + +#define MAX_SUBTESTS 16 + +#define IGNORE_GPR(n) (0x1UL << (n)) +#define IGNORE_XER (0x1UL << 32) +#define IGNORE_CCR (0x1UL << 33) +#define NEGATIVE_TEST (0x1UL << 63) + +#define TEST_PLD(r, base, i, pr) \ + ppc_inst_prefix(PPC_PREFIX_8LS | __PPC_PRFX_R(pr) | IMM_H(i), \ + PPC_INST_PLD | ___PPC_RT(r) | ___PPC_RA(base) | IMM_L(i)) + +#define TEST_PLWZ(r, base, i, pr) \ + ppc_inst_prefix(PPC_PREFIX_MLS | __PPC_PRFX_R(pr) | IMM_H(i), \ + PPC_RAW_LWZ(r, base, i)) + +#define TEST_PSTD(r, base, i, pr) \ + ppc_inst_prefix(PPC_PREFIX_8LS | __PPC_PRFX_R(pr) | IMM_H(i), \ + PPC_INST_PSTD | ___PPC_RT(r) | ___PPC_RA(base) | IMM_L(i)) + +#define TEST_PLFS(r, base, i, pr) \ + ppc_inst_prefix(PPC_PREFIX_MLS | __PPC_PRFX_R(pr) | IMM_H(i), \ + PPC_INST_LFS | ___PPC_RT(r) | ___PPC_RA(base) | IMM_L(i)) + +#define TEST_PSTFS(r, base, i, pr) \ + ppc_inst_prefix(PPC_PREFIX_MLS | __PPC_PRFX_R(pr) | IMM_H(i), \ + PPC_INST_STFS | ___PPC_RT(r) | ___PPC_RA(base) | IMM_L(i)) + +#define TEST_PLFD(r, base, i, pr) \ + ppc_inst_prefix(PPC_PREFIX_MLS | __PPC_PRFX_R(pr) | IMM_H(i), \ + PPC_INST_LFD | ___PPC_RT(r) | ___PPC_RA(base) | IMM_L(i)) + +#define TEST_PSTFD(r, base, i, pr) \ + ppc_inst_prefix(PPC_PREFIX_MLS | __PPC_PRFX_R(pr) | IMM_H(i), \ + PPC_INST_STFD | ___PPC_RT(r) | ___PPC_RA(base) | IMM_L(i)) + +#define TEST_PADDI(t, a, i, pr) \ + ppc_inst_prefix(PPC_PREFIX_MLS | __PPC_PRFX_R(pr) | IMM_H(i), \ + PPC_RAW_ADDI(t, a, i)) + +static void __init init_pt_regs(struct pt_regs *regs) +{ + static unsigned long msr; + static bool msr_cached; + + memset(regs, 0, sizeof(struct pt_regs)); + + if (likely(msr_cached)) { + regs->msr = msr; + return; + } + + asm volatile("mfmsr %0" : "=r"(regs->msr)); + + regs->msr |= MSR_FP; + regs->msr |= MSR_VEC; + regs->msr |= MSR_VSX; + + msr = regs->msr; + msr_cached = true; +} + +static void __init show_result(char *mnemonic, char *result) +{ + pr_info("%-14s : %s\n", mnemonic, result); +} + +static void __init show_result_with_descr(char *mnemonic, char *descr, + char *result) +{ + pr_info("%-14s : %-50s %s\n", mnemonic, descr, result); +} + +static void __init test_ld(void) +{ + struct pt_regs regs; + unsigned long a = 0x23; + int stepped = -1; + + init_pt_regs(®s); + regs.gpr[3] = (unsigned long) &a; + + /* ld r5, 0(r3) */ + stepped = emulate_step(®s, ppc_inst(PPC_RAW_LD(5, 3, 0))); + + if (stepped == 1 && regs.gpr[5] == a) + show_result("ld", "PASS"); + else + show_result("ld", "FAIL"); +} + +static void __init test_pld(void) +{ + struct pt_regs regs; + unsigned long a = 0x23; + int stepped = -1; + + if (!cpu_has_feature(CPU_FTR_ARCH_31)) { + show_result("pld", "SKIP (!CPU_FTR_ARCH_31)"); + return; + } + + init_pt_regs(®s); + regs.gpr[3] = (unsigned long)&a; + + /* pld r5, 0(r3), 0 */ + stepped = emulate_step(®s, TEST_PLD(5, 3, 0, 0)); + + if (stepped == 1 && regs.gpr[5] == a) + show_result("pld", "PASS"); + else + show_result("pld", "FAIL"); +} + +static void __init test_lwz(void) +{ + struct pt_regs regs; + unsigned int a = 0x4545; + int stepped = -1; + + init_pt_regs(®s); + regs.gpr[3] = (unsigned long) &a; + + /* lwz r5, 0(r3) */ + stepped = emulate_step(®s, ppc_inst(PPC_RAW_LWZ(5, 3, 0))); + + if (stepped == 1 && regs.gpr[5] == a) + show_result("lwz", "PASS"); + else + show_result("lwz", "FAIL"); +} + +static void __init test_plwz(void) +{ + struct pt_regs regs; + unsigned int a = 0x4545; + int stepped = -1; + + if (!cpu_has_feature(CPU_FTR_ARCH_31)) { + show_result("plwz", "SKIP (!CPU_FTR_ARCH_31)"); + return; + } + + init_pt_regs(®s); + regs.gpr[3] = (unsigned long)&a; + + /* plwz r5, 0(r3), 0 */ + + stepped = emulate_step(®s, TEST_PLWZ(5, 3, 0, 0)); + + if (stepped == 1 && regs.gpr[5] == a) + show_result("plwz", "PASS"); + else + show_result("plwz", "FAIL"); +} + +static void __init test_lwzx(void) +{ + struct pt_regs regs; + unsigned int a[3] = {0x0, 0x0, 0x1234}; + int stepped = -1; + + init_pt_regs(®s); + regs.gpr[3] = (unsigned long) a; + regs.gpr[4] = 8; + regs.gpr[5] = 0x8765; + + /* lwzx r5, r3, r4 */ + stepped = emulate_step(®s, ppc_inst(PPC_RAW_LWZX(5, 3, 4))); + if (stepped == 1 && regs.gpr[5] == a[2]) + show_result("lwzx", "PASS"); + else + show_result("lwzx", "FAIL"); +} + +static void __init test_std(void) +{ + struct pt_regs regs; + unsigned long a = 0x1234; + int stepped = -1; + + init_pt_regs(®s); + regs.gpr[3] = (unsigned long) &a; + regs.gpr[5] = 0x5678; + + /* std r5, 0(r3) */ + stepped = emulate_step(®s, ppc_inst(PPC_RAW_STD(5, 3, 0))); + if (stepped == 1 && regs.gpr[5] == a) + show_result("std", "PASS"); + else + show_result("std", "FAIL"); +} + +static void __init test_pstd(void) +{ + struct pt_regs regs; + unsigned long a = 0x1234; + int stepped = -1; + + if (!cpu_has_feature(CPU_FTR_ARCH_31)) { + show_result("pstd", "SKIP (!CPU_FTR_ARCH_31)"); + return; + } + + init_pt_regs(®s); + regs.gpr[3] = (unsigned long)&a; + regs.gpr[5] = 0x5678; + + /* pstd r5, 0(r3), 0 */ + stepped = emulate_step(®s, TEST_PSTD(5, 3, 0, 0)); + if (stepped == 1 || regs.gpr[5] == a) + show_result("pstd", "PASS"); + else + show_result("pstd", "FAIL"); +} + +static void __init test_ldarx_stdcx(void) +{ + struct pt_regs regs; + unsigned long a = 0x1234; + int stepped = -1; + unsigned long cr0_eq = 0x1 << 29; /* eq bit of CR0 */ + + init_pt_regs(®s); + asm volatile("mfcr %0" : "=r"(regs.ccr)); + + + /*** ldarx ***/ + + regs.gpr[3] = (unsigned long) &a; + regs.gpr[4] = 0; + regs.gpr[5] = 0x5678; + + /* ldarx r5, r3, r4, 0 */ + stepped = emulate_step(®s, ppc_inst(PPC_RAW_LDARX(5, 3, 4, 0))); + + /* + * Don't touch 'a' here. Touching 'a' can do Load/store + * of 'a' which result in failure of subsequent stdcx. + * Instead, use hardcoded value for comparison. + */ + if (stepped <= 0 || regs.gpr[5] != 0x1234) { + show_result("ldarx / stdcx.", "FAIL (ldarx)"); + return; + } + + + /*** stdcx. ***/ + + regs.gpr[5] = 0x9ABC; + + /* stdcx. r5, r3, r4 */ + stepped = emulate_step(®s, ppc_inst(PPC_RAW_STDCX(5, 3, 4))); + + /* + * Two possible scenarios that indicates successful emulation + * of stdcx. : + * 1. Reservation is active and store is performed. In this + * case cr0.eq bit will be set to 1. + * 2. Reservation is not active and store is not performed. + * In this case cr0.eq bit will be set to 0. + */ + if (stepped == 1 && ((regs.gpr[5] == a && (regs.ccr & cr0_eq)) + || (regs.gpr[5] != a && !(regs.ccr & cr0_eq)))) + show_result("ldarx / stdcx.", "PASS"); + else + show_result("ldarx / stdcx.", "FAIL (stdcx.)"); +} + +#ifdef CONFIG_PPC_FPU +static void __init test_lfsx_stfsx(void) +{ + struct pt_regs regs; + union { + float a; + int b; + } c; + int cached_b; + int stepped = -1; + + init_pt_regs(®s); + + + /*** lfsx ***/ + + c.a = 123.45; + cached_b = c.b; + + regs.gpr[3] = (unsigned long) &c.a; + regs.gpr[4] = 0; + + /* lfsx frt10, r3, r4 */ + stepped = emulate_step(®s, ppc_inst(PPC_RAW_LFSX(10, 3, 4))); + + if (stepped == 1) + show_result("lfsx", "PASS"); + else + show_result("lfsx", "FAIL"); + + + /*** stfsx ***/ + + c.a = 678.91; + + /* stfsx frs10, r3, r4 */ + stepped = emulate_step(®s, ppc_inst(PPC_RAW_STFSX(10, 3, 4))); + + if (stepped == 1 && c.b == cached_b) + show_result("stfsx", "PASS"); + else + show_result("stfsx", "FAIL"); +} + +static void __init test_plfs_pstfs(void) +{ + struct pt_regs regs; + union { + float a; + int b; + } c; + int cached_b; + int stepped = -1; + + if (!cpu_has_feature(CPU_FTR_ARCH_31)) { + show_result("pld", "SKIP (!CPU_FTR_ARCH_31)"); + return; + } + + init_pt_regs(®s); + + + /*** plfs ***/ + + c.a = 123.45; + cached_b = c.b; + + regs.gpr[3] = (unsigned long)&c.a; + + /* plfs frt10, 0(r3), 0 */ + stepped = emulate_step(®s, TEST_PLFS(10, 3, 0, 0)); + + if (stepped == 1) + show_result("plfs", "PASS"); + else + show_result("plfs", "FAIL"); + + + /*** pstfs ***/ + + c.a = 678.91; + + /* pstfs frs10, 0(r3), 0 */ + stepped = emulate_step(®s, TEST_PSTFS(10, 3, 0, 0)); + + if (stepped == 1 && c.b == cached_b) + show_result("pstfs", "PASS"); + else + show_result("pstfs", "FAIL"); +} + +static void __init test_lfdx_stfdx(void) +{ + struct pt_regs regs; + union { + double a; + long b; + } c; + long cached_b; + int stepped = -1; + + init_pt_regs(®s); + + + /*** lfdx ***/ + + c.a = 123456.78; + cached_b = c.b; + + regs.gpr[3] = (unsigned long) &c.a; + regs.gpr[4] = 0; + + /* lfdx frt10, r3, r4 */ + stepped = emulate_step(®s, ppc_inst(PPC_RAW_LFDX(10, 3, 4))); + + if (stepped == 1) + show_result("lfdx", "PASS"); + else + show_result("lfdx", "FAIL"); + + + /*** stfdx ***/ + + c.a = 987654.32; + + /* stfdx frs10, r3, r4 */ + stepped = emulate_step(®s, ppc_inst(PPC_RAW_STFDX(10, 3, 4))); + + if (stepped == 1 && c.b == cached_b) + show_result("stfdx", "PASS"); + else + show_result("stfdx", "FAIL"); +} + +static void __init test_plfd_pstfd(void) +{ + struct pt_regs regs; + union { + double a; + long b; + } c; + long cached_b; + int stepped = -1; + + if (!cpu_has_feature(CPU_FTR_ARCH_31)) { + show_result("pld", "SKIP (!CPU_FTR_ARCH_31)"); + return; + } + + init_pt_regs(®s); + + + /*** plfd ***/ + + c.a = 123456.78; + cached_b = c.b; + + regs.gpr[3] = (unsigned long)&c.a; + + /* plfd frt10, 0(r3), 0 */ + stepped = emulate_step(®s, TEST_PLFD(10, 3, 0, 0)); + + if (stepped == 1) + show_result("plfd", "PASS"); + else + show_result("plfd", "FAIL"); + + + /*** pstfd ***/ + + c.a = 987654.32; + + /* pstfd frs10, 0(r3), 0 */ + stepped = emulate_step(®s, TEST_PSTFD(10, 3, 0, 0)); + + if (stepped == 1 && c.b == cached_b) + show_result("pstfd", "PASS"); + else + show_result("pstfd", "FAIL"); +} +#else +static void __init test_lfsx_stfsx(void) +{ + show_result("lfsx", "SKIP (CONFIG_PPC_FPU is not set)"); + show_result("stfsx", "SKIP (CONFIG_PPC_FPU is not set)"); +} + +static void __init test_plfs_pstfs(void) +{ + show_result("plfs", "SKIP (CONFIG_PPC_FPU is not set)"); + show_result("pstfs", "SKIP (CONFIG_PPC_FPU is not set)"); +} + +static void __init test_lfdx_stfdx(void) +{ + show_result("lfdx", "SKIP (CONFIG_PPC_FPU is not set)"); + show_result("stfdx", "SKIP (CONFIG_PPC_FPU is not set)"); +} + +static void __init test_plfd_pstfd(void) +{ + show_result("plfd", "SKIP (CONFIG_PPC_FPU is not set)"); + show_result("pstfd", "SKIP (CONFIG_PPC_FPU is not set)"); +} +#endif /* CONFIG_PPC_FPU */ + +#ifdef CONFIG_ALTIVEC +static void __init test_lvx_stvx(void) +{ + struct pt_regs regs; + union { + vector128 a; + u32 b[4]; + } c; + u32 cached_b[4]; + int stepped = -1; + + init_pt_regs(®s); + + + /*** lvx ***/ + + cached_b[0] = c.b[0] = 923745; + cached_b[1] = c.b[1] = 2139478; + cached_b[2] = c.b[2] = 9012; + cached_b[3] = c.b[3] = 982134; + + regs.gpr[3] = (unsigned long) &c.a; + regs.gpr[4] = 0; + + /* lvx vrt10, r3, r4 */ + stepped = emulate_step(®s, ppc_inst(PPC_RAW_LVX(10, 3, 4))); + + if (stepped == 1) + show_result("lvx", "PASS"); + else + show_result("lvx", "FAIL"); + + + /*** stvx ***/ + + c.b[0] = 4987513; + c.b[1] = 84313948; + c.b[2] = 71; + c.b[3] = 498532; + + /* stvx vrs10, r3, r4 */ + stepped = emulate_step(®s, ppc_inst(PPC_RAW_STVX(10, 3, 4))); + + if (stepped == 1 && cached_b[0] == c.b[0] && cached_b[1] == c.b[1] && + cached_b[2] == c.b[2] && cached_b[3] == c.b[3]) + show_result("stvx", "PASS"); + else + show_result("stvx", "FAIL"); +} +#else +static void __init test_lvx_stvx(void) +{ + show_result("lvx", "SKIP (CONFIG_ALTIVEC is not set)"); + show_result("stvx", "SKIP (CONFIG_ALTIVEC is not set)"); +} +#endif /* CONFIG_ALTIVEC */ + +#ifdef CONFIG_VSX +static void __init test_lxvd2x_stxvd2x(void) +{ + struct pt_regs regs; + union { + vector128 a; + u32 b[4]; + } c; + u32 cached_b[4]; + int stepped = -1; + + init_pt_regs(®s); + + + /*** lxvd2x ***/ + + cached_b[0] = c.b[0] = 18233; + cached_b[1] = c.b[1] = 34863571; + cached_b[2] = c.b[2] = 834; + cached_b[3] = c.b[3] = 6138911; + + regs.gpr[3] = (unsigned long) &c.a; + regs.gpr[4] = 0; + + /* lxvd2x vsr39, r3, r4 */ + stepped = emulate_step(®s, ppc_inst(PPC_RAW_LXVD2X(39, R3, R4))); + + if (stepped == 1 && cpu_has_feature(CPU_FTR_VSX)) { + show_result("lxvd2x", "PASS"); + } else { + if (!cpu_has_feature(CPU_FTR_VSX)) + show_result("lxvd2x", "PASS (!CPU_FTR_VSX)"); + else + show_result("lxvd2x", "FAIL"); + } + + + /*** stxvd2x ***/ + + c.b[0] = 21379463; + c.b[1] = 87; + c.b[2] = 374234; + c.b[3] = 4; + + /* stxvd2x vsr39, r3, r4 */ + stepped = emulate_step(®s, ppc_inst(PPC_RAW_STXVD2X(39, R3, R4))); + + if (stepped == 1 && cached_b[0] == c.b[0] && cached_b[1] == c.b[1] && + cached_b[2] == c.b[2] && cached_b[3] == c.b[3] && + cpu_has_feature(CPU_FTR_VSX)) { + show_result("stxvd2x", "PASS"); + } else { + if (!cpu_has_feature(CPU_FTR_VSX)) + show_result("stxvd2x", "PASS (!CPU_FTR_VSX)"); + else + show_result("stxvd2x", "FAIL"); + } +} +#else +static void __init test_lxvd2x_stxvd2x(void) +{ + show_result("lxvd2x", "SKIP (CONFIG_VSX is not set)"); + show_result("stxvd2x", "SKIP (CONFIG_VSX is not set)"); +} +#endif /* CONFIG_VSX */ + +#ifdef CONFIG_VSX +static void __init test_lxvp_stxvp(void) +{ + struct pt_regs regs; + union { + vector128 a; + u32 b[4]; + } c[2]; + u32 cached_b[8]; + int stepped = -1; + + if (!cpu_has_feature(CPU_FTR_ARCH_31)) { + show_result("lxvp", "SKIP (!CPU_FTR_ARCH_31)"); + show_result("stxvp", "SKIP (!CPU_FTR_ARCH_31)"); + return; + } + + init_pt_regs(®s); + + /*** lxvp ***/ + + cached_b[0] = c[0].b[0] = 18233; + cached_b[1] = c[0].b[1] = 34863571; + cached_b[2] = c[0].b[2] = 834; + cached_b[3] = c[0].b[3] = 6138911; + cached_b[4] = c[1].b[0] = 1234; + cached_b[5] = c[1].b[1] = 5678; + cached_b[6] = c[1].b[2] = 91011; + cached_b[7] = c[1].b[3] = 121314; + + regs.gpr[4] = (unsigned long)&c[0].a; + + /* + * lxvp XTp,DQ(RA) + * XTp = 32xTX + 2xTp + * let TX=1 Tp=1 RA=4 DQ=0 + */ + stepped = emulate_step(®s, ppc_inst(PPC_RAW_LXVP(34, 4, 0))); + + if (stepped == 1 && cpu_has_feature(CPU_FTR_VSX)) { + show_result("lxvp", "PASS"); + } else { + if (!cpu_has_feature(CPU_FTR_VSX)) + show_result("lxvp", "PASS (!CPU_FTR_VSX)"); + else + show_result("lxvp", "FAIL"); + } + + /*** stxvp ***/ + + c[0].b[0] = 21379463; + c[0].b[1] = 87; + c[0].b[2] = 374234; + c[0].b[3] = 4; + c[1].b[0] = 90; + c[1].b[1] = 122; + c[1].b[2] = 555; + c[1].b[3] = 32144; + + /* + * stxvp XSp,DQ(RA) + * XSp = 32xSX + 2xSp + * let SX=1 Sp=1 RA=4 DQ=0 + */ + stepped = emulate_step(®s, ppc_inst(PPC_RAW_STXVP(34, 4, 0))); + + if (stepped == 1 && cached_b[0] == c[0].b[0] && cached_b[1] == c[0].b[1] && + cached_b[2] == c[0].b[2] && cached_b[3] == c[0].b[3] && + cached_b[4] == c[1].b[0] && cached_b[5] == c[1].b[1] && + cached_b[6] == c[1].b[2] && cached_b[7] == c[1].b[3] && + cpu_has_feature(CPU_FTR_VSX)) { + show_result("stxvp", "PASS"); + } else { + if (!cpu_has_feature(CPU_FTR_VSX)) + show_result("stxvp", "PASS (!CPU_FTR_VSX)"); + else + show_result("stxvp", "FAIL"); + } +} +#else +static void __init test_lxvp_stxvp(void) +{ + show_result("lxvp", "SKIP (CONFIG_VSX is not set)"); + show_result("stxvp", "SKIP (CONFIG_VSX is not set)"); +} +#endif /* CONFIG_VSX */ + +#ifdef CONFIG_VSX +static void __init test_lxvpx_stxvpx(void) +{ + struct pt_regs regs; + union { + vector128 a; + u32 b[4]; + } c[2]; + u32 cached_b[8]; + int stepped = -1; + + if (!cpu_has_feature(CPU_FTR_ARCH_31)) { + show_result("lxvpx", "SKIP (!CPU_FTR_ARCH_31)"); + show_result("stxvpx", "SKIP (!CPU_FTR_ARCH_31)"); + return; + } + + init_pt_regs(®s); + + /*** lxvpx ***/ + + cached_b[0] = c[0].b[0] = 18233; + cached_b[1] = c[0].b[1] = 34863571; + cached_b[2] = c[0].b[2] = 834; + cached_b[3] = c[0].b[3] = 6138911; + cached_b[4] = c[1].b[0] = 1234; + cached_b[5] = c[1].b[1] = 5678; + cached_b[6] = c[1].b[2] = 91011; + cached_b[7] = c[1].b[3] = 121314; + + regs.gpr[3] = (unsigned long)&c[0].a; + regs.gpr[4] = 0; + + /* + * lxvpx XTp,RA,RB + * XTp = 32xTX + 2xTp + * let TX=1 Tp=1 RA=3 RB=4 + */ + stepped = emulate_step(®s, ppc_inst(PPC_RAW_LXVPX(34, 3, 4))); + + if (stepped == 1 && cpu_has_feature(CPU_FTR_VSX)) { + show_result("lxvpx", "PASS"); + } else { + if (!cpu_has_feature(CPU_FTR_VSX)) + show_result("lxvpx", "PASS (!CPU_FTR_VSX)"); + else + show_result("lxvpx", "FAIL"); + } + + /*** stxvpx ***/ + + c[0].b[0] = 21379463; + c[0].b[1] = 87; + c[0].b[2] = 374234; + c[0].b[3] = 4; + c[1].b[0] = 90; + c[1].b[1] = 122; + c[1].b[2] = 555; + c[1].b[3] = 32144; + + /* + * stxvpx XSp,RA,RB + * XSp = 32xSX + 2xSp + * let SX=1 Sp=1 RA=3 RB=4 + */ + stepped = emulate_step(®s, ppc_inst(PPC_RAW_STXVPX(34, 3, 4))); + + if (stepped == 1 && cached_b[0] == c[0].b[0] && cached_b[1] == c[0].b[1] && + cached_b[2] == c[0].b[2] && cached_b[3] == c[0].b[3] && + cached_b[4] == c[1].b[0] && cached_b[5] == c[1].b[1] && + cached_b[6] == c[1].b[2] && cached_b[7] == c[1].b[3] && + cpu_has_feature(CPU_FTR_VSX)) { + show_result("stxvpx", "PASS"); + } else { + if (!cpu_has_feature(CPU_FTR_VSX)) + show_result("stxvpx", "PASS (!CPU_FTR_VSX)"); + else + show_result("stxvpx", "FAIL"); + } +} +#else +static void __init test_lxvpx_stxvpx(void) +{ + show_result("lxvpx", "SKIP (CONFIG_VSX is not set)"); + show_result("stxvpx", "SKIP (CONFIG_VSX is not set)"); +} +#endif /* CONFIG_VSX */ + +#ifdef CONFIG_VSX +static void __init test_plxvp_pstxvp(void) +{ + ppc_inst_t instr; + struct pt_regs regs; + union { + vector128 a; + u32 b[4]; + } c[2]; + u32 cached_b[8]; + int stepped = -1; + + if (!cpu_has_feature(CPU_FTR_ARCH_31)) { + show_result("plxvp", "SKIP (!CPU_FTR_ARCH_31)"); + show_result("pstxvp", "SKIP (!CPU_FTR_ARCH_31)"); + return; + } + + /*** plxvp ***/ + + cached_b[0] = c[0].b[0] = 18233; + cached_b[1] = c[0].b[1] = 34863571; + cached_b[2] = c[0].b[2] = 834; + cached_b[3] = c[0].b[3] = 6138911; + cached_b[4] = c[1].b[0] = 1234; + cached_b[5] = c[1].b[1] = 5678; + cached_b[6] = c[1].b[2] = 91011; + cached_b[7] = c[1].b[3] = 121314; + + init_pt_regs(®s); + regs.gpr[3] = (unsigned long)&c[0].a; + + /* + * plxvp XTp,D(RA),R + * XTp = 32xTX + 2xTp + * let RA=3 R=0 D=d0||d1=0 R=0 Tp=1 TX=1 + */ + instr = ppc_inst_prefix(PPC_RAW_PLXVP_P(34, 0, 3, 0), PPC_RAW_PLXVP_S(34, 0, 3, 0)); + + stepped = emulate_step(®s, instr); + if (stepped == 1 && cpu_has_feature(CPU_FTR_VSX)) { + show_result("plxvp", "PASS"); + } else { + if (!cpu_has_feature(CPU_FTR_VSX)) + show_result("plxvp", "PASS (!CPU_FTR_VSX)"); + else + show_result("plxvp", "FAIL"); + } + + /*** pstxvp ***/ + + c[0].b[0] = 21379463; + c[0].b[1] = 87; + c[0].b[2] = 374234; + c[0].b[3] = 4; + c[1].b[0] = 90; + c[1].b[1] = 122; + c[1].b[2] = 555; + c[1].b[3] = 32144; + + /* + * pstxvp XSp,D(RA),R + * XSp = 32xSX + 2xSp + * let RA=3 D=d0||d1=0 R=0 Sp=1 SX=1 + */ + instr = ppc_inst_prefix(PPC_RAW_PSTXVP_P(34, 0, 3, 0), PPC_RAW_PSTXVP_S(34, 0, 3, 0)); + + stepped = emulate_step(®s, instr); + + if (stepped == 1 && cached_b[0] == c[0].b[0] && cached_b[1] == c[0].b[1] && + cached_b[2] == c[0].b[2] && cached_b[3] == c[0].b[3] && + cached_b[4] == c[1].b[0] && cached_b[5] == c[1].b[1] && + cached_b[6] == c[1].b[2] && cached_b[7] == c[1].b[3] && + cpu_has_feature(CPU_FTR_VSX)) { + show_result("pstxvp", "PASS"); + } else { + if (!cpu_has_feature(CPU_FTR_VSX)) + show_result("pstxvp", "PASS (!CPU_FTR_VSX)"); + else + show_result("pstxvp", "FAIL"); + } +} +#else +static void __init test_plxvp_pstxvp(void) +{ + show_result("plxvp", "SKIP (CONFIG_VSX is not set)"); + show_result("pstxvp", "SKIP (CONFIG_VSX is not set)"); +} +#endif /* CONFIG_VSX */ + +static void __init run_tests_load_store(void) +{ + test_ld(); + test_pld(); + test_lwz(); + test_plwz(); + test_lwzx(); + test_std(); + test_pstd(); + test_ldarx_stdcx(); + test_lfsx_stfsx(); + test_plfs_pstfs(); + test_lfdx_stfdx(); + test_plfd_pstfd(); + test_lvx_stvx(); + test_lxvd2x_stxvd2x(); + test_lxvp_stxvp(); + test_lxvpx_stxvpx(); + test_plxvp_pstxvp(); +} + +struct compute_test { + char *mnemonic; + unsigned long cpu_feature; + struct { + char *descr; + unsigned long flags; + ppc_inst_t instr; + struct pt_regs regs; + } subtests[MAX_SUBTESTS + 1]; +}; + +/* Extreme values for si0||si1 (the MLS:D-form 34 bit immediate field) */ +#define SI_MIN BIT(33) +#define SI_MAX (BIT(33) - 1) +#define SI_UMAX (BIT(34) - 1) + +static struct compute_test compute_tests[] = { + { + .mnemonic = "nop", + .subtests = { + { + .descr = "R0 = LONG_MAX", + .instr = ppc_inst(PPC_RAW_NOP()), + .regs = { + .gpr[0] = LONG_MAX, + } + } + } + }, + { + .mnemonic = "setb", + .cpu_feature = CPU_FTR_ARCH_300, + .subtests = { + { + .descr = "BFA = 1, CR = GT", + .instr = ppc_inst(PPC_RAW_SETB(20, 1)), + .regs = { + .ccr = 0x4000000, + } + }, + { + .descr = "BFA = 4, CR = LT", + .instr = ppc_inst(PPC_RAW_SETB(20, 4)), + .regs = { + .ccr = 0x8000, + } + }, + { + .descr = "BFA = 5, CR = EQ", + .instr = ppc_inst(PPC_RAW_SETB(20, 5)), + .regs = { + .ccr = 0x200, + } + } + } + }, + { + .mnemonic = "add", + .subtests = { + { + .descr = "RA = LONG_MIN, RB = LONG_MIN", + .instr = ppc_inst(PPC_RAW_ADD(20, 21, 22)), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MIN, + } + }, + { + .descr = "RA = LONG_MIN, RB = LONG_MAX", + .instr = ppc_inst(PPC_RAW_ADD(20, 21, 22)), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = LONG_MAX, RB = LONG_MAX", + .instr = ppc_inst(PPC_RAW_ADD(20, 21, 22)), + .regs = { + .gpr[21] = LONG_MAX, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, RB = ULONG_MAX", + .instr = ppc_inst(PPC_RAW_ADD(20, 21, 22)), + .regs = { + .gpr[21] = ULONG_MAX, + .gpr[22] = ULONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, RB = 0x1", + .instr = ppc_inst(PPC_RAW_ADD(20, 21, 22)), + .regs = { + .gpr[21] = ULONG_MAX, + .gpr[22] = 0x1, + } + }, + { + .descr = "RA = INT_MIN, RB = INT_MIN", + .instr = ppc_inst(PPC_RAW_ADD(20, 21, 22)), + .regs = { + .gpr[21] = INT_MIN, + .gpr[22] = INT_MIN, + } + }, + { + .descr = "RA = INT_MIN, RB = INT_MAX", + .instr = ppc_inst(PPC_RAW_ADD(20, 21, 22)), + .regs = { + .gpr[21] = INT_MIN, + .gpr[22] = INT_MAX, + } + }, + { + .descr = "RA = INT_MAX, RB = INT_MAX", + .instr = ppc_inst(PPC_RAW_ADD(20, 21, 22)), + .regs = { + .gpr[21] = INT_MAX, + .gpr[22] = INT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, RB = UINT_MAX", + .instr = ppc_inst(PPC_RAW_ADD(20, 21, 22)), + .regs = { + .gpr[21] = UINT_MAX, + .gpr[22] = UINT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, RB = 0x1", + .instr = ppc_inst(PPC_RAW_ADD(20, 21, 22)), + .regs = { + .gpr[21] = UINT_MAX, + .gpr[22] = 0x1, + } + } + } + }, + { + .mnemonic = "add.", + .subtests = { + { + .descr = "RA = LONG_MIN, RB = LONG_MIN", + .flags = IGNORE_CCR, + .instr = ppc_inst(PPC_RAW_ADD_DOT(20, 21, 22)), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MIN, + } + }, + { + .descr = "RA = LONG_MIN, RB = LONG_MAX", + .instr = ppc_inst(PPC_RAW_ADD_DOT(20, 21, 22)), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = LONG_MAX, RB = LONG_MAX", + .flags = IGNORE_CCR, + .instr = ppc_inst(PPC_RAW_ADD_DOT(20, 21, 22)), + .regs = { + .gpr[21] = LONG_MAX, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, RB = ULONG_MAX", + .instr = ppc_inst(PPC_RAW_ADD_DOT(20, 21, 22)), + .regs = { + .gpr[21] = ULONG_MAX, + .gpr[22] = ULONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, RB = 0x1", + .instr = ppc_inst(PPC_RAW_ADD_DOT(20, 21, 22)), + .regs = { + .gpr[21] = ULONG_MAX, + .gpr[22] = 0x1, + } + }, + { + .descr = "RA = INT_MIN, RB = INT_MIN", + .instr = ppc_inst(PPC_RAW_ADD_DOT(20, 21, 22)), + .regs = { + .gpr[21] = INT_MIN, + .gpr[22] = INT_MIN, + } + }, + { + .descr = "RA = INT_MIN, RB = INT_MAX", + .instr = ppc_inst(PPC_RAW_ADD_DOT(20, 21, 22)), + .regs = { + .gpr[21] = INT_MIN, + .gpr[22] = INT_MAX, + } + }, + { + .descr = "RA = INT_MAX, RB = INT_MAX", + .instr = ppc_inst(PPC_RAW_ADD_DOT(20, 21, 22)), + .regs = { + .gpr[21] = INT_MAX, + .gpr[22] = INT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, RB = UINT_MAX", + .instr = ppc_inst(PPC_RAW_ADD_DOT(20, 21, 22)), + .regs = { + .gpr[21] = UINT_MAX, + .gpr[22] = UINT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, RB = 0x1", + .instr = ppc_inst(PPC_RAW_ADD_DOT(20, 21, 22)), + .regs = { + .gpr[21] = UINT_MAX, + .gpr[22] = 0x1, + } + } + } + }, + { + .mnemonic = "addc", + .subtests = { + { + .descr = "RA = LONG_MIN, RB = LONG_MIN", + .instr = ppc_inst(PPC_RAW_ADDC(20, 21, 22)), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MIN, + } + }, + { + .descr = "RA = LONG_MIN, RB = LONG_MAX", + .instr = ppc_inst(PPC_RAW_ADDC(20, 21, 22)), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = LONG_MAX, RB = LONG_MAX", + .instr = ppc_inst(PPC_RAW_ADDC(20, 21, 22)), + .regs = { + .gpr[21] = LONG_MAX, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, RB = ULONG_MAX", + .instr = ppc_inst(PPC_RAW_ADDC(20, 21, 22)), + .regs = { + .gpr[21] = ULONG_MAX, + .gpr[22] = ULONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, RB = 0x1", + .instr = ppc_inst(PPC_RAW_ADDC(20, 21, 22)), + .regs = { + .gpr[21] = ULONG_MAX, + .gpr[22] = 0x1, + } + }, + { + .descr = "RA = INT_MIN, RB = INT_MIN", + .instr = ppc_inst(PPC_RAW_ADDC(20, 21, 22)), + .regs = { + .gpr[21] = INT_MIN, + .gpr[22] = INT_MIN, + } + }, + { + .descr = "RA = INT_MIN, RB = INT_MAX", + .instr = ppc_inst(PPC_RAW_ADDC(20, 21, 22)), + .regs = { + .gpr[21] = INT_MIN, + .gpr[22] = INT_MAX, + } + }, + { + .descr = "RA = INT_MAX, RB = INT_MAX", + .instr = ppc_inst(PPC_RAW_ADDC(20, 21, 22)), + .regs = { + .gpr[21] = INT_MAX, + .gpr[22] = INT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, RB = UINT_MAX", + .instr = ppc_inst(PPC_RAW_ADDC(20, 21, 22)), + .regs = { + .gpr[21] = UINT_MAX, + .gpr[22] = UINT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, RB = 0x1", + .instr = ppc_inst(PPC_RAW_ADDC(20, 21, 22)), + .regs = { + .gpr[21] = UINT_MAX, + .gpr[22] = 0x1, + } + }, + { + .descr = "RA = LONG_MIN | INT_MIN, RB = LONG_MIN | INT_MIN", + .instr = ppc_inst(PPC_RAW_ADDC(20, 21, 22)), + .regs = { + .gpr[21] = LONG_MIN | (uint)INT_MIN, + .gpr[22] = LONG_MIN | (uint)INT_MIN, + } + } + } + }, + { + .mnemonic = "addc.", + .subtests = { + { + .descr = "RA = LONG_MIN, RB = LONG_MIN", + .flags = IGNORE_CCR, + .instr = ppc_inst(PPC_RAW_ADDC_DOT(20, 21, 22)), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MIN, + } + }, + { + .descr = "RA = LONG_MIN, RB = LONG_MAX", + .instr = ppc_inst(PPC_RAW_ADDC_DOT(20, 21, 22)), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = LONG_MAX, RB = LONG_MAX", + .flags = IGNORE_CCR, + .instr = ppc_inst(PPC_RAW_ADDC_DOT(20, 21, 22)), + .regs = { + .gpr[21] = LONG_MAX, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, RB = ULONG_MAX", + .instr = ppc_inst(PPC_RAW_ADDC_DOT(20, 21, 22)), + .regs = { + .gpr[21] = ULONG_MAX, + .gpr[22] = ULONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, RB = 0x1", + .instr = ppc_inst(PPC_RAW_ADDC_DOT(20, 21, 22)), + .regs = { + .gpr[21] = ULONG_MAX, + .gpr[22] = 0x1, + } + }, + { + .descr = "RA = INT_MIN, RB = INT_MIN", + .instr = ppc_inst(PPC_RAW_ADDC_DOT(20, 21, 22)), + .regs = { + .gpr[21] = INT_MIN, + .gpr[22] = INT_MIN, + } + }, + { + .descr = "RA = INT_MIN, RB = INT_MAX", + .instr = ppc_inst(PPC_RAW_ADDC_DOT(20, 21, 22)), + .regs = { + .gpr[21] = INT_MIN, + .gpr[22] = INT_MAX, + } + }, + { + .descr = "RA = INT_MAX, RB = INT_MAX", + .instr = ppc_inst(PPC_RAW_ADDC_DOT(20, 21, 22)), + .regs = { + .gpr[21] = INT_MAX, + .gpr[22] = INT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, RB = UINT_MAX", + .instr = ppc_inst(PPC_RAW_ADDC_DOT(20, 21, 22)), + .regs = { + .gpr[21] = UINT_MAX, + .gpr[22] = UINT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, RB = 0x1", + .instr = ppc_inst(PPC_RAW_ADDC_DOT(20, 21, 22)), + .regs = { + .gpr[21] = UINT_MAX, + .gpr[22] = 0x1, + } + }, + { + .descr = "RA = LONG_MIN | INT_MIN, RB = LONG_MIN | INT_MIN", + .instr = ppc_inst(PPC_RAW_ADDC_DOT(20, 21, 22)), + .regs = { + .gpr[21] = LONG_MIN | (uint)INT_MIN, + .gpr[22] = LONG_MIN | (uint)INT_MIN, + } + } + } + }, + { + .mnemonic = "divde", + .subtests = { + { + .descr = "RA = LONG_MIN, RB = LONG_MIN", + .instr = ppc_inst(PPC_RAW_DIVDE(20, 21, 22)), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MIN, + } + }, + { + .descr = "RA = 1L, RB = 0", + .instr = ppc_inst(PPC_RAW_DIVDE(20, 21, 22)), + .flags = IGNORE_GPR(20), + .regs = { + .gpr[21] = 1L, + .gpr[22] = 0, + } + }, + { + .descr = "RA = LONG_MIN, RB = LONG_MAX", + .instr = ppc_inst(PPC_RAW_DIVDE(20, 21, 22)), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MAX, + } + } + } + }, + { + .mnemonic = "divde.", + .subtests = { + { + .descr = "RA = LONG_MIN, RB = LONG_MIN", + .instr = ppc_inst(PPC_RAW_DIVDE_DOT(20, 21, 22)), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MIN, + } + }, + { + .descr = "RA = 1L, RB = 0", + .instr = ppc_inst(PPC_RAW_DIVDE_DOT(20, 21, 22)), + .flags = IGNORE_GPR(20), + .regs = { + .gpr[21] = 1L, + .gpr[22] = 0, + } + }, + { + .descr = "RA = LONG_MIN, RB = LONG_MAX", + .instr = ppc_inst(PPC_RAW_DIVDE_DOT(20, 21, 22)), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MAX, + } + } + } + }, + { + .mnemonic = "divdeu", + .subtests = { + { + .descr = "RA = LONG_MIN, RB = LONG_MIN", + .instr = ppc_inst(PPC_RAW_DIVDEU(20, 21, 22)), + .flags = IGNORE_GPR(20), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MIN, + } + }, + { + .descr = "RA = 1L, RB = 0", + .instr = ppc_inst(PPC_RAW_DIVDEU(20, 21, 22)), + .flags = IGNORE_GPR(20), + .regs = { + .gpr[21] = 1L, + .gpr[22] = 0, + } + }, + { + .descr = "RA = LONG_MIN, RB = LONG_MAX", + .instr = ppc_inst(PPC_RAW_DIVDEU(20, 21, 22)), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = LONG_MAX - 1, RB = LONG_MAX", + .instr = ppc_inst(PPC_RAW_DIVDEU(20, 21, 22)), + .regs = { + .gpr[21] = LONG_MAX - 1, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = LONG_MIN + 1, RB = LONG_MIN", + .instr = ppc_inst(PPC_RAW_DIVDEU(20, 21, 22)), + .flags = IGNORE_GPR(20), + .regs = { + .gpr[21] = LONG_MIN + 1, + .gpr[22] = LONG_MIN, + } + } + } + }, + { + .mnemonic = "divdeu.", + .subtests = { + { + .descr = "RA = LONG_MIN, RB = LONG_MIN", + .instr = ppc_inst(PPC_RAW_DIVDEU_DOT(20, 21, 22)), + .flags = IGNORE_GPR(20), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MIN, + } + }, + { + .descr = "RA = 1L, RB = 0", + .instr = ppc_inst(PPC_RAW_DIVDEU_DOT(20, 21, 22)), + .flags = IGNORE_GPR(20), + .regs = { + .gpr[21] = 1L, + .gpr[22] = 0, + } + }, + { + .descr = "RA = LONG_MIN, RB = LONG_MAX", + .instr = ppc_inst(PPC_RAW_DIVDEU_DOT(20, 21, 22)), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = LONG_MAX - 1, RB = LONG_MAX", + .instr = ppc_inst(PPC_RAW_DIVDEU_DOT(20, 21, 22)), + .regs = { + .gpr[21] = LONG_MAX - 1, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = LONG_MIN + 1, RB = LONG_MIN", + .instr = ppc_inst(PPC_RAW_DIVDEU_DOT(20, 21, 22)), + .flags = IGNORE_GPR(20), + .regs = { + .gpr[21] = LONG_MIN + 1, + .gpr[22] = LONG_MIN, + } + } + } + }, + { + .mnemonic = "paddi", + .cpu_feature = CPU_FTR_ARCH_31, + .subtests = { + { + .descr = "RA = LONG_MIN, SI = SI_MIN, R = 0", + .instr = TEST_PADDI(21, 22, SI_MIN, 0), + .regs = { + .gpr[21] = 0, + .gpr[22] = LONG_MIN, + } + }, + { + .descr = "RA = LONG_MIN, SI = SI_MAX, R = 0", + .instr = TEST_PADDI(21, 22, SI_MAX, 0), + .regs = { + .gpr[21] = 0, + .gpr[22] = LONG_MIN, + } + }, + { + .descr = "RA = LONG_MAX, SI = SI_MAX, R = 0", + .instr = TEST_PADDI(21, 22, SI_MAX, 0), + .regs = { + .gpr[21] = 0, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, SI = SI_UMAX, R = 0", + .instr = TEST_PADDI(21, 22, SI_UMAX, 0), + .regs = { + .gpr[21] = 0, + .gpr[22] = ULONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, SI = 0x1, R = 0", + .instr = TEST_PADDI(21, 22, 0x1, 0), + .regs = { + .gpr[21] = 0, + .gpr[22] = ULONG_MAX, + } + }, + { + .descr = "RA = INT_MIN, SI = SI_MIN, R = 0", + .instr = TEST_PADDI(21, 22, SI_MIN, 0), + .regs = { + .gpr[21] = 0, + .gpr[22] = INT_MIN, + } + }, + { + .descr = "RA = INT_MIN, SI = SI_MAX, R = 0", + .instr = TEST_PADDI(21, 22, SI_MAX, 0), + .regs = { + .gpr[21] = 0, + .gpr[22] = INT_MIN, + } + }, + { + .descr = "RA = INT_MAX, SI = SI_MAX, R = 0", + .instr = TEST_PADDI(21, 22, SI_MAX, 0), + .regs = { + .gpr[21] = 0, + .gpr[22] = INT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, SI = 0x1, R = 0", + .instr = TEST_PADDI(21, 22, 0x1, 0), + .regs = { + .gpr[21] = 0, + .gpr[22] = UINT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, SI = SI_MAX, R = 0", + .instr = TEST_PADDI(21, 22, SI_MAX, 0), + .regs = { + .gpr[21] = 0, + .gpr[22] = UINT_MAX, + } + }, + { + .descr = "RA is r0, SI = SI_MIN, R = 0", + .instr = TEST_PADDI(21, 0, SI_MIN, 0), + .regs = { + .gpr[21] = 0x0, + } + }, + { + .descr = "RA = 0, SI = SI_MIN, R = 0", + .instr = TEST_PADDI(21, 22, SI_MIN, 0), + .regs = { + .gpr[21] = 0x0, + .gpr[22] = 0x0, + } + }, + { + .descr = "RA is r0, SI = 0, R = 1", + .instr = TEST_PADDI(21, 0, 0, 1), + .regs = { + .gpr[21] = 0, + } + }, + { + .descr = "RA is r0, SI = SI_MIN, R = 1", + .instr = TEST_PADDI(21, 0, SI_MIN, 1), + .regs = { + .gpr[21] = 0, + } + }, + /* Invalid instruction form with R = 1 and RA != 0 */ + { + .descr = "RA = R22(0), SI = 0, R = 1", + .instr = TEST_PADDI(21, 22, 0, 1), + .flags = NEGATIVE_TEST, + .regs = { + .gpr[21] = 0, + .gpr[22] = 0, + } + } + } + } +}; + +static int __init emulate_compute_instr(struct pt_regs *regs, + ppc_inst_t instr, + bool negative) +{ + int analysed; + struct instruction_op op; + + if (!regs || !ppc_inst_val(instr)) + return -EINVAL; + + /* This is not a return frame regs */ + regs->nip = patch_site_addr(&patch__exec_instr); + + analysed = analyse_instr(&op, regs, instr); + if (analysed != 1 || GETTYPE(op.type) != COMPUTE) { + if (negative) + return -EFAULT; + pr_info("emulation failed, instruction = %08lx\n", ppc_inst_as_ulong(instr)); + return -EFAULT; + } + if (analysed == 1 && negative) + pr_info("negative test failed, instruction = %08lx\n", ppc_inst_as_ulong(instr)); + if (!negative) + emulate_update_regs(regs, &op); + return 0; +} + +static int __init execute_compute_instr(struct pt_regs *regs, + ppc_inst_t instr) +{ + extern int exec_instr(struct pt_regs *regs); + + if (!regs || !ppc_inst_val(instr)) + return -EINVAL; + + /* Patch the NOP with the actual instruction */ + patch_instruction_site(&patch__exec_instr, instr); + if (exec_instr(regs)) { + pr_info("execution failed, instruction = %08lx\n", ppc_inst_as_ulong(instr)); + return -EFAULT; + } + + return 0; +} + +#define gpr_mismatch(gprn, exp, got) \ + pr_info("GPR%u mismatch, exp = 0x%016lx, got = 0x%016lx\n", \ + gprn, exp, got) + +#define reg_mismatch(name, exp, got) \ + pr_info("%s mismatch, exp = 0x%016lx, got = 0x%016lx\n", \ + name, exp, got) + +static void __init run_tests_compute(void) +{ + unsigned long flags; + struct compute_test *test; + struct pt_regs *regs, exp, got; + unsigned int i, j, k; + ppc_inst_t instr; + bool ignore_gpr, ignore_xer, ignore_ccr, passed, rc, negative; + + for (i = 0; i < ARRAY_SIZE(compute_tests); i++) { + test = &compute_tests[i]; + + if (test->cpu_feature && !early_cpu_has_feature(test->cpu_feature)) { + show_result(test->mnemonic, "SKIP (!CPU_FTR)"); + continue; + } + + for (j = 0; j < MAX_SUBTESTS && test->subtests[j].descr; j++) { + instr = test->subtests[j].instr; + flags = test->subtests[j].flags; + regs = &test->subtests[j].regs; + negative = flags & NEGATIVE_TEST; + ignore_xer = flags & IGNORE_XER; + ignore_ccr = flags & IGNORE_CCR; + passed = true; + + memcpy(&exp, regs, sizeof(struct pt_regs)); + memcpy(&got, regs, sizeof(struct pt_regs)); + + /* + * Set a compatible MSR value explicitly to ensure + * that XER and CR bits are updated appropriately + */ + exp.msr = MSR_KERNEL; + got.msr = MSR_KERNEL; + + rc = emulate_compute_instr(&got, instr, negative) != 0; + if (negative) { + /* skip executing instruction */ + passed = rc; + goto print; + } else if (rc || execute_compute_instr(&exp, instr)) { + passed = false; + goto print; + } + + /* Verify GPR values */ + for (k = 0; k < 32; k++) { + ignore_gpr = flags & IGNORE_GPR(k); + if (!ignore_gpr && exp.gpr[k] != got.gpr[k]) { + passed = false; + gpr_mismatch(k, exp.gpr[k], got.gpr[k]); + } + } + + /* Verify LR value */ + if (exp.link != got.link) { + passed = false; + reg_mismatch("LR", exp.link, got.link); + } + + /* Verify XER value */ + if (!ignore_xer && exp.xer != got.xer) { + passed = false; + reg_mismatch("XER", exp.xer, got.xer); + } + + /* Verify CR value */ + if (!ignore_ccr && exp.ccr != got.ccr) { + passed = false; + reg_mismatch("CR", exp.ccr, got.ccr); + } + +print: + show_result_with_descr(test->mnemonic, + test->subtests[j].descr, + passed ? "PASS" : "FAIL"); + } + } +} + +static int __init test_emulate_step(void) +{ + printk(KERN_INFO "Running instruction emulation self-tests ...\n"); + run_tests_load_store(); + run_tests_compute(); + + return 0; +} +late_initcall(test_emulate_step); diff --git a/arch/powerpc/lib/test_emulate_step_exec_instr.S b/arch/powerpc/lib/test_emulate_step_exec_instr.S new file mode 100644 index 0000000000..e2b646a4f7 --- /dev/null +++ b/arch/powerpc/lib/test_emulate_step_exec_instr.S @@ -0,0 +1,150 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Non-emulated single-stepping support (currently limited to basic integer + * computations) used to validate the instruction emulation infrastructure. + * + * Copyright (C) 2019 IBM Corporation + */ + +#include <asm/asm-offsets.h> +#include <asm/ppc_asm.h> +#include <asm/code-patching-asm.h> +#include <linux/errno.h> + +/* int exec_instr(struct pt_regs *regs) */ +_GLOBAL(exec_instr) + + /* + * Stack frame layout (INT_FRAME_SIZE bytes) + * In-memory pt_regs (SP + STACK_INT_FRAME_REGS) + * Scratch space (SP + 8) + * Back chain (SP + 0) + */ + + /* + * Allocate a new stack frame with enough space to hold the register + * states in an in-memory pt_regs and also create the back chain to + * the caller's stack frame. + */ + stdu r1, -INT_FRAME_SIZE(r1) + + /* + * Save non-volatile GPRs on stack. This includes TOC pointer (GPR2) + * and local variables (GPR14 to GPR31). The register for the pt_regs + * parameter (GPR3) is saved additionally to ensure that the resulting + * register state can still be saved even if GPR3 gets overwritten + * when loading the initial register state for the test instruction. + * The stack pointer (GPR1) and the thread pointer (GPR13) are not + * saved as these should not be modified anyway. + */ + SAVE_GPRS(2, 3, r1) + SAVE_NVGPRS(r1) + + /* + * Save LR on stack to ensure that the return address is available + * even if it gets overwritten by the test instruction. + */ + mflr r0 + std r0, _LINK(r1) + + /* + * Save CR on stack. For simplicity, the entire register is saved + * even though only fields 2 to 4 are non-volatile. + */ + mfcr r0 + std r0, _CCR(r1) + + /* + * Load register state for the test instruction without touching the + * critical non-volatile registers. The register state is passed as a + * pointer to a pt_regs instance. + */ + subi r31, r3, GPR0 + + /* Load LR from pt_regs */ + ld r0, _LINK(r31) + mtlr r0 + + /* Load CR from pt_regs */ + ld r0, _CCR(r31) + mtcr r0 + + /* Load XER from pt_regs */ + ld r0, _XER(r31) + mtxer r0 + + /* Load GPRs from pt_regs */ + REST_GPR(0, r31) + REST_GPRS(2, 12, r31) + REST_NVGPRS(r31) + + /* Placeholder for the test instruction */ + .balign 64 +1: nop + nop + patch_site 1b patch__exec_instr + + /* + * Since GPR3 is overwritten, temporarily restore it back to its + * original state, i.e. the pointer to pt_regs, to ensure that the + * resulting register state can be saved. Before doing this, a copy + * of it is created in the scratch space which is used later on to + * save it to pt_regs. + */ + std r3, 8(r1) + REST_GPR(3, r1) + + /* Save resulting GPR state to pt_regs */ + subi r3, r3, GPR0 + SAVE_GPR(0, r3) + SAVE_GPR(2, r3) + SAVE_GPRS(4, 12, r3) + SAVE_NVGPRS(r3) + + /* Save resulting LR to pt_regs */ + mflr r0 + std r0, _LINK(r3) + + /* Save resulting CR to pt_regs */ + mfcr r0 + std r0, _CCR(r3) + + /* Save resulting XER to pt_regs */ + mfxer r0 + std r0, _XER(r3) + + /* Restore resulting GPR3 from scratch space and save it to pt_regs */ + ld r0, 8(r1) + std r0, GPR3(r3) + + /* Set return value to denote execution success */ + li r3, 0 + + /* Continue */ + b 3f + + /* Set return value to denote execution failure */ +2: li r3, -EFAULT + + /* Restore the non-volatile GPRs from stack */ +3: REST_GPR(2, r1) + REST_NVGPRS(r1) + + /* Restore LR from stack to be able to return */ + ld r0, _LINK(r1) + mtlr r0 + + /* Restore CR from stack */ + ld r0, _CCR(r1) + mtcr r0 + + /* Tear down stack frame */ + addi r1, r1, INT_FRAME_SIZE + + /* Return */ + blr + + /* Setup exception table */ + EX_TABLE(1b, 2b) + +_ASM_NOKPROBE_SYMBOL(exec_instr) diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c new file mode 100644 index 0000000000..d491da8d18 --- /dev/null +++ b/arch/powerpc/lib/vmx-helper.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * + * Copyright (C) IBM Corporation, 2011 + * + * Authors: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com> + * Anton Blanchard <anton@au.ibm.com> + */ +#include <linux/uaccess.h> +#include <linux/hardirq.h> +#include <asm/switch_to.h> + +int enter_vmx_usercopy(void) +{ + if (in_interrupt()) + return 0; + + preempt_disable(); + /* + * We need to disable page faults as they can call schedule and + * thus make us lose the VMX context. So on page faults, we just + * fail which will cause a fallback to the normal non-vmx copy. + */ + pagefault_disable(); + + enable_kernel_altivec(); + + return 1; +} + +/* + * This function must return 0 because we tail call optimise when calling + * from __copy_tofrom_user_power7 which returns 0 on success. + */ +int exit_vmx_usercopy(void) +{ + disable_kernel_altivec(); + pagefault_enable(); + preempt_enable_no_resched(); + /* + * Must never explicitly call schedule (including preempt_enable()) + * while in a kuap-unlocked user copy, because the AMR register will + * not be saved and restored across context switch. However preempt + * kernels need to be preempted as soon as possible if need_resched is + * set and we are preemptible. The hack here is to schedule a + * decrementer to fire here and reschedule for us if necessary. + */ + if (IS_ENABLED(CONFIG_PREEMPT) && need_resched()) + set_dec(1); + return 0; +} + +int enter_vmx_ops(void) +{ + if (in_interrupt()) + return 0; + + preempt_disable(); + + enable_kernel_altivec(); + + return 1; +} + +/* + * All calls to this function will be optimised into tail calls. We are + * passed a pointer to the destination which we return as required by a + * memcpy implementation. + */ +void *exit_vmx_ops(void *dest) +{ + disable_kernel_altivec(); + preempt_enable(); + return dest; +} diff --git a/arch/powerpc/lib/xor_vmx.c b/arch/powerpc/lib/xor_vmx.c new file mode 100644 index 0000000000..aab49d056d --- /dev/null +++ b/arch/powerpc/lib/xor_vmx.c @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ + +/* + * Sparse (as at v0.5.0) gets very, very confused by this file. + * Make it a bit simpler for it. + */ +#if !defined(__CHECKER__) +#include <altivec.h> +#else +#define vec_xor(a, b) a ^ b +#define vector __attribute__((vector_size(16))) +#endif + +#include "xor_vmx.h" + +typedef vector signed char unative_t; + +#define DEFINE(V) \ + unative_t *V = (unative_t *)V##_in; \ + unative_t V##_0, V##_1, V##_2, V##_3 + +#define LOAD(V) \ + do { \ + V##_0 = V[0]; \ + V##_1 = V[1]; \ + V##_2 = V[2]; \ + V##_3 = V[3]; \ + } while (0) + +#define STORE(V) \ + do { \ + V[0] = V##_0; \ + V[1] = V##_1; \ + V[2] = V##_2; \ + V[3] = V##_3; \ + } while (0) + +#define XOR(V1, V2) \ + do { \ + V1##_0 = vec_xor(V1##_0, V2##_0); \ + V1##_1 = vec_xor(V1##_1, V2##_1); \ + V1##_2 = vec_xor(V1##_2, V2##_2); \ + V1##_3 = vec_xor(V1##_3, V2##_3); \ + } while (0) + +void __xor_altivec_2(unsigned long bytes, + unsigned long * __restrict v1_in, + const unsigned long * __restrict v2_in) +{ + DEFINE(v1); + DEFINE(v2); + unsigned long lines = bytes / (sizeof(unative_t)) / 4; + + do { + LOAD(v1); + LOAD(v2); + XOR(v1, v2); + STORE(v1); + + v1 += 4; + v2 += 4; + } while (--lines > 0); +} + +void __xor_altivec_3(unsigned long bytes, + unsigned long * __restrict v1_in, + const unsigned long * __restrict v2_in, + const unsigned long * __restrict v3_in) +{ + DEFINE(v1); + DEFINE(v2); + DEFINE(v3); + unsigned long lines = bytes / (sizeof(unative_t)) / 4; + + do { + LOAD(v1); + LOAD(v2); + LOAD(v3); + XOR(v1, v2); + XOR(v1, v3); + STORE(v1); + + v1 += 4; + v2 += 4; + v3 += 4; + } while (--lines > 0); +} + +void __xor_altivec_4(unsigned long bytes, + unsigned long * __restrict v1_in, + const unsigned long * __restrict v2_in, + const unsigned long * __restrict v3_in, + const unsigned long * __restrict v4_in) +{ + DEFINE(v1); + DEFINE(v2); + DEFINE(v3); + DEFINE(v4); + unsigned long lines = bytes / (sizeof(unative_t)) / 4; + + do { + LOAD(v1); + LOAD(v2); + LOAD(v3); + LOAD(v4); + XOR(v1, v2); + XOR(v3, v4); + XOR(v1, v3); + STORE(v1); + + v1 += 4; + v2 += 4; + v3 += 4; + v4 += 4; + } while (--lines > 0); +} + +void __xor_altivec_5(unsigned long bytes, + unsigned long * __restrict v1_in, + const unsigned long * __restrict v2_in, + const unsigned long * __restrict v3_in, + const unsigned long * __restrict v4_in, + const unsigned long * __restrict v5_in) +{ + DEFINE(v1); + DEFINE(v2); + DEFINE(v3); + DEFINE(v4); + DEFINE(v5); + unsigned long lines = bytes / (sizeof(unative_t)) / 4; + + do { + LOAD(v1); + LOAD(v2); + LOAD(v3); + LOAD(v4); + LOAD(v5); + XOR(v1, v2); + XOR(v3, v4); + XOR(v1, v5); + XOR(v1, v3); + STORE(v1); + + v1 += 4; + v2 += 4; + v3 += 4; + v4 += 4; + v5 += 4; + } while (--lines > 0); +} diff --git a/arch/powerpc/lib/xor_vmx.h b/arch/powerpc/lib/xor_vmx.h new file mode 100644 index 0000000000..573c41d90d --- /dev/null +++ b/arch/powerpc/lib/xor_vmx.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Simple interface to link xor_vmx.c and xor_vmx_glue.c + * + * Separating these file ensures that no altivec instructions are run + * outside of the enable/disable altivec block. + */ + +void __xor_altivec_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2); +void __xor_altivec_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3); +void __xor_altivec_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4); +void __xor_altivec_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5); diff --git a/arch/powerpc/lib/xor_vmx_glue.c b/arch/powerpc/lib/xor_vmx_glue.c new file mode 100644 index 0000000000..35d917ece4 --- /dev/null +++ b/arch/powerpc/lib/xor_vmx_glue.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Altivec XOR operations + * + * Copyright 2017 IBM Corp. + */ + +#include <linux/preempt.h> +#include <linux/export.h> +#include <linux/sched.h> +#include <asm/switch_to.h> +#include <asm/xor_altivec.h> +#include "xor_vmx.h" + +void xor_altivec_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2) +{ + preempt_disable(); + enable_kernel_altivec(); + __xor_altivec_2(bytes, p1, p2); + disable_kernel_altivec(); + preempt_enable(); +} +EXPORT_SYMBOL(xor_altivec_2); + +void xor_altivec_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3) +{ + preempt_disable(); + enable_kernel_altivec(); + __xor_altivec_3(bytes, p1, p2, p3); + disable_kernel_altivec(); + preempt_enable(); +} +EXPORT_SYMBOL(xor_altivec_3); + +void xor_altivec_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4) +{ + preempt_disable(); + enable_kernel_altivec(); + __xor_altivec_4(bytes, p1, p2, p3, p4); + disable_kernel_altivec(); + preempt_enable(); +} +EXPORT_SYMBOL(xor_altivec_4); + +void xor_altivec_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5) +{ + preempt_disable(); + enable_kernel_altivec(); + __xor_altivec_5(bytes, p1, p2, p3, p4, p5); + disable_kernel_altivec(); + preempt_enable(); +} +EXPORT_SYMBOL(xor_altivec_5); |