diff options
Diffstat (limited to '')
97 files changed, 29954 insertions, 0 deletions
diff --git a/arch/s390/kernel/.gitignore b/arch/s390/kernel/.gitignore new file mode 100644 index 000000000..c5f676c3c --- /dev/null +++ b/arch/s390/kernel/.gitignore @@ -0,0 +1 @@ +vmlinux.lds diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile new file mode 100644 index 000000000..762fc4537 --- /dev/null +++ b/arch/s390/kernel/Makefile @@ -0,0 +1,88 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the linux kernel. +# + +ifdef CONFIG_FUNCTION_TRACER + +# Do not trace tracer code +CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) + +# Do not trace early setup code +CFLAGS_REMOVE_early.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_early_nobss.o = $(CC_FLAGS_FTRACE) + +endif + +GCOV_PROFILE_early.o := n +GCOV_PROFILE_early_nobss.o := n + +KCOV_INSTRUMENT_early.o := n +KCOV_INSTRUMENT_early_nobss.o := n + +UBSAN_SANITIZE_early.o := n +UBSAN_SANITIZE_early_nobss.o := n + +KASAN_SANITIZE_early_nobss.o := n + +# +# Passing null pointers is ok for smp code, since we access the lowcore here. +# +CFLAGS_smp.o := -Wno-nonnull + +# +# Disable tailcall optimizations for stack / callchain walking functions +# since this might generate broken code when accessing register 15 and +# passing its content to other functions. +# +CFLAGS_stacktrace.o += -fno-optimize-sibling-calls +CFLAGS_dumpstack.o += -fno-optimize-sibling-calls + +# +# Pass UTS_MACHINE for user_regset definition +# +CFLAGS_ptrace.o += -DUTS_MACHINE='"$(UTS_MACHINE)"' + +obj-y := traps.o time.o process.o base.o early.o setup.o idle.o vtime.o +obj-y += processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o +obj-y += debug.o irq.o ipl.o dis.o diag.o vdso.o early_nobss.o +obj-y += sysinfo.o lgr.o os_info.o machine_kexec.o pgm_check.o +obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o +obj-y += entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o +obj-y += nospec-branch.o + +extra-y += head64.o vmlinux.lds + +obj-$(CONFIG_SYSFS) += nospec-sysfs.o +CFLAGS_REMOVE_nospec-branch.o += $(CC_FLAGS_EXPOLINE) + +obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_SMP) += smp.o +obj-$(CONFIG_SCHED_TOPOLOGY) += topology.o +obj-$(CONFIG_HIBERNATION) += suspend.o swsusp.o +obj-$(CONFIG_AUDIT) += audit.o +compat-obj-$(CONFIG_AUDIT) += compat_audit.o +obj-$(CONFIG_COMPAT) += compat_linux.o compat_signal.o +obj-$(CONFIG_COMPAT) += compat_wrapper.o $(compat-obj-y) +obj-$(CONFIG_EARLY_PRINTK) += early_printk.o +obj-$(CONFIG_STACKTRACE) += stacktrace.o +obj-$(CONFIG_KPROBES) += kprobes.o +obj-$(CONFIG_FUNCTION_TRACER) += mcount.o ftrace.o +obj-$(CONFIG_CRASH_DUMP) += crash_dump.o +obj-$(CONFIG_UPROBES) += uprobes.o +obj-$(CONFIG_JUMP_LABEL) += jump_label.o + +obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file.o kexec_image.o +obj-$(CONFIG_KEXEC_FILE) += kexec_elf.o + +obj-$(CONFIG_PERF_EVENTS) += perf_event.o perf_cpum_cf.o perf_cpum_sf.o +obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_events.o perf_regs.o + +obj-$(CONFIG_TRACEPOINTS) += trace.o + +# vdso +obj-y += vdso64/ +obj-$(CONFIG_COMPAT) += vdso32/ + +chkbss := head64.o early_nobss.o +include $(srctree)/arch/s390/scripts/Makefile.chkbss diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c new file mode 100644 index 000000000..8e1f2aee8 --- /dev/null +++ b/arch/s390/kernel/alternative.c @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/module.h> +#include <asm/alternative.h> +#include <asm/facility.h> +#include <asm/nospec-branch.h> + +#define MAX_PATCH_LEN (255 - 1) + +static int __initdata_or_module alt_instr_disabled; + +static int __init disable_alternative_instructions(char *str) +{ + alt_instr_disabled = 1; + return 0; +} + +early_param("noaltinstr", disable_alternative_instructions); + +struct brcl_insn { + u16 opc; + s32 disp; +} __packed; + +static u16 __initdata_or_module nop16 = 0x0700; +static u32 __initdata_or_module nop32 = 0x47000000; +static struct brcl_insn __initdata_or_module nop48 = { + 0xc004, 0 +}; + +static const void *nops[] __initdata_or_module = { + &nop16, + &nop32, + &nop48 +}; + +static void __init_or_module add_jump_padding(void *insns, unsigned int len) +{ + struct brcl_insn brcl = { + 0xc0f4, + len / 2 + }; + + memcpy(insns, &brcl, sizeof(brcl)); + insns += sizeof(brcl); + len -= sizeof(brcl); + + while (len > 0) { + memcpy(insns, &nop16, 2); + insns += 2; + len -= 2; + } +} + +static void __init_or_module add_padding(void *insns, unsigned int len) +{ + if (len > 6) + add_jump_padding(insns, len); + else if (len >= 2) + memcpy(insns, nops[len / 2 - 1], len); +} + +static void __init_or_module __apply_alternatives(struct alt_instr *start, + struct alt_instr *end) +{ + struct alt_instr *a; + u8 *instr, *replacement; + u8 insnbuf[MAX_PATCH_LEN]; + + /* + * The scan order should be from start to end. A later scanned + * alternative code can overwrite previously scanned alternative code. + */ + for (a = start; a < end; a++) { + int insnbuf_sz = 0; + + instr = (u8 *)&a->instr_offset + a->instr_offset; + replacement = (u8 *)&a->repl_offset + a->repl_offset; + + if (!__test_facility(a->facility, + S390_lowcore.alt_stfle_fac_list)) + continue; + + if (unlikely(a->instrlen % 2 || a->replacementlen % 2)) { + WARN_ONCE(1, "cpu alternatives instructions length is " + "odd, skipping patching\n"); + continue; + } + + memcpy(insnbuf, replacement, a->replacementlen); + insnbuf_sz = a->replacementlen; + + if (a->instrlen > a->replacementlen) { + add_padding(insnbuf + a->replacementlen, + a->instrlen - a->replacementlen); + insnbuf_sz += a->instrlen - a->replacementlen; + } + + s390_kernel_write(instr, insnbuf, insnbuf_sz); + } +} + +void __init_or_module apply_alternatives(struct alt_instr *start, + struct alt_instr *end) +{ + if (!alt_instr_disabled) + __apply_alternatives(start, end); +} + +extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; +void __init apply_alternative_instructions(void) +{ + apply_alternatives(__alt_instructions, __alt_instructions_end); +} diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c new file mode 100644 index 000000000..e9d09f6e8 --- /dev/null +++ b/arch/s390/kernel/asm-offsets.c @@ -0,0 +1,206 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Generate definitions needed by assembly language modules. + * This code generates raw asm output which is post-processed to extract + * and format the required data. + */ + +#define ASM_OFFSETS_C + +#include <linux/kbuild.h> +#include <linux/kvm_host.h> +#include <linux/sched.h> +#include <linux/purgatory.h> +#include <asm/idle.h> +#include <asm/vdso.h> +#include <asm/pgtable.h> +#include <asm/gmap.h> +#include <asm/nmi.h> + +int main(void) +{ + /* task struct offsets */ + OFFSET(__TASK_stack, task_struct, stack); + OFFSET(__TASK_thread, task_struct, thread); + OFFSET(__TASK_pid, task_struct, pid); + BLANK(); + /* thread struct offsets */ + OFFSET(__THREAD_ksp, thread_struct, ksp); + OFFSET(__THREAD_sysc_table, thread_struct, sys_call_table); + OFFSET(__THREAD_last_break, thread_struct, last_break); + OFFSET(__THREAD_FPU_fpc, thread_struct, fpu.fpc); + OFFSET(__THREAD_FPU_regs, thread_struct, fpu.regs); + OFFSET(__THREAD_per_cause, thread_struct, per_event.cause); + OFFSET(__THREAD_per_address, thread_struct, per_event.address); + OFFSET(__THREAD_per_paid, thread_struct, per_event.paid); + OFFSET(__THREAD_trap_tdb, thread_struct, trap_tdb); + BLANK(); + /* thread info offsets */ + OFFSET(__TI_flags, task_struct, thread_info.flags); + BLANK(); + /* pt_regs offsets */ + OFFSET(__PT_ARGS, pt_regs, args); + OFFSET(__PT_PSW, pt_regs, psw); + OFFSET(__PT_GPRS, pt_regs, gprs); + OFFSET(__PT_ORIG_GPR2, pt_regs, orig_gpr2); + OFFSET(__PT_INT_CODE, pt_regs, int_code); + OFFSET(__PT_INT_PARM, pt_regs, int_parm); + OFFSET(__PT_INT_PARM_LONG, pt_regs, int_parm_long); + OFFSET(__PT_FLAGS, pt_regs, flags); + DEFINE(__PT_SIZE, sizeof(struct pt_regs)); + BLANK(); + /* stack_frame offsets */ + OFFSET(__SF_BACKCHAIN, stack_frame, back_chain); + OFFSET(__SF_GPRS, stack_frame, gprs); + OFFSET(__SF_EMPTY, stack_frame, empty1); + OFFSET(__SF_SIE_CONTROL, stack_frame, empty1[0]); + OFFSET(__SF_SIE_SAVEAREA, stack_frame, empty1[1]); + OFFSET(__SF_SIE_REASON, stack_frame, empty1[2]); + OFFSET(__SF_SIE_FLAGS, stack_frame, empty1[3]); + BLANK(); + /* timeval/timezone offsets for use by vdso */ + OFFSET(__VDSO_UPD_COUNT, vdso_data, tb_update_count); + OFFSET(__VDSO_XTIME_STAMP, vdso_data, xtime_tod_stamp); + OFFSET(__VDSO_XTIME_SEC, vdso_data, xtime_clock_sec); + OFFSET(__VDSO_XTIME_NSEC, vdso_data, xtime_clock_nsec); + OFFSET(__VDSO_XTIME_CRS_SEC, vdso_data, xtime_coarse_sec); + OFFSET(__VDSO_XTIME_CRS_NSEC, vdso_data, xtime_coarse_nsec); + OFFSET(__VDSO_WTOM_SEC, vdso_data, wtom_clock_sec); + OFFSET(__VDSO_WTOM_NSEC, vdso_data, wtom_clock_nsec); + OFFSET(__VDSO_WTOM_CRS_SEC, vdso_data, wtom_coarse_sec); + OFFSET(__VDSO_WTOM_CRS_NSEC, vdso_data, wtom_coarse_nsec); + OFFSET(__VDSO_TIMEZONE, vdso_data, tz_minuteswest); + OFFSET(__VDSO_ECTG_OK, vdso_data, ectg_available); + OFFSET(__VDSO_TK_MULT, vdso_data, tk_mult); + OFFSET(__VDSO_TK_SHIFT, vdso_data, tk_shift); + OFFSET(__VDSO_TS_DIR, vdso_data, ts_dir); + OFFSET(__VDSO_TS_END, vdso_data, ts_end); + OFFSET(__VDSO_CLOCK_REALTIME_RES, vdso_data, hrtimer_res); + OFFSET(__VDSO_ECTG_BASE, vdso_per_cpu_data, ectg_timer_base); + OFFSET(__VDSO_ECTG_USER, vdso_per_cpu_data, ectg_user_time); + OFFSET(__VDSO_CPU_NR, vdso_per_cpu_data, cpu_nr); + OFFSET(__VDSO_NODE_ID, vdso_per_cpu_data, node_id); + BLANK(); + /* constants used by the vdso */ + DEFINE(__CLOCK_REALTIME, CLOCK_REALTIME); + DEFINE(__CLOCK_MONOTONIC, CLOCK_MONOTONIC); + DEFINE(__CLOCK_REALTIME_COARSE, CLOCK_REALTIME_COARSE); + DEFINE(__CLOCK_MONOTONIC_COARSE, CLOCK_MONOTONIC_COARSE); + DEFINE(__CLOCK_THREAD_CPUTIME_ID, CLOCK_THREAD_CPUTIME_ID); + DEFINE(__CLOCK_COARSE_RES, LOW_RES_NSEC); + BLANK(); + /* idle data offsets */ + OFFSET(__CLOCK_IDLE_ENTER, s390_idle_data, clock_idle_enter); + OFFSET(__CLOCK_IDLE_EXIT, s390_idle_data, clock_idle_exit); + OFFSET(__TIMER_IDLE_ENTER, s390_idle_data, timer_idle_enter); + OFFSET(__TIMER_IDLE_EXIT, s390_idle_data, timer_idle_exit); + BLANK(); + /* hardware defined lowcore locations 0x000 - 0x1ff */ + OFFSET(__LC_EXT_PARAMS, lowcore, ext_params); + OFFSET(__LC_EXT_CPU_ADDR, lowcore, ext_cpu_addr); + OFFSET(__LC_EXT_INT_CODE, lowcore, ext_int_code); + OFFSET(__LC_SVC_ILC, lowcore, svc_ilc); + OFFSET(__LC_SVC_INT_CODE, lowcore, svc_code); + OFFSET(__LC_PGM_ILC, lowcore, pgm_ilc); + OFFSET(__LC_PGM_INT_CODE, lowcore, pgm_code); + OFFSET(__LC_DATA_EXC_CODE, lowcore, data_exc_code); + OFFSET(__LC_MON_CLASS_NR, lowcore, mon_class_num); + OFFSET(__LC_PER_CODE, lowcore, per_code); + OFFSET(__LC_PER_ATMID, lowcore, per_atmid); + OFFSET(__LC_PER_ADDRESS, lowcore, per_address); + OFFSET(__LC_EXC_ACCESS_ID, lowcore, exc_access_id); + OFFSET(__LC_PER_ACCESS_ID, lowcore, per_access_id); + OFFSET(__LC_OP_ACCESS_ID, lowcore, op_access_id); + OFFSET(__LC_AR_MODE_ID, lowcore, ar_mode_id); + OFFSET(__LC_TRANS_EXC_CODE, lowcore, trans_exc_code); + OFFSET(__LC_MON_CODE, lowcore, monitor_code); + OFFSET(__LC_SUBCHANNEL_ID, lowcore, subchannel_id); + OFFSET(__LC_SUBCHANNEL_NR, lowcore, subchannel_nr); + OFFSET(__LC_IO_INT_PARM, lowcore, io_int_parm); + OFFSET(__LC_IO_INT_WORD, lowcore, io_int_word); + OFFSET(__LC_STFL_FAC_LIST, lowcore, stfl_fac_list); + OFFSET(__LC_STFLE_FAC_LIST, lowcore, stfle_fac_list); + OFFSET(__LC_MCCK_CODE, lowcore, mcck_interruption_code); + OFFSET(__LC_EXT_DAMAGE_CODE, lowcore, external_damage_code); + OFFSET(__LC_MCCK_FAIL_STOR_ADDR, lowcore, failing_storage_address); + OFFSET(__LC_LAST_BREAK, lowcore, breaking_event_addr); + OFFSET(__LC_RST_OLD_PSW, lowcore, restart_old_psw); + OFFSET(__LC_EXT_OLD_PSW, lowcore, external_old_psw); + OFFSET(__LC_SVC_OLD_PSW, lowcore, svc_old_psw); + OFFSET(__LC_PGM_OLD_PSW, lowcore, program_old_psw); + OFFSET(__LC_MCK_OLD_PSW, lowcore, mcck_old_psw); + OFFSET(__LC_IO_OLD_PSW, lowcore, io_old_psw); + OFFSET(__LC_RST_NEW_PSW, lowcore, restart_psw); + OFFSET(__LC_EXT_NEW_PSW, lowcore, external_new_psw); + OFFSET(__LC_SVC_NEW_PSW, lowcore, svc_new_psw); + OFFSET(__LC_PGM_NEW_PSW, lowcore, program_new_psw); + OFFSET(__LC_MCK_NEW_PSW, lowcore, mcck_new_psw); + OFFSET(__LC_IO_NEW_PSW, lowcore, io_new_psw); + /* software defined lowcore locations 0x200 - 0xdff*/ + OFFSET(__LC_SAVE_AREA_SYNC, lowcore, save_area_sync); + OFFSET(__LC_SAVE_AREA_ASYNC, lowcore, save_area_async); + OFFSET(__LC_SAVE_AREA_RESTART, lowcore, save_area_restart); + OFFSET(__LC_CPU_FLAGS, lowcore, cpu_flags); + OFFSET(__LC_RETURN_PSW, lowcore, return_psw); + OFFSET(__LC_RETURN_MCCK_PSW, lowcore, return_mcck_psw); + OFFSET(__LC_SYNC_ENTER_TIMER, lowcore, sync_enter_timer); + OFFSET(__LC_ASYNC_ENTER_TIMER, lowcore, async_enter_timer); + OFFSET(__LC_MCCK_ENTER_TIMER, lowcore, mcck_enter_timer); + OFFSET(__LC_EXIT_TIMER, lowcore, exit_timer); + OFFSET(__LC_USER_TIMER, lowcore, user_timer); + OFFSET(__LC_SYSTEM_TIMER, lowcore, system_timer); + OFFSET(__LC_STEAL_TIMER, lowcore, steal_timer); + OFFSET(__LC_LAST_UPDATE_TIMER, lowcore, last_update_timer); + OFFSET(__LC_LAST_UPDATE_CLOCK, lowcore, last_update_clock); + OFFSET(__LC_INT_CLOCK, lowcore, int_clock); + OFFSET(__LC_MCCK_CLOCK, lowcore, mcck_clock); + OFFSET(__LC_CLOCK_COMPARATOR, lowcore, clock_comparator); + OFFSET(__LC_BOOT_CLOCK, lowcore, boot_clock); + OFFSET(__LC_CURRENT, lowcore, current_task); + OFFSET(__LC_KERNEL_STACK, lowcore, kernel_stack); + OFFSET(__LC_ASYNC_STACK, lowcore, async_stack); + OFFSET(__LC_PANIC_STACK, lowcore, panic_stack); + OFFSET(__LC_RESTART_STACK, lowcore, restart_stack); + OFFSET(__LC_RESTART_FN, lowcore, restart_fn); + OFFSET(__LC_RESTART_DATA, lowcore, restart_data); + OFFSET(__LC_RESTART_SOURCE, lowcore, restart_source); + OFFSET(__LC_USER_ASCE, lowcore, user_asce); + OFFSET(__LC_VDSO_ASCE, lowcore, vdso_asce); + OFFSET(__LC_LPP, lowcore, lpp); + OFFSET(__LC_CURRENT_PID, lowcore, current_pid); + OFFSET(__LC_PERCPU_OFFSET, lowcore, percpu_offset); + OFFSET(__LC_VDSO_PER_CPU, lowcore, vdso_per_cpu_data); + OFFSET(__LC_MACHINE_FLAGS, lowcore, machine_flags); + OFFSET(__LC_PREEMPT_COUNT, lowcore, preempt_count); + OFFSET(__LC_GMAP, lowcore, gmap); + OFFSET(__LC_BR_R1, lowcore, br_r1_trampoline); + /* software defined ABI-relevant lowcore locations 0xe00 - 0xe20 */ + OFFSET(__LC_DUMP_REIPL, lowcore, ipib); + /* hardware defined lowcore locations 0x1000 - 0x18ff */ + OFFSET(__LC_MCESAD, lowcore, mcesad); + OFFSET(__LC_EXT_PARAMS2, lowcore, ext_params2); + OFFSET(__LC_FPREGS_SAVE_AREA, lowcore, floating_pt_save_area); + OFFSET(__LC_GPREGS_SAVE_AREA, lowcore, gpregs_save_area); + OFFSET(__LC_PSW_SAVE_AREA, lowcore, psw_save_area); + OFFSET(__LC_PREFIX_SAVE_AREA, lowcore, prefixreg_save_area); + OFFSET(__LC_FP_CREG_SAVE_AREA, lowcore, fpt_creg_save_area); + OFFSET(__LC_TOD_PROGREG_SAVE_AREA, lowcore, tod_progreg_save_area); + OFFSET(__LC_CPU_TIMER_SAVE_AREA, lowcore, cpu_timer_save_area); + OFFSET(__LC_CLOCK_COMP_SAVE_AREA, lowcore, clock_comp_save_area); + OFFSET(__LC_AREGS_SAVE_AREA, lowcore, access_regs_save_area); + OFFSET(__LC_CREGS_SAVE_AREA, lowcore, cregs_save_area); + OFFSET(__LC_PGM_TDB, lowcore, pgm_tdb); + BLANK(); + /* extended machine check save area */ + OFFSET(__MCESA_GS_SAVE_AREA, mcesa, guarded_storage_save_area); + BLANK(); + /* gmap/sie offsets */ + OFFSET(__GMAP_ASCE, gmap, asce); + OFFSET(__SIE_PROG0C, kvm_s390_sie_block, prog0c); + OFFSET(__SIE_PROG20, kvm_s390_sie_block, prog20); + /* kexec_sha_region */ + OFFSET(__KEXEC_SHA_REGION_START, kexec_sha_region, start); + OFFSET(__KEXEC_SHA_REGION_LEN, kexec_sha_region, len); + DEFINE(__KEXEC_SHA_REGION_SIZE, sizeof(struct kexec_sha_region)); + return 0; +} diff --git a/arch/s390/kernel/audit.c b/arch/s390/kernel/audit.c new file mode 100644 index 000000000..d395c6c99 --- /dev/null +++ b/arch/s390/kernel/audit.c @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/init.h> +#include <linux/types.h> +#include <linux/audit.h> +#include <asm/unistd.h> +#include "audit.h" + +static unsigned dir_class[] = { +#include <asm-generic/audit_dir_write.h> +~0U +}; + +static unsigned read_class[] = { +#include <asm-generic/audit_read.h> +~0U +}; + +static unsigned write_class[] = { +#include <asm-generic/audit_write.h> +~0U +}; + +static unsigned chattr_class[] = { +#include <asm-generic/audit_change_attr.h> +~0U +}; + +static unsigned signal_class[] = { +#include <asm-generic/audit_signal.h> +~0U +}; + +int audit_classify_arch(int arch) +{ +#ifdef CONFIG_COMPAT + if (arch == AUDIT_ARCH_S390) + return 1; +#endif + return 0; +} + +int audit_classify_syscall(int abi, unsigned syscall) +{ +#ifdef CONFIG_COMPAT + if (abi == AUDIT_ARCH_S390) + return s390_classify_syscall(syscall); +#endif + switch(syscall) { + case __NR_open: + return 2; + case __NR_openat: + return 3; + case __NR_socketcall: + return 4; + case __NR_execve: + return 5; + default: + return 0; + } +} + +static int __init audit_classes_init(void) +{ +#ifdef CONFIG_COMPAT + audit_register_class(AUDIT_CLASS_WRITE_32, s390_write_class); + audit_register_class(AUDIT_CLASS_READ_32, s390_read_class); + audit_register_class(AUDIT_CLASS_DIR_WRITE_32, s390_dir_class); + audit_register_class(AUDIT_CLASS_CHATTR_32, s390_chattr_class); + audit_register_class(AUDIT_CLASS_SIGNAL_32, s390_signal_class); +#endif + audit_register_class(AUDIT_CLASS_WRITE, write_class); + audit_register_class(AUDIT_CLASS_READ, read_class); + audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class); + audit_register_class(AUDIT_CLASS_CHATTR, chattr_class); + audit_register_class(AUDIT_CLASS_SIGNAL, signal_class); + return 0; +} + +__initcall(audit_classes_init); diff --git a/arch/s390/kernel/audit.h b/arch/s390/kernel/audit.h new file mode 100644 index 000000000..4d4b59641 --- /dev/null +++ b/arch/s390/kernel/audit.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ARCH_S390_KERNEL_AUDIT_H +#define __ARCH_S390_KERNEL_AUDIT_H + +#include <linux/types.h> + +#ifdef CONFIG_COMPAT +extern int s390_classify_syscall(unsigned); +extern __u32 s390_dir_class[]; +extern __u32 s390_write_class[]; +extern __u32 s390_read_class[]; +extern __u32 s390_chattr_class[]; +extern __u32 s390_signal_class[]; +#endif /* CONFIG_COMPAT */ + +#endif /* __ARCH_S390_KERNEL_AUDIT_H */ diff --git a/arch/s390/kernel/base.S b/arch/s390/kernel/base.S new file mode 100644 index 000000000..b65874b0b --- /dev/null +++ b/arch/s390/kernel/base.S @@ -0,0 +1,149 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * arch/s390/kernel/base.S + * + * Copyright IBM Corp. 2006, 2007 + * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com> + * Michael Holzheu <holzheu@de.ibm.com> + */ + +#include <linux/linkage.h> +#include <asm/asm-offsets.h> +#include <asm/nospec-insn.h> +#include <asm/ptrace.h> +#include <asm/sigp.h> + + GEN_BR_THUNK %r9 + GEN_BR_THUNK %r14 + +ENTRY(s390_base_mcck_handler) + basr %r13,0 +0: lg %r15,__LC_PANIC_STACK # load panic stack + aghi %r15,-STACK_FRAME_OVERHEAD + larl %r1,s390_base_mcck_handler_fn + lg %r9,0(%r1) + ltgr %r9,%r9 + jz 1f + BASR_EX %r14,%r9 +1: la %r1,4095 + lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1) + lpswe __LC_MCK_OLD_PSW + + .section .bss + .align 8 + .globl s390_base_mcck_handler_fn +s390_base_mcck_handler_fn: + .quad 0 + .previous + +ENTRY(s390_base_ext_handler) + stmg %r0,%r15,__LC_SAVE_AREA_ASYNC + basr %r13,0 +0: aghi %r15,-STACK_FRAME_OVERHEAD + larl %r1,s390_base_ext_handler_fn + lg %r9,0(%r1) + ltgr %r9,%r9 + jz 1f + BASR_EX %r14,%r9 +1: lmg %r0,%r15,__LC_SAVE_AREA_ASYNC + ni __LC_EXT_OLD_PSW+1,0xfd # clear wait state bit + lpswe __LC_EXT_OLD_PSW + + .section .bss + .align 8 + .globl s390_base_ext_handler_fn +s390_base_ext_handler_fn: + .quad 0 + .previous + +ENTRY(s390_base_pgm_handler) + stmg %r0,%r15,__LC_SAVE_AREA_SYNC + basr %r13,0 +0: aghi %r15,-STACK_FRAME_OVERHEAD + larl %r1,s390_base_pgm_handler_fn + lg %r9,0(%r1) + ltgr %r9,%r9 + jz 1f + BASR_EX %r14,%r9 + lmg %r0,%r15,__LC_SAVE_AREA_SYNC + lpswe __LC_PGM_OLD_PSW +1: lpswe disabled_wait_psw-0b(%r13) + + .align 8 +disabled_wait_psw: + .quad 0x0002000180000000,0x0000000000000000 + s390_base_pgm_handler + + .section .bss + .align 8 + .globl s390_base_pgm_handler_fn +s390_base_pgm_handler_fn: + .quad 0 + .previous + +# +# Calls diag 308 subcode 1 and continues execution +# +ENTRY(diag308_reset) + larl %r4,.Lctlregs # Save control registers + stctg %c0,%c15,0(%r4) + lg %r2,0(%r4) # Disable lowcore protection + nilh %r2,0xefff + larl %r4,.Lctlreg0 + stg %r2,0(%r4) + lctlg %c0,%c0,0(%r4) + larl %r4,.Lfpctl # Floating point control register + stfpc 0(%r4) + larl %r4,.Lprefix # Save prefix register + stpx 0(%r4) + larl %r4,.Lprefix_zero # Set prefix register to 0 + spx 0(%r4) + larl %r4,.Lcontinue_psw # Save PSW flags + epsw %r2,%r3 + stm %r2,%r3,0(%r4) + larl %r4,.Lrestart_psw # Setup restart PSW at absolute 0 + lghi %r3,0 + lg %r4,0(%r4) # Save PSW + sturg %r4,%r3 # Use sturg, because of large pages + lghi %r1,1 + lghi %r0,0 + diag %r0,%r1,0x308 +.Lrestart_part2: + lhi %r0,0 # Load r0 with zero + lhi %r1,2 # Use mode 2 = ESAME (dump) + sigp %r1,%r0,SIGP_SET_ARCHITECTURE # Switch to ESAME mode + sam64 # Switch to 64 bit addressing mode + larl %r4,.Lctlregs # Restore control registers + lctlg %c0,%c15,0(%r4) + larl %r4,.Lfpctl # Restore floating point ctl register + lfpc 0(%r4) + larl %r4,.Lprefix # Restore prefix register + spx 0(%r4) + larl %r4,.Lcontinue_psw # Restore PSW flags + lpswe 0(%r4) +.Lcontinue: + BR_EX %r14 +.align 16 +.Lrestart_psw: + .long 0x00080000,0x80000000 + .Lrestart_part2 + + .section .data..nosave,"aw",@progbits +.align 8 +.Lcontinue_psw: + .quad 0,.Lcontinue + .previous + + .section .bss +.align 8 +.Lctlreg0: + .quad 0 +.Lctlregs: + .rept 16 + .quad 0 + .endr +.Lfpctl: + .long 0 +.Lprefix: + .long 0 +.Lprefix_zero: + .long 0 + .previous diff --git a/arch/s390/kernel/cache.c b/arch/s390/kernel/cache.c new file mode 100644 index 000000000..d66825e53 --- /dev/null +++ b/arch/s390/kernel/cache.c @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Extract CPU cache information and expose them via sysfs. + * + * Copyright IBM Corp. 2012 + * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com> + */ + +#include <linux/seq_file.h> +#include <linux/cpu.h> +#include <linux/cacheinfo.h> +#include <asm/facility.h> + +enum { + CACHE_SCOPE_NOTEXISTS, + CACHE_SCOPE_PRIVATE, + CACHE_SCOPE_SHARED, + CACHE_SCOPE_RESERVED, +}; + +enum { + CTYPE_SEPARATE, + CTYPE_DATA, + CTYPE_INSTRUCTION, + CTYPE_UNIFIED, +}; + +enum { + EXTRACT_TOPOLOGY, + EXTRACT_LINE_SIZE, + EXTRACT_SIZE, + EXTRACT_ASSOCIATIVITY, +}; + +enum { + CACHE_TI_UNIFIED = 0, + CACHE_TI_DATA = 0, + CACHE_TI_INSTRUCTION, +}; + +struct cache_info { + unsigned char : 4; + unsigned char scope : 2; + unsigned char type : 2; +}; + +#define CACHE_MAX_LEVEL 8 +union cache_topology { + struct cache_info ci[CACHE_MAX_LEVEL]; + unsigned long long raw; +}; + +static const char * const cache_type_string[] = { + "", + "Instruction", + "Data", + "", + "Unified", +}; + +static const enum cache_type cache_type_map[] = { + [CTYPE_SEPARATE] = CACHE_TYPE_SEPARATE, + [CTYPE_DATA] = CACHE_TYPE_DATA, + [CTYPE_INSTRUCTION] = CACHE_TYPE_INST, + [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED, +}; + +void show_cacheinfo(struct seq_file *m) +{ + struct cpu_cacheinfo *this_cpu_ci; + struct cacheinfo *cache; + int idx; + + if (!test_facility(34)) + return; + this_cpu_ci = get_cpu_cacheinfo(cpumask_any(cpu_online_mask)); + for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) { + cache = this_cpu_ci->info_list + idx; + seq_printf(m, "cache%-11d: ", idx); + seq_printf(m, "level=%d ", cache->level); + seq_printf(m, "type=%s ", cache_type_string[cache->type]); + seq_printf(m, "scope=%s ", + cache->disable_sysfs ? "Shared" : "Private"); + seq_printf(m, "size=%dK ", cache->size >> 10); + seq_printf(m, "line_size=%u ", cache->coherency_line_size); + seq_printf(m, "associativity=%d", cache->ways_of_associativity); + seq_puts(m, "\n"); + } +} + +static inline enum cache_type get_cache_type(struct cache_info *ci, int level) +{ + if (level >= CACHE_MAX_LEVEL) + return CACHE_TYPE_NOCACHE; + ci += level; + if (ci->scope != CACHE_SCOPE_SHARED && ci->scope != CACHE_SCOPE_PRIVATE) + return CACHE_TYPE_NOCACHE; + return cache_type_map[ci->type]; +} + +static inline unsigned long ecag(int ai, int li, int ti) +{ + return __ecag(ECAG_CACHE_ATTRIBUTE, ai << 4 | li << 1 | ti); +} + +static void ci_leaf_init(struct cacheinfo *this_leaf, int private, + enum cache_type type, unsigned int level, int cpu) +{ + int ti, num_sets; + + if (type == CACHE_TYPE_INST) + ti = CACHE_TI_INSTRUCTION; + else + ti = CACHE_TI_UNIFIED; + this_leaf->level = level + 1; + this_leaf->type = type; + this_leaf->coherency_line_size = ecag(EXTRACT_LINE_SIZE, level, ti); + this_leaf->ways_of_associativity = ecag(EXTRACT_ASSOCIATIVITY, level, ti); + this_leaf->size = ecag(EXTRACT_SIZE, level, ti); + num_sets = this_leaf->size / this_leaf->coherency_line_size; + num_sets /= this_leaf->ways_of_associativity; + this_leaf->number_of_sets = num_sets; + cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map); + if (!private) + this_leaf->disable_sysfs = true; +} + +int init_cache_level(unsigned int cpu) +{ + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + unsigned int level = 0, leaves = 0; + union cache_topology ct; + enum cache_type ctype; + + if (!test_facility(34)) + return -EOPNOTSUPP; + if (!this_cpu_ci) + return -EINVAL; + ct.raw = ecag(EXTRACT_TOPOLOGY, 0, 0); + do { + ctype = get_cache_type(&ct.ci[0], level); + if (ctype == CACHE_TYPE_NOCACHE) + break; + /* Separate instruction and data caches */ + leaves += (ctype == CACHE_TYPE_SEPARATE) ? 2 : 1; + } while (++level < CACHE_MAX_LEVEL); + this_cpu_ci->num_levels = level; + this_cpu_ci->num_leaves = leaves; + return 0; +} + +int populate_cache_leaves(unsigned int cpu) +{ + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + struct cacheinfo *this_leaf = this_cpu_ci->info_list; + unsigned int level, idx, pvt; + union cache_topology ct; + enum cache_type ctype; + + if (!test_facility(34)) + return -EOPNOTSUPP; + ct.raw = ecag(EXTRACT_TOPOLOGY, 0, 0); + for (idx = 0, level = 0; level < this_cpu_ci->num_levels && + idx < this_cpu_ci->num_leaves; idx++, level++) { + if (!this_leaf) + return -EINVAL; + pvt = (ct.ci[level].scope == CACHE_SCOPE_PRIVATE) ? 1 : 0; + ctype = get_cache_type(&ct.ci[0], level); + if (ctype == CACHE_TYPE_SEPARATE) { + ci_leaf_init(this_leaf++, pvt, CACHE_TYPE_DATA, level, cpu); + ci_leaf_init(this_leaf++, pvt, CACHE_TYPE_INST, level, cpu); + } else { + ci_leaf_init(this_leaf++, pvt, ctype, level, cpu); + } + } + return 0; +} diff --git a/arch/s390/kernel/compat_audit.c b/arch/s390/kernel/compat_audit.c new file mode 100644 index 000000000..444fb1f66 --- /dev/null +++ b/arch/s390/kernel/compat_audit.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0 +#undef __s390x__ +#include <asm/unistd.h> +#include "audit.h" + +unsigned s390_dir_class[] = { +#include <asm-generic/audit_dir_write.h> +~0U +}; + +unsigned s390_chattr_class[] = { +#include <asm-generic/audit_change_attr.h> +~0U +}; + +unsigned s390_write_class[] = { +#include <asm-generic/audit_write.h> +~0U +}; + +unsigned s390_read_class[] = { +#include <asm-generic/audit_read.h> +~0U +}; + +unsigned s390_signal_class[] = { +#include <asm-generic/audit_signal.h> +~0U +}; + +int s390_classify_syscall(unsigned syscall) +{ + switch(syscall) { + case __NR_open: + return 2; + case __NR_openat: + return 3; + case __NR_socketcall: + return 4; + case __NR_execve: + return 5; + default: + return 1; + } +} diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c new file mode 100644 index 000000000..8ac38d51e --- /dev/null +++ b/arch/s390/kernel/compat_linux.c @@ -0,0 +1,522 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * S390 version + * Copyright IBM Corp. 2000 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), + * Gerhard Tonn (ton@de.ibm.com) + * Thomas Spatzier (tspat@de.ibm.com) + * + * Conversion between 31bit and 64bit native syscalls. + * + * Heavily inspired by the 32-bit Sparc compat code which is + * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz) + * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) + * + */ + + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/file.h> +#include <linux/signal.h> +#include <linux/resource.h> +#include <linux/times.h> +#include <linux/smp.h> +#include <linux/sem.h> +#include <linux/msg.h> +#include <linux/shm.h> +#include <linux/uio.h> +#include <linux/quota.h> +#include <linux/poll.h> +#include <linux/personality.h> +#include <linux/stat.h> +#include <linux/filter.h> +#include <linux/highmem.h> +#include <linux/highuid.h> +#include <linux/mman.h> +#include <linux/ipv6.h> +#include <linux/in.h> +#include <linux/icmpv6.h> +#include <linux/syscalls.h> +#include <linux/sysctl.h> +#include <linux/binfmts.h> +#include <linux/capability.h> +#include <linux/compat.h> +#include <linux/vfs.h> +#include <linux/ptrace.h> +#include <linux/fadvise.h> +#include <linux/ipc.h> +#include <linux/slab.h> + +#include <asm/types.h> +#include <linux/uaccess.h> + +#include <net/scm.h> +#include <net/sock.h> + +#include "compat_linux.h" + +/* For this source file, we want overflow handling. */ + +#undef high2lowuid +#undef high2lowgid +#undef low2highuid +#undef low2highgid +#undef SET_UID16 +#undef SET_GID16 +#undef NEW_TO_OLD_UID +#undef NEW_TO_OLD_GID +#undef SET_OLDSTAT_UID +#undef SET_OLDSTAT_GID +#undef SET_STAT_UID +#undef SET_STAT_GID + +#define high2lowuid(uid) ((uid) > 65535) ? (u16)overflowuid : (u16)(uid) +#define high2lowgid(gid) ((gid) > 65535) ? (u16)overflowgid : (u16)(gid) +#define low2highuid(uid) ((uid) == (u16)-1) ? (uid_t)-1 : (uid_t)(uid) +#define low2highgid(gid) ((gid) == (u16)-1) ? (gid_t)-1 : (gid_t)(gid) +#define SET_UID16(var, uid) var = high2lowuid(uid) +#define SET_GID16(var, gid) var = high2lowgid(gid) +#define NEW_TO_OLD_UID(uid) high2lowuid(uid) +#define NEW_TO_OLD_GID(gid) high2lowgid(gid) +#define SET_OLDSTAT_UID(stat, uid) (stat).st_uid = high2lowuid(uid) +#define SET_OLDSTAT_GID(stat, gid) (stat).st_gid = high2lowgid(gid) +#define SET_STAT_UID(stat, uid) (stat).st_uid = high2lowuid(uid) +#define SET_STAT_GID(stat, gid) (stat).st_gid = high2lowgid(gid) + +COMPAT_SYSCALL_DEFINE3(s390_chown16, const char __user *, filename, + u16, user, u16, group) +{ + return ksys_chown(filename, low2highuid(user), low2highgid(group)); +} + +COMPAT_SYSCALL_DEFINE3(s390_lchown16, const char __user *, + filename, u16, user, u16, group) +{ + return ksys_lchown(filename, low2highuid(user), low2highgid(group)); +} + +COMPAT_SYSCALL_DEFINE3(s390_fchown16, unsigned int, fd, u16, user, u16, group) +{ + return ksys_fchown(fd, low2highuid(user), low2highgid(group)); +} + +COMPAT_SYSCALL_DEFINE2(s390_setregid16, u16, rgid, u16, egid) +{ + return sys_setregid(low2highgid(rgid), low2highgid(egid)); +} + +COMPAT_SYSCALL_DEFINE1(s390_setgid16, u16, gid) +{ + return sys_setgid(low2highgid(gid)); +} + +COMPAT_SYSCALL_DEFINE2(s390_setreuid16, u16, ruid, u16, euid) +{ + return sys_setreuid(low2highuid(ruid), low2highuid(euid)); +} + +COMPAT_SYSCALL_DEFINE1(s390_setuid16, u16, uid) +{ + return sys_setuid(low2highuid(uid)); +} + +COMPAT_SYSCALL_DEFINE3(s390_setresuid16, u16, ruid, u16, euid, u16, suid) +{ + return sys_setresuid(low2highuid(ruid), low2highuid(euid), + low2highuid(suid)); +} + +COMPAT_SYSCALL_DEFINE3(s390_getresuid16, u16 __user *, ruidp, + u16 __user *, euidp, u16 __user *, suidp) +{ + const struct cred *cred = current_cred(); + int retval; + u16 ruid, euid, suid; + + ruid = high2lowuid(from_kuid_munged(cred->user_ns, cred->uid)); + euid = high2lowuid(from_kuid_munged(cred->user_ns, cred->euid)); + suid = high2lowuid(from_kuid_munged(cred->user_ns, cred->suid)); + + if (!(retval = put_user(ruid, ruidp)) && + !(retval = put_user(euid, euidp))) + retval = put_user(suid, suidp); + + return retval; +} + +COMPAT_SYSCALL_DEFINE3(s390_setresgid16, u16, rgid, u16, egid, u16, sgid) +{ + return sys_setresgid(low2highgid(rgid), low2highgid(egid), + low2highgid(sgid)); +} + +COMPAT_SYSCALL_DEFINE3(s390_getresgid16, u16 __user *, rgidp, + u16 __user *, egidp, u16 __user *, sgidp) +{ + const struct cred *cred = current_cred(); + int retval; + u16 rgid, egid, sgid; + + rgid = high2lowgid(from_kgid_munged(cred->user_ns, cred->gid)); + egid = high2lowgid(from_kgid_munged(cred->user_ns, cred->egid)); + sgid = high2lowgid(from_kgid_munged(cred->user_ns, cred->sgid)); + + if (!(retval = put_user(rgid, rgidp)) && + !(retval = put_user(egid, egidp))) + retval = put_user(sgid, sgidp); + + return retval; +} + +COMPAT_SYSCALL_DEFINE1(s390_setfsuid16, u16, uid) +{ + return sys_setfsuid(low2highuid(uid)); +} + +COMPAT_SYSCALL_DEFINE1(s390_setfsgid16, u16, gid) +{ + return sys_setfsgid(low2highgid(gid)); +} + +static int groups16_to_user(u16 __user *grouplist, struct group_info *group_info) +{ + struct user_namespace *user_ns = current_user_ns(); + int i; + u16 group; + kgid_t kgid; + + for (i = 0; i < group_info->ngroups; i++) { + kgid = group_info->gid[i]; + group = (u16)from_kgid_munged(user_ns, kgid); + if (put_user(group, grouplist+i)) + return -EFAULT; + } + + return 0; +} + +static int groups16_from_user(struct group_info *group_info, u16 __user *grouplist) +{ + struct user_namespace *user_ns = current_user_ns(); + int i; + u16 group; + kgid_t kgid; + + for (i = 0; i < group_info->ngroups; i++) { + if (get_user(group, grouplist+i)) + return -EFAULT; + + kgid = make_kgid(user_ns, (gid_t)group); + if (!gid_valid(kgid)) + return -EINVAL; + + group_info->gid[i] = kgid; + } + + return 0; +} + +COMPAT_SYSCALL_DEFINE2(s390_getgroups16, int, gidsetsize, u16 __user *, grouplist) +{ + const struct cred *cred = current_cred(); + int i; + + if (gidsetsize < 0) + return -EINVAL; + + get_group_info(cred->group_info); + i = cred->group_info->ngroups; + if (gidsetsize) { + if (i > gidsetsize) { + i = -EINVAL; + goto out; + } + if (groups16_to_user(grouplist, cred->group_info)) { + i = -EFAULT; + goto out; + } + } +out: + put_group_info(cred->group_info); + return i; +} + +COMPAT_SYSCALL_DEFINE2(s390_setgroups16, int, gidsetsize, u16 __user *, grouplist) +{ + struct group_info *group_info; + int retval; + + if (!may_setgroups()) + return -EPERM; + if ((unsigned)gidsetsize > NGROUPS_MAX) + return -EINVAL; + + group_info = groups_alloc(gidsetsize); + if (!group_info) + return -ENOMEM; + retval = groups16_from_user(group_info, grouplist); + if (retval) { + put_group_info(group_info); + return retval; + } + + groups_sort(group_info); + retval = set_current_groups(group_info); + put_group_info(group_info); + + return retval; +} + +COMPAT_SYSCALL_DEFINE0(s390_getuid16) +{ + return high2lowuid(from_kuid_munged(current_user_ns(), current_uid())); +} + +COMPAT_SYSCALL_DEFINE0(s390_geteuid16) +{ + return high2lowuid(from_kuid_munged(current_user_ns(), current_euid())); +} + +COMPAT_SYSCALL_DEFINE0(s390_getgid16) +{ + return high2lowgid(from_kgid_munged(current_user_ns(), current_gid())); +} + +COMPAT_SYSCALL_DEFINE0(s390_getegid16) +{ + return high2lowgid(from_kgid_munged(current_user_ns(), current_egid())); +} + +#ifdef CONFIG_SYSVIPC +COMPAT_SYSCALL_DEFINE5(s390_ipc, uint, call, int, first, compat_ulong_t, second, + compat_ulong_t, third, compat_uptr_t, ptr) +{ + if (call >> 16) /* hack for backward compatibility */ + return -EINVAL; + return compat_sys_ipc(call, first, second, third, ptr, third); +} +#endif + +COMPAT_SYSCALL_DEFINE3(s390_truncate64, const char __user *, path, u32, high, u32, low) +{ + return ksys_truncate(path, (unsigned long)high << 32 | low); +} + +COMPAT_SYSCALL_DEFINE3(s390_ftruncate64, unsigned int, fd, u32, high, u32, low) +{ + return ksys_ftruncate(fd, (unsigned long)high << 32 | low); +} + +COMPAT_SYSCALL_DEFINE5(s390_pread64, unsigned int, fd, char __user *, ubuf, + compat_size_t, count, u32, high, u32, low) +{ + if ((compat_ssize_t) count < 0) + return -EINVAL; + return ksys_pread64(fd, ubuf, count, (unsigned long)high << 32 | low); +} + +COMPAT_SYSCALL_DEFINE5(s390_pwrite64, unsigned int, fd, const char __user *, ubuf, + compat_size_t, count, u32, high, u32, low) +{ + if ((compat_ssize_t) count < 0) + return -EINVAL; + return ksys_pwrite64(fd, ubuf, count, (unsigned long)high << 32 | low); +} + +COMPAT_SYSCALL_DEFINE4(s390_readahead, int, fd, u32, high, u32, low, s32, count) +{ + return ksys_readahead(fd, (unsigned long)high << 32 | low, count); +} + +struct stat64_emu31 { + unsigned long long st_dev; + unsigned int __pad1; +#define STAT64_HAS_BROKEN_ST_INO 1 + u32 __st_ino; + unsigned int st_mode; + unsigned int st_nlink; + u32 st_uid; + u32 st_gid; + unsigned long long st_rdev; + unsigned int __pad3; + long st_size; + u32 st_blksize; + unsigned char __pad4[4]; + u32 __pad5; /* future possible st_blocks high bits */ + u32 st_blocks; /* Number 512-byte blocks allocated. */ + u32 st_atime; + u32 __pad6; + u32 st_mtime; + u32 __pad7; + u32 st_ctime; + u32 __pad8; /* will be high 32 bits of ctime someday */ + unsigned long st_ino; +}; + +static int cp_stat64(struct stat64_emu31 __user *ubuf, struct kstat *stat) +{ + struct stat64_emu31 tmp; + + memset(&tmp, 0, sizeof(tmp)); + + tmp.st_dev = huge_encode_dev(stat->dev); + tmp.st_ino = stat->ino; + tmp.__st_ino = (u32)stat->ino; + tmp.st_mode = stat->mode; + tmp.st_nlink = (unsigned int)stat->nlink; + tmp.st_uid = from_kuid_munged(current_user_ns(), stat->uid); + tmp.st_gid = from_kgid_munged(current_user_ns(), stat->gid); + tmp.st_rdev = huge_encode_dev(stat->rdev); + tmp.st_size = stat->size; + tmp.st_blksize = (u32)stat->blksize; + tmp.st_blocks = (u32)stat->blocks; + tmp.st_atime = (u32)stat->atime.tv_sec; + tmp.st_mtime = (u32)stat->mtime.tv_sec; + tmp.st_ctime = (u32)stat->ctime.tv_sec; + + return copy_to_user(ubuf,&tmp,sizeof(tmp)) ? -EFAULT : 0; +} + +COMPAT_SYSCALL_DEFINE2(s390_stat64, const char __user *, filename, struct stat64_emu31 __user *, statbuf) +{ + struct kstat stat; + int ret = vfs_stat(filename, &stat); + if (!ret) + ret = cp_stat64(statbuf, &stat); + return ret; +} + +COMPAT_SYSCALL_DEFINE2(s390_lstat64, const char __user *, filename, struct stat64_emu31 __user *, statbuf) +{ + struct kstat stat; + int ret = vfs_lstat(filename, &stat); + if (!ret) + ret = cp_stat64(statbuf, &stat); + return ret; +} + +COMPAT_SYSCALL_DEFINE2(s390_fstat64, unsigned int, fd, struct stat64_emu31 __user *, statbuf) +{ + struct kstat stat; + int ret = vfs_fstat(fd, &stat); + if (!ret) + ret = cp_stat64(statbuf, &stat); + return ret; +} + +COMPAT_SYSCALL_DEFINE4(s390_fstatat64, unsigned int, dfd, const char __user *, filename, + struct stat64_emu31 __user *, statbuf, int, flag) +{ + struct kstat stat; + int error; + + error = vfs_fstatat(dfd, filename, &stat, flag); + if (error) + return error; + return cp_stat64(statbuf, &stat); +} + +/* + * Linux/i386 didn't use to be able to handle more than + * 4 system call parameters, so these system calls used a memory + * block for parameter passing.. + */ + +struct mmap_arg_struct_emu31 { + compat_ulong_t addr; + compat_ulong_t len; + compat_ulong_t prot; + compat_ulong_t flags; + compat_ulong_t fd; + compat_ulong_t offset; +}; + +COMPAT_SYSCALL_DEFINE1(s390_old_mmap, struct mmap_arg_struct_emu31 __user *, arg) +{ + struct mmap_arg_struct_emu31 a; + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; + if (a.offset & ~PAGE_MASK) + return -EINVAL; + return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset >> PAGE_SHIFT); +} + +COMPAT_SYSCALL_DEFINE1(s390_mmap2, struct mmap_arg_struct_emu31 __user *, arg) +{ + struct mmap_arg_struct_emu31 a; + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; + return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset); +} + +COMPAT_SYSCALL_DEFINE3(s390_read, unsigned int, fd, char __user *, buf, compat_size_t, count) +{ + if ((compat_ssize_t) count < 0) + return -EINVAL; + + return ksys_read(fd, buf, count); +} + +COMPAT_SYSCALL_DEFINE3(s390_write, unsigned int, fd, const char __user *, buf, compat_size_t, count) +{ + if ((compat_ssize_t) count < 0) + return -EINVAL; + + return ksys_write(fd, buf, count); +} + +/* + * 31 bit emulation wrapper functions for sys_fadvise64/fadvise64_64. + * These need to rewrite the advise values for POSIX_FADV_{DONTNEED,NOREUSE} + * because the 31 bit values differ from the 64 bit values. + */ + +COMPAT_SYSCALL_DEFINE5(s390_fadvise64, int, fd, u32, high, u32, low, compat_size_t, len, int, advise) +{ + if (advise == 4) + advise = POSIX_FADV_DONTNEED; + else if (advise == 5) + advise = POSIX_FADV_NOREUSE; + return ksys_fadvise64_64(fd, (unsigned long)high << 32 | low, len, + advise); +} + +struct fadvise64_64_args { + int fd; + long long offset; + long long len; + int advice; +}; + +COMPAT_SYSCALL_DEFINE1(s390_fadvise64_64, struct fadvise64_64_args __user *, args) +{ + struct fadvise64_64_args a; + + if ( copy_from_user(&a, args, sizeof(a)) ) + return -EFAULT; + if (a.advice == 4) + a.advice = POSIX_FADV_DONTNEED; + else if (a.advice == 5) + a.advice = POSIX_FADV_NOREUSE; + return ksys_fadvise64_64(a.fd, a.offset, a.len, a.advice); +} + +COMPAT_SYSCALL_DEFINE6(s390_sync_file_range, int, fd, u32, offhigh, u32, offlow, + u32, nhigh, u32, nlow, unsigned int, flags) +{ + return ksys_sync_file_range(fd, ((loff_t)offhigh << 32) + offlow, + ((u64)nhigh << 32) + nlow, flags); +} + +COMPAT_SYSCALL_DEFINE6(s390_fallocate, int, fd, int, mode, u32, offhigh, u32, offlow, + u32, lenhigh, u32, lenlow) +{ + return ksys_fallocate(fd, mode, ((loff_t)offhigh << 32) + offlow, + ((u64)lenhigh << 32) + lenlow); +} diff --git a/arch/s390/kernel/compat_linux.h b/arch/s390/kernel/compat_linux.h new file mode 100644 index 000000000..64509e7db --- /dev/null +++ b/arch/s390/kernel/compat_linux.h @@ -0,0 +1,130 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390X_S390_H +#define _ASM_S390X_S390_H + +#include <linux/compat.h> +#include <linux/socket.h> +#include <linux/syscalls.h> + +/* Macro that masks the high order bit of an 32 bit pointer and converts it*/ +/* to a 64 bit pointer */ +#define A(__x) ((unsigned long)((__x) & 0x7FFFFFFFUL)) +#define AA(__x) \ + ((unsigned long)(__x)) + +/* Now 32bit compatibility types */ +struct ipc_kludge_32 { + __u32 msgp; /* pointer */ + __s32 msgtyp; +}; + +/* asm/sigcontext.h */ +typedef union +{ + __u64 d; + __u32 f; +} freg_t32; + +typedef struct +{ + unsigned int fpc; + unsigned int pad; + freg_t32 fprs[__NUM_FPRS]; +} _s390_fp_regs32; + +typedef struct +{ + __u32 mask; + __u32 addr; +} _psw_t32 __attribute__ ((aligned(8))); + +typedef struct +{ + _psw_t32 psw; + __u32 gprs[__NUM_GPRS]; + __u32 acrs[__NUM_ACRS]; +} _s390_regs_common32; + +typedef struct +{ + _s390_regs_common32 regs; + _s390_fp_regs32 fpregs; +} _sigregs32; + +typedef struct +{ + __u32 gprs_high[__NUM_GPRS]; + __u64 vxrs_low[__NUM_VXRS_LOW]; + __vector128 vxrs_high[__NUM_VXRS_HIGH]; + __u8 __reserved[128]; +} _sigregs_ext32; + +#define _SIGCONTEXT_NSIG32 64 +#define _SIGCONTEXT_NSIG_BPW32 32 +#define __SIGNAL_FRAMESIZE32 96 +#define _SIGMASK_COPY_SIZE32 (sizeof(u32)*2) + +struct sigcontext32 +{ + __u32 oldmask[_COMPAT_NSIG_WORDS]; + __u32 sregs; /* pointer */ +}; + +/* asm/signal.h */ + +/* asm/ucontext.h */ +struct ucontext32 { + __u32 uc_flags; + __u32 uc_link; /* pointer */ + compat_stack_t uc_stack; + _sigregs32 uc_mcontext; + compat_sigset_t uc_sigmask; + /* Allow for uc_sigmask growth. Glibc uses a 1024-bit sigset_t. */ + unsigned char __unused[128 - sizeof(compat_sigset_t)]; + _sigregs_ext32 uc_mcontext_ext; +}; + +struct stat64_emu31; +struct mmap_arg_struct_emu31; +struct fadvise64_64_args; + +long compat_sys_s390_chown16(const char __user *filename, u16 user, u16 group); +long compat_sys_s390_lchown16(const char __user *filename, u16 user, u16 group); +long compat_sys_s390_fchown16(unsigned int fd, u16 user, u16 group); +long compat_sys_s390_setregid16(u16 rgid, u16 egid); +long compat_sys_s390_setgid16(u16 gid); +long compat_sys_s390_setreuid16(u16 ruid, u16 euid); +long compat_sys_s390_setuid16(u16 uid); +long compat_sys_s390_setresuid16(u16 ruid, u16 euid, u16 suid); +long compat_sys_s390_getresuid16(u16 __user *ruid, u16 __user *euid, u16 __user *suid); +long compat_sys_s390_setresgid16(u16 rgid, u16 egid, u16 sgid); +long compat_sys_s390_getresgid16(u16 __user *rgid, u16 __user *egid, u16 __user *sgid); +long compat_sys_s390_setfsuid16(u16 uid); +long compat_sys_s390_setfsgid16(u16 gid); +long compat_sys_s390_getgroups16(int gidsetsize, u16 __user *grouplist); +long compat_sys_s390_setgroups16(int gidsetsize, u16 __user *grouplist); +long compat_sys_s390_getuid16(void); +long compat_sys_s390_geteuid16(void); +long compat_sys_s390_getgid16(void); +long compat_sys_s390_getegid16(void); +long compat_sys_s390_truncate64(const char __user *path, u32 high, u32 low); +long compat_sys_s390_ftruncate64(unsigned int fd, u32 high, u32 low); +long compat_sys_s390_pread64(unsigned int fd, char __user *ubuf, compat_size_t count, u32 high, u32 low); +long compat_sys_s390_pwrite64(unsigned int fd, const char __user *ubuf, compat_size_t count, u32 high, u32 low); +long compat_sys_s390_readahead(int fd, u32 high, u32 low, s32 count); +long compat_sys_s390_stat64(const char __user *filename, struct stat64_emu31 __user *statbuf); +long compat_sys_s390_lstat64(const char __user *filename, struct stat64_emu31 __user *statbuf); +long compat_sys_s390_fstat64(unsigned int fd, struct stat64_emu31 __user *statbuf); +long compat_sys_s390_fstatat64(unsigned int dfd, const char __user *filename, struct stat64_emu31 __user *statbuf, int flag); +long compat_sys_s390_old_mmap(struct mmap_arg_struct_emu31 __user *arg); +long compat_sys_s390_mmap2(struct mmap_arg_struct_emu31 __user *arg); +long compat_sys_s390_read(unsigned int fd, char __user * buf, compat_size_t count); +long compat_sys_s390_write(unsigned int fd, const char __user * buf, compat_size_t count); +long compat_sys_s390_fadvise64(int fd, u32 high, u32 low, compat_size_t len, int advise); +long compat_sys_s390_fadvise64_64(struct fadvise64_64_args __user *args); +long compat_sys_s390_sync_file_range(int fd, u32 offhigh, u32 offlow, u32 nhigh, u32 nlow, unsigned int flags); +long compat_sys_s390_fallocate(int fd, int mode, u32 offhigh, u32 offlow, u32 lenhigh, u32 lenlow); +long compat_sys_sigreturn(void); +long compat_sys_rt_sigreturn(void); + +#endif /* _ASM_S390X_S390_H */ diff --git a/arch/s390/kernel/compat_ptrace.h b/arch/s390/kernel/compat_ptrace.h new file mode 100644 index 000000000..3c400fc7e --- /dev/null +++ b/arch/s390/kernel/compat_ptrace.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _PTRACE32_H +#define _PTRACE32_H + +#include <asm/ptrace.h> /* needed for NUM_CR_WORDS */ +#include "compat_linux.h" /* needed for psw_compat_t */ + +struct compat_per_struct_kernel { + __u32 cr9; /* PER control bits */ + __u32 cr10; /* PER starting address */ + __u32 cr11; /* PER ending address */ + __u32 bits; /* Obsolete software bits */ + __u32 starting_addr; /* User specified start address */ + __u32 ending_addr; /* User specified end address */ + __u16 perc_atmid; /* PER trap ATMID */ + __u32 address; /* PER trap instruction address */ + __u8 access_id; /* PER trap access identification */ +}; + +struct compat_user_regs_struct +{ + psw_compat_t psw; + u32 gprs[NUM_GPRS]; + u32 acrs[NUM_ACRS]; + u32 orig_gpr2; + /* nb: there's a 4-byte hole here */ + s390_fp_regs fp_regs; + /* + * These per registers are in here so that gdb can modify them + * itself as there is no "official" ptrace interface for hardware + * watchpoints. This is the way intel does it. + */ + struct compat_per_struct_kernel per_info; + u32 ieee_instruction_pointer; /* obsolete, always 0 */ +}; + +struct compat_user { + /* We start with the registers, to mimic the way that "memory" + is returned from the ptrace(3,...) function. */ + struct compat_user_regs_struct regs; + /* The rest of this junk is to help gdb figure out what goes where */ + u32 u_tsize; /* Text segment size (pages). */ + u32 u_dsize; /* Data segment size (pages). */ + u32 u_ssize; /* Stack segment size (pages). */ + u32 start_code; /* Starting virtual address of text. */ + u32 start_stack; /* Starting virtual address of stack area. + This is actually the bottom of the stack, + the top of the stack is always found in the + esp register. */ + s32 signal; /* Signal that caused the core dump. */ + u32 u_ar0; /* Used by gdb to help find the values for */ + /* the registers. */ + u32 magic; /* To uniquely identify a core file */ + char u_comm[32]; /* User command that was responsible */ +}; + +typedef struct +{ + __u32 len; + __u32 kernel_addr; + __u32 process_addr; +} compat_ptrace_area; + +#endif /* _PTRACE32_H */ diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c new file mode 100644 index 000000000..6f2a193cc --- /dev/null +++ b/arch/s390/kernel/compat_signal.c @@ -0,0 +1,428 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2000, 2006 + * Author(s): Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com) + * Gerhard Tonn (ton@de.ibm.com) + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson + */ + +#include <linux/compat.h> +#include <linux/sched.h> +#include <linux/sched/task_stack.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/kernel.h> +#include <linux/signal.h> +#include <linux/errno.h> +#include <linux/wait.h> +#include <linux/ptrace.h> +#include <linux/unistd.h> +#include <linux/stddef.h> +#include <linux/tty.h> +#include <linux/personality.h> +#include <linux/binfmts.h> +#include <asm/ucontext.h> +#include <linux/uaccess.h> +#include <asm/lowcore.h> +#include <asm/switch_to.h> +#include "compat_linux.h" +#include "compat_ptrace.h" +#include "entry.h" + +typedef struct +{ + __u8 callee_used_stack[__SIGNAL_FRAMESIZE32]; + struct sigcontext32 sc; + _sigregs32 sregs; + int signo; + _sigregs_ext32 sregs_ext; + __u16 svc_insn; /* Offset of svc_insn is NOT fixed! */ +} sigframe32; + +typedef struct +{ + __u8 callee_used_stack[__SIGNAL_FRAMESIZE32]; + __u16 svc_insn; + compat_siginfo_t info; + struct ucontext32 uc; +} rt_sigframe32; + +/* Store registers needed to create the signal frame */ +static void store_sigregs(void) +{ + save_access_regs(current->thread.acrs); + save_fpu_regs(); +} + +/* Load registers after signal return */ +static void load_sigregs(void) +{ + restore_access_regs(current->thread.acrs); +} + +static int save_sigregs32(struct pt_regs *regs, _sigregs32 __user *sregs) +{ + _sigregs32 user_sregs; + int i; + + user_sregs.regs.psw.mask = (__u32)(regs->psw.mask >> 32); + user_sregs.regs.psw.mask &= PSW32_MASK_USER | PSW32_MASK_RI; + user_sregs.regs.psw.mask |= PSW32_USER_BITS; + user_sregs.regs.psw.addr = (__u32) regs->psw.addr | + (__u32)(regs->psw.mask & PSW_MASK_BA); + for (i = 0; i < NUM_GPRS; i++) + user_sregs.regs.gprs[i] = (__u32) regs->gprs[i]; + memcpy(&user_sregs.regs.acrs, current->thread.acrs, + sizeof(user_sregs.regs.acrs)); + fpregs_store((_s390_fp_regs *) &user_sregs.fpregs, ¤t->thread.fpu); + if (__copy_to_user(sregs, &user_sregs, sizeof(_sigregs32))) + return -EFAULT; + return 0; +} + +static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs) +{ + _sigregs32 user_sregs; + int i; + + /* Alwys make any pending restarted system call return -EINTR */ + current->restart_block.fn = do_no_restart_syscall; + + if (__copy_from_user(&user_sregs, &sregs->regs, sizeof(user_sregs))) + return -EFAULT; + + if (!is_ri_task(current) && (user_sregs.regs.psw.mask & PSW32_MASK_RI)) + return -EINVAL; + + /* Test the floating-point-control word. */ + if (test_fp_ctl(user_sregs.fpregs.fpc)) + return -EINVAL; + + /* Use regs->psw.mask instead of PSW_USER_BITS to preserve PER bit. */ + regs->psw.mask = (regs->psw.mask & ~(PSW_MASK_USER | PSW_MASK_RI)) | + (__u64)(user_sregs.regs.psw.mask & PSW32_MASK_USER) << 32 | + (__u64)(user_sregs.regs.psw.mask & PSW32_MASK_RI) << 32 | + (__u64)(user_sregs.regs.psw.addr & PSW32_ADDR_AMODE); + /* Check for invalid user address space control. */ + if ((regs->psw.mask & PSW_MASK_ASC) == PSW_ASC_HOME) + regs->psw.mask = PSW_ASC_PRIMARY | + (regs->psw.mask & ~PSW_MASK_ASC); + regs->psw.addr = (__u64)(user_sregs.regs.psw.addr & PSW32_ADDR_INSN); + for (i = 0; i < NUM_GPRS; i++) + regs->gprs[i] = (__u64) user_sregs.regs.gprs[i]; + memcpy(¤t->thread.acrs, &user_sregs.regs.acrs, + sizeof(current->thread.acrs)); + fpregs_load((_s390_fp_regs *) &user_sregs.fpregs, ¤t->thread.fpu); + + clear_pt_regs_flag(regs, PIF_SYSCALL); /* No longer in a system call */ + return 0; +} + +static int save_sigregs_ext32(struct pt_regs *regs, + _sigregs_ext32 __user *sregs_ext) +{ + __u32 gprs_high[NUM_GPRS]; + __u64 vxrs[__NUM_VXRS_LOW]; + int i; + + /* Save high gprs to signal stack */ + for (i = 0; i < NUM_GPRS; i++) + gprs_high[i] = regs->gprs[i] >> 32; + if (__copy_to_user(&sregs_ext->gprs_high, &gprs_high, + sizeof(sregs_ext->gprs_high))) + return -EFAULT; + + /* Save vector registers to signal stack */ + if (MACHINE_HAS_VX) { + for (i = 0; i < __NUM_VXRS_LOW; i++) + vxrs[i] = *((__u64 *)(current->thread.fpu.vxrs + i) + 1); + if (__copy_to_user(&sregs_ext->vxrs_low, vxrs, + sizeof(sregs_ext->vxrs_low)) || + __copy_to_user(&sregs_ext->vxrs_high, + current->thread.fpu.vxrs + __NUM_VXRS_LOW, + sizeof(sregs_ext->vxrs_high))) + return -EFAULT; + } + return 0; +} + +static int restore_sigregs_ext32(struct pt_regs *regs, + _sigregs_ext32 __user *sregs_ext) +{ + __u32 gprs_high[NUM_GPRS]; + __u64 vxrs[__NUM_VXRS_LOW]; + int i; + + /* Restore high gprs from signal stack */ + if (__copy_from_user(&gprs_high, &sregs_ext->gprs_high, + sizeof(sregs_ext->gprs_high))) + return -EFAULT; + for (i = 0; i < NUM_GPRS; i++) + *(__u32 *)®s->gprs[i] = gprs_high[i]; + + /* Restore vector registers from signal stack */ + if (MACHINE_HAS_VX) { + if (__copy_from_user(vxrs, &sregs_ext->vxrs_low, + sizeof(sregs_ext->vxrs_low)) || + __copy_from_user(current->thread.fpu.vxrs + __NUM_VXRS_LOW, + &sregs_ext->vxrs_high, + sizeof(sregs_ext->vxrs_high))) + return -EFAULT; + for (i = 0; i < __NUM_VXRS_LOW; i++) + *((__u64 *)(current->thread.fpu.vxrs + i) + 1) = vxrs[i]; + } + return 0; +} + +COMPAT_SYSCALL_DEFINE0(sigreturn) +{ + struct pt_regs *regs = task_pt_regs(current); + sigframe32 __user *frame = (sigframe32 __user *)regs->gprs[15]; + sigset_t set; + + if (get_compat_sigset(&set, (compat_sigset_t __user *)frame->sc.oldmask)) + goto badframe; + set_current_blocked(&set); + save_fpu_regs(); + if (restore_sigregs32(regs, &frame->sregs)) + goto badframe; + if (restore_sigregs_ext32(regs, &frame->sregs_ext)) + goto badframe; + load_sigregs(); + return regs->gprs[2]; +badframe: + force_sig(SIGSEGV, current); + return 0; +} + +COMPAT_SYSCALL_DEFINE0(rt_sigreturn) +{ + struct pt_regs *regs = task_pt_regs(current); + rt_sigframe32 __user *frame = (rt_sigframe32 __user *)regs->gprs[15]; + sigset_t set; + + if (get_compat_sigset(&set, &frame->uc.uc_sigmask)) + goto badframe; + set_current_blocked(&set); + if (compat_restore_altstack(&frame->uc.uc_stack)) + goto badframe; + save_fpu_regs(); + if (restore_sigregs32(regs, &frame->uc.uc_mcontext)) + goto badframe; + if (restore_sigregs_ext32(regs, &frame->uc.uc_mcontext_ext)) + goto badframe; + load_sigregs(); + return regs->gprs[2]; +badframe: + force_sig(SIGSEGV, current); + return 0; +} + +/* + * Set up a signal frame. + */ + + +/* + * Determine which stack to use.. + */ +static inline void __user * +get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size) +{ + unsigned long sp; + + /* Default to using normal stack */ + sp = (unsigned long) A(regs->gprs[15]); + + /* Overflow on alternate signal stack gives SIGSEGV. */ + if (on_sig_stack(sp) && !on_sig_stack((sp - frame_size) & -8UL)) + return (void __user *) -1UL; + + /* This is the X/Open sanctioned signal stack switching. */ + if (ka->sa.sa_flags & SA_ONSTACK) { + if (! sas_ss_flags(sp)) + sp = current->sas_ss_sp + current->sas_ss_size; + } + + return (void __user *)((sp - frame_size) & -8ul); +} + +static int setup_frame32(struct ksignal *ksig, sigset_t *set, + struct pt_regs *regs) +{ + int sig = ksig->sig; + sigframe32 __user *frame; + unsigned long restorer; + size_t frame_size; + + /* + * gprs_high are always present for 31-bit compat tasks. + * The space for vector registers is only allocated if + * the machine supports it + */ + frame_size = sizeof(*frame) - sizeof(frame->sregs_ext.__reserved); + if (!MACHINE_HAS_VX) + frame_size -= sizeof(frame->sregs_ext.vxrs_low) + + sizeof(frame->sregs_ext.vxrs_high); + frame = get_sigframe(&ksig->ka, regs, frame_size); + if (frame == (void __user *) -1UL) + return -EFAULT; + + /* Set up backchain. */ + if (__put_user(regs->gprs[15], (unsigned int __user *) frame)) + return -EFAULT; + + /* Create struct sigcontext32 on the signal stack */ + if (put_compat_sigset((compat_sigset_t __user *)frame->sc.oldmask, + set, sizeof(compat_sigset_t))) + return -EFAULT; + if (__put_user(ptr_to_compat(&frame->sregs), &frame->sc.sregs)) + return -EFAULT; + + /* Store registers needed to create the signal frame */ + store_sigregs(); + + /* Create _sigregs32 on the signal stack */ + if (save_sigregs32(regs, &frame->sregs)) + return -EFAULT; + + /* Place signal number on stack to allow backtrace from handler. */ + if (__put_user(regs->gprs[2], (int __force __user *) &frame->signo)) + return -EFAULT; + + /* Create _sigregs_ext32 on the signal stack */ + if (save_sigregs_ext32(regs, &frame->sregs_ext)) + return -EFAULT; + + /* Set up to return from userspace. If provided, use a stub + already in userspace. */ + if (ksig->ka.sa.sa_flags & SA_RESTORER) { + restorer = (unsigned long __force) + ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE; + } else { + /* Signal frames without vectors registers are short ! */ + __u16 __user *svc = (void __user *) frame + frame_size - 2; + if (__put_user(S390_SYSCALL_OPCODE | __NR_sigreturn, svc)) + return -EFAULT; + restorer = (unsigned long __force) svc | PSW32_ADDR_AMODE; + } + + /* Set up registers for signal handler */ + regs->gprs[14] = restorer; + regs->gprs[15] = (__force __u64) frame; + /* Force 31 bit amode and default user address space control. */ + regs->psw.mask = PSW_MASK_BA | + (PSW_USER_BITS & PSW_MASK_ASC) | + (regs->psw.mask & ~PSW_MASK_ASC); + regs->psw.addr = (__force __u64) ksig->ka.sa.sa_handler; + + regs->gprs[2] = sig; + regs->gprs[3] = (__force __u64) &frame->sc; + + /* We forgot to include these in the sigcontext. + To avoid breaking binary compatibility, they are passed as args. */ + if (sig == SIGSEGV || sig == SIGBUS || sig == SIGILL || + sig == SIGTRAP || sig == SIGFPE) { + /* set extra registers only for synchronous signals */ + regs->gprs[4] = regs->int_code & 127; + regs->gprs[5] = regs->int_parm_long; + regs->gprs[6] = current->thread.last_break; + } + + return 0; +} + +static int setup_rt_frame32(struct ksignal *ksig, sigset_t *set, + struct pt_regs *regs) +{ + rt_sigframe32 __user *frame; + unsigned long restorer; + size_t frame_size; + u32 uc_flags; + + frame_size = sizeof(*frame) - + sizeof(frame->uc.uc_mcontext_ext.__reserved); + /* + * gprs_high are always present for 31-bit compat tasks. + * The space for vector registers is only allocated if + * the machine supports it + */ + uc_flags = UC_GPRS_HIGH; + if (MACHINE_HAS_VX) { + uc_flags |= UC_VXRS; + } else + frame_size -= sizeof(frame->uc.uc_mcontext_ext.vxrs_low) + + sizeof(frame->uc.uc_mcontext_ext.vxrs_high); + frame = get_sigframe(&ksig->ka, regs, frame_size); + if (frame == (void __user *) -1UL) + return -EFAULT; + + /* Set up backchain. */ + if (__put_user(regs->gprs[15], (unsigned int __force __user *) frame)) + return -EFAULT; + + /* Set up to return from userspace. If provided, use a stub + already in userspace. */ + if (ksig->ka.sa.sa_flags & SA_RESTORER) { + restorer = (unsigned long __force) + ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE; + } else { + __u16 __user *svc = &frame->svc_insn; + if (__put_user(S390_SYSCALL_OPCODE | __NR_rt_sigreturn, svc)) + return -EFAULT; + restorer = (unsigned long __force) svc | PSW32_ADDR_AMODE; + } + + /* Create siginfo on the signal stack */ + if (copy_siginfo_to_user32(&frame->info, &ksig->info)) + return -EFAULT; + + /* Store registers needed to create the signal frame */ + store_sigregs(); + + /* Create ucontext on the signal stack. */ + if (__put_user(uc_flags, &frame->uc.uc_flags) || + __put_user(0, &frame->uc.uc_link) || + __compat_save_altstack(&frame->uc.uc_stack, regs->gprs[15]) || + save_sigregs32(regs, &frame->uc.uc_mcontext) || + put_compat_sigset(&frame->uc.uc_sigmask, set, sizeof(compat_sigset_t)) || + save_sigregs_ext32(regs, &frame->uc.uc_mcontext_ext)) + return -EFAULT; + + /* Set up registers for signal handler */ + regs->gprs[14] = restorer; + regs->gprs[15] = (__force __u64) frame; + /* Force 31 bit amode and default user address space control. */ + regs->psw.mask = PSW_MASK_BA | + (PSW_USER_BITS & PSW_MASK_ASC) | + (regs->psw.mask & ~PSW_MASK_ASC); + regs->psw.addr = (__u64 __force) ksig->ka.sa.sa_handler; + + regs->gprs[2] = ksig->sig; + regs->gprs[3] = (__force __u64) &frame->info; + regs->gprs[4] = (__force __u64) &frame->uc; + regs->gprs[5] = current->thread.last_break; + return 0; +} + +/* + * OK, we're invoking a handler + */ + +void handle_signal32(struct ksignal *ksig, sigset_t *oldset, + struct pt_regs *regs) +{ + int ret; + + /* Set up the stack frame */ + if (ksig->ka.sa.sa_flags & SA_SIGINFO) + ret = setup_rt_frame32(ksig, oldset, regs); + else + ret = setup_frame32(ksig, oldset, regs); + + signal_setup_done(ret, ksig, test_thread_flag(TIF_SINGLE_STEP)); +} + diff --git a/arch/s390/kernel/compat_wrapper.c b/arch/s390/kernel/compat_wrapper.c new file mode 100644 index 000000000..2ce28bf0c --- /dev/null +++ b/arch/s390/kernel/compat_wrapper.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Compat system call wrappers. + * + * Copyright IBM Corp. 2014 + */ + +#include <linux/syscalls.h> +#include <linux/compat.h> +#include "entry.h" + +#define COMPAT_SYSCALL_WRAP1(name, ...) \ + COMPAT_SYSCALL_WRAPx(1, _##name, __VA_ARGS__) +#define COMPAT_SYSCALL_WRAP2(name, ...) \ + COMPAT_SYSCALL_WRAPx(2, _##name, __VA_ARGS__) +#define COMPAT_SYSCALL_WRAP3(name, ...) \ + COMPAT_SYSCALL_WRAPx(3, _##name, __VA_ARGS__) +#define COMPAT_SYSCALL_WRAP4(name, ...) \ + COMPAT_SYSCALL_WRAPx(4, _##name, __VA_ARGS__) +#define COMPAT_SYSCALL_WRAP5(name, ...) \ + COMPAT_SYSCALL_WRAPx(5, _##name, __VA_ARGS__) +#define COMPAT_SYSCALL_WRAP6(name, ...) \ + COMPAT_SYSCALL_WRAPx(6, _##name, __VA_ARGS__) + +#define __SC_COMPAT_TYPE(t, a) \ + __typeof(__builtin_choose_expr(sizeof(t) > 4, 0L, (t)0)) a + +#define __SC_COMPAT_CAST(t, a) \ +({ \ + long __ReS = a; \ + \ + BUILD_BUG_ON((sizeof(t) > 4) && !__TYPE_IS_L(t) && \ + !__TYPE_IS_UL(t) && !__TYPE_IS_PTR(t)); \ + if (__TYPE_IS_L(t)) \ + __ReS = (s32)a; \ + if (__TYPE_IS_UL(t)) \ + __ReS = (u32)a; \ + if (__TYPE_IS_PTR(t)) \ + __ReS = a & 0x7fffffff; \ + (t)__ReS; \ +}) + +/* + * The COMPAT_SYSCALL_WRAP macro generates system call wrappers to be used by + * compat tasks. These wrappers will only be used for system calls where only + * the system call arguments need sign or zero extension or zeroing of the upper + * 33 bits of pointers. + * Note: since the wrapper function will afterwards call a system call which + * again performs zero and sign extension for all system call arguments with + * a size of less than eight bytes, these compat wrappers only touch those + * system call arguments with a size of eight bytes ((unsigned) long and + * pointers). Zero and sign extension for e.g. int parameters will be done by + * the regular system call wrappers. + */ +#define COMPAT_SYSCALL_WRAPx(x, name, ...) \ +asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ +asmlinkage long notrace compat_sys##name(__MAP(x,__SC_COMPAT_TYPE,__VA_ARGS__));\ +asmlinkage long notrace compat_sys##name(__MAP(x,__SC_COMPAT_TYPE,__VA_ARGS__)) \ +{ \ + return sys##name(__MAP(x,__SC_COMPAT_CAST,__VA_ARGS__)); \ +} + +COMPAT_SYSCALL_WRAP2(creat, const char __user *, pathname, umode_t, mode); +COMPAT_SYSCALL_WRAP2(link, const char __user *, oldname, const char __user *, newname); +COMPAT_SYSCALL_WRAP1(unlink, const char __user *, pathname); +COMPAT_SYSCALL_WRAP1(chdir, const char __user *, filename); +COMPAT_SYSCALL_WRAP3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev); +COMPAT_SYSCALL_WRAP2(chmod, const char __user *, filename, umode_t, mode); +COMPAT_SYSCALL_WRAP1(oldumount, char __user *, name); +COMPAT_SYSCALL_WRAP2(access, const char __user *, filename, int, mode); +COMPAT_SYSCALL_WRAP2(rename, const char __user *, oldname, const char __user *, newname); +COMPAT_SYSCALL_WRAP2(mkdir, const char __user *, pathname, umode_t, mode); +COMPAT_SYSCALL_WRAP1(rmdir, const char __user *, pathname); +COMPAT_SYSCALL_WRAP1(pipe, int __user *, fildes); +COMPAT_SYSCALL_WRAP1(brk, unsigned long, brk); +COMPAT_SYSCALL_WRAP2(signal, int, sig, __sighandler_t, handler); +COMPAT_SYSCALL_WRAP1(acct, const char __user *, name); +COMPAT_SYSCALL_WRAP2(umount, char __user *, name, int, flags); +COMPAT_SYSCALL_WRAP1(chroot, const char __user *, filename); +COMPAT_SYSCALL_WRAP3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask); +COMPAT_SYSCALL_WRAP2(sethostname, char __user *, name, int, len); +COMPAT_SYSCALL_WRAP2(symlink, const char __user *, old, const char __user *, new); +COMPAT_SYSCALL_WRAP3(readlink, const char __user *, path, char __user *, buf, int, bufsiz); +COMPAT_SYSCALL_WRAP1(uselib, const char __user *, library); +COMPAT_SYSCALL_WRAP2(swapon, const char __user *, specialfile, int, swap_flags); +COMPAT_SYSCALL_WRAP4(reboot, int, magic1, int, magic2, unsigned int, cmd, void __user *, arg); +COMPAT_SYSCALL_WRAP2(munmap, unsigned long, addr, size_t, len); +COMPAT_SYSCALL_WRAP3(syslog, int, type, char __user *, buf, int, len); +COMPAT_SYSCALL_WRAP1(swapoff, const char __user *, specialfile); +COMPAT_SYSCALL_WRAP2(setdomainname, char __user *, name, int, len); +COMPAT_SYSCALL_WRAP1(newuname, struct new_utsname __user *, name); +COMPAT_SYSCALL_WRAP3(mprotect, unsigned long, start, size_t, len, unsigned long, prot); +COMPAT_SYSCALL_WRAP3(init_module, void __user *, umod, unsigned long, len, const char __user *, uargs); +COMPAT_SYSCALL_WRAP2(delete_module, const char __user *, name_user, unsigned int, flags); +COMPAT_SYSCALL_WRAP4(quotactl, unsigned int, cmd, const char __user *, special, qid_t, id, void __user *, addr); +COMPAT_SYSCALL_WRAP2(bdflush, int, func, long, data); +COMPAT_SYSCALL_WRAP3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2); +COMPAT_SYSCALL_WRAP5(llseek, unsigned int, fd, unsigned long, high, unsigned long, low, loff_t __user *, result, unsigned int, whence); +COMPAT_SYSCALL_WRAP3(msync, unsigned long, start, size_t, len, int, flags); +COMPAT_SYSCALL_WRAP2(mlock, unsigned long, start, size_t, len); +COMPAT_SYSCALL_WRAP2(munlock, unsigned long, start, size_t, len); +COMPAT_SYSCALL_WRAP2(sched_setparam, pid_t, pid, struct sched_param __user *, param); +COMPAT_SYSCALL_WRAP2(sched_getparam, pid_t, pid, struct sched_param __user *, param); +COMPAT_SYSCALL_WRAP3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param); +COMPAT_SYSCALL_WRAP5(mremap, unsigned long, addr, unsigned long, old_len, unsigned long, new_len, unsigned long, flags, unsigned long, new_addr); +COMPAT_SYSCALL_WRAP3(poll, struct pollfd __user *, ufds, unsigned int, nfds, int, timeout); +COMPAT_SYSCALL_WRAP5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5); +COMPAT_SYSCALL_WRAP2(getcwd, char __user *, buf, unsigned long, size); +COMPAT_SYSCALL_WRAP2(capget, cap_user_header_t, header, cap_user_data_t, dataptr); +COMPAT_SYSCALL_WRAP2(capset, cap_user_header_t, header, const cap_user_data_t, data); +COMPAT_SYSCALL_WRAP3(lchown, const char __user *, filename, uid_t, user, gid_t, group); +COMPAT_SYSCALL_WRAP2(getgroups, int, gidsetsize, gid_t __user *, grouplist); +COMPAT_SYSCALL_WRAP2(setgroups, int, gidsetsize, gid_t __user *, grouplist); +COMPAT_SYSCALL_WRAP3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __user *, suid); +COMPAT_SYSCALL_WRAP3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __user *, sgid); +COMPAT_SYSCALL_WRAP3(chown, const char __user *, filename, uid_t, user, gid_t, group); +COMPAT_SYSCALL_WRAP2(pivot_root, const char __user *, new_root, const char __user *, put_old); +COMPAT_SYSCALL_WRAP3(mincore, unsigned long, start, size_t, len, unsigned char __user *, vec); +COMPAT_SYSCALL_WRAP3(madvise, unsigned long, start, size_t, len, int, behavior); +COMPAT_SYSCALL_WRAP5(setxattr, const char __user *, path, const char __user *, name, const void __user *, value, size_t, size, int, flags); +COMPAT_SYSCALL_WRAP5(lsetxattr, const char __user *, path, const char __user *, name, const void __user *, value, size_t, size, int, flags); +COMPAT_SYSCALL_WRAP5(fsetxattr, int, fd, const char __user *, name, const void __user *, value, size_t, size, int, flags); +COMPAT_SYSCALL_WRAP3(getdents64, unsigned int, fd, struct linux_dirent64 __user *, dirent, unsigned int, count); +COMPAT_SYSCALL_WRAP4(getxattr, const char __user *, path, const char __user *, name, void __user *, value, size_t, size); +COMPAT_SYSCALL_WRAP4(lgetxattr, const char __user *, path, const char __user *, name, void __user *, value, size_t, size); +COMPAT_SYSCALL_WRAP4(fgetxattr, int, fd, const char __user *, name, void __user *, value, size_t, size); +COMPAT_SYSCALL_WRAP3(listxattr, const char __user *, path, char __user *, list, size_t, size); +COMPAT_SYSCALL_WRAP3(llistxattr, const char __user *, path, char __user *, list, size_t, size); +COMPAT_SYSCALL_WRAP3(flistxattr, int, fd, char __user *, list, size_t, size); +COMPAT_SYSCALL_WRAP2(removexattr, const char __user *, path, const char __user *, name); +COMPAT_SYSCALL_WRAP2(lremovexattr, const char __user *, path, const char __user *, name); +COMPAT_SYSCALL_WRAP2(fremovexattr, int, fd, const char __user *, name); +COMPAT_SYSCALL_WRAP1(set_tid_address, int __user *, tidptr); +COMPAT_SYSCALL_WRAP4(epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event __user *, event); +COMPAT_SYSCALL_WRAP4(epoll_wait, int, epfd, struct epoll_event __user *, events, int, maxevents, int, timeout); +COMPAT_SYSCALL_WRAP1(io_destroy, aio_context_t, ctx); +COMPAT_SYSCALL_WRAP3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, struct io_event __user *, result); +COMPAT_SYSCALL_WRAP1(mq_unlink, const char __user *, name); +COMPAT_SYSCALL_WRAP5(add_key, const char __user *, tp, const char __user *, dsc, const void __user *, pld, size_t, len, key_serial_t, id); +COMPAT_SYSCALL_WRAP4(request_key, const char __user *, tp, const char __user *, dsc, const char __user *, info, key_serial_t, id); +COMPAT_SYSCALL_WRAP5(remap_file_pages, unsigned long, start, unsigned long, size, unsigned long, prot, unsigned long, pgoff, unsigned long, flags); +COMPAT_SYSCALL_WRAP3(inotify_add_watch, int, fd, const char __user *, path, u32, mask); +COMPAT_SYSCALL_WRAP3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode); +COMPAT_SYSCALL_WRAP4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, unsigned, dev); +COMPAT_SYSCALL_WRAP5(fchownat, int, dfd, const char __user *, filename, uid_t, user, gid_t, group, int, flag); +COMPAT_SYSCALL_WRAP3(unlinkat, int, dfd, const char __user *, pathname, int, flag); +COMPAT_SYSCALL_WRAP4(renameat, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname); +COMPAT_SYSCALL_WRAP5(linkat, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname, int, flags); +COMPAT_SYSCALL_WRAP3(symlinkat, const char __user *, oldname, int, newdfd, const char __user *, newname); +COMPAT_SYSCALL_WRAP4(readlinkat, int, dfd, const char __user *, path, char __user *, buf, int, bufsiz); +COMPAT_SYSCALL_WRAP3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode); +COMPAT_SYSCALL_WRAP3(faccessat, int, dfd, const char __user *, filename, int, mode); +COMPAT_SYSCALL_WRAP1(unshare, unsigned long, unshare_flags); +COMPAT_SYSCALL_WRAP6(splice, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags); +COMPAT_SYSCALL_WRAP4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags); +COMPAT_SYSCALL_WRAP3(getcpu, unsigned __user *, cpu, unsigned __user *, node, struct getcpu_cache __user *, cache); +COMPAT_SYSCALL_WRAP2(pipe2, int __user *, fildes, int, flags); +COMPAT_SYSCALL_WRAP5(perf_event_open, struct perf_event_attr __user *, attr_uptr, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags); +COMPAT_SYSCALL_WRAP5(clone, unsigned long, newsp, unsigned long, clone_flags, int __user *, parent_tidptr, int __user *, child_tidptr, unsigned long, tls); +COMPAT_SYSCALL_WRAP4(prlimit64, pid_t, pid, unsigned int, resource, const struct rlimit64 __user *, new_rlim, struct rlimit64 __user *, old_rlim); +COMPAT_SYSCALL_WRAP5(name_to_handle_at, int, dfd, const char __user *, name, struct file_handle __user *, handle, int __user *, mnt_id, int, flag); +COMPAT_SYSCALL_WRAP5(kcmp, pid_t, pid1, pid_t, pid2, int, type, unsigned long, idx1, unsigned long, idx2); +COMPAT_SYSCALL_WRAP3(finit_module, int, fd, const char __user *, uargs, int, flags); +COMPAT_SYSCALL_WRAP3(sched_setattr, pid_t, pid, struct sched_attr __user *, attr, unsigned int, flags); +COMPAT_SYSCALL_WRAP4(sched_getattr, pid_t, pid, struct sched_attr __user *, attr, unsigned int, size, unsigned int, flags); +COMPAT_SYSCALL_WRAP5(renameat2, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname, unsigned int, flags); +COMPAT_SYSCALL_WRAP3(seccomp, unsigned int, op, unsigned int, flags, const char __user *, uargs) +COMPAT_SYSCALL_WRAP3(getrandom, char __user *, buf, size_t, count, unsigned int, flags) +COMPAT_SYSCALL_WRAP2(memfd_create, const char __user *, uname, unsigned int, flags) +COMPAT_SYSCALL_WRAP3(bpf, int, cmd, union bpf_attr *, attr, unsigned int, size); +COMPAT_SYSCALL_WRAP3(s390_pci_mmio_write, const unsigned long, mmio_addr, const void __user *, user_buffer, const size_t, length); +COMPAT_SYSCALL_WRAP3(s390_pci_mmio_read, const unsigned long, mmio_addr, void __user *, user_buffer, const size_t, length); +COMPAT_SYSCALL_WRAP4(socketpair, int, family, int, type, int, protocol, int __user *, usockvec); +COMPAT_SYSCALL_WRAP3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen); +COMPAT_SYSCALL_WRAP3(connect, int, fd, struct sockaddr __user *, uservaddr, int, addrlen); +COMPAT_SYSCALL_WRAP4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr, int __user *, upeer_addrlen, int, flags); +COMPAT_SYSCALL_WRAP3(getsockname, int, fd, struct sockaddr __user *, usockaddr, int __user *, usockaddr_len); +COMPAT_SYSCALL_WRAP3(getpeername, int, fd, struct sockaddr __user *, usockaddr, int __user *, usockaddr_len); +COMPAT_SYSCALL_WRAP6(sendto, int, fd, void __user *, buff, size_t, len, unsigned int, flags, struct sockaddr __user *, addr, int, addr_len); +COMPAT_SYSCALL_WRAP3(mlock2, unsigned long, start, size_t, len, int, flags); +COMPAT_SYSCALL_WRAP6(copy_file_range, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags); +COMPAT_SYSCALL_WRAP2(s390_guarded_storage, int, command, struct gs_cb *, gs_cb); +COMPAT_SYSCALL_WRAP5(statx, int, dfd, const char __user *, path, unsigned, flags, unsigned, mask, struct statx __user *, buffer); +COMPAT_SYSCALL_WRAP4(s390_sthyi, unsigned long, code, void __user *, info, u64 __user *, rc, unsigned long, flags); +COMPAT_SYSCALL_WRAP5(kexec_file_load, int, kernel_fd, int, initrd_fd, unsigned long, cmdline_len, const char __user *, cmdline_ptr, unsigned long, flags) +COMPAT_SYSCALL_WRAP4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) diff --git a/arch/s390/kernel/cpcmd.c b/arch/s390/kernel/cpcmd.c new file mode 100644 index 000000000..2da027359 --- /dev/null +++ b/arch/s390/kernel/cpcmd.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * S390 version + * Copyright IBM Corp. 1999, 2007 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), + * Christian Borntraeger (cborntra@de.ibm.com), + */ + +#define KMSG_COMPONENT "cpcmd" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/stddef.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <asm/diag.h> +#include <asm/ebcdic.h> +#include <asm/cpcmd.h> +#include <asm/io.h> + +static DEFINE_SPINLOCK(cpcmd_lock); +static char cpcmd_buf[241]; + +static int diag8_noresponse(int cmdlen) +{ + register unsigned long reg2 asm ("2") = (addr_t) cpcmd_buf; + register unsigned long reg3 asm ("3") = cmdlen; + + asm volatile( + " diag %1,%0,0x8\n" + : "+d" (reg3) : "d" (reg2) : "cc"); + return reg3; +} + +static int diag8_response(int cmdlen, char *response, int *rlen) +{ + unsigned long _cmdlen = cmdlen | 0x40000000L; + unsigned long _rlen = *rlen; + register unsigned long reg2 asm ("2") = (addr_t) cpcmd_buf; + register unsigned long reg3 asm ("3") = (addr_t) response; + register unsigned long reg4 asm ("4") = _cmdlen; + register unsigned long reg5 asm ("5") = _rlen; + + asm volatile( + " diag %2,%0,0x8\n" + " brc 8,1f\n" + " agr %1,%4\n" + "1:\n" + : "+d" (reg4), "+d" (reg5) + : "d" (reg2), "d" (reg3), "d" (*rlen) : "cc"); + *rlen = reg5; + return reg4; +} + +/* + * __cpcmd has some restrictions over cpcmd + * - __cpcmd is unlocked and therefore not SMP-safe + */ +int __cpcmd(const char *cmd, char *response, int rlen, int *response_code) +{ + int cmdlen; + int rc; + int response_len; + + cmdlen = strlen(cmd); + BUG_ON(cmdlen > 240); + memcpy(cpcmd_buf, cmd, cmdlen); + ASCEBC(cpcmd_buf, cmdlen); + + diag_stat_inc(DIAG_STAT_X008); + if (response) { + memset(response, 0, rlen); + response_len = rlen; + rc = diag8_response(cmdlen, response, &rlen); + EBCASC(response, response_len); + } else { + rc = diag8_noresponse(cmdlen); + } + if (response_code) + *response_code = rc; + return rlen; +} +EXPORT_SYMBOL(__cpcmd); + +int cpcmd(const char *cmd, char *response, int rlen, int *response_code) +{ + unsigned long flags; + char *lowbuf; + int len; + + if (is_vmalloc_or_module_addr(response)) { + lowbuf = kmalloc(rlen, GFP_KERNEL); + if (!lowbuf) { + pr_warn("The cpcmd kernel function failed to allocate a response buffer\n"); + return -ENOMEM; + } + spin_lock_irqsave(&cpcmd_lock, flags); + len = __cpcmd(cmd, lowbuf, rlen, response_code); + spin_unlock_irqrestore(&cpcmd_lock, flags); + memcpy(response, lowbuf, rlen); + kfree(lowbuf); + } else { + spin_lock_irqsave(&cpcmd_lock, flags); + len = __cpcmd(cmd, response, rlen, response_code); + spin_unlock_irqrestore(&cpcmd_lock, flags); + } + return len; +} +EXPORT_SYMBOL(cpcmd); diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c new file mode 100644 index 000000000..376f6b6df --- /dev/null +++ b/arch/s390/kernel/crash_dump.c @@ -0,0 +1,711 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * S390 kdump implementation + * + * Copyright IBM Corp. 2011 + * Author(s): Michael Holzheu <holzheu@linux.vnet.ibm.com> + */ + +#include <linux/crash_dump.h> +#include <asm/lowcore.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/bootmem.h> +#include <linux/elf.h> +#include <asm/asm-offsets.h> +#include <linux/memblock.h> +#include <asm/os_info.h> +#include <asm/elf.h> +#include <asm/ipl.h> +#include <asm/sclp.h> + +#define PTR_ADD(x, y) (((char *) (x)) + ((unsigned long) (y))) +#define PTR_SUB(x, y) (((char *) (x)) - ((unsigned long) (y))) +#define PTR_DIFF(x, y) ((unsigned long)(((char *) (x)) - ((unsigned long) (y)))) + +static struct memblock_region oldmem_region; + +static struct memblock_type oldmem_type = { + .cnt = 1, + .max = 1, + .total_size = 0, + .regions = &oldmem_region, + .name = "oldmem", +}; + +struct save_area { + struct list_head list; + u64 psw[2]; + u64 ctrs[16]; + u64 gprs[16]; + u32 acrs[16]; + u64 fprs[16]; + u32 fpc; + u32 prefix; + u64 todpreg; + u64 timer; + u64 todcmp; + u64 vxrs_low[16]; + __vector128 vxrs_high[16]; +}; + +static LIST_HEAD(dump_save_areas); + +/* + * Allocate a save area + */ +struct save_area * __init save_area_alloc(bool is_boot_cpu) +{ + struct save_area *sa; + + sa = (void *) memblock_alloc(sizeof(*sa), 8); + if (is_boot_cpu) + list_add(&sa->list, &dump_save_areas); + else + list_add_tail(&sa->list, &dump_save_areas); + return sa; +} + +/* + * Return the address of the save area for the boot CPU + */ +struct save_area * __init save_area_boot_cpu(void) +{ + return list_first_entry_or_null(&dump_save_areas, struct save_area, list); +} + +/* + * Copy CPU registers into the save area + */ +void __init save_area_add_regs(struct save_area *sa, void *regs) +{ + struct lowcore *lc; + + lc = (struct lowcore *)(regs - __LC_FPREGS_SAVE_AREA); + memcpy(&sa->psw, &lc->psw_save_area, sizeof(sa->psw)); + memcpy(&sa->ctrs, &lc->cregs_save_area, sizeof(sa->ctrs)); + memcpy(&sa->gprs, &lc->gpregs_save_area, sizeof(sa->gprs)); + memcpy(&sa->acrs, &lc->access_regs_save_area, sizeof(sa->acrs)); + memcpy(&sa->fprs, &lc->floating_pt_save_area, sizeof(sa->fprs)); + memcpy(&sa->fpc, &lc->fpt_creg_save_area, sizeof(sa->fpc)); + memcpy(&sa->prefix, &lc->prefixreg_save_area, sizeof(sa->prefix)); + memcpy(&sa->todpreg, &lc->tod_progreg_save_area, sizeof(sa->todpreg)); + memcpy(&sa->timer, &lc->cpu_timer_save_area, sizeof(sa->timer)); + memcpy(&sa->todcmp, &lc->clock_comp_save_area, sizeof(sa->todcmp)); +} + +/* + * Copy vector registers into the save area + */ +void __init save_area_add_vxrs(struct save_area *sa, __vector128 *vxrs) +{ + int i; + + /* Copy lower halves of vector registers 0-15 */ + for (i = 0; i < 16; i++) + memcpy(&sa->vxrs_low[i], &vxrs[i].u[2], 8); + /* Copy vector registers 16-31 */ + memcpy(sa->vxrs_high, vxrs + 16, 16 * sizeof(__vector128)); +} + +/* + * Return physical address for virtual address + */ +static inline void *load_real_addr(void *addr) +{ + unsigned long real_addr; + + asm volatile( + " lra %0,0(%1)\n" + " jz 0f\n" + " la %0,0\n" + "0:" + : "=a" (real_addr) : "a" (addr) : "cc"); + return (void *)real_addr; +} + +/* + * Copy memory of the old, dumped system to a kernel space virtual address + */ +int copy_oldmem_kernel(void *dst, void *src, size_t count) +{ + unsigned long from, len; + void *ra; + int rc; + + while (count) { + from = __pa(src); + if (!OLDMEM_BASE && from < sclp.hsa_size) { + /* Copy from zfcpdump HSA area */ + len = min(count, sclp.hsa_size - from); + rc = memcpy_hsa_kernel(dst, from, len); + if (rc) + return rc; + } else { + /* Check for swapped kdump oldmem areas */ + if (OLDMEM_BASE && from - OLDMEM_BASE < OLDMEM_SIZE) { + from -= OLDMEM_BASE; + len = min(count, OLDMEM_SIZE - from); + } else if (OLDMEM_BASE && from < OLDMEM_SIZE) { + len = min(count, OLDMEM_SIZE - from); + from += OLDMEM_BASE; + } else { + len = count; + } + if (is_vmalloc_or_module_addr(dst)) { + ra = load_real_addr(dst); + len = min(PAGE_SIZE - offset_in_page(ra), len); + } else { + ra = dst; + } + if (memcpy_real(ra, (void *) from, len)) + return -EFAULT; + } + dst += len; + src += len; + count -= len; + } + return 0; +} + +/* + * Copy memory of the old, dumped system to a user space virtual address + */ +static int copy_oldmem_user(void __user *dst, void *src, size_t count) +{ + unsigned long from, len; + int rc; + + while (count) { + from = __pa(src); + if (!OLDMEM_BASE && from < sclp.hsa_size) { + /* Copy from zfcpdump HSA area */ + len = min(count, sclp.hsa_size - from); + rc = memcpy_hsa_user(dst, from, len); + if (rc) + return rc; + } else { + /* Check for swapped kdump oldmem areas */ + if (OLDMEM_BASE && from - OLDMEM_BASE < OLDMEM_SIZE) { + from -= OLDMEM_BASE; + len = min(count, OLDMEM_SIZE - from); + } else if (OLDMEM_BASE && from < OLDMEM_SIZE) { + len = min(count, OLDMEM_SIZE - from); + from += OLDMEM_BASE; + } else { + len = count; + } + rc = copy_to_user_real(dst, (void *) from, count); + if (rc) + return rc; + } + dst += len; + src += len; + count -= len; + } + return 0; +} + +/* + * Copy one page from "oldmem" + */ +ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, + unsigned long offset, int userbuf) +{ + void *src; + int rc; + + if (!csize) + return 0; + src = (void *) (pfn << PAGE_SHIFT) + offset; + if (userbuf) + rc = copy_oldmem_user((void __force __user *) buf, src, csize); + else + rc = copy_oldmem_kernel((void *) buf, src, csize); + return rc; +} + +/* + * Remap "oldmem" for kdump + * + * For the kdump reserved memory this functions performs a swap operation: + * [0 - OLDMEM_SIZE] is mapped to [OLDMEM_BASE - OLDMEM_BASE + OLDMEM_SIZE] + */ +static int remap_oldmem_pfn_range_kdump(struct vm_area_struct *vma, + unsigned long from, unsigned long pfn, + unsigned long size, pgprot_t prot) +{ + unsigned long size_old; + int rc; + + if (pfn < OLDMEM_SIZE >> PAGE_SHIFT) { + size_old = min(size, OLDMEM_SIZE - (pfn << PAGE_SHIFT)); + rc = remap_pfn_range(vma, from, + pfn + (OLDMEM_BASE >> PAGE_SHIFT), + size_old, prot); + if (rc || size == size_old) + return rc; + size -= size_old; + from += size_old; + pfn += size_old >> PAGE_SHIFT; + } + return remap_pfn_range(vma, from, pfn, size, prot); +} + +/* + * Remap "oldmem" for zfcpdump + * + * We only map available memory above HSA size. Memory below HSA size + * is read on demand using the copy_oldmem_page() function. + */ +static int remap_oldmem_pfn_range_zfcpdump(struct vm_area_struct *vma, + unsigned long from, + unsigned long pfn, + unsigned long size, pgprot_t prot) +{ + unsigned long hsa_end = sclp.hsa_size; + unsigned long size_hsa; + + if (pfn < hsa_end >> PAGE_SHIFT) { + size_hsa = min(size, hsa_end - (pfn << PAGE_SHIFT)); + if (size == size_hsa) + return 0; + size -= size_hsa; + from += size_hsa; + pfn += size_hsa >> PAGE_SHIFT; + } + return remap_pfn_range(vma, from, pfn, size, prot); +} + +/* + * Remap "oldmem" for kdump or zfcpdump + */ +int remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + if (OLDMEM_BASE) + return remap_oldmem_pfn_range_kdump(vma, from, pfn, size, prot); + else + return remap_oldmem_pfn_range_zfcpdump(vma, from, pfn, size, + prot); +} + +static const char *nt_name(Elf64_Word type) +{ + const char *name = "LINUX"; + + if (type == NT_PRPSINFO || type == NT_PRSTATUS || type == NT_PRFPREG) + name = KEXEC_CORE_NOTE_NAME; + return name; +} + +/* + * Initialize ELF note + */ +static void *nt_init_name(void *buf, Elf64_Word type, void *desc, int d_len, + const char *name) +{ + Elf64_Nhdr *note; + u64 len; + + note = (Elf64_Nhdr *)buf; + note->n_namesz = strlen(name) + 1; + note->n_descsz = d_len; + note->n_type = type; + len = sizeof(Elf64_Nhdr); + + memcpy(buf + len, name, note->n_namesz); + len = roundup(len + note->n_namesz, 4); + + memcpy(buf + len, desc, note->n_descsz); + len = roundup(len + note->n_descsz, 4); + + return PTR_ADD(buf, len); +} + +static inline void *nt_init(void *buf, Elf64_Word type, void *desc, int d_len) +{ + return nt_init_name(buf, type, desc, d_len, nt_name(type)); +} + +/* + * Calculate the size of ELF note + */ +static size_t nt_size_name(int d_len, const char *name) +{ + size_t size; + + size = sizeof(Elf64_Nhdr); + size += roundup(strlen(name) + 1, 4); + size += roundup(d_len, 4); + + return size; +} + +static inline size_t nt_size(Elf64_Word type, int d_len) +{ + return nt_size_name(d_len, nt_name(type)); +} + +/* + * Fill ELF notes for one CPU with save area registers + */ +static void *fill_cpu_elf_notes(void *ptr, int cpu, struct save_area *sa) +{ + struct elf_prstatus nt_prstatus; + elf_fpregset_t nt_fpregset; + + /* Prepare prstatus note */ + memset(&nt_prstatus, 0, sizeof(nt_prstatus)); + memcpy(&nt_prstatus.pr_reg.gprs, sa->gprs, sizeof(sa->gprs)); + memcpy(&nt_prstatus.pr_reg.psw, sa->psw, sizeof(sa->psw)); + memcpy(&nt_prstatus.pr_reg.acrs, sa->acrs, sizeof(sa->acrs)); + nt_prstatus.pr_pid = cpu; + /* Prepare fpregset (floating point) note */ + memset(&nt_fpregset, 0, sizeof(nt_fpregset)); + memcpy(&nt_fpregset.fpc, &sa->fpc, sizeof(sa->fpc)); + memcpy(&nt_fpregset.fprs, &sa->fprs, sizeof(sa->fprs)); + /* Create ELF notes for the CPU */ + ptr = nt_init(ptr, NT_PRSTATUS, &nt_prstatus, sizeof(nt_prstatus)); + ptr = nt_init(ptr, NT_PRFPREG, &nt_fpregset, sizeof(nt_fpregset)); + ptr = nt_init(ptr, NT_S390_TIMER, &sa->timer, sizeof(sa->timer)); + ptr = nt_init(ptr, NT_S390_TODCMP, &sa->todcmp, sizeof(sa->todcmp)); + ptr = nt_init(ptr, NT_S390_TODPREG, &sa->todpreg, sizeof(sa->todpreg)); + ptr = nt_init(ptr, NT_S390_CTRS, &sa->ctrs, sizeof(sa->ctrs)); + ptr = nt_init(ptr, NT_S390_PREFIX, &sa->prefix, sizeof(sa->prefix)); + if (MACHINE_HAS_VX) { + ptr = nt_init(ptr, NT_S390_VXRS_HIGH, + &sa->vxrs_high, sizeof(sa->vxrs_high)); + ptr = nt_init(ptr, NT_S390_VXRS_LOW, + &sa->vxrs_low, sizeof(sa->vxrs_low)); + } + return ptr; +} + +/* + * Calculate size of ELF notes per cpu + */ +static size_t get_cpu_elf_notes_size(void) +{ + struct save_area *sa = NULL; + size_t size; + + size = nt_size(NT_PRSTATUS, sizeof(struct elf_prstatus)); + size += nt_size(NT_PRFPREG, sizeof(elf_fpregset_t)); + size += nt_size(NT_S390_TIMER, sizeof(sa->timer)); + size += nt_size(NT_S390_TODCMP, sizeof(sa->todcmp)); + size += nt_size(NT_S390_TODPREG, sizeof(sa->todpreg)); + size += nt_size(NT_S390_CTRS, sizeof(sa->ctrs)); + size += nt_size(NT_S390_PREFIX, sizeof(sa->prefix)); + if (MACHINE_HAS_VX) { + size += nt_size(NT_S390_VXRS_HIGH, sizeof(sa->vxrs_high)); + size += nt_size(NT_S390_VXRS_LOW, sizeof(sa->vxrs_low)); + } + + return size; +} + +/* + * Initialize prpsinfo note (new kernel) + */ +static void *nt_prpsinfo(void *ptr) +{ + struct elf_prpsinfo prpsinfo; + + memset(&prpsinfo, 0, sizeof(prpsinfo)); + prpsinfo.pr_sname = 'R'; + strcpy(prpsinfo.pr_fname, "vmlinux"); + return nt_init(ptr, NT_PRPSINFO, &prpsinfo, sizeof(prpsinfo)); +} + +/* + * Get vmcoreinfo using lowcore->vmcore_info (new kernel) + */ +static void *get_vmcoreinfo_old(unsigned long *size) +{ + char nt_name[11], *vmcoreinfo; + Elf64_Nhdr note; + void *addr; + + if (copy_oldmem_kernel(&addr, &S390_lowcore.vmcore_info, sizeof(addr))) + return NULL; + memset(nt_name, 0, sizeof(nt_name)); + if (copy_oldmem_kernel(¬e, addr, sizeof(note))) + return NULL; + if (copy_oldmem_kernel(nt_name, addr + sizeof(note), + sizeof(nt_name) - 1)) + return NULL; + if (strcmp(nt_name, VMCOREINFO_NOTE_NAME) != 0) + return NULL; + vmcoreinfo = kzalloc(note.n_descsz, GFP_KERNEL); + if (!vmcoreinfo) + return NULL; + if (copy_oldmem_kernel(vmcoreinfo, addr + 24, note.n_descsz)) { + kfree(vmcoreinfo); + return NULL; + } + *size = note.n_descsz; + return vmcoreinfo; +} + +/* + * Initialize vmcoreinfo note (new kernel) + */ +static void *nt_vmcoreinfo(void *ptr) +{ + const char *name = VMCOREINFO_NOTE_NAME; + unsigned long size; + void *vmcoreinfo; + + vmcoreinfo = os_info_old_entry(OS_INFO_VMCOREINFO, &size); + if (vmcoreinfo) + return nt_init_name(ptr, 0, vmcoreinfo, size, name); + + vmcoreinfo = get_vmcoreinfo_old(&size); + if (!vmcoreinfo) + return ptr; + ptr = nt_init_name(ptr, 0, vmcoreinfo, size, name); + kfree(vmcoreinfo); + return ptr; +} + +static size_t nt_vmcoreinfo_size(void) +{ + const char *name = VMCOREINFO_NOTE_NAME; + unsigned long size; + void *vmcoreinfo; + + vmcoreinfo = os_info_old_entry(OS_INFO_VMCOREINFO, &size); + if (vmcoreinfo) + return nt_size_name(size, name); + + vmcoreinfo = get_vmcoreinfo_old(&size); + if (!vmcoreinfo) + return 0; + + kfree(vmcoreinfo); + return nt_size_name(size, name); +} + +/* + * Initialize final note (needed for /proc/vmcore code) + */ +static void *nt_final(void *ptr) +{ + Elf64_Nhdr *note; + + note = (Elf64_Nhdr *) ptr; + note->n_namesz = 0; + note->n_descsz = 0; + note->n_type = 0; + return PTR_ADD(ptr, sizeof(Elf64_Nhdr)); +} + +/* + * Initialize ELF header (new kernel) + */ +static void *ehdr_init(Elf64_Ehdr *ehdr, int mem_chunk_cnt) +{ + memset(ehdr, 0, sizeof(*ehdr)); + memcpy(ehdr->e_ident, ELFMAG, SELFMAG); + ehdr->e_ident[EI_CLASS] = ELFCLASS64; + ehdr->e_ident[EI_DATA] = ELFDATA2MSB; + ehdr->e_ident[EI_VERSION] = EV_CURRENT; + memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD); + ehdr->e_type = ET_CORE; + ehdr->e_machine = EM_S390; + ehdr->e_version = EV_CURRENT; + ehdr->e_phoff = sizeof(Elf64_Ehdr); + ehdr->e_ehsize = sizeof(Elf64_Ehdr); + ehdr->e_phentsize = sizeof(Elf64_Phdr); + ehdr->e_phnum = mem_chunk_cnt + 1; + return ehdr + 1; +} + +/* + * Return CPU count for ELF header (new kernel) + */ +static int get_cpu_cnt(void) +{ + struct save_area *sa; + int cpus = 0; + + list_for_each_entry(sa, &dump_save_areas, list) + if (sa->prefix != 0) + cpus++; + return cpus; +} + +/* + * Return memory chunk count for ELF header (new kernel) + */ +static int get_mem_chunk_cnt(void) +{ + int cnt = 0; + u64 idx; + + for_each_mem_range(idx, &memblock.physmem, &oldmem_type, NUMA_NO_NODE, + MEMBLOCK_NONE, NULL, NULL, NULL) + cnt++; + return cnt; +} + +/* + * Initialize ELF loads (new kernel) + */ +static void loads_init(Elf64_Phdr *phdr, u64 loads_offset) +{ + phys_addr_t start, end; + u64 idx; + + for_each_mem_range(idx, &memblock.physmem, &oldmem_type, NUMA_NO_NODE, + MEMBLOCK_NONE, &start, &end, NULL) { + phdr->p_filesz = end - start; + phdr->p_type = PT_LOAD; + phdr->p_offset = start; + phdr->p_vaddr = start; + phdr->p_paddr = start; + phdr->p_memsz = end - start; + phdr->p_flags = PF_R | PF_W | PF_X; + phdr->p_align = PAGE_SIZE; + phdr++; + } +} + +/* + * Initialize notes (new kernel) + */ +static void *notes_init(Elf64_Phdr *phdr, void *ptr, u64 notes_offset) +{ + struct save_area *sa; + void *ptr_start = ptr; + int cpu; + + ptr = nt_prpsinfo(ptr); + + cpu = 1; + list_for_each_entry(sa, &dump_save_areas, list) + if (sa->prefix != 0) + ptr = fill_cpu_elf_notes(ptr, cpu++, sa); + ptr = nt_vmcoreinfo(ptr); + ptr = nt_final(ptr); + memset(phdr, 0, sizeof(*phdr)); + phdr->p_type = PT_NOTE; + phdr->p_offset = notes_offset; + phdr->p_filesz = (unsigned long) PTR_SUB(ptr, ptr_start); + phdr->p_memsz = phdr->p_filesz; + return ptr; +} + +static size_t get_elfcorehdr_size(int mem_chunk_cnt) +{ + size_t size; + + size = sizeof(Elf64_Ehdr); + /* PT_NOTES */ + size += sizeof(Elf64_Phdr); + /* nt_prpsinfo */ + size += nt_size(NT_PRPSINFO, sizeof(struct elf_prpsinfo)); + /* regsets */ + size += get_cpu_cnt() * get_cpu_elf_notes_size(); + /* nt_vmcoreinfo */ + size += nt_vmcoreinfo_size(); + /* nt_final */ + size += sizeof(Elf64_Nhdr); + /* PT_LOADS */ + size += mem_chunk_cnt * sizeof(Elf64_Phdr); + + return size; +} + +/* + * Create ELF core header (new kernel) + */ +int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size) +{ + Elf64_Phdr *phdr_notes, *phdr_loads; + int mem_chunk_cnt; + void *ptr, *hdr; + u32 alloc_size; + u64 hdr_off; + + /* If we are not in kdump or zfcpdump mode return */ + if (!OLDMEM_BASE && ipl_info.type != IPL_TYPE_FCP_DUMP) + return 0; + /* If we cannot get HSA size for zfcpdump return error */ + if (ipl_info.type == IPL_TYPE_FCP_DUMP && !sclp.hsa_size) + return -ENODEV; + + /* For kdump, exclude previous crashkernel memory */ + if (OLDMEM_BASE) { + oldmem_region.base = OLDMEM_BASE; + oldmem_region.size = OLDMEM_SIZE; + oldmem_type.total_size = OLDMEM_SIZE; + } + + mem_chunk_cnt = get_mem_chunk_cnt(); + + alloc_size = get_elfcorehdr_size(mem_chunk_cnt); + + hdr = kzalloc(alloc_size, GFP_KERNEL); + + /* Without elfcorehdr /proc/vmcore cannot be created. Thus creating + * a dump with this crash kernel will fail. Panic now to allow other + * dump mechanisms to take over. + */ + if (!hdr) + panic("s390 kdump allocating elfcorehdr failed"); + + /* Init elf header */ + ptr = ehdr_init(hdr, mem_chunk_cnt); + /* Init program headers */ + phdr_notes = ptr; + ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr)); + phdr_loads = ptr; + ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr) * mem_chunk_cnt); + /* Init notes */ + hdr_off = PTR_DIFF(ptr, hdr); + ptr = notes_init(phdr_notes, ptr, ((unsigned long) hdr) + hdr_off); + /* Init loads */ + hdr_off = PTR_DIFF(ptr, hdr); + loads_init(phdr_loads, hdr_off); + *addr = (unsigned long long) hdr; + *size = (unsigned long long) hdr_off; + BUG_ON(elfcorehdr_size > alloc_size); + return 0; +} + +/* + * Free ELF core header (new kernel) + */ +void elfcorehdr_free(unsigned long long addr) +{ + kfree((void *)(unsigned long)addr); +} + +/* + * Read from ELF header + */ +ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos) +{ + void *src = (void *)(unsigned long)*ppos; + + memcpy(buf, src, count); + *ppos += count; + return count; +} + +/* + * Read from ELF notes data + */ +ssize_t elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos) +{ + void *src = (void *)(unsigned long)*ppos; + + memcpy(buf, src, count); + *ppos += count; + return count; +} diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c new file mode 100644 index 000000000..04bbf7e97 --- /dev/null +++ b/arch/s390/kernel/debug.c @@ -0,0 +1,1461 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * S/390 debug facility + * + * Copyright IBM Corp. 1999, 2012 + * + * Author(s): Michael Holzheu (holzheu@de.ibm.com), + * Holger Smolinski (Holger.Smolinski@de.ibm.com) + * + * Bugreports to: <Linux390@de.ibm.com> + */ + +#define KMSG_COMPONENT "s390dbf" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/stddef.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/ctype.h> +#include <linux/string.h> +#include <linux/sysctl.h> +#include <linux/uaccess.h> +#include <linux/export.h> +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/debugfs.h> + +#include <asm/debug.h> + +#define DEBUG_PROLOG_ENTRY -1 + +#define ALL_AREAS 0 /* copy all debug areas */ +#define NO_AREAS 1 /* copy no debug areas */ + +/* typedefs */ + +typedef struct file_private_info { + loff_t offset; /* offset of last read in file */ + int act_area; /* number of last formated area */ + int act_page; /* act page in given area */ + int act_entry; /* last formated entry (offset */ + /* relative to beginning of last */ + /* formated page) */ + size_t act_entry_offset; /* up to this offset we copied */ + /* in last read the last formated */ + /* entry to userland */ + char temp_buf[2048]; /* buffer for output */ + debug_info_t *debug_info_org; /* original debug information */ + debug_info_t *debug_info_snap; /* snapshot of debug information */ + struct debug_view *view; /* used view of debug info */ +} file_private_info_t; + +typedef struct { + char *string; + /* + * This assumes that all args are converted into longs + * on L/390 this is the case for all types of parameter + * except of floats, and long long (32 bit) + * + */ + long args[0]; +} debug_sprintf_entry_t; + +/* internal function prototyes */ + +static int debug_init(void); +static ssize_t debug_output(struct file *file, char __user *user_buf, + size_t user_len, loff_t *offset); +static ssize_t debug_input(struct file *file, const char __user *user_buf, + size_t user_len, loff_t *offset); +static int debug_open(struct inode *inode, struct file *file); +static int debug_close(struct inode *inode, struct file *file); +static debug_info_t *debug_info_create(const char *name, int pages_per_area, + int nr_areas, int buf_size, umode_t mode); +static void debug_info_get(debug_info_t *); +static void debug_info_put(debug_info_t *); +static int debug_prolog_level_fn(debug_info_t *id, + struct debug_view *view, char *out_buf); +static int debug_input_level_fn(debug_info_t *id, struct debug_view *view, + struct file *file, const char __user *user_buf, + size_t user_buf_size, loff_t *offset); +static int debug_prolog_pages_fn(debug_info_t *id, + struct debug_view *view, char *out_buf); +static int debug_input_pages_fn(debug_info_t *id, struct debug_view *view, + struct file *file, const char __user *user_buf, + size_t user_buf_size, loff_t *offset); +static int debug_input_flush_fn(debug_info_t *id, struct debug_view *view, + struct file *file, const char __user *user_buf, + size_t user_buf_size, loff_t *offset); +static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view, + char *out_buf, const char *in_buf); +static int debug_raw_format_fn(debug_info_t *id, + struct debug_view *view, char *out_buf, + const char *in_buf); +static int debug_raw_header_fn(debug_info_t *id, struct debug_view *view, + int area, debug_entry_t *entry, char *out_buf); + +static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view, + char *out_buf, debug_sprintf_entry_t *curr_event); + +/* globals */ + +struct debug_view debug_raw_view = { + "raw", + NULL, + &debug_raw_header_fn, + &debug_raw_format_fn, + NULL, + NULL +}; +EXPORT_SYMBOL(debug_raw_view); + +struct debug_view debug_hex_ascii_view = { + "hex_ascii", + NULL, + &debug_dflt_header_fn, + &debug_hex_ascii_format_fn, + NULL, + NULL +}; +EXPORT_SYMBOL(debug_hex_ascii_view); + +static struct debug_view debug_level_view = { + "level", + &debug_prolog_level_fn, + NULL, + NULL, + &debug_input_level_fn, + NULL +}; + +static struct debug_view debug_pages_view = { + "pages", + &debug_prolog_pages_fn, + NULL, + NULL, + &debug_input_pages_fn, + NULL +}; + +static struct debug_view debug_flush_view = { + "flush", + NULL, + NULL, + NULL, + &debug_input_flush_fn, + NULL +}; + +struct debug_view debug_sprintf_view = { + "sprintf", + NULL, + &debug_dflt_header_fn, + (debug_format_proc_t *)&debug_sprintf_format_fn, + NULL, + NULL +}; +EXPORT_SYMBOL(debug_sprintf_view); + +/* used by dump analysis tools to determine version of debug feature */ +static unsigned int __used debug_feature_version = __DEBUG_FEATURE_VERSION; + +/* static globals */ + +static debug_info_t *debug_area_first; +static debug_info_t *debug_area_last; +static DEFINE_MUTEX(debug_mutex); + +static int initialized; +static int debug_critical; + +static const struct file_operations debug_file_ops = { + .owner = THIS_MODULE, + .read = debug_output, + .write = debug_input, + .open = debug_open, + .release = debug_close, + .llseek = no_llseek, +}; + +static struct dentry *debug_debugfs_root_entry; + +/* functions */ + +/* + * debug_areas_alloc + * - Debug areas are implemented as a threedimensonal array: + * areas[areanumber][pagenumber][pageoffset] + */ + +static debug_entry_t ***debug_areas_alloc(int pages_per_area, int nr_areas) +{ + debug_entry_t ***areas; + int i, j; + + areas = kmalloc_array(nr_areas, sizeof(debug_entry_t **), GFP_KERNEL); + if (!areas) + goto fail_malloc_areas; + for (i = 0; i < nr_areas; i++) { + /* GFP_NOWARN to avoid user triggerable WARN, we handle fails */ + areas[i] = kmalloc_array(pages_per_area, + sizeof(debug_entry_t *), + GFP_KERNEL | __GFP_NOWARN); + if (!areas[i]) + goto fail_malloc_areas2; + for (j = 0; j < pages_per_area; j++) { + areas[i][j] = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!areas[i][j]) { + for (j--; j >= 0 ; j--) + kfree(areas[i][j]); + kfree(areas[i]); + goto fail_malloc_areas2; + } + } + } + return areas; + +fail_malloc_areas2: + for (i--; i >= 0; i--) { + for (j = 0; j < pages_per_area; j++) + kfree(areas[i][j]); + kfree(areas[i]); + } + kfree(areas); +fail_malloc_areas: + return NULL; +} + +/* + * debug_info_alloc + * - alloc new debug-info + */ +static debug_info_t *debug_info_alloc(const char *name, int pages_per_area, + int nr_areas, int buf_size, int level, + int mode) +{ + debug_info_t *rc; + + /* alloc everything */ + rc = kmalloc(sizeof(debug_info_t), GFP_KERNEL); + if (!rc) + goto fail_malloc_rc; + rc->active_entries = kcalloc(nr_areas, sizeof(int), GFP_KERNEL); + if (!rc->active_entries) + goto fail_malloc_active_entries; + rc->active_pages = kcalloc(nr_areas, sizeof(int), GFP_KERNEL); + if (!rc->active_pages) + goto fail_malloc_active_pages; + if ((mode == ALL_AREAS) && (pages_per_area != 0)) { + rc->areas = debug_areas_alloc(pages_per_area, nr_areas); + if (!rc->areas) + goto fail_malloc_areas; + } else { + rc->areas = NULL; + } + + /* initialize members */ + spin_lock_init(&rc->lock); + rc->pages_per_area = pages_per_area; + rc->nr_areas = nr_areas; + rc->active_area = 0; + rc->level = level; + rc->buf_size = buf_size; + rc->entry_size = sizeof(debug_entry_t) + buf_size; + strlcpy(rc->name, name, sizeof(rc->name)); + memset(rc->views, 0, DEBUG_MAX_VIEWS * sizeof(struct debug_view *)); + memset(rc->debugfs_entries, 0, DEBUG_MAX_VIEWS * sizeof(struct dentry *)); + refcount_set(&(rc->ref_count), 0); + + return rc; + +fail_malloc_areas: + kfree(rc->active_pages); +fail_malloc_active_pages: + kfree(rc->active_entries); +fail_malloc_active_entries: + kfree(rc); +fail_malloc_rc: + return NULL; +} + +/* + * debug_areas_free + * - free all debug areas + */ +static void debug_areas_free(debug_info_t *db_info) +{ + int i, j; + + if (!db_info->areas) + return; + for (i = 0; i < db_info->nr_areas; i++) { + for (j = 0; j < db_info->pages_per_area; j++) + kfree(db_info->areas[i][j]); + kfree(db_info->areas[i]); + } + kfree(db_info->areas); + db_info->areas = NULL; +} + +/* + * debug_info_free + * - free memory debug-info + */ +static void debug_info_free(debug_info_t *db_info) +{ + debug_areas_free(db_info); + kfree(db_info->active_entries); + kfree(db_info->active_pages); + kfree(db_info); +} + +/* + * debug_info_create + * - create new debug-info + */ + +static debug_info_t *debug_info_create(const char *name, int pages_per_area, + int nr_areas, int buf_size, umode_t mode) +{ + debug_info_t *rc; + + rc = debug_info_alloc(name, pages_per_area, nr_areas, buf_size, + DEBUG_DEFAULT_LEVEL, ALL_AREAS); + if (!rc) + goto out; + + rc->mode = mode & ~S_IFMT; + + /* create root directory */ + rc->debugfs_root_entry = debugfs_create_dir(rc->name, + debug_debugfs_root_entry); + + /* append new element to linked list */ + if (!debug_area_first) { + /* first element in list */ + debug_area_first = rc; + rc->prev = NULL; + } else { + /* append element to end of list */ + debug_area_last->next = rc; + rc->prev = debug_area_last; + } + debug_area_last = rc; + rc->next = NULL; + + refcount_set(&rc->ref_count, 1); +out: + return rc; +} + +/* + * debug_info_copy + * - copy debug-info + */ +static debug_info_t *debug_info_copy(debug_info_t *in, int mode) +{ + unsigned long flags; + debug_info_t *rc; + int i, j; + + /* get a consistent copy of the debug areas */ + do { + rc = debug_info_alloc(in->name, in->pages_per_area, + in->nr_areas, in->buf_size, in->level, mode); + spin_lock_irqsave(&in->lock, flags); + if (!rc) + goto out; + /* has something changed in the meantime ? */ + if ((rc->pages_per_area == in->pages_per_area) && + (rc->nr_areas == in->nr_areas)) { + break; + } + spin_unlock_irqrestore(&in->lock, flags); + debug_info_free(rc); + } while (1); + + if (mode == NO_AREAS) + goto out; + + for (i = 0; i < in->nr_areas; i++) { + for (j = 0; j < in->pages_per_area; j++) + memcpy(rc->areas[i][j], in->areas[i][j], PAGE_SIZE); + } +out: + spin_unlock_irqrestore(&in->lock, flags); + return rc; +} + +/* + * debug_info_get + * - increments reference count for debug-info + */ +static void debug_info_get(debug_info_t *db_info) +{ + if (db_info) + refcount_inc(&db_info->ref_count); +} + +/* + * debug_info_put: + * - decreases reference count for debug-info and frees it if necessary + */ +static void debug_info_put(debug_info_t *db_info) +{ + int i; + + if (!db_info) + return; + if (refcount_dec_and_test(&db_info->ref_count)) { + for (i = 0; i < DEBUG_MAX_VIEWS; i++) { + if (!db_info->views[i]) + continue; + debugfs_remove(db_info->debugfs_entries[i]); + } + debugfs_remove(db_info->debugfs_root_entry); + if (db_info == debug_area_first) + debug_area_first = db_info->next; + if (db_info == debug_area_last) + debug_area_last = db_info->prev; + if (db_info->prev) + db_info->prev->next = db_info->next; + if (db_info->next) + db_info->next->prev = db_info->prev; + debug_info_free(db_info); + } +} + +/* + * debug_format_entry: + * - format one debug entry and return size of formated data + */ +static int debug_format_entry(file_private_info_t *p_info) +{ + debug_info_t *id_snap = p_info->debug_info_snap; + struct debug_view *view = p_info->view; + debug_entry_t *act_entry; + size_t len = 0; + + if (p_info->act_entry == DEBUG_PROLOG_ENTRY) { + /* print prolog */ + if (view->prolog_proc) + len += view->prolog_proc(id_snap, view, p_info->temp_buf); + goto out; + } + if (!id_snap->areas) /* this is true, if we have a prolog only view */ + goto out; /* or if 'pages_per_area' is 0 */ + act_entry = (debug_entry_t *) ((char *)id_snap->areas[p_info->act_area] + [p_info->act_page] + p_info->act_entry); + + if (act_entry->id.stck == 0LL) + goto out; /* empty entry */ + if (view->header_proc) + len += view->header_proc(id_snap, view, p_info->act_area, + act_entry, p_info->temp_buf + len); + if (view->format_proc) + len += view->format_proc(id_snap, view, p_info->temp_buf + len, + DEBUG_DATA(act_entry)); +out: + return len; +} + +/* + * debug_next_entry: + * - goto next entry in p_info + */ +static inline int debug_next_entry(file_private_info_t *p_info) +{ + debug_info_t *id; + + id = p_info->debug_info_snap; + if (p_info->act_entry == DEBUG_PROLOG_ENTRY) { + p_info->act_entry = 0; + p_info->act_page = 0; + goto out; + } + if (!id->areas) + return 1; + p_info->act_entry += id->entry_size; + /* switch to next page, if we reached the end of the page */ + if (p_info->act_entry > (PAGE_SIZE - id->entry_size)) { + /* next page */ + p_info->act_entry = 0; + p_info->act_page += 1; + if ((p_info->act_page % id->pages_per_area) == 0) { + /* next area */ + p_info->act_area++; + p_info->act_page = 0; + } + if (p_info->act_area >= id->nr_areas) + return 1; + } +out: + return 0; +} + +/* + * debug_output: + * - called for user read() + * - copies formated debug entries to the user buffer + */ +static ssize_t debug_output(struct file *file, /* file descriptor */ + char __user *user_buf, /* user buffer */ + size_t len, /* length of buffer */ + loff_t *offset) /* offset in the file */ +{ + size_t count = 0; + size_t entry_offset; + file_private_info_t *p_info; + + p_info = (file_private_info_t *) file->private_data; + if (*offset != p_info->offset) + return -EPIPE; + if (p_info->act_area >= p_info->debug_info_snap->nr_areas) + return 0; + entry_offset = p_info->act_entry_offset; + while (count < len) { + int formatted_line_residue; + int formatted_line_size; + int user_buf_residue; + size_t copy_size; + + formatted_line_size = debug_format_entry(p_info); + formatted_line_residue = formatted_line_size - entry_offset; + user_buf_residue = len-count; + copy_size = min(user_buf_residue, formatted_line_residue); + if (copy_size) { + if (copy_to_user(user_buf + count, p_info->temp_buf + + entry_offset, copy_size)) + return -EFAULT; + count += copy_size; + entry_offset += copy_size; + } + if (copy_size == formatted_line_residue) { + entry_offset = 0; + if (debug_next_entry(p_info)) + goto out; + } + } +out: + p_info->offset = *offset + count; + p_info->act_entry_offset = entry_offset; + *offset = p_info->offset; + return count; +} + +/* + * debug_input: + * - called for user write() + * - calls input function of view + */ +static ssize_t debug_input(struct file *file, const char __user *user_buf, + size_t length, loff_t *offset) +{ + file_private_info_t *p_info; + int rc = 0; + + mutex_lock(&debug_mutex); + p_info = ((file_private_info_t *) file->private_data); + if (p_info->view->input_proc) { + rc = p_info->view->input_proc(p_info->debug_info_org, + p_info->view, file, user_buf, + length, offset); + } else { + rc = -EPERM; + } + mutex_unlock(&debug_mutex); + return rc; /* number of input characters */ +} + +/* + * debug_open: + * - called for user open() + * - copies formated output to private_data area of the file + * handle + */ +static int debug_open(struct inode *inode, struct file *file) +{ + debug_info_t *debug_info, *debug_info_snapshot; + file_private_info_t *p_info; + int i, rc = 0; + + mutex_lock(&debug_mutex); + debug_info = file_inode(file)->i_private; + /* find debug view */ + for (i = 0; i < DEBUG_MAX_VIEWS; i++) { + if (!debug_info->views[i]) + continue; + else if (debug_info->debugfs_entries[i] == file->f_path.dentry) + goto found; /* found view ! */ + } + /* no entry found */ + rc = -EINVAL; + goto out; + +found: + + /* Make snapshot of current debug areas to get it consistent. */ + /* To copy all the areas is only needed, if we have a view which */ + /* formats the debug areas. */ + + if (!debug_info->views[i]->format_proc && !debug_info->views[i]->header_proc) + debug_info_snapshot = debug_info_copy(debug_info, NO_AREAS); + else + debug_info_snapshot = debug_info_copy(debug_info, ALL_AREAS); + + if (!debug_info_snapshot) { + rc = -ENOMEM; + goto out; + } + p_info = kmalloc(sizeof(file_private_info_t), GFP_KERNEL); + if (!p_info) { + debug_info_free(debug_info_snapshot); + rc = -ENOMEM; + goto out; + } + p_info->offset = 0; + p_info->debug_info_snap = debug_info_snapshot; + p_info->debug_info_org = debug_info; + p_info->view = debug_info->views[i]; + p_info->act_area = 0; + p_info->act_page = 0; + p_info->act_entry = DEBUG_PROLOG_ENTRY; + p_info->act_entry_offset = 0; + file->private_data = p_info; + debug_info_get(debug_info); + nonseekable_open(inode, file); +out: + mutex_unlock(&debug_mutex); + return rc; +} + +/* + * debug_close: + * - called for user close() + * - deletes private_data area of the file handle + */ +static int debug_close(struct inode *inode, struct file *file) +{ + file_private_info_t *p_info; + + p_info = (file_private_info_t *) file->private_data; + if (p_info->debug_info_snap) + debug_info_free(p_info->debug_info_snap); + debug_info_put(p_info->debug_info_org); + kfree(file->private_data); + return 0; /* success */ +} + +/* + * debug_register_mode: + * - Creates and initializes debug area for the caller + * The mode parameter allows to specify access rights for the s390dbf files + * - Returns handle for debug area + */ +debug_info_t *debug_register_mode(const char *name, int pages_per_area, + int nr_areas, int buf_size, umode_t mode, + uid_t uid, gid_t gid) +{ + debug_info_t *rc = NULL; + + /* Since debugfs currently does not support uid/gid other than root, */ + /* we do not allow gid/uid != 0 until we get support for that. */ + if ((uid != 0) || (gid != 0)) + pr_warn("Root becomes the owner of all s390dbf files in sysfs\n"); + BUG_ON(!initialized); + mutex_lock(&debug_mutex); + + /* create new debug_info */ + rc = debug_info_create(name, pages_per_area, nr_areas, buf_size, mode); + if (!rc) + goto out; + debug_register_view(rc, &debug_level_view); + debug_register_view(rc, &debug_flush_view); + debug_register_view(rc, &debug_pages_view); +out: + if (!rc) + pr_err("Registering debug feature %s failed\n", name); + mutex_unlock(&debug_mutex); + return rc; +} +EXPORT_SYMBOL(debug_register_mode); + +/* + * debug_register: + * - creates and initializes debug area for the caller + * - returns handle for debug area + */ +debug_info_t *debug_register(const char *name, int pages_per_area, + int nr_areas, int buf_size) +{ + return debug_register_mode(name, pages_per_area, nr_areas, buf_size, + S_IRUSR | S_IWUSR, 0, 0); +} +EXPORT_SYMBOL(debug_register); + +/* + * debug_unregister: + * - give back debug area + */ +void debug_unregister(debug_info_t *id) +{ + if (!id) + return; + mutex_lock(&debug_mutex); + debug_info_put(id); + mutex_unlock(&debug_mutex); +} +EXPORT_SYMBOL(debug_unregister); + +/* + * debug_set_size: + * - set area size (number of pages) and number of areas + */ +static int debug_set_size(debug_info_t *id, int nr_areas, int pages_per_area) +{ + debug_entry_t ***new_areas; + unsigned long flags; + int rc = 0; + + if (!id || (nr_areas <= 0) || (pages_per_area < 0)) + return -EINVAL; + if (pages_per_area > 0) { + new_areas = debug_areas_alloc(pages_per_area, nr_areas); + if (!new_areas) { + pr_info("Allocating memory for %i pages failed\n", + pages_per_area); + rc = -ENOMEM; + goto out; + } + } else { + new_areas = NULL; + } + spin_lock_irqsave(&id->lock, flags); + debug_areas_free(id); + id->areas = new_areas; + id->nr_areas = nr_areas; + id->pages_per_area = pages_per_area; + id->active_area = 0; + memset(id->active_entries, 0, sizeof(int)*id->nr_areas); + memset(id->active_pages, 0, sizeof(int)*id->nr_areas); + spin_unlock_irqrestore(&id->lock, flags); + pr_info("%s: set new size (%i pages)\n", id->name, pages_per_area); +out: + return rc; +} + +/* + * debug_set_level: + * - set actual debug level + */ +void debug_set_level(debug_info_t *id, int new_level) +{ + unsigned long flags; + + if (!id) + return; + spin_lock_irqsave(&id->lock, flags); + if (new_level == DEBUG_OFF_LEVEL) { + id->level = DEBUG_OFF_LEVEL; + pr_info("%s: switched off\n", id->name); + } else if ((new_level > DEBUG_MAX_LEVEL) || (new_level < 0)) { + pr_info("%s: level %i is out of range (%i - %i)\n", + id->name, new_level, 0, DEBUG_MAX_LEVEL); + } else { + id->level = new_level; + } + spin_unlock_irqrestore(&id->lock, flags); +} +EXPORT_SYMBOL(debug_set_level); + +/* + * proceed_active_entry: + * - set active entry to next in the ring buffer + */ +static inline void proceed_active_entry(debug_info_t *id) +{ + if ((id->active_entries[id->active_area] += id->entry_size) + > (PAGE_SIZE - id->entry_size)) { + id->active_entries[id->active_area] = 0; + id->active_pages[id->active_area] = + (id->active_pages[id->active_area] + 1) % + id->pages_per_area; + } +} + +/* + * proceed_active_area: + * - set active area to next in the ring buffer + */ +static inline void proceed_active_area(debug_info_t *id) +{ + id->active_area++; + id->active_area = id->active_area % id->nr_areas; +} + +/* + * get_active_entry: + */ +static inline debug_entry_t *get_active_entry(debug_info_t *id) +{ + return (debug_entry_t *) (((char *) id->areas[id->active_area] + [id->active_pages[id->active_area]]) + + id->active_entries[id->active_area]); +} + +/* + * debug_finish_entry: + * - set timestamp, caller address, cpu number etc. + */ + +static inline void debug_finish_entry(debug_info_t *id, debug_entry_t *active, + int level, int exception) +{ + active->id.stck = get_tod_clock_fast() - + *(unsigned long long *) &tod_clock_base[1]; + active->id.fields.cpuid = smp_processor_id(); + active->caller = __builtin_return_address(0); + active->id.fields.exception = exception; + active->id.fields.level = level; + proceed_active_entry(id); + if (exception) + proceed_active_area(id); +} + +static int debug_stoppable = 1; +static int debug_active = 1; + +#define CTL_S390DBF_STOPPABLE 5678 +#define CTL_S390DBF_ACTIVE 5679 + +/* + * proc handler for the running debug_active sysctl + * always allow read, allow write only if debug_stoppable is set or + * if debug_active is already off + */ +static int s390dbf_procactive(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + if (!write || debug_stoppable || !debug_active) + return proc_dointvec(table, write, buffer, lenp, ppos); + else + return 0; +} + +static struct ctl_table s390dbf_table[] = { + { + .procname = "debug_stoppable", + .data = &debug_stoppable, + .maxlen = sizeof(int), + .mode = S_IRUGO | S_IWUSR, + .proc_handler = proc_dointvec, + }, + { + .procname = "debug_active", + .data = &debug_active, + .maxlen = sizeof(int), + .mode = S_IRUGO | S_IWUSR, + .proc_handler = s390dbf_procactive, + }, + { } +}; + +static struct ctl_table s390dbf_dir_table[] = { + { + .procname = "s390dbf", + .maxlen = 0, + .mode = S_IRUGO | S_IXUGO, + .child = s390dbf_table, + }, + { } +}; + +static struct ctl_table_header *s390dbf_sysctl_header; + +void debug_stop_all(void) +{ + if (debug_stoppable) + debug_active = 0; +} +EXPORT_SYMBOL(debug_stop_all); + +void debug_set_critical(void) +{ + debug_critical = 1; +} + +/* + * debug_event_common: + * - write debug entry with given size + */ +debug_entry_t *debug_event_common(debug_info_t *id, int level, const void *buf, + int len) +{ + debug_entry_t *active; + unsigned long flags; + + if (!debug_active || !id->areas) + return NULL; + if (debug_critical) { + if (!spin_trylock_irqsave(&id->lock, flags)) + return NULL; + } else { + spin_lock_irqsave(&id->lock, flags); + } + do { + active = get_active_entry(id); + memcpy(DEBUG_DATA(active), buf, min(len, id->buf_size)); + if (len < id->buf_size) + memset((DEBUG_DATA(active)) + len, 0, id->buf_size - len); + debug_finish_entry(id, active, level, 0); + len -= id->buf_size; + buf += id->buf_size; + } while (len > 0); + + spin_unlock_irqrestore(&id->lock, flags); + return active; +} +EXPORT_SYMBOL(debug_event_common); + +/* + * debug_exception_common: + * - write debug entry with given size and switch to next debug area + */ +debug_entry_t *debug_exception_common(debug_info_t *id, int level, + const void *buf, int len) +{ + debug_entry_t *active; + unsigned long flags; + + if (!debug_active || !id->areas) + return NULL; + if (debug_critical) { + if (!spin_trylock_irqsave(&id->lock, flags)) + return NULL; + } else { + spin_lock_irqsave(&id->lock, flags); + } + do { + active = get_active_entry(id); + memcpy(DEBUG_DATA(active), buf, min(len, id->buf_size)); + if (len < id->buf_size) + memset((DEBUG_DATA(active)) + len, 0, id->buf_size - len); + debug_finish_entry(id, active, level, len <= id->buf_size); + len -= id->buf_size; + buf += id->buf_size; + } while (len > 0); + + spin_unlock_irqrestore(&id->lock, flags); + return active; +} +EXPORT_SYMBOL(debug_exception_common); + +/* + * counts arguments in format string for sprintf view + */ +static inline int debug_count_numargs(char *string) +{ + int numargs = 0; + + while (*string) { + if (*string++ == '%') + numargs++; + } + return numargs; +} + +/* + * debug_sprintf_event: + */ +debug_entry_t *__debug_sprintf_event(debug_info_t *id, int level, char *string, ...) +{ + debug_sprintf_entry_t *curr_event; + debug_entry_t *active; + unsigned long flags; + int numargs, idx; + va_list ap; + + if (!debug_active || !id->areas) + return NULL; + numargs = debug_count_numargs(string); + + if (debug_critical) { + if (!spin_trylock_irqsave(&id->lock, flags)) + return NULL; + } else { + spin_lock_irqsave(&id->lock, flags); + } + active = get_active_entry(id); + curr_event = (debug_sprintf_entry_t *) DEBUG_DATA(active); + va_start(ap, string); + curr_event->string = string; + for (idx = 0; idx < min(numargs, (int)(id->buf_size / sizeof(long)) - 1); idx++) + curr_event->args[idx] = va_arg(ap, long); + va_end(ap); + debug_finish_entry(id, active, level, 0); + spin_unlock_irqrestore(&id->lock, flags); + + return active; +} +EXPORT_SYMBOL(__debug_sprintf_event); + +/* + * debug_sprintf_exception: + */ +debug_entry_t *__debug_sprintf_exception(debug_info_t *id, int level, char *string, ...) +{ + debug_sprintf_entry_t *curr_event; + debug_entry_t *active; + unsigned long flags; + int numargs, idx; + va_list ap; + + if (!debug_active || !id->areas) + return NULL; + + numargs = debug_count_numargs(string); + + if (debug_critical) { + if (!spin_trylock_irqsave(&id->lock, flags)) + return NULL; + } else { + spin_lock_irqsave(&id->lock, flags); + } + active = get_active_entry(id); + curr_event = (debug_sprintf_entry_t *)DEBUG_DATA(active); + va_start(ap, string); + curr_event->string = string; + for (idx = 0; idx < min(numargs, (int)(id->buf_size / sizeof(long)) - 1); idx++) + curr_event->args[idx] = va_arg(ap, long); + va_end(ap); + debug_finish_entry(id, active, level, 1); + spin_unlock_irqrestore(&id->lock, flags); + + return active; +} +EXPORT_SYMBOL(__debug_sprintf_exception); + +/* + * debug_register_view: + */ +int debug_register_view(debug_info_t *id, struct debug_view *view) +{ + unsigned long flags; + struct dentry *pde; + umode_t mode; + int rc = 0; + int i; + + if (!id) + goto out; + mode = (id->mode | S_IFREG) & ~S_IXUGO; + if (!(view->prolog_proc || view->format_proc || view->header_proc)) + mode &= ~(S_IRUSR | S_IRGRP | S_IROTH); + if (!view->input_proc) + mode &= ~(S_IWUSR | S_IWGRP | S_IWOTH); + pde = debugfs_create_file(view->name, mode, id->debugfs_root_entry, + id, &debug_file_ops); + if (!pde) { + pr_err("Registering view %s/%s failed due to out of " + "memory\n", id->name, view->name); + rc = -1; + goto out; + } + spin_lock_irqsave(&id->lock, flags); + for (i = 0; i < DEBUG_MAX_VIEWS; i++) { + if (!id->views[i]) + break; + } + if (i == DEBUG_MAX_VIEWS) { + pr_err("Registering view %s/%s would exceed the maximum " + "number of views %i\n", id->name, view->name, i); + rc = -1; + } else { + id->views[i] = view; + id->debugfs_entries[i] = pde; + } + spin_unlock_irqrestore(&id->lock, flags); + if (rc) + debugfs_remove(pde); +out: + return rc; +} +EXPORT_SYMBOL(debug_register_view); + +/* + * debug_unregister_view: + */ +int debug_unregister_view(debug_info_t *id, struct debug_view *view) +{ + struct dentry *dentry = NULL; + unsigned long flags; + int i, rc = 0; + + if (!id) + goto out; + spin_lock_irqsave(&id->lock, flags); + for (i = 0; i < DEBUG_MAX_VIEWS; i++) { + if (id->views[i] == view) + break; + } + if (i == DEBUG_MAX_VIEWS) { + rc = -1; + } else { + dentry = id->debugfs_entries[i]; + id->views[i] = NULL; + id->debugfs_entries[i] = NULL; + } + spin_unlock_irqrestore(&id->lock, flags); + debugfs_remove(dentry); +out: + return rc; +} +EXPORT_SYMBOL(debug_unregister_view); + +static inline char *debug_get_user_string(const char __user *user_buf, + size_t user_len) +{ + char *buffer; + + buffer = kmalloc(user_len + 1, GFP_KERNEL); + if (!buffer) + return ERR_PTR(-ENOMEM); + if (copy_from_user(buffer, user_buf, user_len) != 0) { + kfree(buffer); + return ERR_PTR(-EFAULT); + } + /* got the string, now strip linefeed. */ + if (buffer[user_len - 1] == '\n') + buffer[user_len - 1] = 0; + else + buffer[user_len] = 0; + return buffer; +} + +static inline int debug_get_uint(char *buf) +{ + int rc; + + buf = skip_spaces(buf); + rc = simple_strtoul(buf, &buf, 10); + if (*buf) + rc = -EINVAL; + return rc; +} + +/* + * functions for debug-views + *********************************** +*/ + +/* + * prints out actual debug level + */ + +static int debug_prolog_pages_fn(debug_info_t *id, struct debug_view *view, + char *out_buf) +{ + return sprintf(out_buf, "%i\n", id->pages_per_area); +} + +/* + * reads new size (number of pages per debug area) + */ + +static int debug_input_pages_fn(debug_info_t *id, struct debug_view *view, + struct file *file, const char __user *user_buf, + size_t user_len, loff_t *offset) +{ + int rc, new_pages; + char *str; + + if (user_len > 0x10000) + user_len = 0x10000; + if (*offset != 0) { + rc = -EPIPE; + goto out; + } + str = debug_get_user_string(user_buf, user_len); + if (IS_ERR(str)) { + rc = PTR_ERR(str); + goto out; + } + new_pages = debug_get_uint(str); + if (new_pages < 0) { + rc = -EINVAL; + goto free_str; + } + rc = debug_set_size(id, id->nr_areas, new_pages); + if (rc != 0) { + rc = -EINVAL; + goto free_str; + } + rc = user_len; +free_str: + kfree(str); +out: + *offset += user_len; + return rc; /* number of input characters */ +} + +/* + * prints out actual debug level + */ +static int debug_prolog_level_fn(debug_info_t *id, struct debug_view *view, + char *out_buf) +{ + int rc = 0; + + if (id->level == DEBUG_OFF_LEVEL) + rc = sprintf(out_buf, "-\n"); + else + rc = sprintf(out_buf, "%i\n", id->level); + return rc; +} + +/* + * reads new debug level + */ +static int debug_input_level_fn(debug_info_t *id, struct debug_view *view, + struct file *file, const char __user *user_buf, + size_t user_len, loff_t *offset) +{ + int rc, new_level; + char *str; + + if (user_len > 0x10000) + user_len = 0x10000; + if (*offset != 0) { + rc = -EPIPE; + goto out; + } + str = debug_get_user_string(user_buf, user_len); + if (IS_ERR(str)) { + rc = PTR_ERR(str); + goto out; + } + if (str[0] == '-') { + debug_set_level(id, DEBUG_OFF_LEVEL); + rc = user_len; + goto free_str; + } else { + new_level = debug_get_uint(str); + } + if (new_level < 0) { + pr_warn("%s is not a valid level for a debug feature\n", str); + rc = -EINVAL; + } else { + debug_set_level(id, new_level); + rc = user_len; + } +free_str: + kfree(str); +out: + *offset += user_len; + return rc; /* number of input characters */ +} + +/* + * flushes debug areas + */ +static void debug_flush(debug_info_t *id, int area) +{ + unsigned long flags; + int i, j; + + if (!id || !id->areas) + return; + spin_lock_irqsave(&id->lock, flags); + if (area == DEBUG_FLUSH_ALL) { + id->active_area = 0; + memset(id->active_entries, 0, id->nr_areas * sizeof(int)); + for (i = 0; i < id->nr_areas; i++) { + id->active_pages[i] = 0; + for (j = 0; j < id->pages_per_area; j++) + memset(id->areas[i][j], 0, PAGE_SIZE); + } + } else if (area >= 0 && area < id->nr_areas) { + id->active_entries[area] = 0; + id->active_pages[area] = 0; + for (i = 0; i < id->pages_per_area; i++) + memset(id->areas[area][i], 0, PAGE_SIZE); + } + spin_unlock_irqrestore(&id->lock, flags); +} + +/* + * view function: flushes debug areas + */ +static int debug_input_flush_fn(debug_info_t *id, struct debug_view *view, + struct file *file, const char __user *user_buf, + size_t user_len, loff_t *offset) +{ + char input_buf[1]; + int rc = user_len; + + if (user_len > 0x10000) + user_len = 0x10000; + if (*offset != 0) { + rc = -EPIPE; + goto out; + } + if (copy_from_user(input_buf, user_buf, 1)) { + rc = -EFAULT; + goto out; + } + if (input_buf[0] == '-') { + debug_flush(id, DEBUG_FLUSH_ALL); + goto out; + } + if (isdigit(input_buf[0])) { + int area = ((int) input_buf[0] - (int) '0'); + + debug_flush(id, area); + goto out; + } + + pr_info("Flushing debug data failed because %c is not a valid " + "area\n", input_buf[0]); + +out: + *offset += user_len; + return rc; /* number of input characters */ +} + +/* + * prints debug header in raw format + */ +static int debug_raw_header_fn(debug_info_t *id, struct debug_view *view, + int area, debug_entry_t *entry, char *out_buf) +{ + int rc; + + rc = sizeof(debug_entry_t); + memcpy(out_buf, entry, sizeof(debug_entry_t)); + return rc; +} + +/* + * prints debug data in raw format + */ +static int debug_raw_format_fn(debug_info_t *id, struct debug_view *view, + char *out_buf, const char *in_buf) +{ + int rc; + + rc = id->buf_size; + memcpy(out_buf, in_buf, id->buf_size); + return rc; +} + +/* + * prints debug data in hex/ascii format + */ +static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view, + char *out_buf, const char *in_buf) +{ + int i, rc = 0; + + for (i = 0; i < id->buf_size; i++) + rc += sprintf(out_buf + rc, "%02x ", ((unsigned char *) in_buf)[i]); + rc += sprintf(out_buf + rc, "| "); + for (i = 0; i < id->buf_size; i++) { + unsigned char c = in_buf[i]; + + if (isascii(c) && isprint(c)) + rc += sprintf(out_buf + rc, "%c", c); + else + rc += sprintf(out_buf + rc, "."); + } + rc += sprintf(out_buf + rc, "\n"); + return rc; +} + +/* + * prints header for debug entry + */ +int debug_dflt_header_fn(debug_info_t *id, struct debug_view *view, + int area, debug_entry_t *entry, char *out_buf) +{ + unsigned long base, sec, usec; + unsigned long caller; + unsigned int level; + char *except_str; + int rc = 0; + + level = entry->id.fields.level; + base = (*(unsigned long *) &tod_clock_base[0]) >> 4; + sec = (entry->id.stck >> 12) + base - (TOD_UNIX_EPOCH >> 12); + usec = do_div(sec, USEC_PER_SEC); + + if (entry->id.fields.exception) + except_str = "*"; + else + except_str = "-"; + caller = (unsigned long) entry->caller; + rc += sprintf(out_buf, "%02i %011ld:%06lu %1u %1s %02i %pK ", + area, sec, usec, level, except_str, + entry->id.fields.cpuid, (void *)caller); + return rc; +} +EXPORT_SYMBOL(debug_dflt_header_fn); + +/* + * prints debug data sprintf-formated: + * debug_sprinf_event/exception calls must be used together with this view + */ + +#define DEBUG_SPRINTF_MAX_ARGS 10 + +static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view, + char *out_buf, debug_sprintf_entry_t *curr_event) +{ + int num_longs, num_used_args = 0, i, rc = 0; + int index[DEBUG_SPRINTF_MAX_ARGS]; + + /* count of longs fit into one entry */ + num_longs = id->buf_size / sizeof(long); + + if (num_longs < 1) + goto out; /* bufsize of entry too small */ + if (num_longs == 1) { + /* no args, we use only the string */ + strcpy(out_buf, curr_event->string); + rc = strlen(curr_event->string); + goto out; + } + + /* number of arguments used for sprintf (without the format string) */ + num_used_args = min(DEBUG_SPRINTF_MAX_ARGS, (num_longs - 1)); + + memset(index, 0, DEBUG_SPRINTF_MAX_ARGS * sizeof(int)); + + for (i = 0; i < num_used_args; i++) + index[i] = i; + + rc = sprintf(out_buf, curr_event->string, curr_event->args[index[0]], + curr_event->args[index[1]], curr_event->args[index[2]], + curr_event->args[index[3]], curr_event->args[index[4]], + curr_event->args[index[5]], curr_event->args[index[6]], + curr_event->args[index[7]], curr_event->args[index[8]], + curr_event->args[index[9]]); +out: + return rc; +} + +/* + * debug_init: + * - is called exactly once to initialize the debug feature + */ +static int __init debug_init(void) +{ + s390dbf_sysctl_header = register_sysctl_table(s390dbf_dir_table); + mutex_lock(&debug_mutex); + debug_debugfs_root_entry = debugfs_create_dir(DEBUG_DIR_ROOT, NULL); + initialized = 1; + mutex_unlock(&debug_mutex); + return 0; +} +postcore_initcall(debug_init); diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c new file mode 100644 index 000000000..4c7cf8787 --- /dev/null +++ b/arch/s390/kernel/diag.c @@ -0,0 +1,268 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Implementation of s390 diagnose codes + * + * Copyright IBM Corp. 2007 + * Author(s): Michael Holzheu <holzheu@de.ibm.com> + */ + +#include <linux/export.h> +#include <linux/init.h> +#include <linux/cpu.h> +#include <linux/seq_file.h> +#include <linux/debugfs.h> +#include <asm/diag.h> +#include <asm/trace/diag.h> + +struct diag_stat { + unsigned int counter[NR_DIAG_STAT]; +}; + +static DEFINE_PER_CPU(struct diag_stat, diag_stat); + +struct diag_desc { + int code; + char *name; +}; + +static const struct diag_desc diag_map[NR_DIAG_STAT] = { + [DIAG_STAT_X008] = { .code = 0x008, .name = "Console Function" }, + [DIAG_STAT_X00C] = { .code = 0x00c, .name = "Pseudo Timer" }, + [DIAG_STAT_X010] = { .code = 0x010, .name = "Release Pages" }, + [DIAG_STAT_X014] = { .code = 0x014, .name = "Spool File Services" }, + [DIAG_STAT_X044] = { .code = 0x044, .name = "Voluntary Timeslice End" }, + [DIAG_STAT_X064] = { .code = 0x064, .name = "NSS Manipulation" }, + [DIAG_STAT_X09C] = { .code = 0x09c, .name = "Relinquish Timeslice" }, + [DIAG_STAT_X0DC] = { .code = 0x0dc, .name = "Appldata Control" }, + [DIAG_STAT_X204] = { .code = 0x204, .name = "Logical-CPU Utilization" }, + [DIAG_STAT_X210] = { .code = 0x210, .name = "Device Information" }, + [DIAG_STAT_X224] = { .code = 0x224, .name = "EBCDIC-Name Table" }, + [DIAG_STAT_X250] = { .code = 0x250, .name = "Block I/O" }, + [DIAG_STAT_X258] = { .code = 0x258, .name = "Page-Reference Services" }, + [DIAG_STAT_X26C] = { .code = 0x26c, .name = "Certain System Information" }, + [DIAG_STAT_X288] = { .code = 0x288, .name = "Time Bomb" }, + [DIAG_STAT_X2C4] = { .code = 0x2c4, .name = "FTP Services" }, + [DIAG_STAT_X2FC] = { .code = 0x2fc, .name = "Guest Performance Data" }, + [DIAG_STAT_X304] = { .code = 0x304, .name = "Partition-Resource Service" }, + [DIAG_STAT_X308] = { .code = 0x308, .name = "List-Directed IPL" }, + [DIAG_STAT_X500] = { .code = 0x500, .name = "Virtio Service" }, +}; + +static int show_diag_stat(struct seq_file *m, void *v) +{ + struct diag_stat *stat; + unsigned long n = (unsigned long) v - 1; + int cpu, prec, tmp; + + get_online_cpus(); + if (n == 0) { + seq_puts(m, " "); + + for_each_online_cpu(cpu) { + prec = 10; + for (tmp = 10; cpu >= tmp; tmp *= 10) + prec--; + seq_printf(m, "%*s%d", prec, "CPU", cpu); + } + seq_putc(m, '\n'); + } else if (n <= NR_DIAG_STAT) { + seq_printf(m, "diag %03x:", diag_map[n-1].code); + for_each_online_cpu(cpu) { + stat = &per_cpu(diag_stat, cpu); + seq_printf(m, " %10u", stat->counter[n-1]); + } + seq_printf(m, " %s\n", diag_map[n-1].name); + } + put_online_cpus(); + return 0; +} + +static void *show_diag_stat_start(struct seq_file *m, loff_t *pos) +{ + return *pos <= NR_DIAG_STAT ? (void *)((unsigned long) *pos + 1) : NULL; +} + +static void *show_diag_stat_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return show_diag_stat_start(m, pos); +} + +static void show_diag_stat_stop(struct seq_file *m, void *v) +{ +} + +static const struct seq_operations show_diag_stat_sops = { + .start = show_diag_stat_start, + .next = show_diag_stat_next, + .stop = show_diag_stat_stop, + .show = show_diag_stat, +}; + +static int show_diag_stat_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &show_diag_stat_sops); +} + +static const struct file_operations show_diag_stat_fops = { + .open = show_diag_stat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + + +static int __init show_diag_stat_init(void) +{ + debugfs_create_file("diag_stat", 0400, NULL, NULL, + &show_diag_stat_fops); + return 0; +} + +device_initcall(show_diag_stat_init); + +void diag_stat_inc(enum diag_stat_enum nr) +{ + this_cpu_inc(diag_stat.counter[nr]); + trace_s390_diagnose(diag_map[nr].code); +} +EXPORT_SYMBOL(diag_stat_inc); + +void notrace diag_stat_inc_norecursion(enum diag_stat_enum nr) +{ + this_cpu_inc(diag_stat.counter[nr]); + trace_s390_diagnose_norecursion(diag_map[nr].code); +} +EXPORT_SYMBOL(diag_stat_inc_norecursion); + +/* + * Diagnose 14: Input spool file manipulation + */ +static inline int __diag14(unsigned long rx, unsigned long ry1, + unsigned long subcode) +{ + register unsigned long _ry1 asm("2") = ry1; + register unsigned long _ry2 asm("3") = subcode; + int rc = 0; + + asm volatile( + " sam31\n" + " diag %2,2,0x14\n" + " sam64\n" + " ipm %0\n" + " srl %0,28\n" + : "=d" (rc), "+d" (_ry2) + : "d" (rx), "d" (_ry1) + : "cc"); + + return rc; +} + +int diag14(unsigned long rx, unsigned long ry1, unsigned long subcode) +{ + diag_stat_inc(DIAG_STAT_X014); + return __diag14(rx, ry1, subcode); +} +EXPORT_SYMBOL(diag14); + +static inline int __diag204(unsigned long *subcode, unsigned long size, void *addr) +{ + register unsigned long _subcode asm("0") = *subcode; + register unsigned long _size asm("1") = size; + + asm volatile( + " diag %2,%0,0x204\n" + "0: nopr %%r7\n" + EX_TABLE(0b,0b) + : "+d" (_subcode), "+d" (_size) : "d" (addr) : "memory"); + *subcode = _subcode; + return _size; +} + +int diag204(unsigned long subcode, unsigned long size, void *addr) +{ + diag_stat_inc(DIAG_STAT_X204); + size = __diag204(&subcode, size, addr); + if (subcode) + return -1; + return size; +} +EXPORT_SYMBOL(diag204); + +/* + * Diagnose 210: Get information about a virtual device + */ +int diag210(struct diag210 *addr) +{ + /* + * diag 210 needs its data below the 2GB border, so we + * use a static data area to be sure + */ + static struct diag210 diag210_tmp; + static DEFINE_SPINLOCK(diag210_lock); + unsigned long flags; + int ccode; + + spin_lock_irqsave(&diag210_lock, flags); + diag210_tmp = *addr; + + diag_stat_inc(DIAG_STAT_X210); + asm volatile( + " lhi %0,-1\n" + " sam31\n" + " diag %1,0,0x210\n" + "0: ipm %0\n" + " srl %0,28\n" + "1: sam64\n" + EX_TABLE(0b, 1b) + : "=&d" (ccode) : "a" (&diag210_tmp) : "cc", "memory"); + + *addr = diag210_tmp; + spin_unlock_irqrestore(&diag210_lock, flags); + + return ccode; +} +EXPORT_SYMBOL(diag210); + +int diag224(void *ptr) +{ + int rc = -EOPNOTSUPP; + + diag_stat_inc(DIAG_STAT_X224); + asm volatile( + " diag %1,%2,0x224\n" + "0: lhi %0,0x0\n" + "1:\n" + EX_TABLE(0b,1b) + : "+d" (rc) :"d" (0), "d" (ptr) : "memory"); + return rc; +} +EXPORT_SYMBOL(diag224); + +/* + * Diagnose 26C: Access Certain System Information + */ +static inline int __diag26c(void *req, void *resp, enum diag26c_sc subcode) +{ + register unsigned long _req asm("2") = (addr_t) req; + register unsigned long _resp asm("3") = (addr_t) resp; + register unsigned long _subcode asm("4") = subcode; + register unsigned long _rc asm("5") = -EOPNOTSUPP; + + asm volatile( + " sam31\n" + " diag %[rx],%[ry],0x26c\n" + "0: sam64\n" + EX_TABLE(0b,0b) + : "+d" (_rc) + : [rx] "d" (_req), "d" (_resp), [ry] "d" (_subcode) + : "cc", "memory"); + return _rc; +} + +int diag26c(void *req, void *resp, enum diag26c_sc subcode) +{ + diag_stat_inc(DIAG_STAT_X26C); + return __diag26c(req, resp, subcode); +} +EXPORT_SYMBOL(diag26c); diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c new file mode 100644 index 000000000..01307db4b --- /dev/null +++ b/arch/s390/kernel/dis.c @@ -0,0 +1,582 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Disassemble s390 instructions. + * + * Copyright IBM Corp. 2007 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), + */ + +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/ptrace.h> +#include <linux/timer.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/delay.h> +#include <linux/export.h> +#include <linux/kallsyms.h> +#include <linux/reboot.h> +#include <linux/kprobes.h> +#include <linux/kdebug.h> +#include <linux/uaccess.h> +#include <linux/atomic.h> +#include <asm/dis.h> +#include <asm/io.h> +#include <asm/cpcmd.h> +#include <asm/lowcore.h> +#include <asm/debug.h> +#include <asm/irq.h> + +/* Type of operand */ +#define OPERAND_GPR 0x1 /* Operand printed as %rx */ +#define OPERAND_FPR 0x2 /* Operand printed as %fx */ +#define OPERAND_AR 0x4 /* Operand printed as %ax */ +#define OPERAND_CR 0x8 /* Operand printed as %cx */ +#define OPERAND_VR 0x10 /* Operand printed as %vx */ +#define OPERAND_DISP 0x20 /* Operand printed as displacement */ +#define OPERAND_BASE 0x40 /* Operand printed as base register */ +#define OPERAND_INDEX 0x80 /* Operand printed as index register */ +#define OPERAND_PCREL 0x100 /* Operand printed as pc-relative symbol */ +#define OPERAND_SIGNED 0x200 /* Operand printed as signed value */ +#define OPERAND_LENGTH 0x400 /* Operand printed as length (+1) */ + +struct s390_operand { + unsigned char bits; /* The number of bits in the operand. */ + unsigned char shift; /* The number of bits to shift. */ + unsigned short flags; /* One bit syntax flags. */ +}; + +struct s390_insn { + union { + const char name[5]; + struct { + unsigned char zero; + unsigned int offset; + } __packed; + }; + unsigned char opfrag; + unsigned char format; +}; + +struct s390_opcode_offset { + unsigned char opcode; + unsigned char mask; + unsigned char byte; + unsigned short offset; + unsigned short count; +} __packed; + +enum { + UNUSED, + A_8, /* Access reg. starting at position 8 */ + A_12, /* Access reg. starting at position 12 */ + A_24, /* Access reg. starting at position 24 */ + A_28, /* Access reg. starting at position 28 */ + B_16, /* Base register starting at position 16 */ + B_32, /* Base register starting at position 32 */ + C_8, /* Control reg. starting at position 8 */ + C_12, /* Control reg. starting at position 12 */ + D20_20, /* 20 bit displacement starting at 20 */ + D_20, /* Displacement starting at position 20 */ + D_36, /* Displacement starting at position 36 */ + F_8, /* FPR starting at position 8 */ + F_12, /* FPR starting at position 12 */ + F_16, /* FPR starting at position 16 */ + F_24, /* FPR starting at position 24 */ + F_28, /* FPR starting at position 28 */ + F_32, /* FPR starting at position 32 */ + I8_8, /* 8 bit signed value starting at 8 */ + I8_32, /* 8 bit signed value starting at 32 */ + I16_16, /* 16 bit signed value starting at 16 */ + I16_32, /* 16 bit signed value starting at 32 */ + I32_16, /* 32 bit signed value starting at 16 */ + J12_12, /* 12 bit PC relative offset at 12 */ + J16_16, /* 16 bit PC relative offset at 16 */ + J16_32, /* 16 bit PC relative offset at 32 */ + J24_24, /* 24 bit PC relative offset at 24 */ + J32_16, /* 32 bit PC relative offset at 16 */ + L4_8, /* 4 bit length starting at position 8 */ + L4_12, /* 4 bit length starting at position 12 */ + L8_8, /* 8 bit length starting at position 8 */ + R_8, /* GPR starting at position 8 */ + R_12, /* GPR starting at position 12 */ + R_16, /* GPR starting at position 16 */ + R_24, /* GPR starting at position 24 */ + R_28, /* GPR starting at position 28 */ + U4_8, /* 4 bit unsigned value starting at 8 */ + U4_12, /* 4 bit unsigned value starting at 12 */ + U4_16, /* 4 bit unsigned value starting at 16 */ + U4_20, /* 4 bit unsigned value starting at 20 */ + U4_24, /* 4 bit unsigned value starting at 24 */ + U4_28, /* 4 bit unsigned value starting at 28 */ + U4_32, /* 4 bit unsigned value starting at 32 */ + U4_36, /* 4 bit unsigned value starting at 36 */ + U8_8, /* 8 bit unsigned value starting at 8 */ + U8_16, /* 8 bit unsigned value starting at 16 */ + U8_24, /* 8 bit unsigned value starting at 24 */ + U8_28, /* 8 bit unsigned value starting at 28 */ + U8_32, /* 8 bit unsigned value starting at 32 */ + U12_16, /* 12 bit unsigned value starting at 16 */ + U16_16, /* 16 bit unsigned value starting at 16 */ + U16_32, /* 16 bit unsigned value starting at 32 */ + U32_16, /* 32 bit unsigned value starting at 16 */ + VX_12, /* Vector index register starting at position 12 */ + V_8, /* Vector reg. starting at position 8 */ + V_12, /* Vector reg. starting at position 12 */ + V_16, /* Vector reg. starting at position 16 */ + V_32, /* Vector reg. starting at position 32 */ + X_12, /* Index register starting at position 12 */ +}; + +static const struct s390_operand operands[] = { + [UNUSED] = { 0, 0, 0 }, + [A_8] = { 4, 8, OPERAND_AR }, + [A_12] = { 4, 12, OPERAND_AR }, + [A_24] = { 4, 24, OPERAND_AR }, + [A_28] = { 4, 28, OPERAND_AR }, + [B_16] = { 4, 16, OPERAND_BASE | OPERAND_GPR }, + [B_32] = { 4, 32, OPERAND_BASE | OPERAND_GPR }, + [C_8] = { 4, 8, OPERAND_CR }, + [C_12] = { 4, 12, OPERAND_CR }, + [D20_20] = { 20, 20, OPERAND_DISP | OPERAND_SIGNED }, + [D_20] = { 12, 20, OPERAND_DISP }, + [D_36] = { 12, 36, OPERAND_DISP }, + [F_8] = { 4, 8, OPERAND_FPR }, + [F_12] = { 4, 12, OPERAND_FPR }, + [F_16] = { 4, 16, OPERAND_FPR }, + [F_24] = { 4, 24, OPERAND_FPR }, + [F_28] = { 4, 28, OPERAND_FPR }, + [F_32] = { 4, 32, OPERAND_FPR }, + [I8_8] = { 8, 8, OPERAND_SIGNED }, + [I8_32] = { 8, 32, OPERAND_SIGNED }, + [I16_16] = { 16, 16, OPERAND_SIGNED }, + [I16_32] = { 16, 32, OPERAND_SIGNED }, + [I32_16] = { 32, 16, OPERAND_SIGNED }, + [J12_12] = { 12, 12, OPERAND_PCREL }, + [J16_16] = { 16, 16, OPERAND_PCREL }, + [J16_32] = { 16, 32, OPERAND_PCREL }, + [J24_24] = { 24, 24, OPERAND_PCREL }, + [J32_16] = { 32, 16, OPERAND_PCREL }, + [L4_8] = { 4, 8, OPERAND_LENGTH }, + [L4_12] = { 4, 12, OPERAND_LENGTH }, + [L8_8] = { 8, 8, OPERAND_LENGTH }, + [R_8] = { 4, 8, OPERAND_GPR }, + [R_12] = { 4, 12, OPERAND_GPR }, + [R_16] = { 4, 16, OPERAND_GPR }, + [R_24] = { 4, 24, OPERAND_GPR }, + [R_28] = { 4, 28, OPERAND_GPR }, + [U4_8] = { 4, 8, 0 }, + [U4_12] = { 4, 12, 0 }, + [U4_16] = { 4, 16, 0 }, + [U4_20] = { 4, 20, 0 }, + [U4_24] = { 4, 24, 0 }, + [U4_28] = { 4, 28, 0 }, + [U4_32] = { 4, 32, 0 }, + [U4_36] = { 4, 36, 0 }, + [U8_8] = { 8, 8, 0 }, + [U8_16] = { 8, 16, 0 }, + [U8_24] = { 8, 24, 0 }, + [U8_28] = { 8, 28, 0 }, + [U8_32] = { 8, 32, 0 }, + [U12_16] = { 12, 16, 0 }, + [U16_16] = { 16, 16, 0 }, + [U16_32] = { 16, 32, 0 }, + [U32_16] = { 32, 16, 0 }, + [VX_12] = { 4, 12, OPERAND_INDEX | OPERAND_VR }, + [V_8] = { 4, 8, OPERAND_VR }, + [V_12] = { 4, 12, OPERAND_VR }, + [V_16] = { 4, 16, OPERAND_VR }, + [V_32] = { 4, 32, OPERAND_VR }, + [X_12] = { 4, 12, OPERAND_INDEX | OPERAND_GPR }, +}; + +static const unsigned char formats[][6] = { + [INSTR_E] = { 0, 0, 0, 0, 0, 0 }, + [INSTR_IE_UU] = { U4_24, U4_28, 0, 0, 0, 0 }, + [INSTR_MII_UPP] = { U4_8, J12_12, J24_24 }, + [INSTR_RIE_R0IU] = { R_8, I16_16, U4_32, 0, 0, 0 }, + [INSTR_RIE_R0UU] = { R_8, U16_16, U4_32, 0, 0, 0 }, + [INSTR_RIE_RRI0] = { R_8, R_12, I16_16, 0, 0, 0 }, + [INSTR_RIE_RRP] = { R_8, R_12, J16_16, 0, 0, 0 }, + [INSTR_RIE_RRPU] = { R_8, R_12, U4_32, J16_16, 0, 0 }, + [INSTR_RIE_RRUUU] = { R_8, R_12, U8_16, U8_24, U8_32, 0 }, + [INSTR_RIE_RUI0] = { R_8, I16_16, U4_12, 0, 0, 0 }, + [INSTR_RIE_RUPI] = { R_8, I8_32, U4_12, J16_16, 0, 0 }, + [INSTR_RIE_RUPU] = { R_8, U8_32, U4_12, J16_16, 0, 0 }, + [INSTR_RIL_RI] = { R_8, I32_16, 0, 0, 0, 0 }, + [INSTR_RIL_RP] = { R_8, J32_16, 0, 0, 0, 0 }, + [INSTR_RIL_RU] = { R_8, U32_16, 0, 0, 0, 0 }, + [INSTR_RIL_UP] = { U4_8, J32_16, 0, 0, 0, 0 }, + [INSTR_RIS_RURDI] = { R_8, I8_32, U4_12, D_20, B_16, 0 }, + [INSTR_RIS_RURDU] = { R_8, U8_32, U4_12, D_20, B_16, 0 }, + [INSTR_RI_RI] = { R_8, I16_16, 0, 0, 0, 0 }, + [INSTR_RI_RP] = { R_8, J16_16, 0, 0, 0, 0 }, + [INSTR_RI_RU] = { R_8, U16_16, 0, 0, 0, 0 }, + [INSTR_RI_UP] = { U4_8, J16_16, 0, 0, 0, 0 }, + [INSTR_RRE_00] = { 0, 0, 0, 0, 0, 0 }, + [INSTR_RRE_AA] = { A_24, A_28, 0, 0, 0, 0 }, + [INSTR_RRE_AR] = { A_24, R_28, 0, 0, 0, 0 }, + [INSTR_RRE_F0] = { F_24, 0, 0, 0, 0, 0 }, + [INSTR_RRE_FF] = { F_24, F_28, 0, 0, 0, 0 }, + [INSTR_RRE_FR] = { F_24, R_28, 0, 0, 0, 0 }, + [INSTR_RRE_R0] = { R_24, 0, 0, 0, 0, 0 }, + [INSTR_RRE_RA] = { R_24, A_28, 0, 0, 0, 0 }, + [INSTR_RRE_RF] = { R_24, F_28, 0, 0, 0, 0 }, + [INSTR_RRE_RR] = { R_24, R_28, 0, 0, 0, 0 }, + [INSTR_RRF_0UFF] = { F_24, F_28, U4_20, 0, 0, 0 }, + [INSTR_RRF_0URF] = { R_24, F_28, U4_20, 0, 0, 0 }, + [INSTR_RRF_F0FF] = { F_16, F_24, F_28, 0, 0, 0 }, + [INSTR_RRF_F0FF2] = { F_24, F_16, F_28, 0, 0, 0 }, + [INSTR_RRF_F0FR] = { F_24, F_16, R_28, 0, 0, 0 }, + [INSTR_RRF_FFRU] = { F_24, F_16, R_28, U4_20, 0, 0 }, + [INSTR_RRF_FUFF] = { F_24, F_16, F_28, U4_20, 0, 0 }, + [INSTR_RRF_FUFF2] = { F_24, F_28, F_16, U4_20, 0, 0 }, + [INSTR_RRF_R0RR] = { R_24, R_16, R_28, 0, 0, 0 }, + [INSTR_RRF_R0RR2] = { R_24, R_28, R_16, 0, 0, 0 }, + [INSTR_RRF_RURR] = { R_24, R_28, R_16, U4_20, 0, 0 }, + [INSTR_RRF_RURR2] = { R_24, R_16, R_28, U4_20, 0, 0 }, + [INSTR_RRF_U0FF] = { F_24, U4_16, F_28, 0, 0, 0 }, + [INSTR_RRF_U0RF] = { R_24, U4_16, F_28, 0, 0, 0 }, + [INSTR_RRF_U0RR] = { R_24, R_28, U4_16, 0, 0, 0 }, + [INSTR_RRF_UUFF] = { F_24, U4_16, F_28, U4_20, 0, 0 }, + [INSTR_RRF_UUFR] = { F_24, U4_16, R_28, U4_20, 0, 0 }, + [INSTR_RRF_UURF] = { R_24, U4_16, F_28, U4_20, 0, 0 }, + [INSTR_RRS_RRRDU] = { R_8, R_12, U4_32, D_20, B_16 }, + [INSTR_RR_FF] = { F_8, F_12, 0, 0, 0, 0 }, + [INSTR_RR_R0] = { R_8, 0, 0, 0, 0, 0 }, + [INSTR_RR_RR] = { R_8, R_12, 0, 0, 0, 0 }, + [INSTR_RR_U0] = { U8_8, 0, 0, 0, 0, 0 }, + [INSTR_RR_UR] = { U4_8, R_12, 0, 0, 0, 0 }, + [INSTR_RSI_RRP] = { R_8, R_12, J16_16, 0, 0, 0 }, + [INSTR_RSL_LRDFU] = { F_32, D_20, L8_8, B_16, U4_36, 0 }, + [INSTR_RSL_R0RD] = { D_20, L4_8, B_16, 0, 0, 0 }, + [INSTR_RSY_AARD] = { A_8, A_12, D20_20, B_16, 0, 0 }, + [INSTR_RSY_CCRD] = { C_8, C_12, D20_20, B_16, 0, 0 }, + [INSTR_RSY_RDRU] = { R_8, D20_20, B_16, U4_12, 0, 0 }, + [INSTR_RSY_RRRD] = { R_8, R_12, D20_20, B_16, 0, 0 }, + [INSTR_RSY_RURD] = { R_8, U4_12, D20_20, B_16, 0, 0 }, + [INSTR_RSY_RURD2] = { R_8, D20_20, B_16, U4_12, 0, 0 }, + [INSTR_RS_AARD] = { A_8, A_12, D_20, B_16, 0, 0 }, + [INSTR_RS_CCRD] = { C_8, C_12, D_20, B_16, 0, 0 }, + [INSTR_RS_R0RD] = { R_8, D_20, B_16, 0, 0, 0 }, + [INSTR_RS_RRRD] = { R_8, R_12, D_20, B_16, 0, 0 }, + [INSTR_RS_RURD] = { R_8, U4_12, D_20, B_16, 0, 0 }, + [INSTR_RXE_FRRD] = { F_8, D_20, X_12, B_16, 0, 0 }, + [INSTR_RXE_RRRDU] = { R_8, D_20, X_12, B_16, U4_32, 0 }, + [INSTR_RXF_FRRDF] = { F_32, F_8, D_20, X_12, B_16, 0 }, + [INSTR_RXY_FRRD] = { F_8, D20_20, X_12, B_16, 0, 0 }, + [INSTR_RXY_RRRD] = { R_8, D20_20, X_12, B_16, 0, 0 }, + [INSTR_RXY_URRD] = { U4_8, D20_20, X_12, B_16, 0, 0 }, + [INSTR_RX_FRRD] = { F_8, D_20, X_12, B_16, 0, 0 }, + [INSTR_RX_RRRD] = { R_8, D_20, X_12, B_16, 0, 0 }, + [INSTR_RX_URRD] = { U4_8, D_20, X_12, B_16, 0, 0 }, + [INSTR_SIL_RDI] = { D_20, B_16, I16_32, 0, 0, 0 }, + [INSTR_SIL_RDU] = { D_20, B_16, U16_32, 0, 0, 0 }, + [INSTR_SIY_IRD] = { D20_20, B_16, I8_8, 0, 0, 0 }, + [INSTR_SIY_URD] = { D20_20, B_16, U8_8, 0, 0, 0 }, + [INSTR_SI_RD] = { D_20, B_16, 0, 0, 0, 0 }, + [INSTR_SI_URD] = { D_20, B_16, U8_8, 0, 0, 0 }, + [INSTR_SMI_U0RDP] = { U4_8, J16_32, D_20, B_16, 0, 0 }, + [INSTR_SSE_RDRD] = { D_20, B_16, D_36, B_32, 0, 0 }, + [INSTR_SSF_RRDRD] = { D_20, B_16, D_36, B_32, R_8, 0 }, + [INSTR_SSF_RRDRD2] = { R_8, D_20, B_16, D_36, B_32, 0 }, + [INSTR_SS_L0RDRD] = { D_20, L8_8, B_16, D_36, B_32, 0 }, + [INSTR_SS_L2RDRD] = { D_20, B_16, D_36, L8_8, B_32, 0 }, + [INSTR_SS_LIRDRD] = { D_20, L4_8, B_16, D_36, B_32, U4_12 }, + [INSTR_SS_LLRDRD] = { D_20, L4_8, B_16, D_36, L4_12, B_32 }, + [INSTR_SS_RRRDRD] = { D_20, R_8, B_16, D_36, B_32, R_12 }, + [INSTR_SS_RRRDRD2] = { R_8, D_20, B_16, R_12, D_36, B_32 }, + [INSTR_SS_RRRDRD3] = { R_8, R_12, D_20, B_16, D_36, B_32 }, + [INSTR_S_00] = { 0, 0, 0, 0, 0, 0 }, + [INSTR_S_RD] = { D_20, B_16, 0, 0, 0, 0 }, + [INSTR_VRI_V0IU] = { V_8, I16_16, U4_32, 0, 0, 0 }, + [INSTR_VRI_V0U] = { V_8, U16_16, 0, 0, 0, 0 }, + [INSTR_VRI_V0UU2] = { V_8, U16_16, U4_32, 0, 0, 0 }, + [INSTR_VRI_V0UUU] = { V_8, U8_16, U8_24, U4_32, 0, 0 }, + [INSTR_VRI_VR0UU] = { V_8, R_12, U8_28, U4_24, 0, 0 }, + [INSTR_VRI_VVUU] = { V_8, V_12, U16_16, U4_32, 0, 0 }, + [INSTR_VRI_VVUUU] = { V_8, V_12, U12_16, U4_32, U4_28, 0 }, + [INSTR_VRI_VVUUU2] = { V_8, V_12, U8_28, U8_16, U4_24, 0 }, + [INSTR_VRI_VVV0U] = { V_8, V_12, V_16, U8_24, 0, 0 }, + [INSTR_VRI_VVV0UU] = { V_8, V_12, V_16, U8_24, U4_32, 0 }, + [INSTR_VRI_VVV0UU2] = { V_8, V_12, V_16, U8_28, U4_24, 0 }, + [INSTR_VRR_0V] = { V_12, 0, 0, 0, 0, 0 }, + [INSTR_VRR_0VV0U] = { V_12, V_16, U4_24, 0, 0, 0 }, + [INSTR_VRR_RV0U] = { R_8, V_12, U4_24, 0, 0, 0 }, + [INSTR_VRR_VRR] = { V_8, R_12, R_16, 0, 0, 0 }, + [INSTR_VRR_VV] = { V_8, V_12, 0, 0, 0, 0 }, + [INSTR_VRR_VV0U] = { V_8, V_12, U4_32, 0, 0, 0 }, + [INSTR_VRR_VV0U0U] = { V_8, V_12, U4_32, U4_24, 0, 0 }, + [INSTR_VRR_VV0UU2] = { V_8, V_12, U4_32, U4_28, 0, 0 }, + [INSTR_VRR_VV0UUU] = { V_8, V_12, U4_32, U4_28, U4_24, 0 }, + [INSTR_VRR_VVV] = { V_8, V_12, V_16, 0, 0, 0 }, + [INSTR_VRR_VVV0U] = { V_8, V_12, V_16, U4_32, 0, 0 }, + [INSTR_VRR_VVV0U0U] = { V_8, V_12, V_16, U4_32, U4_24, 0 }, + [INSTR_VRR_VVV0UU] = { V_8, V_12, V_16, U4_32, U4_28, 0 }, + [INSTR_VRR_VVV0UUU] = { V_8, V_12, V_16, U4_32, U4_28, U4_24 }, + [INSTR_VRR_VVV0V] = { V_8, V_12, V_16, V_32, 0, 0 }, + [INSTR_VRR_VVVU0UV] = { V_8, V_12, V_16, V_32, U4_28, U4_20 }, + [INSTR_VRR_VVVU0V] = { V_8, V_12, V_16, V_32, U4_20, 0 }, + [INSTR_VRR_VVVUU0V] = { V_8, V_12, V_16, V_32, U4_20, U4_24 }, + [INSTR_VRS_RRDV] = { V_32, R_12, D_20, B_16, 0, 0 }, + [INSTR_VRS_RVRDU] = { R_8, V_12, D_20, B_16, U4_32, 0 }, + [INSTR_VRS_VRRD] = { V_8, R_12, D_20, B_16, 0, 0 }, + [INSTR_VRS_VRRDU] = { V_8, R_12, D_20, B_16, U4_32, 0 }, + [INSTR_VRS_VVRD] = { V_8, V_12, D_20, B_16, 0, 0 }, + [INSTR_VRS_VVRDU] = { V_8, V_12, D_20, B_16, U4_32, 0 }, + [INSTR_VRV_VVXRDU] = { V_8, D_20, VX_12, B_16, U4_32, 0 }, + [INSTR_VRX_VRRD] = { V_8, D_20, X_12, B_16, 0, 0 }, + [INSTR_VRX_VRRDU] = { V_8, D_20, X_12, B_16, U4_32, 0 }, + [INSTR_VRX_VV] = { V_8, V_12, 0, 0, 0, 0 }, + [INSTR_VSI_URDV] = { V_32, D_20, B_16, U8_8, 0, 0 }, +}; + +static char long_insn_name[][7] = LONG_INSN_INITIALIZER; +static struct s390_insn opcode[] = OPCODE_TABLE_INITIALIZER; +static struct s390_opcode_offset opcode_offset[] = OPCODE_OFFSET_INITIALIZER; + +/* Extracts an operand value from an instruction. */ +static unsigned int extract_operand(unsigned char *code, + const struct s390_operand *operand) +{ + unsigned char *cp; + unsigned int val; + int bits; + + /* Extract fragments of the operand byte for byte. */ + cp = code + operand->shift / 8; + bits = (operand->shift & 7) + operand->bits; + val = 0; + do { + val <<= 8; + val |= (unsigned int) *cp++; + bits -= 8; + } while (bits > 0); + val >>= -bits; + val &= ((1U << (operand->bits - 1)) << 1) - 1; + + /* Check for special long displacement case. */ + if (operand->bits == 20 && operand->shift == 20) + val = (val & 0xff) << 12 | (val & 0xfff00) >> 8; + + /* Check for register extensions bits for vector registers. */ + if (operand->flags & OPERAND_VR) { + if (operand->shift == 8) + val |= (code[4] & 8) << 1; + else if (operand->shift == 12) + val |= (code[4] & 4) << 2; + else if (operand->shift == 16) + val |= (code[4] & 2) << 3; + else if (operand->shift == 32) + val |= (code[4] & 1) << 4; + } + + /* Sign extend value if the operand is signed or pc relative. */ + if ((operand->flags & (OPERAND_SIGNED | OPERAND_PCREL)) && + (val & (1U << (operand->bits - 1)))) + val |= (-1U << (operand->bits - 1)) << 1; + + /* Double value if the operand is pc relative. */ + if (operand->flags & OPERAND_PCREL) + val <<= 1; + + /* Length x in an instructions has real length x + 1. */ + if (operand->flags & OPERAND_LENGTH) + val++; + return val; +} + +struct s390_insn *find_insn(unsigned char *code) +{ + struct s390_opcode_offset *entry; + struct s390_insn *insn; + unsigned char opfrag; + int i; + + /* Search the opcode offset table to find an entry which + * matches the beginning of the opcode. If there is no match + * the last entry will be used, which is the default entry for + * unknown instructions as well as 1-byte opcode instructions. + */ + for (i = 0; i < ARRAY_SIZE(opcode_offset); i++) { + entry = &opcode_offset[i]; + if (entry->opcode == code[0]) + break; + } + + opfrag = *(code + entry->byte) & entry->mask; + + insn = &opcode[entry->offset]; + for (i = 0; i < entry->count; i++) { + if (insn->opfrag == opfrag) + return insn; + insn++; + } + return NULL; +} + +static int print_insn(char *buffer, unsigned char *code, unsigned long addr) +{ + struct s390_insn *insn; + const unsigned char *ops; + const struct s390_operand *operand; + unsigned int value; + char separator; + char *ptr; + int i; + + ptr = buffer; + insn = find_insn(code); + if (insn) { + if (insn->zero == 0) + ptr += sprintf(ptr, "%.7s\t", + long_insn_name[insn->offset]); + else + ptr += sprintf(ptr, "%.5s\t", insn->name); + /* Extract the operands. */ + separator = 0; + for (ops = formats[insn->format], i = 0; + *ops != 0 && i < 6; ops++, i++) { + operand = operands + *ops; + value = extract_operand(code, operand); + if ((operand->flags & OPERAND_INDEX) && value == 0) + continue; + if ((operand->flags & OPERAND_BASE) && + value == 0 && separator == '(') { + separator = ','; + continue; + } + if (separator) + ptr += sprintf(ptr, "%c", separator); + if (operand->flags & OPERAND_GPR) + ptr += sprintf(ptr, "%%r%i", value); + else if (operand->flags & OPERAND_FPR) + ptr += sprintf(ptr, "%%f%i", value); + else if (operand->flags & OPERAND_AR) + ptr += sprintf(ptr, "%%a%i", value); + else if (operand->flags & OPERAND_CR) + ptr += sprintf(ptr, "%%c%i", value); + else if (operand->flags & OPERAND_VR) + ptr += sprintf(ptr, "%%v%i", value); + else if (operand->flags & OPERAND_PCREL) { + void *pcrel = (void *)((int)value + addr); + + ptr += sprintf(ptr, "%px", pcrel); + } else if (operand->flags & OPERAND_SIGNED) + ptr += sprintf(ptr, "%i", value); + else + ptr += sprintf(ptr, "%u", value); + if (operand->flags & OPERAND_DISP) + separator = '('; + else if (operand->flags & OPERAND_BASE) { + ptr += sprintf(ptr, ")"); + separator = ','; + } else + separator = ','; + } + } else + ptr += sprintf(ptr, "unknown"); + return (int) (ptr - buffer); +} + +void show_code(struct pt_regs *regs) +{ + char *mode = user_mode(regs) ? "User" : "Krnl"; + unsigned char code[64]; + char buffer[128], *ptr; + mm_segment_t old_fs; + unsigned long addr; + int start, end, opsize, hops, i; + + /* Get a snapshot of the 64 bytes surrounding the fault address. */ + old_fs = get_fs(); + set_fs(user_mode(regs) ? USER_DS : KERNEL_DS); + for (start = 32; start && regs->psw.addr >= 34 - start; start -= 2) { + addr = regs->psw.addr - 34 + start; + if (__copy_from_user(code + start - 2, + (char __user *) addr, 2)) + break; + } + for (end = 32; end < 64; end += 2) { + addr = regs->psw.addr + end - 32; + if (__copy_from_user(code + end, + (char __user *) addr, 2)) + break; + } + set_fs(old_fs); + /* Code snapshot useable ? */ + if ((regs->psw.addr & 1) || start >= end) { + printk("%s Code: Bad PSW.\n", mode); + return; + } + /* Find a starting point for the disassembly. */ + while (start < 32) { + for (i = 0, hops = 0; start + i < 32 && hops < 3; hops++) { + if (!find_insn(code + start + i)) + break; + i += insn_length(code[start + i]); + } + if (start + i == 32) + /* Looks good, sequence ends at PSW. */ + break; + start += 2; + } + /* Decode the instructions. */ + ptr = buffer; + ptr += sprintf(ptr, "%s Code:", mode); + hops = 0; + while (start < end && hops < 8) { + opsize = insn_length(code[start]); + if (start + opsize == 32) + *ptr++ = '#'; + else if (start == 32) + *ptr++ = '>'; + else + *ptr++ = ' '; + addr = regs->psw.addr + start - 32; + ptr += sprintf(ptr, "%px: ", (void *)addr); + if (start + opsize >= end) + break; + for (i = 0; i < opsize; i++) + ptr += sprintf(ptr, "%02x", code[start + i]); + *ptr++ = '\t'; + if (i < 6) + *ptr++ = '\t'; + ptr += print_insn(ptr, code + start, addr); + start += opsize; + pr_cont("%s", buffer); + ptr = buffer; + ptr += sprintf(ptr, "\n "); + hops++; + } + pr_cont("\n"); +} + +void print_fn_code(unsigned char *code, unsigned long len) +{ + char buffer[128], *ptr; + int opsize, i; + + while (len) { + ptr = buffer; + opsize = insn_length(*code); + if (opsize > len) + break; + ptr += sprintf(ptr, "%px: ", code); + for (i = 0; i < opsize; i++) + ptr += sprintf(ptr, "%02x", code[i]); + *ptr++ = '\t'; + if (i < 4) + *ptr++ = '\t'; + ptr += print_insn(ptr, code, (unsigned long) code); + *ptr++ = '\n'; + *ptr++ = 0; + printk("%s", buffer); + code += opsize; + len -= opsize; + } +} diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c new file mode 100644 index 000000000..5b23c4f6e --- /dev/null +++ b/arch/s390/kernel/dumpstack.c @@ -0,0 +1,191 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Stack dumping functions + * + * Copyright IBM Corp. 1999, 2013 + */ + +#include <linux/kallsyms.h> +#include <linux/hardirq.h> +#include <linux/kprobes.h> +#include <linux/utsname.h> +#include <linux/export.h> +#include <linux/kdebug.h> +#include <linux/ptrace.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/sched/debug.h> +#include <linux/sched/task_stack.h> +#include <asm/processor.h> +#include <asm/debug.h> +#include <asm/dis.h> +#include <asm/ipl.h> + +/* + * For dump_trace we have tree different stack to consider: + * - the panic stack which is used if the kernel stack has overflown + * - the asynchronous interrupt stack (cpu related) + * - the synchronous kernel stack (process related) + * The stack trace can start at any of the three stacks and can potentially + * touch all of them. The order is: panic stack, async stack, sync stack. + */ +static unsigned long +__dump_trace(dump_trace_func_t func, void *data, unsigned long sp, + unsigned long low, unsigned long high) +{ + struct stack_frame *sf; + struct pt_regs *regs; + + while (1) { + if (sp < low || sp > high - sizeof(*sf)) + return sp; + sf = (struct stack_frame *) sp; + if (func(data, sf->gprs[8], 0)) + return sp; + /* Follow the backchain. */ + while (1) { + low = sp; + sp = sf->back_chain; + if (!sp) + break; + if (sp <= low || sp > high - sizeof(*sf)) + return sp; + sf = (struct stack_frame *) sp; + if (func(data, sf->gprs[8], 1)) + return sp; + } + /* Zero backchain detected, check for interrupt frame. */ + sp = (unsigned long) (sf + 1); + if (sp <= low || sp > high - sizeof(*regs)) + return sp; + regs = (struct pt_regs *) sp; + if (!user_mode(regs)) { + if (func(data, regs->psw.addr, 1)) + return sp; + } + low = sp; + sp = regs->gprs[15]; + } +} + +void dump_trace(dump_trace_func_t func, void *data, struct task_struct *task, + unsigned long sp) +{ + unsigned long frame_size; + + frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs); +#ifdef CONFIG_CHECK_STACK + sp = __dump_trace(func, data, sp, + S390_lowcore.panic_stack + frame_size - PAGE_SIZE, + S390_lowcore.panic_stack + frame_size); +#endif + sp = __dump_trace(func, data, sp, + S390_lowcore.async_stack + frame_size - ASYNC_SIZE, + S390_lowcore.async_stack + frame_size); + task = task ?: current; + __dump_trace(func, data, sp, + (unsigned long)task_stack_page(task), + (unsigned long)task_stack_page(task) + THREAD_SIZE); +} +EXPORT_SYMBOL_GPL(dump_trace); + +static int show_address(void *data, unsigned long address, int reliable) +{ + if (reliable) + printk(" [<%016lx>] %pSR \n", address, (void *)address); + else + printk("([<%016lx>] %pSR)\n", address, (void *)address); + return 0; +} + +void show_stack(struct task_struct *task, unsigned long *stack) +{ + unsigned long sp = (unsigned long) stack; + + if (!sp) + sp = task ? task->thread.ksp : current_stack_pointer(); + printk("Call Trace:\n"); + dump_trace(show_address, NULL, task, sp); + if (!task) + task = current; + debug_show_held_locks(task); +} + +static void show_last_breaking_event(struct pt_regs *regs) +{ + printk("Last Breaking-Event-Address:\n"); + printk(" [<%016lx>] %pSR\n", regs->args[0], (void *)regs->args[0]); +} + +void show_registers(struct pt_regs *regs) +{ + struct psw_bits *psw = &psw_bits(regs->psw); + char *mode; + + mode = user_mode(regs) ? "User" : "Krnl"; + printk("%s PSW : %p %p", mode, (void *)regs->psw.mask, (void *)regs->psw.addr); + if (!user_mode(regs)) + pr_cont(" (%pSR)", (void *)regs->psw.addr); + pr_cont("\n"); + printk(" R:%x T:%x IO:%x EX:%x Key:%x M:%x W:%x " + "P:%x AS:%x CC:%x PM:%x", psw->per, psw->dat, psw->io, psw->ext, + psw->key, psw->mcheck, psw->wait, psw->pstate, psw->as, psw->cc, psw->pm); + pr_cont(" RI:%x EA:%x\n", psw->ri, psw->eaba); + printk("%s GPRS: %016lx %016lx %016lx %016lx\n", mode, + regs->gprs[0], regs->gprs[1], regs->gprs[2], regs->gprs[3]); + printk(" %016lx %016lx %016lx %016lx\n", + regs->gprs[4], regs->gprs[5], regs->gprs[6], regs->gprs[7]); + printk(" %016lx %016lx %016lx %016lx\n", + regs->gprs[8], regs->gprs[9], regs->gprs[10], regs->gprs[11]); + printk(" %016lx %016lx %016lx %016lx\n", + regs->gprs[12], regs->gprs[13], regs->gprs[14], regs->gprs[15]); + show_code(regs); +} + +void show_regs(struct pt_regs *regs) +{ + show_regs_print_info(KERN_DEFAULT); + show_registers(regs); + /* Show stack backtrace if pt_regs is from kernel mode */ + if (!user_mode(regs)) + show_stack(NULL, (unsigned long *) regs->gprs[15]); + show_last_breaking_event(regs); +} + +static DEFINE_SPINLOCK(die_lock); + +void die(struct pt_regs *regs, const char *str) +{ + static int die_counter; + + oops_enter(); + lgr_info_log(); + debug_stop_all(); + console_verbose(); + spin_lock_irq(&die_lock); + bust_spinlocks(1); + printk("%s: %04x ilc:%d [#%d] ", str, regs->int_code & 0xffff, + regs->int_code >> 17, ++die_counter); +#ifdef CONFIG_PREEMPT + pr_cont("PREEMPT "); +#endif +#ifdef CONFIG_SMP + pr_cont("SMP "); +#endif + if (debug_pagealloc_enabled()) + pr_cont("DEBUG_PAGEALLOC"); + pr_cont("\n"); + notify_die(DIE_OOPS, str, regs, 0, regs->int_code & 0xffff, SIGSEGV); + print_modules(); + show_regs(regs); + bust_spinlocks(0); + add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); + spin_unlock_irq(&die_lock); + if (in_interrupt()) + panic("Fatal exception in interrupt"); + if (panic_on_oops) + panic("Fatal exception: panic_on_oops"); + oops_exit(); + do_exit(SIGSEGV); +} diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c new file mode 100644 index 000000000..ad88bed74 --- /dev/null +++ b/arch/s390/kernel/early.c @@ -0,0 +1,366 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2007, 2009 + * Author(s): Hongjie Yang <hongjie@us.ibm.com>, + * Heiko Carstens <heiko.carstens@de.ibm.com> + */ + +#define KMSG_COMPONENT "setup" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/compiler.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <linux/lockdep.h> +#include <linux/extable.h> +#include <linux/pfn.h> +#include <linux/uaccess.h> +#include <linux/kernel.h> +#include <asm/diag.h> +#include <asm/ebcdic.h> +#include <asm/ipl.h> +#include <asm/lowcore.h> +#include <asm/processor.h> +#include <asm/sections.h> +#include <asm/setup.h> +#include <asm/sysinfo.h> +#include <asm/cpcmd.h> +#include <asm/sclp.h> +#include <asm/facility.h> +#include "entry.h" + +static void __init setup_boot_command_line(void); + +/* + * Initialize storage key for kernel pages + */ +static noinline __init void init_kernel_storage_key(void) +{ +#if PAGE_DEFAULT_KEY + unsigned long end_pfn, init_pfn; + + end_pfn = PFN_UP(__pa(_end)); + + for (init_pfn = 0 ; init_pfn < end_pfn; init_pfn++) + page_set_storage_key(init_pfn << PAGE_SHIFT, + PAGE_DEFAULT_KEY, 0); +#endif +} + +static __initdata char sysinfo_page[PAGE_SIZE] __aligned(PAGE_SIZE); + +static noinline __init void detect_machine_type(void) +{ + struct sysinfo_3_2_2 *vmms = (struct sysinfo_3_2_2 *)&sysinfo_page; + + /* Check current-configuration-level */ + if (stsi(NULL, 0, 0, 0) <= 2) { + S390_lowcore.machine_flags |= MACHINE_FLAG_LPAR; + return; + } + /* Get virtual-machine cpu information. */ + if (stsi(vmms, 3, 2, 2) || !vmms->count) + return; + + /* Detect known hypervisors */ + if (!memcmp(vmms->vm[0].cpi, "\xd2\xe5\xd4", 3)) + S390_lowcore.machine_flags |= MACHINE_FLAG_KVM; + else if (!memcmp(vmms->vm[0].cpi, "\xa9\x61\xe5\xd4", 4)) + S390_lowcore.machine_flags |= MACHINE_FLAG_VM; +} + +/* Remove leading, trailing and double whitespace. */ +static inline void strim_all(char *str) +{ + char *s; + + s = strim(str); + if (s != str) + memmove(str, s, strlen(s)); + while (*str) { + if (!isspace(*str++)) + continue; + if (isspace(*str)) { + s = skip_spaces(str); + memmove(str, s, strlen(s) + 1); + } + } +} + +static noinline __init void setup_arch_string(void) +{ + struct sysinfo_1_1_1 *mach = (struct sysinfo_1_1_1 *)&sysinfo_page; + struct sysinfo_3_2_2 *vm = (struct sysinfo_3_2_2 *)&sysinfo_page; + char mstr[80], hvstr[17]; + + if (stsi(mach, 1, 1, 1)) + return; + EBCASC(mach->manufacturer, sizeof(mach->manufacturer)); + EBCASC(mach->type, sizeof(mach->type)); + EBCASC(mach->model, sizeof(mach->model)); + EBCASC(mach->model_capacity, sizeof(mach->model_capacity)); + sprintf(mstr, "%-16.16s %-4.4s %-16.16s %-16.16s", + mach->manufacturer, mach->type, + mach->model, mach->model_capacity); + strim_all(mstr); + if (stsi(vm, 3, 2, 2) == 0 && vm->count) { + EBCASC(vm->vm[0].cpi, sizeof(vm->vm[0].cpi)); + sprintf(hvstr, "%-16.16s", vm->vm[0].cpi); + strim_all(hvstr); + } else { + sprintf(hvstr, "%s", + MACHINE_IS_LPAR ? "LPAR" : + MACHINE_IS_VM ? "z/VM" : + MACHINE_IS_KVM ? "KVM" : "unknown"); + } + dump_stack_set_arch_desc("%s (%s)", mstr, hvstr); +} + +static __init void setup_topology(void) +{ + int max_mnest; + + if (!test_facility(11)) + return; + S390_lowcore.machine_flags |= MACHINE_FLAG_TOPOLOGY; + for (max_mnest = 6; max_mnest > 1; max_mnest--) { + if (stsi(&sysinfo_page, 15, 1, max_mnest) == 0) + break; + } + topology_max_mnest = max_mnest; +} + +static void early_pgm_check_handler(void) +{ + const struct exception_table_entry *fixup; + unsigned long cr0, cr0_new; + unsigned long addr; + + addr = S390_lowcore.program_old_psw.addr; + fixup = search_exception_tables(addr); + if (!fixup) + disabled_wait(0); + /* Disable low address protection before storing into lowcore. */ + __ctl_store(cr0, 0, 0); + cr0_new = cr0 & ~(1UL << 28); + __ctl_load(cr0_new, 0, 0); + S390_lowcore.program_old_psw.addr = extable_fixup(fixup); + __ctl_load(cr0, 0, 0); +} + +static noinline __init void setup_lowcore_early(void) +{ + psw_t psw; + + psw.mask = PSW_MASK_BASE | PSW_DEFAULT_KEY | PSW_MASK_EA | PSW_MASK_BA; + if (IS_ENABLED(CONFIG_KASAN)) + psw.mask |= PSW_MASK_DAT; + psw.addr = (unsigned long) s390_base_ext_handler; + S390_lowcore.external_new_psw = psw; + psw.addr = (unsigned long) s390_base_pgm_handler; + S390_lowcore.program_new_psw = psw; + s390_base_pgm_handler_fn = early_pgm_check_handler; + S390_lowcore.preempt_count = INIT_PREEMPT_COUNT; +} + +static noinline __init void setup_facility_list(void) +{ + stfle(S390_lowcore.stfle_fac_list, + ARRAY_SIZE(S390_lowcore.stfle_fac_list)); + memcpy(S390_lowcore.alt_stfle_fac_list, + S390_lowcore.stfle_fac_list, + sizeof(S390_lowcore.alt_stfle_fac_list)); + if (!IS_ENABLED(CONFIG_KERNEL_NOBP)) + __clear_facility(82, S390_lowcore.alt_stfle_fac_list); +} + +static __init void detect_diag9c(void) +{ + unsigned int cpu_address; + int rc; + + cpu_address = stap(); + diag_stat_inc(DIAG_STAT_X09C); + asm volatile( + " diag %2,0,0x9c\n" + "0: la %0,0\n" + "1:\n" + EX_TABLE(0b,1b) + : "=d" (rc) : "0" (-EOPNOTSUPP), "d" (cpu_address) : "cc"); + if (!rc) + S390_lowcore.machine_flags |= MACHINE_FLAG_DIAG9C; +} + +static __init void detect_diag44(void) +{ + int rc; + + diag_stat_inc(DIAG_STAT_X044); + asm volatile( + " diag 0,0,0x44\n" + "0: la %0,0\n" + "1:\n" + EX_TABLE(0b,1b) + : "=d" (rc) : "0" (-EOPNOTSUPP) : "cc"); + if (!rc) + S390_lowcore.machine_flags |= MACHINE_FLAG_DIAG44; +} + +static __init void detect_machine_facilities(void) +{ + if (test_facility(8)) { + S390_lowcore.machine_flags |= MACHINE_FLAG_EDAT1; + __ctl_set_bit(0, 23); + } + if (test_facility(78)) + S390_lowcore.machine_flags |= MACHINE_FLAG_EDAT2; + if (test_facility(3)) + S390_lowcore.machine_flags |= MACHINE_FLAG_IDTE; + if (test_facility(50) && test_facility(73)) { + S390_lowcore.machine_flags |= MACHINE_FLAG_TE; + __ctl_set_bit(0, 55); + } + if (test_facility(51)) + S390_lowcore.machine_flags |= MACHINE_FLAG_TLB_LC; + if (test_facility(129)) { + S390_lowcore.machine_flags |= MACHINE_FLAG_VX; + __ctl_set_bit(0, 17); + } + if (test_facility(130)) { + S390_lowcore.machine_flags |= MACHINE_FLAG_NX; + __ctl_set_bit(0, 20); + } + if (test_facility(133)) + S390_lowcore.machine_flags |= MACHINE_FLAG_GS; + if (test_facility(139) && (tod_clock_base[1] & 0x80)) { + /* Enabled signed clock comparator comparisons */ + S390_lowcore.machine_flags |= MACHINE_FLAG_SCC; + clock_comparator_max = -1ULL >> 1; + __ctl_set_bit(0, 53); + } +} + +static inline void save_vector_registers(void) +{ +#ifdef CONFIG_CRASH_DUMP + if (test_facility(129)) + save_vx_regs(boot_cpu_vector_save_area); +#endif +} + +static int __init disable_vector_extension(char *str) +{ + S390_lowcore.machine_flags &= ~MACHINE_FLAG_VX; + __ctl_clear_bit(0, 17); + return 0; +} +early_param("novx", disable_vector_extension); + +static int __init noexec_setup(char *str) +{ + bool enabled; + int rc; + + rc = kstrtobool(str, &enabled); + if (!rc && !enabled) { + /* Disable no-execute support */ + S390_lowcore.machine_flags &= ~MACHINE_FLAG_NX; + __ctl_clear_bit(0, 20); + } + return rc; +} +early_param("noexec", noexec_setup); + +static int __init cad_setup(char *str) +{ + bool enabled; + int rc; + + rc = kstrtobool(str, &enabled); + if (!rc && enabled && test_facility(128)) + /* Enable problem state CAD. */ + __ctl_set_bit(2, 3); + return rc; +} +early_param("cad", cad_setup); + +/* Set up boot command line */ +static void __init append_to_cmdline(size_t (*ipl_data)(char *, size_t)) +{ + char *parm, *delim; + size_t rc, len; + + len = strlen(boot_command_line); + + delim = boot_command_line + len; /* '\0' character position */ + parm = boot_command_line + len + 1; /* append right after '\0' */ + + rc = ipl_data(parm, COMMAND_LINE_SIZE - len - 1); + if (rc) { + if (*parm == '=') + memmove(boot_command_line, parm + 1, rc); + else + *delim = ' '; /* replace '\0' with space */ + } +} + +static inline int has_ebcdic_char(const char *str) +{ + int i; + + for (i = 0; str[i]; i++) + if (str[i] & 0x80) + return 1; + return 0; +} + +static void __init setup_boot_command_line(void) +{ + COMMAND_LINE[ARCH_COMMAND_LINE_SIZE - 1] = 0; + /* convert arch command line to ascii if necessary */ + if (has_ebcdic_char(COMMAND_LINE)) + EBCASC(COMMAND_LINE, ARCH_COMMAND_LINE_SIZE); + /* copy arch command line */ + strlcpy(boot_command_line, strstrip(COMMAND_LINE), + ARCH_COMMAND_LINE_SIZE); + + /* append IPL PARM data to the boot command line */ + if (MACHINE_IS_VM) + append_to_cmdline(append_ipl_vmparm); + + append_to_cmdline(append_ipl_scpdata); +} + +static void __init check_image_bootable(void) +{ + if (!memcmp(EP_STRING, (void *)EP_OFFSET, strlen(EP_STRING))) + return; + + sclp_early_printk("Linux kernel boot failure: An attempt to boot a vmlinux ELF image failed.\n"); + sclp_early_printk("This image does not contain all parts necessary for starting up. Use\n"); + sclp_early_printk("bzImage or arch/s390/boot/compressed/vmlinux instead.\n"); + disabled_wait(0xbadb007); +} + +void __init startup_init(void) +{ + check_image_bootable(); + time_early_init(); + init_kernel_storage_key(); + lockdep_off(); + setup_lowcore_early(); + setup_facility_list(); + detect_machine_type(); + setup_arch_string(); + ipl_store_parameters(); + setup_boot_command_line(); + detect_diag9c(); + detect_diag44(); + detect_machine_facilities(); + save_vector_registers(); + setup_topology(); + sclp_early_detect(); + lockdep_on(); +} diff --git a/arch/s390/kernel/early_nobss.c b/arch/s390/kernel/early_nobss.c new file mode 100644 index 000000000..2d84fc48d --- /dev/null +++ b/arch/s390/kernel/early_nobss.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2007, 2018 + */ + +/* + * Early setup functions which may not rely on an initialized bss + * section. The last thing that is supposed to happen here is + * initialization of the bss section. + */ + +#include <linux/processor.h> +#include <linux/string.h> +#include <asm/sections.h> +#include <asm/lowcore.h> +#include <asm/setup.h> +#include <asm/timex.h> +#include "entry.h" + +static void __init reset_tod_clock(void) +{ + u64 time; + + if (store_tod_clock(&time) == 0) + return; + /* TOD clock not running. Set the clock to Unix Epoch. */ + if (set_tod_clock(TOD_UNIX_EPOCH) != 0 || store_tod_clock(&time) != 0) + disabled_wait(0); + + memset(tod_clock_base, 0, 16); + *(__u64 *) &tod_clock_base[1] = TOD_UNIX_EPOCH; + S390_lowcore.last_update_clock = TOD_UNIX_EPOCH; +} + +static void __init rescue_initrd(void) +{ + unsigned long min_initrd_addr = (unsigned long) _end + (4UL << 20); + + /* + * Just like in case of IPL from VM reader we make sure there is a + * gap of 4MB between end of kernel and start of initrd. + * That way we can also be sure that saving an NSS will succeed, + * which however only requires different segments. + */ + if (!IS_ENABLED(CONFIG_BLK_DEV_INITRD)) + return; + if (!INITRD_START || !INITRD_SIZE) + return; + if (INITRD_START >= min_initrd_addr) + return; + memmove((void *) min_initrd_addr, (void *) INITRD_START, INITRD_SIZE); + INITRD_START = min_initrd_addr; +} + +static void __init clear_bss_section(void) +{ + memset(__bss_start, 0, __bss_stop - __bss_start); +} + +void __init startup_init_nobss(void) +{ + reset_tod_clock(); + rescue_initrd(); + clear_bss_section(); +} diff --git a/arch/s390/kernel/early_printk.c b/arch/s390/kernel/early_printk.c new file mode 100644 index 000000000..40c1dfec9 --- /dev/null +++ b/arch/s390/kernel/early_printk.c @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2017 + */ + +#include <linux/console.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <asm/sclp.h> + +static void sclp_early_write(struct console *con, const char *s, unsigned int len) +{ + __sclp_early_printk(s, len, 0); +} + +static struct console sclp_early_console = { + .name = "earlysclp", + .write = sclp_early_write, + .flags = CON_PRINTBUFFER | CON_BOOT, + .index = -1, +}; + +static int __init setup_early_printk(char *buf) +{ + if (early_console) + return 0; + /* Accept only "earlyprintk" and "earlyprintk=sclp" */ + if (buf && strncmp(buf, "sclp", 4)) + return 0; + if (!sclp.has_linemode && !sclp.has_vt220) + return 0; + early_console = &sclp_early_console; + register_console(early_console); + return 0; +} +early_param("earlyprintk", setup_early_printk); diff --git a/arch/s390/kernel/ebcdic.c b/arch/s390/kernel/ebcdic.c new file mode 100644 index 000000000..7f8246c9b --- /dev/null +++ b/arch/s390/kernel/ebcdic.c @@ -0,0 +1,401 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ECBDIC -> ASCII, ASCII -> ECBDIC, + * upper to lower case (EBCDIC) conversion tables. + * + * S390 version + * Copyright IBM Corp. 1999 + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + * Martin Peschke <peschke@fh-brandenburg.de> + */ + +#include <linux/types.h> +#include <linux/export.h> +#include <asm/ebcdic.h> + +/* + * ASCII (IBM PC 437) -> EBCDIC 037 + */ +__u8 _ascebc[256] = +{ + /*00 NUL SOH STX ETX EOT ENQ ACK BEL */ + 0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, + /*08 BS HT LF VT FF CR SO SI */ + /* ->NL */ + 0x16, 0x05, 0x15, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + /*10 DLE DC1 DC2 DC3 DC4 NAK SYN ETB */ + 0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, + /*18 CAN EM SUB ESC FS GS RS US */ + /* ->IGS ->IRS ->IUS */ + 0x18, 0x19, 0x3F, 0x27, 0x22, 0x1D, 0x1E, 0x1F, + /*20 SP ! " # $ % & ' */ + 0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, + /*28 ( ) * + , - . / */ + 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61, + /*30 0 1 2 3 4 5 6 7 */ + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, + /*38 8 9 : ; < = > ? */ + 0xF8, 0xF9, 0x7A, 0x5E, 0x4C, 0x7E, 0x6E, 0x6F, + /*40 @ A B C D E F G */ + 0x7C, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, + /*48 H I J K L M N O */ + 0xC8, 0xC9, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, + /*50 P Q R S T U V W */ + 0xD7, 0xD8, 0xD9, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, + /*58 X Y Z [ \ ] ^ _ */ + 0xE7, 0xE8, 0xE9, 0xBA, 0xE0, 0xBB, 0xB0, 0x6D, + /*60 ` a b c d e f g */ + 0x79, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + /*68 h i j k l m n o */ + 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, + /*70 p q r s t u v w */ + 0x97, 0x98, 0x99, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, + /*78 x y z { | } ~ DL */ + 0xA7, 0xA8, 0xA9, 0xC0, 0x4F, 0xD0, 0xA1, 0x07, + /*80*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*88*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*90*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*98*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*A0*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*A8*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*B0*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*B8*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*C0*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*C8*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*D0*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*D8*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*E0 sz */ + 0x3F, 0x59, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*E8*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*F0*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*F8*/ + 0x90, 0x3F, 0x3F, 0x3F, 0x3F, 0xEA, 0x3F, 0xFF +}; + +/* + * EBCDIC 037 -> ASCII (IBM PC 437) + */ +__u8 _ebcasc[256] = +{ + /* 0x00 NUL SOH STX ETX *SEL HT *RNL DEL */ + 0x00, 0x01, 0x02, 0x03, 0x07, 0x09, 0x07, 0x7F, + /* 0x08 -GE -SPS -RPT VT FF CR SO SI */ + 0x07, 0x07, 0x07, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + /* 0x10 DLE DC1 DC2 DC3 -RES -NL BS -POC + -ENP ->LF */ + 0x10, 0x11, 0x12, 0x13, 0x07, 0x0A, 0x08, 0x07, + /* 0x18 CAN EM -UBS -CU1 -IFS -IGS -IRS -ITB + -IUS */ + 0x18, 0x19, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + /* 0x20 -DS -SOS FS -WUS -BYP LF ETB ESC + -INP */ + 0x07, 0x07, 0x1C, 0x07, 0x07, 0x0A, 0x17, 0x1B, + /* 0x28 -SA -SFE -SM -CSP -MFA ENQ ACK BEL + -SW */ + 0x07, 0x07, 0x07, 0x07, 0x07, 0x05, 0x06, 0x07, + /* 0x30 ---- ---- SYN -IR -PP -TRN -NBS EOT */ + 0x07, 0x07, 0x16, 0x07, 0x07, 0x07, 0x07, 0x04, + /* 0x38 -SBS -IT -RFF -CU3 DC4 NAK ---- SUB */ + 0x07, 0x07, 0x07, 0x07, 0x14, 0x15, 0x07, 0x1A, + /* 0x40 SP RSP ä ---- */ + 0x20, 0xFF, 0x83, 0x84, 0x85, 0xA0, 0x07, 0x86, + /* 0x48 . < ( + | */ + 0x87, 0xA4, 0x9B, 0x2E, 0x3C, 0x28, 0x2B, 0x7C, + /* 0x50 & ---- */ + 0x26, 0x82, 0x88, 0x89, 0x8A, 0xA1, 0x8C, 0x07, + /* 0x58 ß ! $ * ) ; */ + 0x8D, 0xE1, 0x21, 0x24, 0x2A, 0x29, 0x3B, 0xAA, + /* 0x60 - / ---- Ä ---- ---- ---- */ + 0x2D, 0x2F, 0x07, 0x8E, 0x07, 0x07, 0x07, 0x8F, + /* 0x68 ---- , % _ > ? */ + 0x80, 0xA5, 0x07, 0x2C, 0x25, 0x5F, 0x3E, 0x3F, + /* 0x70 ---- ---- ---- ---- ---- ---- ---- */ + 0x07, 0x90, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + /* 0x78 * ` : # @ ' = " */ + 0x70, 0x60, 0x3A, 0x23, 0x40, 0x27, 0x3D, 0x22, + /* 0x80 * a b c d e f g */ + 0x07, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + /* 0x88 h i ---- ---- ---- */ + 0x68, 0x69, 0xAE, 0xAF, 0x07, 0x07, 0x07, 0xF1, + /* 0x90 ° j k l m n o p */ + 0xF8, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, + /* 0x98 q r ---- ---- */ + 0x71, 0x72, 0xA6, 0xA7, 0x91, 0x07, 0x92, 0x07, + /* 0xA0 ~ s t u v w x */ + 0xE6, 0x7E, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, + /* 0xA8 y z ---- ---- ---- ---- */ + 0x79, 0x7A, 0xAD, 0xAB, 0x07, 0x07, 0x07, 0x07, + /* 0xB0 ^ ---- § ---- */ + 0x5E, 0x9C, 0x9D, 0xFA, 0x07, 0x07, 0x07, 0xAC, + /* 0xB8 ---- [ ] ---- ---- ---- ---- */ + 0xAB, 0x07, 0x5B, 0x5D, 0x07, 0x07, 0x07, 0x07, + /* 0xC0 { A B C D E F G */ + 0x7B, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + /* 0xC8 H I ---- ö ---- */ + 0x48, 0x49, 0x07, 0x93, 0x94, 0x95, 0xA2, 0x07, + /* 0xD0 } J K L M N O P */ + 0x7D, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50, + /* 0xD8 Q R ---- ü */ + 0x51, 0x52, 0x07, 0x96, 0x81, 0x97, 0xA3, 0x98, + /* 0xE0 \ S T U V W X */ + 0x5C, 0xF6, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, + /* 0xE8 Y Z ---- Ö ---- ---- ---- */ + 0x59, 0x5A, 0xFD, 0x07, 0x99, 0x07, 0x07, 0x07, + /* 0xF0 0 1 2 3 4 5 6 7 */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + /* 0xF8 8 9 ---- ---- Ãœ ---- ---- ---- */ + 0x38, 0x39, 0x07, 0x07, 0x9A, 0x07, 0x07, 0x07 +}; + + +/* + * ASCII (IBM PC 437) -> EBCDIC 500 + */ +__u8 _ascebc_500[256] = +{ + /*00 NUL SOH STX ETX EOT ENQ ACK BEL */ + 0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, + /*08 BS HT LF VT FF CR SO SI */ + /* ->NL */ + 0x16, 0x05, 0x15, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + /*10 DLE DC1 DC2 DC3 DC4 NAK SYN ETB */ + 0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, + /*18 CAN EM SUB ESC FS GS RS US */ + /* ->IGS ->IRS ->IUS */ + 0x18, 0x19, 0x3F, 0x27, 0x22, 0x1D, 0x1E, 0x1F, + /*20 SP ! " # $ % & ' */ + 0x40, 0x4F, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, + /*28 ( ) * + , - . / */ + 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61, + /*30 0 1 2 3 4 5 6 7 */ + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, + /*38 8 9 : ; < = > ? */ + 0xF8, 0xF9, 0x7A, 0x5E, 0x4C, 0x7E, 0x6E, 0x6F, + /*40 @ A B C D E F G */ + 0x7C, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, + /*48 H I J K L M N O */ + 0xC8, 0xC9, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, + /*50 P Q R S T U V W */ + 0xD7, 0xD8, 0xD9, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, + /*58 X Y Z [ \ ] ^ _ */ + 0xE7, 0xE8, 0xE9, 0x4A, 0xE0, 0x5A, 0x5F, 0x6D, + /*60 ` a b c d e f g */ + 0x79, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + /*68 h i j k l m n o */ + 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, + /*70 p q r s t u v w */ + 0x97, 0x98, 0x99, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, + /*78 x y z { | } ~ DL */ + 0xA7, 0xA8, 0xA9, 0xC0, 0xBB, 0xD0, 0xA1, 0x07, + /*80*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*88*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*90*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*98*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*A0*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*A8*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*B0*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*B8*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*C0*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*C8*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*D0*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*D8*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*E0 sz */ + 0x3F, 0x59, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*E8*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*F0*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*F8*/ + 0x90, 0x3F, 0x3F, 0x3F, 0x3F, 0xEA, 0x3F, 0xFF +}; + +/* + * EBCDIC 500 -> ASCII (IBM PC 437) + */ +__u8 _ebcasc_500[256] = +{ + /* 0x00 NUL SOH STX ETX *SEL HT *RNL DEL */ + 0x00, 0x01, 0x02, 0x03, 0x07, 0x09, 0x07, 0x7F, + /* 0x08 -GE -SPS -RPT VT FF CR SO SI */ + 0x07, 0x07, 0x07, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + /* 0x10 DLE DC1 DC2 DC3 -RES -NL BS -POC + -ENP ->LF */ + 0x10, 0x11, 0x12, 0x13, 0x07, 0x0A, 0x08, 0x07, + /* 0x18 CAN EM -UBS -CU1 -IFS -IGS -IRS -ITB + -IUS */ + 0x18, 0x19, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + /* 0x20 -DS -SOS FS -WUS -BYP LF ETB ESC + -INP */ + 0x07, 0x07, 0x1C, 0x07, 0x07, 0x0A, 0x17, 0x1B, + /* 0x28 -SA -SFE -SM -CSP -MFA ENQ ACK BEL + -SW */ + 0x07, 0x07, 0x07, 0x07, 0x07, 0x05, 0x06, 0x07, + /* 0x30 ---- ---- SYN -IR -PP -TRN -NBS EOT */ + 0x07, 0x07, 0x16, 0x07, 0x07, 0x07, 0x07, 0x04, + /* 0x38 -SBS -IT -RFF -CU3 DC4 NAK ---- SUB */ + 0x07, 0x07, 0x07, 0x07, 0x14, 0x15, 0x07, 0x1A, + /* 0x40 SP RSP ä ---- */ + 0x20, 0xFF, 0x83, 0x84, 0x85, 0xA0, 0x07, 0x86, + /* 0x48 [ . < ( + ! */ + 0x87, 0xA4, 0x5B, 0x2E, 0x3C, 0x28, 0x2B, 0x21, + /* 0x50 & ---- */ + 0x26, 0x82, 0x88, 0x89, 0x8A, 0xA1, 0x8C, 0x07, + /* 0x58 ß ] $ * ) ; ^ */ + 0x8D, 0xE1, 0x5D, 0x24, 0x2A, 0x29, 0x3B, 0x5E, + /* 0x60 - / ---- Ä ---- ---- ---- */ + 0x2D, 0x2F, 0x07, 0x8E, 0x07, 0x07, 0x07, 0x8F, + /* 0x68 ---- , % _ > ? */ + 0x80, 0xA5, 0x07, 0x2C, 0x25, 0x5F, 0x3E, 0x3F, + /* 0x70 ---- ---- ---- ---- ---- ---- ---- */ + 0x07, 0x90, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + /* 0x78 * ` : # @ ' = " */ + 0x70, 0x60, 0x3A, 0x23, 0x40, 0x27, 0x3D, 0x22, + /* 0x80 * a b c d e f g */ + 0x07, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + /* 0x88 h i ---- ---- ---- */ + 0x68, 0x69, 0xAE, 0xAF, 0x07, 0x07, 0x07, 0xF1, + /* 0x90 ° j k l m n o p */ + 0xF8, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, + /* 0x98 q r ---- ---- */ + 0x71, 0x72, 0xA6, 0xA7, 0x91, 0x07, 0x92, 0x07, + /* 0xA0 ~ s t u v w x */ + 0xE6, 0x7E, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, + /* 0xA8 y z ---- ---- ---- ---- */ + 0x79, 0x7A, 0xAD, 0xAB, 0x07, 0x07, 0x07, 0x07, + /* 0xB0 ---- § ---- */ + 0x9B, 0x9C, 0x9D, 0xFA, 0x07, 0x07, 0x07, 0xAC, + /* 0xB8 ---- | ---- ---- ---- ---- */ + 0xAB, 0x07, 0xAA, 0x7C, 0x07, 0x07, 0x07, 0x07, + /* 0xC0 { A B C D E F G */ + 0x7B, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + /* 0xC8 H I ---- ö ---- */ + 0x48, 0x49, 0x07, 0x93, 0x94, 0x95, 0xA2, 0x07, + /* 0xD0 } J K L M N O P */ + 0x7D, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50, + /* 0xD8 Q R ---- ü */ + 0x51, 0x52, 0x07, 0x96, 0x81, 0x97, 0xA3, 0x98, + /* 0xE0 \ S T U V W X */ + 0x5C, 0xF6, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, + /* 0xE8 Y Z ---- Ö ---- ---- ---- */ + 0x59, 0x5A, 0xFD, 0x07, 0x99, 0x07, 0x07, 0x07, + /* 0xF0 0 1 2 3 4 5 6 7 */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + /* 0xF8 8 9 ---- ---- Ãœ ---- ---- ---- */ + 0x38, 0x39, 0x07, 0x07, 0x9A, 0x07, 0x07, 0x07 +}; + + +/* + * EBCDIC 037/500 conversion table: + * from upper to lower case + */ +__u8 _ebc_tolower[256] = +{ + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F, + 0x60, 0x61, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9C, 0x9F, + 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, + 0xA8, 0xA9, 0xAA, 0xAB, 0x8C, 0x8D, 0x8E, 0xAF, + 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, + 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF, + 0xC0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, + 0xD0, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, + 0xE0, 0xE1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, + 0xA8, 0xA9, 0xEA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, + 0xF8, 0xF9, 0xFA, 0xDB, 0xDC, 0xDD, 0xDE, 0xFF +}; + + +/* + * EBCDIC 037/500 conversion table: + * from lower to upper case + */ +__u8 _ebc_toupper[256] = +{ + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F, + 0x40, 0x41, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0x50, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, + 0x80, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, + 0xC8, 0xC9, 0x8A, 0x8B, 0xAC, 0xAD, 0xAE, 0x8F, + 0x90, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, + 0xD8, 0xD9, 0x9A, 0x9B, 0x9E, 0x9D, 0x9E, 0x9F, + 0xA0, 0xA1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, + 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, + 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF, + 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, + 0xC8, 0xC9, 0xCA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, + 0xD8, 0xD9, 0xDA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, + 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF +}; + +EXPORT_SYMBOL(_ascebc_500); +EXPORT_SYMBOL(_ebcasc_500); +EXPORT_SYMBOL(_ascebc); +EXPORT_SYMBOL(_ebcasc); +EXPORT_SYMBOL(_ebc_tolower); +EXPORT_SYMBOL(_ebc_toupper); + diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S new file mode 100644 index 000000000..7e6a9cf86 --- /dev/null +++ b/arch/s390/kernel/entry.S @@ -0,0 +1,1510 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * S390 low-level entry points. + * + * Copyright IBM Corp. 1999, 2012 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), + * Hartmut Penner (hp@de.ibm.com), + * Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com), + * Heiko Carstens <heiko.carstens@de.ibm.com> + */ + +#include <linux/init.h> +#include <linux/linkage.h> +#include <asm/alternative-asm.h> +#include <asm/processor.h> +#include <asm/cache.h> +#include <asm/ctl_reg.h> +#include <asm/dwarf.h> +#include <asm/errno.h> +#include <asm/ptrace.h> +#include <asm/thread_info.h> +#include <asm/asm-offsets.h> +#include <asm/unistd.h> +#include <asm/page.h> +#include <asm/sigp.h> +#include <asm/irq.h> +#include <asm/vx-insn.h> +#include <asm/setup.h> +#include <asm/nmi.h> +#include <asm/export.h> +#include <asm/nospec-insn.h> + +__PT_R0 = __PT_GPRS +__PT_R1 = __PT_GPRS + 8 +__PT_R2 = __PT_GPRS + 16 +__PT_R3 = __PT_GPRS + 24 +__PT_R4 = __PT_GPRS + 32 +__PT_R5 = __PT_GPRS + 40 +__PT_R6 = __PT_GPRS + 48 +__PT_R7 = __PT_GPRS + 56 +__PT_R8 = __PT_GPRS + 64 +__PT_R9 = __PT_GPRS + 72 +__PT_R10 = __PT_GPRS + 80 +__PT_R11 = __PT_GPRS + 88 +__PT_R12 = __PT_GPRS + 96 +__PT_R13 = __PT_GPRS + 104 +__PT_R14 = __PT_GPRS + 112 +__PT_R15 = __PT_GPRS + 120 + +STACK_SHIFT = PAGE_SHIFT + THREAD_SIZE_ORDER +STACK_SIZE = 1 << STACK_SHIFT +STACK_INIT = STACK_SIZE - STACK_FRAME_OVERHEAD - __PT_SIZE + +_TIF_WORK = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \ + _TIF_UPROBE | _TIF_GUARDED_STORAGE | _TIF_PATCH_PENDING) +_TIF_TRACE = (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \ + _TIF_SYSCALL_TRACEPOINT) +_CIF_WORK = (_CIF_MCCK_PENDING | _CIF_ASCE_PRIMARY | \ + _CIF_ASCE_SECONDARY | _CIF_FPU) +_PIF_WORK = (_PIF_PER_TRAP | _PIF_SYSCALL_RESTART) + +_LPP_OFFSET = __LC_LPP + +#define BASED(name) name-cleanup_critical(%r13) + + .macro TRACE_IRQS_ON +#ifdef CONFIG_TRACE_IRQFLAGS + basr %r2,%r0 + brasl %r14,trace_hardirqs_on_caller +#endif + .endm + + .macro TRACE_IRQS_OFF +#ifdef CONFIG_TRACE_IRQFLAGS + basr %r2,%r0 + brasl %r14,trace_hardirqs_off_caller +#endif + .endm + + .macro LOCKDEP_SYS_EXIT +#ifdef CONFIG_LOCKDEP + tm __PT_PSW+1(%r11),0x01 # returning to user ? + jz .+10 + brasl %r14,lockdep_sys_exit +#endif + .endm + + .macro CHECK_STACK stacksize,savearea +#ifdef CONFIG_CHECK_STACK + tml %r15,\stacksize - CONFIG_STACK_GUARD + lghi %r14,\savearea + jz stack_overflow +#endif + .endm + + .macro SWITCH_ASYNC savearea,timer + tmhh %r8,0x0001 # interrupting from user ? + jnz 1f + lgr %r14,%r9 + slg %r14,BASED(.Lcritical_start) + clg %r14,BASED(.Lcritical_length) + jhe 0f + lghi %r11,\savearea # inside critical section, do cleanup + brasl %r14,cleanup_critical + tmhh %r8,0x0001 # retest problem state after cleanup + jnz 1f +0: lg %r14,__LC_ASYNC_STACK # are we already on the async stack? + slgr %r14,%r15 + srag %r14,%r14,STACK_SHIFT + jnz 2f + CHECK_STACK 1<<STACK_SHIFT,\savearea + aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) + j 3f +1: UPDATE_VTIME %r14,%r15,\timer + BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP +2: lg %r15,__LC_ASYNC_STACK # load async stack +3: la %r11,STACK_FRAME_OVERHEAD(%r15) + .endm + + .macro UPDATE_VTIME w1,w2,enter_timer + lg \w1,__LC_EXIT_TIMER + lg \w2,__LC_LAST_UPDATE_TIMER + slg \w1,\enter_timer + slg \w2,__LC_EXIT_TIMER + alg \w1,__LC_USER_TIMER + alg \w2,__LC_SYSTEM_TIMER + stg \w1,__LC_USER_TIMER + stg \w2,__LC_SYSTEM_TIMER + mvc __LC_LAST_UPDATE_TIMER(8),\enter_timer + .endm + + .macro REENABLE_IRQS + stg %r8,__LC_RETURN_PSW + ni __LC_RETURN_PSW,0xbf + ssm __LC_RETURN_PSW + .endm + + .macro STCK savearea +#ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES + .insn s,0xb27c0000,\savearea # store clock fast +#else + .insn s,0xb2050000,\savearea # store clock +#endif + .endm + + /* + * The TSTMSK macro generates a test-under-mask instruction by + * calculating the memory offset for the specified mask value. + * Mask value can be any constant. The macro shifts the mask + * value to calculate the memory offset for the test-under-mask + * instruction. + */ + .macro TSTMSK addr, mask, size=8, bytepos=0 + .if (\bytepos < \size) && (\mask >> 8) + .if (\mask & 0xff) + .error "Mask exceeds byte boundary" + .endif + TSTMSK \addr, "(\mask >> 8)", \size, "(\bytepos + 1)" + .exitm + .endif + .ifeq \mask + .error "Mask must not be zero" + .endif + off = \size - \bytepos - 1 + tm off+\addr, \mask + .endm + + .macro BPOFF + ALTERNATIVE "", ".long 0xb2e8c000", 82 + .endm + + .macro BPON + ALTERNATIVE "", ".long 0xb2e8d000", 82 + .endm + + .macro BPENTER tif_ptr,tif_mask + ALTERNATIVE "TSTMSK \tif_ptr,\tif_mask; jz .+8; .long 0xb2e8d000", \ + "", 82 + .endm + + .macro BPEXIT tif_ptr,tif_mask + TSTMSK \tif_ptr,\tif_mask + ALTERNATIVE "jz .+8; .long 0xb2e8c000", \ + "jnz .+8; .long 0xb2e8d000", 82 + .endm + + GEN_BR_THUNK %r9 + GEN_BR_THUNK %r14 + GEN_BR_THUNK %r14,%r11 + + .section .kprobes.text, "ax" +.Ldummy: + /* + * This nop exists only in order to avoid that __switch_to starts at + * the beginning of the kprobes text section. In that case we would + * have several symbols at the same address. E.g. objdump would take + * an arbitrary symbol name when disassembling this code. + * With the added nop in between the __switch_to symbol is unique + * again. + */ + nop 0 + +ENTRY(__bpon) + .globl __bpon + BPON + BR_EX %r14 + +/* + * Scheduler resume function, called by switch_to + * gpr2 = (task_struct *) prev + * gpr3 = (task_struct *) next + * Returns: + * gpr2 = prev + */ +ENTRY(__switch_to) + stmg %r6,%r15,__SF_GPRS(%r15) # store gprs of prev task + lghi %r4,__TASK_stack + lghi %r1,__TASK_thread + lg %r5,0(%r4,%r3) # start of kernel stack of next + stg %r15,__THREAD_ksp(%r1,%r2) # store kernel stack of prev + lgr %r15,%r5 + aghi %r15,STACK_INIT # end of kernel stack of next + stg %r3,__LC_CURRENT # store task struct of next + stg %r15,__LC_KERNEL_STACK # store end of kernel stack + lg %r15,__THREAD_ksp(%r1,%r3) # load kernel stack of next + aghi %r3,__TASK_pid + mvc __LC_CURRENT_PID(4,%r0),0(%r3) # store pid of next + lmg %r6,%r15,__SF_GPRS(%r15) # load gprs of next task + ALTERNATIVE "", ".insn s,0xb2800000,_LPP_OFFSET", 40 + BR_EX %r14 + +.L__critical_start: + +#if IS_ENABLED(CONFIG_KVM) +/* + * sie64a calling convention: + * %r2 pointer to sie control block + * %r3 guest register save area + */ +ENTRY(sie64a) + stmg %r6,%r14,__SF_GPRS(%r15) # save kernel registers + lg %r12,__LC_CURRENT + stg %r2,__SF_SIE_CONTROL(%r15) # save control block pointer + stg %r3,__SF_SIE_SAVEAREA(%r15) # save guest register save area + xc __SF_SIE_REASON(8,%r15),__SF_SIE_REASON(%r15) # reason code = 0 + mvc __SF_SIE_FLAGS(8,%r15),__TI_flags(%r12) # copy thread flags + TSTMSK __LC_CPU_FLAGS,_CIF_FPU # load guest fp/vx registers ? + jno .Lsie_load_guest_gprs + brasl %r14,load_fpu_regs # load guest fp/vx regs +.Lsie_load_guest_gprs: + lmg %r0,%r13,0(%r3) # load guest gprs 0-13 + lg %r14,__LC_GMAP # get gmap pointer + ltgr %r14,%r14 + jz .Lsie_gmap + lctlg %c1,%c1,__GMAP_ASCE(%r14) # load primary asce +.Lsie_gmap: + lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer + oi __SIE_PROG0C+3(%r14),1 # we are going into SIE now + tm __SIE_PROG20+3(%r14),3 # last exit... + jnz .Lsie_skip + TSTMSK __LC_CPU_FLAGS,_CIF_FPU + jo .Lsie_skip # exit if fp/vx regs changed + BPEXIT __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) +.Lsie_entry: + sie 0(%r14) +.Lsie_exit: + BPOFF + BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) +.Lsie_skip: + ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE + lctlg %c1,%c1,__LC_USER_ASCE # load primary asce +.Lsie_done: +# some program checks are suppressing. C code (e.g. do_protection_exception) +# will rewind the PSW by the ILC, which is often 4 bytes in case of SIE. There +# are some corner cases (e.g. runtime instrumentation) where ILC is unpredictable. +# Other instructions between sie64a and .Lsie_done should not cause program +# interrupts. So lets use 3 nops as a landing pad for all possible rewinds. +# See also .Lcleanup_sie +.Lrewind_pad6: + nopr 7 +.Lrewind_pad4: + nopr 7 +.Lrewind_pad2: + nopr 7 + .globl sie_exit +sie_exit: + lg %r14,__SF_SIE_SAVEAREA(%r15) # load guest register save area + stmg %r0,%r13,0(%r14) # save guest gprs 0-13 + xgr %r0,%r0 # clear guest registers to + xgr %r1,%r1 # prevent speculative use + xgr %r2,%r2 + xgr %r3,%r3 + xgr %r4,%r4 + xgr %r5,%r5 + lmg %r6,%r14,__SF_GPRS(%r15) # restore kernel registers + lg %r2,__SF_SIE_REASON(%r15) # return exit reason code + BR_EX %r14 +.Lsie_fault: + lghi %r14,-EFAULT + stg %r14,__SF_SIE_REASON(%r15) # set exit reason code + j sie_exit + + EX_TABLE(.Lrewind_pad6,.Lsie_fault) + EX_TABLE(.Lrewind_pad4,.Lsie_fault) + EX_TABLE(.Lrewind_pad2,.Lsie_fault) + EX_TABLE(sie_exit,.Lsie_fault) +EXPORT_SYMBOL(sie64a) +EXPORT_SYMBOL(sie_exit) +#endif + +/* + * SVC interrupt handler routine. System calls are synchronous events and + * are executed with interrupts enabled. + */ + +ENTRY(system_call) + stpt __LC_SYNC_ENTER_TIMER +.Lsysc_stmg: + stmg %r8,%r15,__LC_SAVE_AREA_SYNC + BPOFF + lg %r12,__LC_CURRENT + lghi %r13,__TASK_thread + lghi %r14,_PIF_SYSCALL +.Lsysc_per: + lg %r15,__LC_KERNEL_STACK + la %r11,STACK_FRAME_OVERHEAD(%r15) # pointer to pt_regs +.Lsysc_vtime: + UPDATE_VTIME %r8,%r9,__LC_SYNC_ENTER_TIMER + BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP + stmg %r0,%r7,__PT_R0(%r11) + mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC + mvc __PT_PSW(16,%r11),__LC_SVC_OLD_PSW + mvc __PT_INT_CODE(4,%r11),__LC_SVC_ILC + stg %r14,__PT_FLAGS(%r11) +.Lsysc_do_svc: + # clear user controlled register to prevent speculative use + xgr %r0,%r0 + # load address of system call table + lg %r10,__THREAD_sysc_table(%r13,%r12) + llgh %r8,__PT_INT_CODE+2(%r11) + slag %r8,%r8,2 # shift and test for svc 0 + jnz .Lsysc_nr_ok + # svc 0: system call number in %r1 + llgfr %r1,%r1 # clear high word in r1 + cghi %r1,NR_syscalls + jnl .Lsysc_nr_ok + sth %r1,__PT_INT_CODE+2(%r11) + slag %r8,%r1,2 +.Lsysc_nr_ok: + xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) + stg %r2,__PT_ORIG_GPR2(%r11) + stg %r7,STACK_FRAME_OVERHEAD(%r15) + lgf %r9,0(%r8,%r10) # get system call add. + TSTMSK __TI_flags(%r12),_TIF_TRACE + jnz .Lsysc_tracesys + BASR_EX %r14,%r9 # call sys_xxxx + stg %r2,__PT_R2(%r11) # store return value + +.Lsysc_return: +#ifdef CONFIG_DEBUG_RSEQ + lgr %r2,%r11 + brasl %r14,rseq_syscall +#endif + LOCKDEP_SYS_EXIT +.Lsysc_tif: + TSTMSK __PT_FLAGS(%r11),_PIF_WORK + jnz .Lsysc_work + TSTMSK __TI_flags(%r12),_TIF_WORK + jnz .Lsysc_work # check for work + TSTMSK __LC_CPU_FLAGS,_CIF_WORK + jnz .Lsysc_work + BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP +.Lsysc_restore: + lg %r14,__LC_VDSO_PER_CPU + lmg %r0,%r10,__PT_R0(%r11) + mvc __LC_RETURN_PSW(16),__PT_PSW(%r11) +.Lsysc_exit_timer: + stpt __LC_EXIT_TIMER + mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER + lmg %r11,%r15,__PT_R11(%r11) + lpswe __LC_RETURN_PSW +.Lsysc_done: + +# +# One of the work bits is on. Find out which one. +# +.Lsysc_work: + TSTMSK __LC_CPU_FLAGS,_CIF_MCCK_PENDING + jo .Lsysc_mcck_pending + TSTMSK __TI_flags(%r12),_TIF_NEED_RESCHED + jo .Lsysc_reschedule + TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL_RESTART + jo .Lsysc_syscall_restart +#ifdef CONFIG_UPROBES + TSTMSK __TI_flags(%r12),_TIF_UPROBE + jo .Lsysc_uprobe_notify +#endif + TSTMSK __TI_flags(%r12),_TIF_GUARDED_STORAGE + jo .Lsysc_guarded_storage + TSTMSK __PT_FLAGS(%r11),_PIF_PER_TRAP + jo .Lsysc_singlestep +#ifdef CONFIG_LIVEPATCH + TSTMSK __TI_flags(%r12),_TIF_PATCH_PENDING + jo .Lsysc_patch_pending # handle live patching just before + # signals and possible syscall restart +#endif + TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL_RESTART + jo .Lsysc_syscall_restart + TSTMSK __TI_flags(%r12),_TIF_SIGPENDING + jo .Lsysc_sigpending + TSTMSK __TI_flags(%r12),_TIF_NOTIFY_RESUME + jo .Lsysc_notify_resume + TSTMSK __LC_CPU_FLAGS,_CIF_FPU + jo .Lsysc_vxrs + TSTMSK __LC_CPU_FLAGS,(_CIF_ASCE_PRIMARY|_CIF_ASCE_SECONDARY) + jnz .Lsysc_asce + j .Lsysc_return # beware of critical section cleanup + +# +# _TIF_NEED_RESCHED is set, call schedule +# +.Lsysc_reschedule: + larl %r14,.Lsysc_return + jg schedule + +# +# _CIF_MCCK_PENDING is set, call handler +# +.Lsysc_mcck_pending: + larl %r14,.Lsysc_return + jg s390_handle_mcck # TIF bit will be cleared by handler + +# +# _CIF_ASCE_PRIMARY and/or _CIF_ASCE_SECONDARY set, load user space asce +# +.Lsysc_asce: + ni __LC_CPU_FLAGS+7,255-_CIF_ASCE_SECONDARY + lctlg %c7,%c7,__LC_VDSO_ASCE # load secondary asce + TSTMSK __LC_CPU_FLAGS,_CIF_ASCE_PRIMARY + jz .Lsysc_return +#ifndef CONFIG_HAVE_MARCH_Z10_FEATURES + tm __LC_STFLE_FAC_LIST+3,0x10 # has MVCOS ? + jnz .Lsysc_set_fs_fixup + ni __LC_CPU_FLAGS+7,255-_CIF_ASCE_PRIMARY + lctlg %c1,%c1,__LC_USER_ASCE # load primary asce + j .Lsysc_return +.Lsysc_set_fs_fixup: +#endif + larl %r14,.Lsysc_return + jg set_fs_fixup + +# +# CIF_FPU is set, restore floating-point controls and floating-point registers. +# +.Lsysc_vxrs: + larl %r14,.Lsysc_return + jg load_fpu_regs + +# +# _TIF_SIGPENDING is set, call do_signal +# +.Lsysc_sigpending: + lgr %r2,%r11 # pass pointer to pt_regs + brasl %r14,do_signal + TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL + jno .Lsysc_return +.Lsysc_do_syscall: + lghi %r13,__TASK_thread + lmg %r2,%r7,__PT_R2(%r11) # load svc arguments + lghi %r1,0 # svc 0 returns -ENOSYS + j .Lsysc_do_svc + +# +# _TIF_NOTIFY_RESUME is set, call do_notify_resume +# +.Lsysc_notify_resume: + lgr %r2,%r11 # pass pointer to pt_regs + larl %r14,.Lsysc_return + jg do_notify_resume + +# +# _TIF_UPROBE is set, call uprobe_notify_resume +# +#ifdef CONFIG_UPROBES +.Lsysc_uprobe_notify: + lgr %r2,%r11 # pass pointer to pt_regs + larl %r14,.Lsysc_return + jg uprobe_notify_resume +#endif + +# +# _TIF_GUARDED_STORAGE is set, call guarded_storage_load +# +.Lsysc_guarded_storage: + lgr %r2,%r11 # pass pointer to pt_regs + larl %r14,.Lsysc_return + jg gs_load_bc_cb +# +# _TIF_PATCH_PENDING is set, call klp_update_patch_state +# +#ifdef CONFIG_LIVEPATCH +.Lsysc_patch_pending: + lg %r2,__LC_CURRENT # pass pointer to task struct + larl %r14,.Lsysc_return + jg klp_update_patch_state +#endif + +# +# _PIF_PER_TRAP is set, call do_per_trap +# +.Lsysc_singlestep: + ni __PT_FLAGS+7(%r11),255-_PIF_PER_TRAP + lgr %r2,%r11 # pass pointer to pt_regs + larl %r14,.Lsysc_return + jg do_per_trap + +# +# _PIF_SYSCALL_RESTART is set, repeat the current system call +# +.Lsysc_syscall_restart: + ni __PT_FLAGS+7(%r11),255-_PIF_SYSCALL_RESTART + lmg %r1,%r7,__PT_R1(%r11) # load svc arguments + lg %r2,__PT_ORIG_GPR2(%r11) + j .Lsysc_do_svc + +# +# call tracehook_report_syscall_entry/tracehook_report_syscall_exit before +# and after the system call +# +.Lsysc_tracesys: + lgr %r2,%r11 # pass pointer to pt_regs + la %r3,0 + llgh %r0,__PT_INT_CODE+2(%r11) + stg %r0,__PT_R2(%r11) + brasl %r14,do_syscall_trace_enter + lghi %r0,NR_syscalls + clgr %r0,%r2 + jnh .Lsysc_tracenogo + sllg %r8,%r2,2 + lgf %r9,0(%r8,%r10) +.Lsysc_tracego: + lmg %r3,%r7,__PT_R3(%r11) + stg %r7,STACK_FRAME_OVERHEAD(%r15) + lg %r2,__PT_ORIG_GPR2(%r11) + BASR_EX %r14,%r9 # call sys_xxx + stg %r2,__PT_R2(%r11) # store return value +.Lsysc_tracenogo: + TSTMSK __TI_flags(%r12),_TIF_TRACE + jz .Lsysc_return + lgr %r2,%r11 # pass pointer to pt_regs + larl %r14,.Lsysc_return + jg do_syscall_trace_exit + +# +# a new process exits the kernel with ret_from_fork +# +ENTRY(ret_from_fork) + la %r11,STACK_FRAME_OVERHEAD(%r15) + lg %r12,__LC_CURRENT + brasl %r14,schedule_tail + TRACE_IRQS_ON + ssm __LC_SVC_NEW_PSW # reenable interrupts + tm __PT_PSW+1(%r11),0x01 # forking a kernel thread ? + jne .Lsysc_tracenogo + # it's a kernel thread + lmg %r9,%r10,__PT_R9(%r11) # load gprs +ENTRY(kernel_thread_starter) + la %r2,0(%r10) + BASR_EX %r14,%r9 + j .Lsysc_tracenogo + +/* + * Program check handler routine + */ + +ENTRY(pgm_check_handler) + stpt __LC_SYNC_ENTER_TIMER + BPOFF + stmg %r8,%r15,__LC_SAVE_AREA_SYNC + lg %r10,__LC_LAST_BREAK + lg %r12,__LC_CURRENT + lghi %r11,0 + larl %r13,cleanup_critical + lmg %r8,%r9,__LC_PGM_OLD_PSW + tmhh %r8,0x0001 # test problem state bit + jnz 2f # -> fault in user space +#if IS_ENABLED(CONFIG_KVM) + # cleanup critical section for program checks in sie64a + lgr %r14,%r9 + slg %r14,BASED(.Lsie_critical_start) + clg %r14,BASED(.Lsie_critical_length) + jhe 0f + lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer + ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE + lctlg %c1,%c1,__LC_USER_ASCE # load primary asce + larl %r9,sie_exit # skip forward to sie_exit + lghi %r11,_PIF_GUEST_FAULT +#endif +0: tmhh %r8,0x4000 # PER bit set in old PSW ? + jnz 1f # -> enabled, can't be a double fault + tm __LC_PGM_ILC+3,0x80 # check for per exception + jnz .Lpgm_svcper # -> single stepped svc +1: CHECK_STACK STACK_SIZE,__LC_SAVE_AREA_SYNC + aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) + j 4f +2: UPDATE_VTIME %r14,%r15,__LC_SYNC_ENTER_TIMER + BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP + lg %r15,__LC_KERNEL_STACK + lgr %r14,%r12 + aghi %r14,__TASK_thread # pointer to thread_struct + lghi %r13,__LC_PGM_TDB + tm __LC_PGM_ILC+2,0x02 # check for transaction abort + jz 3f + mvc __THREAD_trap_tdb(256,%r14),0(%r13) +3: stg %r10,__THREAD_last_break(%r14) +4: lgr %r13,%r11 + la %r11,STACK_FRAME_OVERHEAD(%r15) + stmg %r0,%r7,__PT_R0(%r11) + # clear user controlled registers to prevent speculative use + xgr %r0,%r0 + xgr %r1,%r1 + xgr %r2,%r2 + xgr %r3,%r3 + xgr %r4,%r4 + xgr %r5,%r5 + xgr %r6,%r6 + xgr %r7,%r7 + mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC + stmg %r8,%r9,__PT_PSW(%r11) + mvc __PT_INT_CODE(4,%r11),__LC_PGM_ILC + mvc __PT_INT_PARM_LONG(8,%r11),__LC_TRANS_EXC_CODE + stg %r13,__PT_FLAGS(%r11) + stg %r10,__PT_ARGS(%r11) + tm __LC_PGM_ILC+3,0x80 # check for per exception + jz 5f + tmhh %r8,0x0001 # kernel per event ? + jz .Lpgm_kprobe + oi __PT_FLAGS+7(%r11),_PIF_PER_TRAP + mvc __THREAD_per_address(8,%r14),__LC_PER_ADDRESS + mvc __THREAD_per_cause(2,%r14),__LC_PER_CODE + mvc __THREAD_per_paid(1,%r14),__LC_PER_ACCESS_ID +5: REENABLE_IRQS + xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) + larl %r1,pgm_check_table + llgh %r10,__PT_INT_CODE+2(%r11) + nill %r10,0x007f + sll %r10,2 + je .Lpgm_return + lgf %r9,0(%r10,%r1) # load address of handler routine + lgr %r2,%r11 # pass pointer to pt_regs + BASR_EX %r14,%r9 # branch to interrupt-handler +.Lpgm_return: + LOCKDEP_SYS_EXIT + tm __PT_PSW+1(%r11),0x01 # returning to user ? + jno .Lsysc_restore + TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL + jo .Lsysc_do_syscall + j .Lsysc_tif + +# +# PER event in supervisor state, must be kprobes +# +.Lpgm_kprobe: + REENABLE_IRQS + xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) + lgr %r2,%r11 # pass pointer to pt_regs + brasl %r14,do_per_trap + j .Lpgm_return + +# +# single stepped system call +# +.Lpgm_svcper: + mvc __LC_RETURN_PSW(8),__LC_SVC_NEW_PSW + lghi %r13,__TASK_thread + larl %r14,.Lsysc_per + stg %r14,__LC_RETURN_PSW+8 + lghi %r14,_PIF_SYSCALL | _PIF_PER_TRAP + lpswe __LC_RETURN_PSW # branch to .Lsysc_per and enable irqs + +/* + * IO interrupt handler routine + */ +ENTRY(io_int_handler) + STCK __LC_INT_CLOCK + stpt __LC_ASYNC_ENTER_TIMER + BPOFF + stmg %r8,%r15,__LC_SAVE_AREA_ASYNC + lg %r12,__LC_CURRENT + larl %r13,cleanup_critical + lmg %r8,%r9,__LC_IO_OLD_PSW + SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_ENTER_TIMER + stmg %r0,%r7,__PT_R0(%r11) + # clear user controlled registers to prevent speculative use + xgr %r0,%r0 + xgr %r1,%r1 + xgr %r2,%r2 + xgr %r3,%r3 + xgr %r4,%r4 + xgr %r5,%r5 + xgr %r6,%r6 + xgr %r7,%r7 + xgr %r10,%r10 + mvc __PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC + stmg %r8,%r9,__PT_PSW(%r11) + mvc __PT_INT_CODE(12,%r11),__LC_SUBCHANNEL_ID + xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) + TSTMSK __LC_CPU_FLAGS,_CIF_IGNORE_IRQ + jo .Lio_restore + TRACE_IRQS_OFF + xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) +.Lio_loop: + lgr %r2,%r11 # pass pointer to pt_regs + lghi %r3,IO_INTERRUPT + tm __PT_INT_CODE+8(%r11),0x80 # adapter interrupt ? + jz .Lio_call + lghi %r3,THIN_INTERRUPT +.Lio_call: + brasl %r14,do_IRQ + TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_LPAR + jz .Lio_return + tpi 0 + jz .Lio_return + mvc __PT_INT_CODE(12,%r11),__LC_SUBCHANNEL_ID + j .Lio_loop +.Lio_return: + LOCKDEP_SYS_EXIT + TRACE_IRQS_ON +.Lio_tif: + TSTMSK __TI_flags(%r12),_TIF_WORK + jnz .Lio_work # there is work to do (signals etc.) + TSTMSK __LC_CPU_FLAGS,_CIF_WORK + jnz .Lio_work +.Lio_restore: + lg %r14,__LC_VDSO_PER_CPU + lmg %r0,%r10,__PT_R0(%r11) + mvc __LC_RETURN_PSW(16),__PT_PSW(%r11) + tm __PT_PSW+1(%r11),0x01 # returning to user ? + jno .Lio_exit_kernel + BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP +.Lio_exit_timer: + stpt __LC_EXIT_TIMER + mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER +.Lio_exit_kernel: + lmg %r11,%r15,__PT_R11(%r11) + lpswe __LC_RETURN_PSW +.Lio_done: + +# +# There is work todo, find out in which context we have been interrupted: +# 1) if we return to user space we can do all _TIF_WORK work +# 2) if we return to kernel code and kvm is enabled check if we need to +# modify the psw to leave SIE +# 3) if we return to kernel code and preemptive scheduling is enabled check +# the preemption counter and if it is zero call preempt_schedule_irq +# Before any work can be done, a switch to the kernel stack is required. +# +.Lio_work: + tm __PT_PSW+1(%r11),0x01 # returning to user ? + jo .Lio_work_user # yes -> do resched & signal +#ifdef CONFIG_PREEMPT + # check for preemptive scheduling + icm %r0,15,__LC_PREEMPT_COUNT + jnz .Lio_restore # preemption is disabled + TSTMSK __TI_flags(%r12),_TIF_NEED_RESCHED + jno .Lio_restore + # switch to kernel stack + lg %r1,__PT_R15(%r11) + aghi %r1,-(STACK_FRAME_OVERHEAD + __PT_SIZE) + mvc STACK_FRAME_OVERHEAD(__PT_SIZE,%r1),0(%r11) + xc __SF_BACKCHAIN(8,%r1),__SF_BACKCHAIN(%r1) + la %r11,STACK_FRAME_OVERHEAD(%r1) + lgr %r15,%r1 + # TRACE_IRQS_ON already done at .Lio_return, call + # TRACE_IRQS_OFF to keep things symmetrical + TRACE_IRQS_OFF + brasl %r14,preempt_schedule_irq + j .Lio_return +#else + j .Lio_restore +#endif + +# +# Need to do work before returning to userspace, switch to kernel stack +# +.Lio_work_user: + lg %r1,__LC_KERNEL_STACK + mvc STACK_FRAME_OVERHEAD(__PT_SIZE,%r1),0(%r11) + xc __SF_BACKCHAIN(8,%r1),__SF_BACKCHAIN(%r1) + la %r11,STACK_FRAME_OVERHEAD(%r1) + lgr %r15,%r1 + +# +# One of the work bits is on. Find out which one. +# +.Lio_work_tif: + TSTMSK __LC_CPU_FLAGS,_CIF_MCCK_PENDING + jo .Lio_mcck_pending + TSTMSK __TI_flags(%r12),_TIF_NEED_RESCHED + jo .Lio_reschedule +#ifdef CONFIG_LIVEPATCH + TSTMSK __TI_flags(%r12),_TIF_PATCH_PENDING + jo .Lio_patch_pending +#endif + TSTMSK __TI_flags(%r12),_TIF_SIGPENDING + jo .Lio_sigpending + TSTMSK __TI_flags(%r12),_TIF_NOTIFY_RESUME + jo .Lio_notify_resume + TSTMSK __TI_flags(%r12),_TIF_GUARDED_STORAGE + jo .Lio_guarded_storage + TSTMSK __LC_CPU_FLAGS,_CIF_FPU + jo .Lio_vxrs + TSTMSK __LC_CPU_FLAGS,(_CIF_ASCE_PRIMARY|_CIF_ASCE_SECONDARY) + jnz .Lio_asce + j .Lio_return # beware of critical section cleanup + +# +# _CIF_MCCK_PENDING is set, call handler +# +.Lio_mcck_pending: + # TRACE_IRQS_ON already done at .Lio_return + brasl %r14,s390_handle_mcck # TIF bit will be cleared by handler + TRACE_IRQS_OFF + j .Lio_return + +# +# _CIF_ASCE_PRIMARY and/or CIF_ASCE_SECONDARY set, load user space asce +# +.Lio_asce: + ni __LC_CPU_FLAGS+7,255-_CIF_ASCE_SECONDARY + lctlg %c7,%c7,__LC_VDSO_ASCE # load secondary asce + TSTMSK __LC_CPU_FLAGS,_CIF_ASCE_PRIMARY + jz .Lio_return +#ifndef CONFIG_HAVE_MARCH_Z10_FEATURES + tm __LC_STFLE_FAC_LIST+3,0x10 # has MVCOS ? + jnz .Lio_set_fs_fixup + ni __LC_CPU_FLAGS+7,255-_CIF_ASCE_PRIMARY + lctlg %c1,%c1,__LC_USER_ASCE # load primary asce + j .Lio_return +.Lio_set_fs_fixup: +#endif + larl %r14,.Lio_return + jg set_fs_fixup + +# +# CIF_FPU is set, restore floating-point controls and floating-point registers. +# +.Lio_vxrs: + larl %r14,.Lio_return + jg load_fpu_regs + +# +# _TIF_GUARDED_STORAGE is set, call guarded_storage_load +# +.Lio_guarded_storage: + # TRACE_IRQS_ON already done at .Lio_return + ssm __LC_SVC_NEW_PSW # reenable interrupts + lgr %r2,%r11 # pass pointer to pt_regs + brasl %r14,gs_load_bc_cb + ssm __LC_PGM_NEW_PSW # disable I/O and ext. interrupts + TRACE_IRQS_OFF + j .Lio_return + +# +# _TIF_NEED_RESCHED is set, call schedule +# +.Lio_reschedule: + # TRACE_IRQS_ON already done at .Lio_return + ssm __LC_SVC_NEW_PSW # reenable interrupts + brasl %r14,schedule # call scheduler + ssm __LC_PGM_NEW_PSW # disable I/O and ext. interrupts + TRACE_IRQS_OFF + j .Lio_return + +# +# _TIF_PATCH_PENDING is set, call klp_update_patch_state +# +#ifdef CONFIG_LIVEPATCH +.Lio_patch_pending: + lg %r2,__LC_CURRENT # pass pointer to task struct + larl %r14,.Lio_return + jg klp_update_patch_state +#endif + +# +# _TIF_SIGPENDING or is set, call do_signal +# +.Lio_sigpending: + # TRACE_IRQS_ON already done at .Lio_return + ssm __LC_SVC_NEW_PSW # reenable interrupts + lgr %r2,%r11 # pass pointer to pt_regs + brasl %r14,do_signal + ssm __LC_PGM_NEW_PSW # disable I/O and ext. interrupts + TRACE_IRQS_OFF + j .Lio_return + +# +# _TIF_NOTIFY_RESUME or is set, call do_notify_resume +# +.Lio_notify_resume: + # TRACE_IRQS_ON already done at .Lio_return + ssm __LC_SVC_NEW_PSW # reenable interrupts + lgr %r2,%r11 # pass pointer to pt_regs + brasl %r14,do_notify_resume + ssm __LC_PGM_NEW_PSW # disable I/O and ext. interrupts + TRACE_IRQS_OFF + j .Lio_return + +/* + * External interrupt handler routine + */ +ENTRY(ext_int_handler) + STCK __LC_INT_CLOCK + stpt __LC_ASYNC_ENTER_TIMER + BPOFF + stmg %r8,%r15,__LC_SAVE_AREA_ASYNC + lg %r12,__LC_CURRENT + larl %r13,cleanup_critical + lmg %r8,%r9,__LC_EXT_OLD_PSW + SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_ENTER_TIMER + stmg %r0,%r7,__PT_R0(%r11) + # clear user controlled registers to prevent speculative use + xgr %r0,%r0 + xgr %r1,%r1 + xgr %r2,%r2 + xgr %r3,%r3 + xgr %r4,%r4 + xgr %r5,%r5 + xgr %r6,%r6 + xgr %r7,%r7 + xgr %r10,%r10 + mvc __PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC + stmg %r8,%r9,__PT_PSW(%r11) + lghi %r1,__LC_EXT_PARAMS2 + mvc __PT_INT_CODE(4,%r11),__LC_EXT_CPU_ADDR + mvc __PT_INT_PARM(4,%r11),__LC_EXT_PARAMS + mvc __PT_INT_PARM_LONG(8,%r11),0(%r1) + xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) + TSTMSK __LC_CPU_FLAGS,_CIF_IGNORE_IRQ + jo .Lio_restore + TRACE_IRQS_OFF + xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) + lgr %r2,%r11 # pass pointer to pt_regs + lghi %r3,EXT_INTERRUPT + brasl %r14,do_IRQ + j .Lio_return + +/* + * Load idle PSW. The second "half" of this function is in .Lcleanup_idle. + */ +ENTRY(psw_idle) + stg %r14,(__SF_GPRS+8*8)(%r15) + stg %r3,__SF_EMPTY(%r15) + larl %r1,.Lpsw_idle_lpsw+4 + stg %r1,__SF_EMPTY+8(%r15) +#ifdef CONFIG_SMP + larl %r1,smp_cpu_mtid + llgf %r1,0(%r1) + ltgr %r1,%r1 + jz .Lpsw_idle_stcctm + .insn rsy,0xeb0000000017,%r1,5,__SF_EMPTY+16(%r15) +.Lpsw_idle_stcctm: +#endif + oi __LC_CPU_FLAGS+7,_CIF_ENABLED_WAIT + BPON + STCK __CLOCK_IDLE_ENTER(%r2) + stpt __TIMER_IDLE_ENTER(%r2) +.Lpsw_idle_lpsw: + lpswe __SF_EMPTY(%r15) + BR_EX %r14 +.Lpsw_idle_end: + +/* + * Store floating-point controls and floating-point or vector register + * depending whether the vector facility is available. A critical section + * cleanup assures that the registers are stored even if interrupted for + * some other work. The CIF_FPU flag is set to trigger a lazy restore + * of the register contents at return from io or a system call. + */ +ENTRY(save_fpu_regs) + lg %r2,__LC_CURRENT + aghi %r2,__TASK_thread + TSTMSK __LC_CPU_FLAGS,_CIF_FPU + jo .Lsave_fpu_regs_exit + stfpc __THREAD_FPU_fpc(%r2) + lg %r3,__THREAD_FPU_regs(%r2) + TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_VX + jz .Lsave_fpu_regs_fp # no -> store FP regs + VSTM %v0,%v15,0,%r3 # vstm 0,15,0(3) + VSTM %v16,%v31,256,%r3 # vstm 16,31,256(3) + j .Lsave_fpu_regs_done # -> set CIF_FPU flag +.Lsave_fpu_regs_fp: + std 0,0(%r3) + std 1,8(%r3) + std 2,16(%r3) + std 3,24(%r3) + std 4,32(%r3) + std 5,40(%r3) + std 6,48(%r3) + std 7,56(%r3) + std 8,64(%r3) + std 9,72(%r3) + std 10,80(%r3) + std 11,88(%r3) + std 12,96(%r3) + std 13,104(%r3) + std 14,112(%r3) + std 15,120(%r3) +.Lsave_fpu_regs_done: + oi __LC_CPU_FLAGS+7,_CIF_FPU +.Lsave_fpu_regs_exit: + BR_EX %r14 +.Lsave_fpu_regs_end: +EXPORT_SYMBOL(save_fpu_regs) + +/* + * Load floating-point controls and floating-point or vector registers. + * A critical section cleanup assures that the register contents are + * loaded even if interrupted for some other work. + * + * There are special calling conventions to fit into sysc and io return work: + * %r15: <kernel stack> + * The function requires: + * %r4 + */ +load_fpu_regs: + lg %r4,__LC_CURRENT + aghi %r4,__TASK_thread + TSTMSK __LC_CPU_FLAGS,_CIF_FPU + jno .Lload_fpu_regs_exit + lfpc __THREAD_FPU_fpc(%r4) + TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_VX + lg %r4,__THREAD_FPU_regs(%r4) # %r4 <- reg save area + jz .Lload_fpu_regs_fp # -> no VX, load FP regs + VLM %v0,%v15,0,%r4 + VLM %v16,%v31,256,%r4 + j .Lload_fpu_regs_done +.Lload_fpu_regs_fp: + ld 0,0(%r4) + ld 1,8(%r4) + ld 2,16(%r4) + ld 3,24(%r4) + ld 4,32(%r4) + ld 5,40(%r4) + ld 6,48(%r4) + ld 7,56(%r4) + ld 8,64(%r4) + ld 9,72(%r4) + ld 10,80(%r4) + ld 11,88(%r4) + ld 12,96(%r4) + ld 13,104(%r4) + ld 14,112(%r4) + ld 15,120(%r4) +.Lload_fpu_regs_done: + ni __LC_CPU_FLAGS+7,255-_CIF_FPU +.Lload_fpu_regs_exit: + BR_EX %r14 +.Lload_fpu_regs_end: + +.L__critical_end: + +/* + * Machine check handler routines + */ +ENTRY(mcck_int_handler) + STCK __LC_MCCK_CLOCK + BPOFF + la %r1,4095 # validate r1 + spt __LC_CPU_TIMER_SAVE_AREA-4095(%r1) # validate cpu timer + sckc __LC_CLOCK_COMPARATOR # validate comparator + lam %a0,%a15,__LC_AREGS_SAVE_AREA-4095(%r1) # validate acrs + lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)# validate gprs + lg %r12,__LC_CURRENT + larl %r13,cleanup_critical + lmg %r8,%r9,__LC_MCK_OLD_PSW + TSTMSK __LC_MCCK_CODE,MCCK_CODE_SYSTEM_DAMAGE + jo .Lmcck_panic # yes -> rest of mcck code invalid + TSTMSK __LC_MCCK_CODE,MCCK_CODE_CR_VALID + jno .Lmcck_panic # control registers invalid -> panic + la %r14,4095 + lctlg %c0,%c15,__LC_CREGS_SAVE_AREA-4095(%r14) # validate ctl regs + ptlb + lg %r11,__LC_MCESAD-4095(%r14) # extended machine check save area + nill %r11,0xfc00 # MCESA_ORIGIN_MASK + TSTMSK __LC_CREGS_SAVE_AREA+16-4095(%r14),CR2_GUARDED_STORAGE + jno 0f + TSTMSK __LC_MCCK_CODE,MCCK_CODE_GS_VALID + jno 0f + .insn rxy,0xe3000000004d,0,__MCESA_GS_SAVE_AREA(%r11) # LGSC +0: l %r14,__LC_FP_CREG_SAVE_AREA-4095(%r14) + TSTMSK __LC_MCCK_CODE,MCCK_CODE_FC_VALID + jo 0f + sr %r14,%r14 +0: sfpc %r14 + TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_VX + jo 0f + lghi %r14,__LC_FPREGS_SAVE_AREA + ld %f0,0(%r14) + ld %f1,8(%r14) + ld %f2,16(%r14) + ld %f3,24(%r14) + ld %f4,32(%r14) + ld %f5,40(%r14) + ld %f6,48(%r14) + ld %f7,56(%r14) + ld %f8,64(%r14) + ld %f9,72(%r14) + ld %f10,80(%r14) + ld %f11,88(%r14) + ld %f12,96(%r14) + ld %f13,104(%r14) + ld %f14,112(%r14) + ld %f15,120(%r14) + j 1f +0: VLM %v0,%v15,0,%r11 + VLM %v16,%v31,256,%r11 +1: lghi %r14,__LC_CPU_TIMER_SAVE_AREA + mvc __LC_MCCK_ENTER_TIMER(8),0(%r14) + TSTMSK __LC_MCCK_CODE,MCCK_CODE_CPU_TIMER_VALID + jo 3f + la %r14,__LC_SYNC_ENTER_TIMER + clc 0(8,%r14),__LC_ASYNC_ENTER_TIMER + jl 0f + la %r14,__LC_ASYNC_ENTER_TIMER +0: clc 0(8,%r14),__LC_EXIT_TIMER + jl 1f + la %r14,__LC_EXIT_TIMER +1: clc 0(8,%r14),__LC_LAST_UPDATE_TIMER + jl 2f + la %r14,__LC_LAST_UPDATE_TIMER +2: spt 0(%r14) + mvc __LC_MCCK_ENTER_TIMER(8),0(%r14) +3: TSTMSK __LC_MCCK_CODE,MCCK_CODE_PSW_MWP_VALID + jno .Lmcck_panic + tmhh %r8,0x0001 # interrupting from user ? + jnz 4f + TSTMSK __LC_MCCK_CODE,MCCK_CODE_PSW_IA_VALID + jno .Lmcck_panic +4: SWITCH_ASYNC __LC_GPREGS_SAVE_AREA+64,__LC_MCCK_ENTER_TIMER +.Lmcck_skip: + lghi %r14,__LC_GPREGS_SAVE_AREA+64 + stmg %r0,%r7,__PT_R0(%r11) + # clear user controlled registers to prevent speculative use + xgr %r0,%r0 + xgr %r1,%r1 + xgr %r2,%r2 + xgr %r3,%r3 + xgr %r4,%r4 + xgr %r5,%r5 + xgr %r6,%r6 + xgr %r7,%r7 + xgr %r10,%r10 + mvc __PT_R8(64,%r11),0(%r14) + stmg %r8,%r9,__PT_PSW(%r11) + xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) + xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) + lgr %r2,%r11 # pass pointer to pt_regs + brasl %r14,s390_do_machine_check + tm __PT_PSW+1(%r11),0x01 # returning to user ? + jno .Lmcck_return + lg %r1,__LC_KERNEL_STACK # switch to kernel stack + mvc STACK_FRAME_OVERHEAD(__PT_SIZE,%r1),0(%r11) + xc __SF_BACKCHAIN(8,%r1),__SF_BACKCHAIN(%r1) + la %r11,STACK_FRAME_OVERHEAD(%r1) + lgr %r15,%r1 + ssm __LC_PGM_NEW_PSW # turn dat on, keep irqs off + TSTMSK __LC_CPU_FLAGS,_CIF_MCCK_PENDING + jno .Lmcck_return + TRACE_IRQS_OFF + brasl %r14,s390_handle_mcck + TRACE_IRQS_ON +.Lmcck_return: + lg %r14,__LC_VDSO_PER_CPU + lmg %r0,%r10,__PT_R0(%r11) + mvc __LC_RETURN_MCCK_PSW(16),__PT_PSW(%r11) # move return PSW + tm __LC_RETURN_MCCK_PSW+1,0x01 # returning to user ? + jno 0f + BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP + stpt __LC_EXIT_TIMER + mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER +0: lmg %r11,%r15,__PT_R11(%r11) + lpswe __LC_RETURN_MCCK_PSW + +.Lmcck_panic: + lg %r15,__LC_PANIC_STACK + la %r11,STACK_FRAME_OVERHEAD(%r15) + j .Lmcck_skip + +# +# PSW restart interrupt handler +# +ENTRY(restart_int_handler) + ALTERNATIVE "", ".insn s,0xb2800000,_LPP_OFFSET", 40 + stg %r15,__LC_SAVE_AREA_RESTART + lg %r15,__LC_RESTART_STACK + aghi %r15,-__PT_SIZE # create pt_regs on stack + xc 0(__PT_SIZE,%r15),0(%r15) + stmg %r0,%r14,__PT_R0(%r15) + mvc __PT_R15(8,%r15),__LC_SAVE_AREA_RESTART + mvc __PT_PSW(16,%r15),__LC_RST_OLD_PSW # store restart old psw + aghi %r15,-STACK_FRAME_OVERHEAD # create stack frame on stack + xc 0(STACK_FRAME_OVERHEAD,%r15),0(%r15) + lg %r1,__LC_RESTART_FN # load fn, parm & source cpu + lg %r2,__LC_RESTART_DATA + lg %r3,__LC_RESTART_SOURCE + ltgr %r3,%r3 # test source cpu address + jm 1f # negative -> skip source stop +0: sigp %r4,%r3,SIGP_SENSE # sigp sense to source cpu + brc 10,0b # wait for status stored +1: basr %r14,%r1 # call function + stap __SF_EMPTY(%r15) # store cpu address + llgh %r3,__SF_EMPTY(%r15) +2: sigp %r4,%r3,SIGP_STOP # sigp stop to current cpu + brc 2,2b +3: j 3b + + .section .kprobes.text, "ax" + +#ifdef CONFIG_CHECK_STACK +/* + * The synchronous or the asynchronous stack overflowed. We are dead. + * No need to properly save the registers, we are going to panic anyway. + * Setup a pt_regs so that show_trace can provide a good call trace. + */ +stack_overflow: + lg %r15,__LC_PANIC_STACK # change to panic stack + la %r11,STACK_FRAME_OVERHEAD(%r15) + stmg %r0,%r7,__PT_R0(%r11) + stmg %r8,%r9,__PT_PSW(%r11) + mvc __PT_R8(64,%r11),0(%r14) + stg %r10,__PT_ORIG_GPR2(%r11) # store last break to orig_gpr2 + xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) + lgr %r2,%r11 # pass pointer to pt_regs + jg kernel_stack_overflow +#endif + +cleanup_critical: +#if IS_ENABLED(CONFIG_KVM) + clg %r9,BASED(.Lcleanup_table_sie) # .Lsie_gmap + jl 0f + clg %r9,BASED(.Lcleanup_table_sie+8)# .Lsie_done + jl .Lcleanup_sie +#endif + clg %r9,BASED(.Lcleanup_table) # system_call + jl 0f + clg %r9,BASED(.Lcleanup_table+8) # .Lsysc_do_svc + jl .Lcleanup_system_call + clg %r9,BASED(.Lcleanup_table+16) # .Lsysc_tif + jl 0f + clg %r9,BASED(.Lcleanup_table+24) # .Lsysc_restore + jl .Lcleanup_sysc_tif + clg %r9,BASED(.Lcleanup_table+32) # .Lsysc_done + jl .Lcleanup_sysc_restore + clg %r9,BASED(.Lcleanup_table+40) # .Lio_tif + jl 0f + clg %r9,BASED(.Lcleanup_table+48) # .Lio_restore + jl .Lcleanup_io_tif + clg %r9,BASED(.Lcleanup_table+56) # .Lio_done + jl .Lcleanup_io_restore + clg %r9,BASED(.Lcleanup_table+64) # psw_idle + jl 0f + clg %r9,BASED(.Lcleanup_table+72) # .Lpsw_idle_end + jl .Lcleanup_idle + clg %r9,BASED(.Lcleanup_table+80) # save_fpu_regs + jl 0f + clg %r9,BASED(.Lcleanup_table+88) # .Lsave_fpu_regs_end + jl .Lcleanup_save_fpu_regs + clg %r9,BASED(.Lcleanup_table+96) # load_fpu_regs + jl 0f + clg %r9,BASED(.Lcleanup_table+104) # .Lload_fpu_regs_end + jl .Lcleanup_load_fpu_regs +0: BR_EX %r14,%r11 + + .align 8 +.Lcleanup_table: + .quad system_call + .quad .Lsysc_do_svc + .quad .Lsysc_tif + .quad .Lsysc_restore + .quad .Lsysc_done + .quad .Lio_tif + .quad .Lio_restore + .quad .Lio_done + .quad psw_idle + .quad .Lpsw_idle_end + .quad save_fpu_regs + .quad .Lsave_fpu_regs_end + .quad load_fpu_regs + .quad .Lload_fpu_regs_end + +#if IS_ENABLED(CONFIG_KVM) +.Lcleanup_table_sie: + .quad .Lsie_gmap + .quad .Lsie_done + +.Lcleanup_sie: + cghi %r11,__LC_SAVE_AREA_ASYNC #Is this in normal interrupt? + je 1f + slg %r9,BASED(.Lsie_crit_mcck_start) + clg %r9,BASED(.Lsie_crit_mcck_length) + jh 1f + oi __LC_CPU_FLAGS+7, _CIF_MCCK_GUEST +1: BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) + lg %r9,__SF_SIE_CONTROL(%r15) # get control block pointer + ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE + lctlg %c1,%c1,__LC_USER_ASCE # load primary asce + larl %r9,sie_exit # skip forward to sie_exit + BR_EX %r14,%r11 +#endif + +.Lcleanup_system_call: + # check if stpt has been executed + clg %r9,BASED(.Lcleanup_system_call_insn) + jh 0f + mvc __LC_SYNC_ENTER_TIMER(8),__LC_ASYNC_ENTER_TIMER + cghi %r11,__LC_SAVE_AREA_ASYNC + je 0f + mvc __LC_SYNC_ENTER_TIMER(8),__LC_MCCK_ENTER_TIMER +0: # check if stmg has been executed + clg %r9,BASED(.Lcleanup_system_call_insn+8) + jh 0f + mvc __LC_SAVE_AREA_SYNC(64),0(%r11) +0: # check if base register setup + TIF bit load has been done + clg %r9,BASED(.Lcleanup_system_call_insn+16) + jhe 0f + # set up saved register r12 task struct pointer + stg %r12,32(%r11) + # set up saved register r13 __TASK_thread offset + mvc 40(8,%r11),BASED(.Lcleanup_system_call_const) +0: # check if the user time update has been done + clg %r9,BASED(.Lcleanup_system_call_insn+24) + jh 0f + lg %r15,__LC_EXIT_TIMER + slg %r15,__LC_SYNC_ENTER_TIMER + alg %r15,__LC_USER_TIMER + stg %r15,__LC_USER_TIMER +0: # check if the system time update has been done + clg %r9,BASED(.Lcleanup_system_call_insn+32) + jh 0f + lg %r15,__LC_LAST_UPDATE_TIMER + slg %r15,__LC_EXIT_TIMER + alg %r15,__LC_SYSTEM_TIMER + stg %r15,__LC_SYSTEM_TIMER +0: # update accounting time stamp + mvc __LC_LAST_UPDATE_TIMER(8),__LC_SYNC_ENTER_TIMER + BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP + # set up saved register r11 + lg %r15,__LC_KERNEL_STACK + la %r9,STACK_FRAME_OVERHEAD(%r15) + stg %r9,24(%r11) # r11 pt_regs pointer + # fill pt_regs + mvc __PT_R8(64,%r9),__LC_SAVE_AREA_SYNC + stmg %r0,%r7,__PT_R0(%r9) + mvc __PT_PSW(16,%r9),__LC_SVC_OLD_PSW + mvc __PT_INT_CODE(4,%r9),__LC_SVC_ILC + xc __PT_FLAGS(8,%r9),__PT_FLAGS(%r9) + mvi __PT_FLAGS+7(%r9),_PIF_SYSCALL + # setup saved register r15 + stg %r15,56(%r11) # r15 stack pointer + # set new psw address and exit + larl %r9,.Lsysc_do_svc + BR_EX %r14,%r11 +.Lcleanup_system_call_insn: + .quad system_call + .quad .Lsysc_stmg + .quad .Lsysc_per + .quad .Lsysc_vtime+36 + .quad .Lsysc_vtime+42 +.Lcleanup_system_call_const: + .quad __TASK_thread + +.Lcleanup_sysc_tif: + larl %r9,.Lsysc_tif + BR_EX %r14,%r11 + +.Lcleanup_sysc_restore: + # check if stpt has been executed + clg %r9,BASED(.Lcleanup_sysc_restore_insn) + jh 0f + mvc __LC_EXIT_TIMER(8),__LC_ASYNC_ENTER_TIMER + cghi %r11,__LC_SAVE_AREA_ASYNC + je 0f + mvc __LC_EXIT_TIMER(8),__LC_MCCK_ENTER_TIMER +0: clg %r9,BASED(.Lcleanup_sysc_restore_insn+8) + je 1f + lg %r9,24(%r11) # get saved pointer to pt_regs + mvc __LC_RETURN_PSW(16),__PT_PSW(%r9) + mvc 0(64,%r11),__PT_R8(%r9) + lmg %r0,%r7,__PT_R0(%r9) +1: lmg %r8,%r9,__LC_RETURN_PSW + BR_EX %r14,%r11 +.Lcleanup_sysc_restore_insn: + .quad .Lsysc_exit_timer + .quad .Lsysc_done - 4 + +.Lcleanup_io_tif: + larl %r9,.Lio_tif + BR_EX %r14,%r11 + +.Lcleanup_io_restore: + # check if stpt has been executed + clg %r9,BASED(.Lcleanup_io_restore_insn) + jh 0f + mvc __LC_EXIT_TIMER(8),__LC_MCCK_ENTER_TIMER +0: clg %r9,BASED(.Lcleanup_io_restore_insn+8) + je 1f + lg %r9,24(%r11) # get saved r11 pointer to pt_regs + mvc __LC_RETURN_PSW(16),__PT_PSW(%r9) + mvc 0(64,%r11),__PT_R8(%r9) + lmg %r0,%r7,__PT_R0(%r9) +1: lmg %r8,%r9,__LC_RETURN_PSW + BR_EX %r14,%r11 +.Lcleanup_io_restore_insn: + .quad .Lio_exit_timer + .quad .Lio_done - 4 + +.Lcleanup_idle: + ni __LC_CPU_FLAGS+7,255-_CIF_ENABLED_WAIT + # copy interrupt clock & cpu timer + mvc __CLOCK_IDLE_EXIT(8,%r2),__LC_INT_CLOCK + mvc __TIMER_IDLE_EXIT(8,%r2),__LC_ASYNC_ENTER_TIMER + cghi %r11,__LC_SAVE_AREA_ASYNC + je 0f + mvc __CLOCK_IDLE_EXIT(8,%r2),__LC_MCCK_CLOCK + mvc __TIMER_IDLE_EXIT(8,%r2),__LC_MCCK_ENTER_TIMER +0: # check if stck & stpt have been executed + clg %r9,BASED(.Lcleanup_idle_insn) + jhe 1f + mvc __CLOCK_IDLE_ENTER(8,%r2),__CLOCK_IDLE_EXIT(%r2) + mvc __TIMER_IDLE_ENTER(8,%r2),__TIMER_IDLE_EXIT(%r2) +1: # calculate idle cycles +#ifdef CONFIG_SMP + clg %r9,BASED(.Lcleanup_idle_insn) + jl 3f + larl %r1,smp_cpu_mtid + llgf %r1,0(%r1) + ltgr %r1,%r1 + jz 3f + .insn rsy,0xeb0000000017,%r1,5,__SF_EMPTY+80(%r15) + larl %r3,mt_cycles + ag %r3,__LC_PERCPU_OFFSET + la %r4,__SF_EMPTY+16(%r15) +2: lg %r0,0(%r3) + slg %r0,0(%r4) + alg %r0,64(%r4) + stg %r0,0(%r3) + la %r3,8(%r3) + la %r4,8(%r4) + brct %r1,2b +#endif +3: # account system time going idle + lg %r9,__LC_STEAL_TIMER + alg %r9,__CLOCK_IDLE_ENTER(%r2) + slg %r9,__LC_LAST_UPDATE_CLOCK + stg %r9,__LC_STEAL_TIMER + mvc __LC_LAST_UPDATE_CLOCK(8),__CLOCK_IDLE_EXIT(%r2) + lg %r9,__LC_SYSTEM_TIMER + alg %r9,__LC_LAST_UPDATE_TIMER + slg %r9,__TIMER_IDLE_ENTER(%r2) + stg %r9,__LC_SYSTEM_TIMER + mvc __LC_LAST_UPDATE_TIMER(8),__TIMER_IDLE_EXIT(%r2) + # prepare return psw + nihh %r8,0xfcfd # clear irq & wait state bits + lg %r9,48(%r11) # return from psw_idle + BR_EX %r14,%r11 +.Lcleanup_idle_insn: + .quad .Lpsw_idle_lpsw + +.Lcleanup_save_fpu_regs: + larl %r9,save_fpu_regs + BR_EX %r14,%r11 + +.Lcleanup_load_fpu_regs: + larl %r9,load_fpu_regs + BR_EX %r14,%r11 + +/* + * Integer constants + */ + .align 8 +.Lcritical_start: + .quad .L__critical_start +.Lcritical_length: + .quad .L__critical_end - .L__critical_start +#if IS_ENABLED(CONFIG_KVM) +.Lsie_critical_start: + .quad .Lsie_gmap +.Lsie_critical_length: + .quad .Lsie_done - .Lsie_gmap +.Lsie_crit_mcck_start: + .quad .Lsie_entry +.Lsie_crit_mcck_length: + .quad .Lsie_skip - .Lsie_entry +#endif + .section .rodata, "a" +#define SYSCALL(esame,emu) .long esame + .globl sys_call_table +sys_call_table: +#include "asm/syscall_table.h" +#undef SYSCALL + +#ifdef CONFIG_COMPAT + +#define SYSCALL(esame,emu) .long emu + .globl sys_call_table_emu +sys_call_table_emu: +#include "asm/syscall_table.h" +#undef SYSCALL +#endif diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h new file mode 100644 index 000000000..472fa2f1a --- /dev/null +++ b/arch/s390/kernel/entry.h @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ENTRY_H +#define _ENTRY_H + +#include <linux/percpu.h> +#include <linux/types.h> +#include <linux/signal.h> +#include <asm/ptrace.h> +#include <asm/idle.h> + +extern void *restart_stack; +extern unsigned long suspend_zero_pages; + +void system_call(void); +void pgm_check_handler(void); +void ext_int_handler(void); +void io_int_handler(void); +void mcck_int_handler(void); +void restart_int_handler(void); +void restart_call_handler(void); + +asmlinkage long do_syscall_trace_enter(struct pt_regs *regs); +asmlinkage void do_syscall_trace_exit(struct pt_regs *regs); + +void do_protection_exception(struct pt_regs *regs); +void do_dat_exception(struct pt_regs *regs); + +void addressing_exception(struct pt_regs *regs); +void data_exception(struct pt_regs *regs); +void default_trap_handler(struct pt_regs *regs); +void divide_exception(struct pt_regs *regs); +void execute_exception(struct pt_regs *regs); +void hfp_divide_exception(struct pt_regs *regs); +void hfp_overflow_exception(struct pt_regs *regs); +void hfp_significance_exception(struct pt_regs *regs); +void hfp_sqrt_exception(struct pt_regs *regs); +void hfp_underflow_exception(struct pt_regs *regs); +void illegal_op(struct pt_regs *regs); +void operand_exception(struct pt_regs *regs); +void overflow_exception(struct pt_regs *regs); +void privileged_op(struct pt_regs *regs); +void space_switch_exception(struct pt_regs *regs); +void special_op_exception(struct pt_regs *regs); +void specification_exception(struct pt_regs *regs); +void transaction_exception(struct pt_regs *regs); +void translation_exception(struct pt_regs *regs); +void vector_exception(struct pt_regs *regs); + +void do_per_trap(struct pt_regs *regs); +void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str); +void syscall_trace(struct pt_regs *regs, int entryexit); +void kernel_stack_overflow(struct pt_regs * regs); +void do_signal(struct pt_regs *regs); +void handle_signal32(struct ksignal *ksig, sigset_t *oldset, + struct pt_regs *regs); +void do_notify_resume(struct pt_regs *regs); + +void __init init_IRQ(void); +void do_IRQ(struct pt_regs *regs, int irq); +void do_restart(void); +void __init startup_init_nobss(void); +void __init startup_init(void); +void die(struct pt_regs *regs, const char *str); +int setup_profiling_timer(unsigned int multiplier); +void __init time_init(void); +int pfn_is_nosave(unsigned long); +void s390_early_resume(void); +unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip); + +struct s390_mmap_arg_struct; +struct fadvise64_64_args; +struct old_sigaction; + +long sys_rt_sigreturn(void); +long sys_sigreturn(void); + +long sys_s390_personality(unsigned int personality); +long sys_s390_runtime_instr(int command, int signum); +long sys_s390_guarded_storage(int command, struct gs_cb __user *); +long sys_s390_pci_mmio_write(unsigned long, const void __user *, size_t); +long sys_s390_pci_mmio_read(unsigned long, void __user *, size_t); +long sys_s390_sthyi(unsigned long function_code, void __user *buffer, u64 __user *return_code, unsigned long flags); + +DECLARE_PER_CPU(u64, mt_cycles[8]); + +void gs_load_bc_cb(struct pt_regs *regs); +void set_fs_fixup(void); + +#endif /* _ENTRY_H */ diff --git a/arch/s390/kernel/fpu.c b/arch/s390/kernel/fpu.c new file mode 100644 index 000000000..594464f21 --- /dev/null +++ b/arch/s390/kernel/fpu.c @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * In-kernel vector facility support functions + * + * Copyright IBM Corp. 2015 + * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> + */ +#include <linux/kernel.h> +#include <linux/cpu.h> +#include <linux/sched.h> +#include <asm/fpu/types.h> +#include <asm/fpu/api.h> + +asm(".include \"asm/vx-insn.h\"\n"); + +void __kernel_fpu_begin(struct kernel_fpu *state, u32 flags) +{ + /* + * Limit the save to the FPU/vector registers already + * in use by the previous context + */ + flags &= state->mask; + + if (flags & KERNEL_FPC) + /* Save floating point control */ + asm volatile("stfpc %0" : "=m" (state->fpc)); + + if (!MACHINE_HAS_VX) { + if (flags & KERNEL_VXR_V0V7) { + /* Save floating-point registers */ + asm volatile("std 0,%0" : "=Q" (state->fprs[0])); + asm volatile("std 1,%0" : "=Q" (state->fprs[1])); + asm volatile("std 2,%0" : "=Q" (state->fprs[2])); + asm volatile("std 3,%0" : "=Q" (state->fprs[3])); + asm volatile("std 4,%0" : "=Q" (state->fprs[4])); + asm volatile("std 5,%0" : "=Q" (state->fprs[5])); + asm volatile("std 6,%0" : "=Q" (state->fprs[6])); + asm volatile("std 7,%0" : "=Q" (state->fprs[7])); + asm volatile("std 8,%0" : "=Q" (state->fprs[8])); + asm volatile("std 9,%0" : "=Q" (state->fprs[9])); + asm volatile("std 10,%0" : "=Q" (state->fprs[10])); + asm volatile("std 11,%0" : "=Q" (state->fprs[11])); + asm volatile("std 12,%0" : "=Q" (state->fprs[12])); + asm volatile("std 13,%0" : "=Q" (state->fprs[13])); + asm volatile("std 14,%0" : "=Q" (state->fprs[14])); + asm volatile("std 15,%0" : "=Q" (state->fprs[15])); + } + return; + } + + /* Test and save vector registers */ + asm volatile ( + /* + * Test if any vector register must be saved and, if so, + * test if all register can be saved. + */ + " la 1,%[vxrs]\n" /* load save area */ + " tmll %[m],30\n" /* KERNEL_VXR */ + " jz 7f\n" /* no work -> done */ + " jo 5f\n" /* -> save V0..V31 */ + /* + * Test for special case KERNEL_FPU_MID only. In this + * case a vstm V8..V23 is the best instruction + */ + " chi %[m],12\n" /* KERNEL_VXR_MID */ + " jne 0f\n" /* -> save V8..V23 */ + " VSTM 8,23,128,1\n" /* vstm %v8,%v23,128(%r1) */ + " j 7f\n" + /* Test and save the first half of 16 vector registers */ + "0: tmll %[m],6\n" /* KERNEL_VXR_LOW */ + " jz 3f\n" /* -> KERNEL_VXR_HIGH */ + " jo 2f\n" /* 11 -> save V0..V15 */ + " brc 2,1f\n" /* 10 -> save V8..V15 */ + " VSTM 0,7,0,1\n" /* vstm %v0,%v7,0(%r1) */ + " j 3f\n" + "1: VSTM 8,15,128,1\n" /* vstm %v8,%v15,128(%r1) */ + " j 3f\n" + "2: VSTM 0,15,0,1\n" /* vstm %v0,%v15,0(%r1) */ + /* Test and save the second half of 16 vector registers */ + "3: tmll %[m],24\n" /* KERNEL_VXR_HIGH */ + " jz 7f\n" + " jo 6f\n" /* 11 -> save V16..V31 */ + " brc 2,4f\n" /* 10 -> save V24..V31 */ + " VSTM 16,23,256,1\n" /* vstm %v16,%v23,256(%r1) */ + " j 7f\n" + "4: VSTM 24,31,384,1\n" /* vstm %v24,%v31,384(%r1) */ + " j 7f\n" + "5: VSTM 0,15,0,1\n" /* vstm %v0,%v15,0(%r1) */ + "6: VSTM 16,31,256,1\n" /* vstm %v16,%v31,256(%r1) */ + "7:" + : [vxrs] "=Q" (*(struct vx_array *) &state->vxrs) + : [m] "d" (flags) + : "1", "cc"); +} +EXPORT_SYMBOL(__kernel_fpu_begin); + +void __kernel_fpu_end(struct kernel_fpu *state, u32 flags) +{ + /* + * Limit the restore to the FPU/vector registers of the + * previous context that have been overwritte by the + * current context + */ + flags &= state->mask; + + if (flags & KERNEL_FPC) + /* Restore floating-point controls */ + asm volatile("lfpc %0" : : "Q" (state->fpc)); + + if (!MACHINE_HAS_VX) { + if (flags & KERNEL_VXR_V0V7) { + /* Restore floating-point registers */ + asm volatile("ld 0,%0" : : "Q" (state->fprs[0])); + asm volatile("ld 1,%0" : : "Q" (state->fprs[1])); + asm volatile("ld 2,%0" : : "Q" (state->fprs[2])); + asm volatile("ld 3,%0" : : "Q" (state->fprs[3])); + asm volatile("ld 4,%0" : : "Q" (state->fprs[4])); + asm volatile("ld 5,%0" : : "Q" (state->fprs[5])); + asm volatile("ld 6,%0" : : "Q" (state->fprs[6])); + asm volatile("ld 7,%0" : : "Q" (state->fprs[7])); + asm volatile("ld 8,%0" : : "Q" (state->fprs[8])); + asm volatile("ld 9,%0" : : "Q" (state->fprs[9])); + asm volatile("ld 10,%0" : : "Q" (state->fprs[10])); + asm volatile("ld 11,%0" : : "Q" (state->fprs[11])); + asm volatile("ld 12,%0" : : "Q" (state->fprs[12])); + asm volatile("ld 13,%0" : : "Q" (state->fprs[13])); + asm volatile("ld 14,%0" : : "Q" (state->fprs[14])); + asm volatile("ld 15,%0" : : "Q" (state->fprs[15])); + } + return; + } + + /* Test and restore (load) vector registers */ + asm volatile ( + /* + * Test if any vector register must be loaded and, if so, + * test if all registers can be loaded at once. + */ + " la 1,%[vxrs]\n" /* load restore area */ + " tmll %[m],30\n" /* KERNEL_VXR */ + " jz 7f\n" /* no work -> done */ + " jo 5f\n" /* -> restore V0..V31 */ + /* + * Test for special case KERNEL_FPU_MID only. In this + * case a vlm V8..V23 is the best instruction + */ + " chi %[m],12\n" /* KERNEL_VXR_MID */ + " jne 0f\n" /* -> restore V8..V23 */ + " VLM 8,23,128,1\n" /* vlm %v8,%v23,128(%r1) */ + " j 7f\n" + /* Test and restore the first half of 16 vector registers */ + "0: tmll %[m],6\n" /* KERNEL_VXR_LOW */ + " jz 3f\n" /* -> KERNEL_VXR_HIGH */ + " jo 2f\n" /* 11 -> restore V0..V15 */ + " brc 2,1f\n" /* 10 -> restore V8..V15 */ + " VLM 0,7,0,1\n" /* vlm %v0,%v7,0(%r1) */ + " j 3f\n" + "1: VLM 8,15,128,1\n" /* vlm %v8,%v15,128(%r1) */ + " j 3f\n" + "2: VLM 0,15,0,1\n" /* vlm %v0,%v15,0(%r1) */ + /* Test and restore the second half of 16 vector registers */ + "3: tmll %[m],24\n" /* KERNEL_VXR_HIGH */ + " jz 7f\n" + " jo 6f\n" /* 11 -> restore V16..V31 */ + " brc 2,4f\n" /* 10 -> restore V24..V31 */ + " VLM 16,23,256,1\n" /* vlm %v16,%v23,256(%r1) */ + " j 7f\n" + "4: VLM 24,31,384,1\n" /* vlm %v24,%v31,384(%r1) */ + " j 7f\n" + "5: VLM 0,15,0,1\n" /* vlm %v0,%v15,0(%r1) */ + "6: VLM 16,31,256,1\n" /* vlm %v16,%v31,256(%r1) */ + "7:" + : [vxrs] "=Q" (*(struct vx_array *) &state->vxrs) + : [m] "d" (flags) + : "1", "cc"); +} +EXPORT_SYMBOL(__kernel_fpu_end); diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c new file mode 100644 index 000000000..463b9e927 --- /dev/null +++ b/arch/s390/kernel/ftrace.c @@ -0,0 +1,244 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Dynamic function tracer architecture backend. + * + * Copyright IBM Corp. 2009,2014 + * + * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>, + * Martin Schwidefsky <schwidefsky@de.ibm.com> + */ + +#include <linux/moduleloader.h> +#include <linux/hardirq.h> +#include <linux/uaccess.h> +#include <linux/ftrace.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/kprobes.h> +#include <trace/syscall.h> +#include <asm/asm-offsets.h> +#include <asm/cacheflush.h> +#include <asm/set_memory.h> +#include "entry.h" + +/* + * The mcount code looks like this: + * stg %r14,8(%r15) # offset 0 + * larl %r1,<&counter> # offset 6 + * brasl %r14,_mcount # offset 12 + * lg %r14,8(%r15) # offset 18 + * Total length is 24 bytes. Only the first instruction will be patched + * by ftrace_make_call / ftrace_make_nop. + * The enabled ftrace code block looks like this: + * > brasl %r0,ftrace_caller # offset 0 + * larl %r1,<&counter> # offset 6 + * brasl %r14,_mcount # offset 12 + * lg %r14,8(%r15) # offset 18 + * The ftrace function gets called with a non-standard C function call ABI + * where r0 contains the return address. It is also expected that the called + * function only clobbers r0 and r1, but restores r2-r15. + * For module code we can't directly jump to ftrace caller, but need a + * trampoline (ftrace_plt), which clobbers also r1. + * The return point of the ftrace function has offset 24, so execution + * continues behind the mcount block. + * The disabled ftrace code block looks like this: + * > jg .+24 # offset 0 + * larl %r1,<&counter> # offset 6 + * brasl %r14,_mcount # offset 12 + * lg %r14,8(%r15) # offset 18 + * The jg instruction branches to offset 24 to skip as many instructions + * as possible. + * In case we use gcc's hotpatch feature the original and also the disabled + * function prologue contains only a single six byte instruction and looks + * like this: + * > brcl 0,0 # offset 0 + * To enable ftrace the code gets patched like above and afterwards looks + * like this: + * > brasl %r0,ftrace_caller # offset 0 + */ + +void *ftrace_func __read_mostly = ftrace_stub; +unsigned long ftrace_plt; + +static inline void ftrace_generate_orig_insn(struct ftrace_insn *insn) +{ +#if defined(CC_USING_HOTPATCH) || defined(CC_USING_NOP_MCOUNT) + /* brcl 0,0 */ + insn->opc = 0xc004; + insn->disp = 0; +#else + /* stg r14,8(r15) */ + insn->opc = 0xe3e0; + insn->disp = 0xf0080024; +#endif +} + +static inline int is_kprobe_on_ftrace(struct ftrace_insn *insn) +{ +#ifdef CONFIG_KPROBES + if (insn->opc == BREAKPOINT_INSTRUCTION) + return 1; +#endif + return 0; +} + +static inline void ftrace_generate_kprobe_nop_insn(struct ftrace_insn *insn) +{ +#ifdef CONFIG_KPROBES + insn->opc = BREAKPOINT_INSTRUCTION; + insn->disp = KPROBE_ON_FTRACE_NOP; +#endif +} + +static inline void ftrace_generate_kprobe_call_insn(struct ftrace_insn *insn) +{ +#ifdef CONFIG_KPROBES + insn->opc = BREAKPOINT_INSTRUCTION; + insn->disp = KPROBE_ON_FTRACE_CALL; +#endif +} + +int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, + unsigned long addr) +{ + return 0; +} + +int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, + unsigned long addr) +{ + struct ftrace_insn orig, new, old; + + if (probe_kernel_read(&old, (void *) rec->ip, sizeof(old))) + return -EFAULT; + if (addr == MCOUNT_ADDR) { + /* Initial code replacement */ + ftrace_generate_orig_insn(&orig); + ftrace_generate_nop_insn(&new); + } else if (is_kprobe_on_ftrace(&old)) { + /* + * If we find a breakpoint instruction, a kprobe has been + * placed at the beginning of the function. We write the + * constant KPROBE_ON_FTRACE_NOP into the remaining four + * bytes of the original instruction so that the kprobes + * handler can execute a nop, if it reaches this breakpoint. + */ + ftrace_generate_kprobe_call_insn(&orig); + ftrace_generate_kprobe_nop_insn(&new); + } else { + /* Replace ftrace call with a nop. */ + ftrace_generate_call_insn(&orig, rec->ip); + ftrace_generate_nop_insn(&new); + } + /* Verify that the to be replaced code matches what we expect. */ + if (memcmp(&orig, &old, sizeof(old))) + return -EINVAL; + s390_kernel_write((void *) rec->ip, &new, sizeof(new)); + return 0; +} + +int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + struct ftrace_insn orig, new, old; + + if (probe_kernel_read(&old, (void *) rec->ip, sizeof(old))) + return -EFAULT; + if (is_kprobe_on_ftrace(&old)) { + /* + * If we find a breakpoint instruction, a kprobe has been + * placed at the beginning of the function. We write the + * constant KPROBE_ON_FTRACE_CALL into the remaining four + * bytes of the original instruction so that the kprobes + * handler can execute a brasl if it reaches this breakpoint. + */ + ftrace_generate_kprobe_nop_insn(&orig); + ftrace_generate_kprobe_call_insn(&new); + } else { + /* Replace nop with an ftrace call. */ + ftrace_generate_nop_insn(&orig); + ftrace_generate_call_insn(&new, rec->ip); + } + /* Verify that the to be replaced code matches what we expect. */ + if (memcmp(&orig, &old, sizeof(old))) + return -EINVAL; + s390_kernel_write((void *) rec->ip, &new, sizeof(new)); + return 0; +} + +int ftrace_update_ftrace_func(ftrace_func_t func) +{ + ftrace_func = func; + return 0; +} + +int __init ftrace_dyn_arch_init(void) +{ + return 0; +} + +#ifdef CONFIG_MODULES + +static int __init ftrace_plt_init(void) +{ + unsigned int *ip; + + ftrace_plt = (unsigned long) module_alloc(PAGE_SIZE); + if (!ftrace_plt) + panic("cannot allocate ftrace plt\n"); + ip = (unsigned int *) ftrace_plt; + ip[0] = 0x0d10e310; /* basr 1,0; lg 1,10(1); br 1 */ + ip[1] = 0x100a0004; + ip[2] = 0x07f10000; + ip[3] = FTRACE_ADDR >> 32; + ip[4] = FTRACE_ADDR & 0xffffffff; + set_memory_ro(ftrace_plt, 1); + return 0; +} +device_initcall(ftrace_plt_init); + +#endif /* CONFIG_MODULES */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +/* + * Hook the return address and push it in the stack of return addresses + * in current thread info. + */ +unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip) +{ + if (unlikely(ftrace_graph_is_dead())) + goto out; + if (unlikely(atomic_read(¤t->tracing_graph_pause))) + goto out; + ip -= MCOUNT_INSN_SIZE; + if (!function_graph_enter(parent, ip, 0, NULL)) + parent = (unsigned long) return_to_handler; +out: + return parent; +} +NOKPROBE_SYMBOL(prepare_ftrace_return); + +/* + * Patch the kernel code at ftrace_graph_caller location. The instruction + * there is branch relative on condition. To enable the ftrace graph code + * block, we simply patch the mask field of the instruction to zero and + * turn the instruction into a nop. + * To disable the ftrace graph code the mask field will be patched to + * all ones, which turns the instruction into an unconditional branch. + */ +int ftrace_enable_ftrace_graph_caller(void) +{ + u8 op = 0x04; /* set mask field to zero */ + + s390_kernel_write(__va(ftrace_graph_caller)+1, &op, sizeof(op)); + return 0; +} + +int ftrace_disable_ftrace_graph_caller(void) +{ + u8 op = 0xf4; /* set mask field to all ones */ + + s390_kernel_write(__va(ftrace_graph_caller)+1, &op, sizeof(op)); + return 0; +} + +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/s390/kernel/guarded_storage.c b/arch/s390/kernel/guarded_storage.c new file mode 100644 index 000000000..d14dd1c2e --- /dev/null +++ b/arch/s390/kernel/guarded_storage.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2016 + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + */ + +#include <linux/kernel.h> +#include <linux/syscalls.h> +#include <linux/signal.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <asm/guarded_storage.h> +#include "entry.h" + +void guarded_storage_release(struct task_struct *tsk) +{ + kfree(tsk->thread.gs_cb); + kfree(tsk->thread.gs_bc_cb); +} + +static int gs_enable(void) +{ + struct gs_cb *gs_cb; + + if (!current->thread.gs_cb) { + gs_cb = kzalloc(sizeof(*gs_cb), GFP_KERNEL); + if (!gs_cb) + return -ENOMEM; + gs_cb->gsd = 25; + preempt_disable(); + __ctl_set_bit(2, 4); + load_gs_cb(gs_cb); + current->thread.gs_cb = gs_cb; + preempt_enable(); + } + return 0; +} + +static int gs_disable(void) +{ + if (current->thread.gs_cb) { + preempt_disable(); + kfree(current->thread.gs_cb); + current->thread.gs_cb = NULL; + __ctl_clear_bit(2, 4); + preempt_enable(); + } + return 0; +} + +static int gs_set_bc_cb(struct gs_cb __user *u_gs_cb) +{ + struct gs_cb *gs_cb; + + gs_cb = current->thread.gs_bc_cb; + if (!gs_cb) { + gs_cb = kzalloc(sizeof(*gs_cb), GFP_KERNEL); + if (!gs_cb) + return -ENOMEM; + current->thread.gs_bc_cb = gs_cb; + } + if (copy_from_user(gs_cb, u_gs_cb, sizeof(*gs_cb))) + return -EFAULT; + return 0; +} + +static int gs_clear_bc_cb(void) +{ + struct gs_cb *gs_cb; + + gs_cb = current->thread.gs_bc_cb; + current->thread.gs_bc_cb = NULL; + kfree(gs_cb); + return 0; +} + +void gs_load_bc_cb(struct pt_regs *regs) +{ + struct gs_cb *gs_cb; + + preempt_disable(); + clear_thread_flag(TIF_GUARDED_STORAGE); + gs_cb = current->thread.gs_bc_cb; + if (gs_cb) { + kfree(current->thread.gs_cb); + current->thread.gs_bc_cb = NULL; + __ctl_set_bit(2, 4); + load_gs_cb(gs_cb); + current->thread.gs_cb = gs_cb; + } + preempt_enable(); +} + +static int gs_broadcast(void) +{ + struct task_struct *sibling; + + read_lock(&tasklist_lock); + for_each_thread(current, sibling) { + if (!sibling->thread.gs_bc_cb) + continue; + if (test_and_set_tsk_thread_flag(sibling, TIF_GUARDED_STORAGE)) + kick_process(sibling); + } + read_unlock(&tasklist_lock); + return 0; +} + +SYSCALL_DEFINE2(s390_guarded_storage, int, command, + struct gs_cb __user *, gs_cb) +{ + if (!MACHINE_HAS_GS) + return -EOPNOTSUPP; + switch (command) { + case GS_ENABLE: + return gs_enable(); + case GS_DISABLE: + return gs_disable(); + case GS_SET_BC_CB: + return gs_set_bc_cb(gs_cb); + case GS_CLEAR_BC_CB: + return gs_clear_bc_cb(); + case GS_BROADCAST: + return gs_broadcast(); + default: + return -EINVAL; + } +} diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S new file mode 100644 index 000000000..6d14ad42b --- /dev/null +++ b/arch/s390/kernel/head64.S @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 1999, 2010 + * + * Author(s): Hartmut Penner <hp@de.ibm.com> + * Martin Schwidefsky <schwidefsky@de.ibm.com> + * Rob van der Heij <rvdhei@iae.nl> + * Heiko Carstens <heiko.carstens@de.ibm.com> + * + */ + +#include <linux/init.h> +#include <linux/linkage.h> +#include <asm/asm-offsets.h> +#include <asm/thread_info.h> +#include <asm/page.h> + +__HEAD +ENTRY(startup_continue) + tm __LC_STFLE_FAC_LIST+5,0x80 # LPP available ? + jz 0f + xc __LC_LPP+1(7,0),__LC_LPP+1 # clear lpp and current_pid + mvi __LC_LPP,0x80 # and set LPP_MAGIC + .insn s,0xb2800000,__LC_LPP # load program parameter +0: larl %r1,tod_clock_base + mvc 0(16,%r1),__LC_BOOT_CLOCK + larl %r13,.LPG1 # get base + lctlg %c0,%c15,.Lctl-.LPG1(%r13) # load control registers + lg %r12,.Lparmaddr-.LPG1(%r13) # pointer to parameter area + # move IPL device to lowcore + larl %r0,boot_vdso_data + stg %r0,__LC_VDSO_PER_CPU +# +# Setup stack +# + larl %r14,init_task + stg %r14,__LC_CURRENT + larl %r15,init_thread_union + aghi %r15,1<<(PAGE_SHIFT+THREAD_SIZE_ORDER) # init_task_union + THREAD_SIZE + stg %r15,__LC_KERNEL_STACK # set end of kernel stack + aghi %r15,-160 +# +# Early setup functions that may not rely on an initialized bss section, +# like moving the initrd. Returns with an initialized bss section. +# + brasl %r14,startup_init_nobss +# +# Early machine initialization and detection functions. +# + brasl %r14,startup_init + +# check control registers + stctg %c0,%c15,0(%r15) + oi 6(%r15),0x60 # enable sigp emergency & external call + oi 4(%r15),0x10 # switch on low address proctection + lctlg %c0,%c15,0(%r15) + + lam 0,15,.Laregs-.LPG1(%r13) # load acrs needed by uaccess + brasl %r14,start_kernel # go to C code +# +# We returned from start_kernel ?!? PANIK +# + basr %r13,0 + lpswe .Ldw-.(%r13) # load disabled wait psw + + .align 16 +.LPG1: +.Lctl: .quad 0x04040000 # cr0: AFP registers & secondary space + .quad 0 # cr1: primary space segment table + .quad .Lduct # cr2: dispatchable unit control table + .quad 0 # cr3: instruction authorization + .quad 0xffff # cr4: instruction authorization + .quad .Lduct # cr5: primary-aste origin + .quad 0 # cr6: I/O interrupts + .quad 0 # cr7: secondary space segment table + .quad 0 # cr8: access registers translation + .quad 0 # cr9: tracing off + .quad 0 # cr10: tracing off + .quad 0 # cr11: tracing off + .quad 0 # cr12: tracing off + .quad 0 # cr13: home space segment table + .quad 0xc0000000 # cr14: machine check handling off + .quad .Llinkage_stack # cr15: linkage stack operations +.Lpcmsk:.quad 0x0000000180000000 +.L4malign:.quad 0xffffffffffc00000 +.Lscan2g:.quad 0x80000000 + 0x20000 - 8 # 2GB + 128K - 8 +.Lnop: .long 0x07000700 +.Lparmaddr: + .quad PARMAREA + .align 64 +.Lduct: .long 0,.Laste,.Laste,0,.Lduald,0,0,0 + .long 0,0,0,0,0,0,0,0 +.Laste: .quad 0,0xffffffffffffffff,0,0,0,0,0,0 + .align 128 +.Lduald:.rept 8 + .long 0x80000000,0,0,0 # invalid access-list entries + .endr +.Llinkage_stack: + .long 0,0,0x89000000,0,0,0,0x8a000000,0 +.Ldw: .quad 0x0002000180000000,0x0000000000000000 +.Laregs:.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c new file mode 100644 index 000000000..8f8456816 --- /dev/null +++ b/arch/s390/kernel/idle.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Idle functions for s390. + * + * Copyright IBM Corp. 2014 + * + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + */ + +#include <linux/kernel.h> +#include <linux/kernel_stat.h> +#include <linux/kprobes.h> +#include <linux/notifier.h> +#include <linux/init.h> +#include <linux/cpu.h> +#include <linux/sched/cputime.h> +#include <asm/nmi.h> +#include <asm/smp.h> +#include "entry.h" + +static DEFINE_PER_CPU(struct s390_idle_data, s390_idle); + +void enabled_wait(void) +{ + struct s390_idle_data *idle = this_cpu_ptr(&s390_idle); + unsigned long long idle_time; + unsigned long psw_mask; + + trace_hardirqs_on(); + + /* Wait for external, I/O or machine check interrupt. */ + psw_mask = PSW_KERNEL_BITS | PSW_MASK_WAIT | PSW_MASK_DAT | + PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; + clear_cpu_flag(CIF_NOHZ_DELAY); + + /* Call the assembler magic in entry.S */ + psw_idle(idle, psw_mask); + + trace_hardirqs_off(); + + /* Account time spent with enabled wait psw loaded as idle time. */ + write_seqcount_begin(&idle->seqcount); + idle_time = idle->clock_idle_exit - idle->clock_idle_enter; + idle->clock_idle_enter = idle->clock_idle_exit = 0ULL; + idle->idle_time += idle_time; + idle->idle_count++; + account_idle_time(cputime_to_nsecs(idle_time)); + write_seqcount_end(&idle->seqcount); +} +NOKPROBE_SYMBOL(enabled_wait); + +static ssize_t show_idle_count(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct s390_idle_data *idle = &per_cpu(s390_idle, dev->id); + unsigned long long idle_count; + unsigned int seq; + + do { + seq = read_seqcount_begin(&idle->seqcount); + idle_count = READ_ONCE(idle->idle_count); + if (READ_ONCE(idle->clock_idle_enter)) + idle_count++; + } while (read_seqcount_retry(&idle->seqcount, seq)); + return sprintf(buf, "%llu\n", idle_count); +} +DEVICE_ATTR(idle_count, 0444, show_idle_count, NULL); + +static ssize_t show_idle_time(struct device *dev, + struct device_attribute *attr, char *buf) +{ + unsigned long long now, idle_time, idle_enter, idle_exit, in_idle; + struct s390_idle_data *idle = &per_cpu(s390_idle, dev->id); + unsigned int seq; + + do { + seq = read_seqcount_begin(&idle->seqcount); + idle_time = READ_ONCE(idle->idle_time); + idle_enter = READ_ONCE(idle->clock_idle_enter); + idle_exit = READ_ONCE(idle->clock_idle_exit); + } while (read_seqcount_retry(&idle->seqcount, seq)); + in_idle = 0; + now = get_tod_clock(); + if (idle_enter) { + if (idle_exit) { + in_idle = idle_exit - idle_enter; + } else if (now > idle_enter) { + in_idle = now - idle_enter; + } + } + idle_time += in_idle; + return sprintf(buf, "%llu\n", idle_time >> 12); +} +DEVICE_ATTR(idle_time_us, 0444, show_idle_time, NULL); + +u64 arch_cpu_idle_time(int cpu) +{ + struct s390_idle_data *idle = &per_cpu(s390_idle, cpu); + unsigned long long now, idle_enter, idle_exit, in_idle; + unsigned int seq; + + do { + seq = read_seqcount_begin(&idle->seqcount); + idle_enter = READ_ONCE(idle->clock_idle_enter); + idle_exit = READ_ONCE(idle->clock_idle_exit); + } while (read_seqcount_retry(&idle->seqcount, seq)); + in_idle = 0; + now = get_tod_clock(); + if (idle_enter) { + if (idle_exit) { + in_idle = idle_exit - idle_enter; + } else if (now > idle_enter) { + in_idle = now - idle_enter; + } + } + return cputime_to_nsecs(in_idle); +} + +void arch_cpu_idle_enter(void) +{ + local_mcck_disable(); +} + +void arch_cpu_idle(void) +{ + if (!test_cpu_flag(CIF_MCCK_PENDING)) + /* Halt the cpu and keep track of cpu time accounting. */ + enabled_wait(); + local_irq_enable(); +} + +void arch_cpu_idle_exit(void) +{ + local_mcck_enable(); + if (test_cpu_flag(CIF_MCCK_PENDING)) + s390_handle_mcck(); +} + +void arch_cpu_idle_dead(void) +{ + cpu_die(); +} diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c new file mode 100644 index 000000000..4296d7e61 --- /dev/null +++ b/arch/s390/kernel/ipl.c @@ -0,0 +1,1787 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ipl/reipl/dump support for Linux on s390. + * + * Copyright IBM Corp. 2005, 2012 + * Author(s): Michael Holzheu <holzheu@de.ibm.com> + * Heiko Carstens <heiko.carstens@de.ibm.com> + * Volker Sameske <sameske@de.ibm.com> + */ + +#include <linux/types.h> +#include <linux/export.h> +#include <linux/init.h> +#include <linux/device.h> +#include <linux/delay.h> +#include <linux/reboot.h> +#include <linux/ctype.h> +#include <linux/fs.h> +#include <linux/gfp.h> +#include <linux/crash_dump.h> +#include <linux/debug_locks.h> +#include <asm/diag.h> +#include <asm/ipl.h> +#include <asm/smp.h> +#include <asm/setup.h> +#include <asm/cpcmd.h> +#include <asm/ebcdic.h> +#include <asm/sclp.h> +#include <asm/checksum.h> +#include <asm/debug.h> +#include <asm/os_info.h> +#include "entry.h" + +#define IPL_PARM_BLOCK_VERSION 0 + +#define IPL_UNKNOWN_STR "unknown" +#define IPL_CCW_STR "ccw" +#define IPL_FCP_STR "fcp" +#define IPL_FCP_DUMP_STR "fcp_dump" +#define IPL_NSS_STR "nss" + +#define DUMP_CCW_STR "ccw" +#define DUMP_FCP_STR "fcp" +#define DUMP_NONE_STR "none" + +/* + * Four shutdown trigger types are supported: + * - panic + * - halt + * - power off + * - reipl + * - restart + */ +#define ON_PANIC_STR "on_panic" +#define ON_HALT_STR "on_halt" +#define ON_POFF_STR "on_poff" +#define ON_REIPL_STR "on_reboot" +#define ON_RESTART_STR "on_restart" + +struct shutdown_action; +struct shutdown_trigger { + char *name; + struct shutdown_action *action; +}; + +/* + * The following shutdown action types are supported: + */ +#define SHUTDOWN_ACTION_IPL_STR "ipl" +#define SHUTDOWN_ACTION_REIPL_STR "reipl" +#define SHUTDOWN_ACTION_DUMP_STR "dump" +#define SHUTDOWN_ACTION_VMCMD_STR "vmcmd" +#define SHUTDOWN_ACTION_STOP_STR "stop" +#define SHUTDOWN_ACTION_DUMP_REIPL_STR "dump_reipl" + +struct shutdown_action { + char *name; + void (*fn) (struct shutdown_trigger *trigger); + int (*init) (void); + int init_rc; +}; + +static char *ipl_type_str(enum ipl_type type) +{ + switch (type) { + case IPL_TYPE_CCW: + return IPL_CCW_STR; + case IPL_TYPE_FCP: + return IPL_FCP_STR; + case IPL_TYPE_FCP_DUMP: + return IPL_FCP_DUMP_STR; + case IPL_TYPE_NSS: + return IPL_NSS_STR; + case IPL_TYPE_UNKNOWN: + default: + return IPL_UNKNOWN_STR; + } +} + +enum dump_type { + DUMP_TYPE_NONE = 1, + DUMP_TYPE_CCW = 2, + DUMP_TYPE_FCP = 4, +}; + +static char *dump_type_str(enum dump_type type) +{ + switch (type) { + case DUMP_TYPE_NONE: + return DUMP_NONE_STR; + case DUMP_TYPE_CCW: + return DUMP_CCW_STR; + case DUMP_TYPE_FCP: + return DUMP_FCP_STR; + default: + return NULL; + } +} + +static int ipl_block_valid; +static struct ipl_parameter_block ipl_block; + +static int reipl_capabilities = IPL_TYPE_UNKNOWN; + +static enum ipl_type reipl_type = IPL_TYPE_UNKNOWN; +static struct ipl_parameter_block *reipl_block_fcp; +static struct ipl_parameter_block *reipl_block_ccw; +static struct ipl_parameter_block *reipl_block_nss; +static struct ipl_parameter_block *reipl_block_actual; + +static int dump_capabilities = DUMP_TYPE_NONE; +static enum dump_type dump_type = DUMP_TYPE_NONE; +static struct ipl_parameter_block *dump_block_fcp; +static struct ipl_parameter_block *dump_block_ccw; + +static struct sclp_ipl_info sclp_ipl_info; + +static inline int __diag308(unsigned long subcode, void *addr) +{ + register unsigned long _addr asm("0") = (unsigned long) addr; + register unsigned long _rc asm("1") = 0; + + asm volatile( + " diag %0,%2,0x308\n" + "0: nopr %%r7\n" + EX_TABLE(0b,0b) + : "+d" (_addr), "+d" (_rc) + : "d" (subcode) : "cc", "memory"); + return _rc; +} + +int diag308(unsigned long subcode, void *addr) +{ + diag_stat_inc(DIAG_STAT_X308); + return __diag308(subcode, addr); +} +EXPORT_SYMBOL_GPL(diag308); + +/* SYSFS */ + +#define IPL_ATTR_SHOW_FN(_prefix, _name, _format, args...) \ +static ssize_t sys_##_prefix##_##_name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + char *page) \ +{ \ + return snprintf(page, PAGE_SIZE, _format, ##args); \ +} + +#define IPL_ATTR_CCW_STORE_FN(_prefix, _name, _ipl_blk) \ +static ssize_t sys_##_prefix##_##_name##_store(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + unsigned long long ssid, devno; \ + \ + if (sscanf(buf, "0.%llx.%llx\n", &ssid, &devno) != 2) \ + return -EINVAL; \ + \ + if (ssid > __MAX_SSID || devno > __MAX_SUBCHANNEL) \ + return -EINVAL; \ + \ + _ipl_blk.ssid = ssid; \ + _ipl_blk.devno = devno; \ + return len; \ +} + +#define DEFINE_IPL_CCW_ATTR_RW(_prefix, _name, _ipl_blk) \ +IPL_ATTR_SHOW_FN(_prefix, _name, "0.%x.%04x\n", \ + _ipl_blk.ssid, _ipl_blk.devno); \ +IPL_ATTR_CCW_STORE_FN(_prefix, _name, _ipl_blk); \ +static struct kobj_attribute sys_##_prefix##_##_name##_attr = \ + __ATTR(_name, (S_IRUGO | S_IWUSR), \ + sys_##_prefix##_##_name##_show, \ + sys_##_prefix##_##_name##_store) \ + +#define DEFINE_IPL_ATTR_RO(_prefix, _name, _format, _value) \ +IPL_ATTR_SHOW_FN(_prefix, _name, _format, _value) \ +static struct kobj_attribute sys_##_prefix##_##_name##_attr = \ + __ATTR(_name, S_IRUGO, sys_##_prefix##_##_name##_show, NULL) + +#define DEFINE_IPL_ATTR_RW(_prefix, _name, _fmt_out, _fmt_in, _value) \ +IPL_ATTR_SHOW_FN(_prefix, _name, _fmt_out, (unsigned long long) _value) \ +static ssize_t sys_##_prefix##_##_name##_store(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + unsigned long long value; \ + if (sscanf(buf, _fmt_in, &value) != 1) \ + return -EINVAL; \ + _value = value; \ + return len; \ +} \ +static struct kobj_attribute sys_##_prefix##_##_name##_attr = \ + __ATTR(_name,(S_IRUGO | S_IWUSR), \ + sys_##_prefix##_##_name##_show, \ + sys_##_prefix##_##_name##_store) + +#define DEFINE_IPL_ATTR_STR_RW(_prefix, _name, _fmt_out, _fmt_in, _value)\ +IPL_ATTR_SHOW_FN(_prefix, _name, _fmt_out, _value) \ +static ssize_t sys_##_prefix##_##_name##_store(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + strncpy(_value, buf, sizeof(_value) - 1); \ + strim(_value); \ + return len; \ +} \ +static struct kobj_attribute sys_##_prefix##_##_name##_attr = \ + __ATTR(_name,(S_IRUGO | S_IWUSR), \ + sys_##_prefix##_##_name##_show, \ + sys_##_prefix##_##_name##_store) + +/* + * ipl section + */ + +static __init enum ipl_type get_ipl_type(void) +{ + if (!ipl_block_valid) + return IPL_TYPE_UNKNOWN; + + switch (ipl_block.hdr.pbt) { + case DIAG308_IPL_TYPE_CCW: + return IPL_TYPE_CCW; + case DIAG308_IPL_TYPE_FCP: + if (ipl_block.ipl_info.fcp.opt == DIAG308_IPL_OPT_DUMP) + return IPL_TYPE_FCP_DUMP; + else + return IPL_TYPE_FCP; + } + return IPL_TYPE_UNKNOWN; +} + +struct ipl_info ipl_info; +EXPORT_SYMBOL_GPL(ipl_info); + +static ssize_t ipl_type_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + return sprintf(page, "%s\n", ipl_type_str(ipl_info.type)); +} + +static struct kobj_attribute sys_ipl_type_attr = __ATTR_RO(ipl_type); + +/* VM IPL PARM routines */ +static size_t reipl_get_ascii_vmparm(char *dest, size_t size, + const struct ipl_parameter_block *ipb) +{ + int i; + size_t len; + char has_lowercase = 0; + + len = 0; + if ((ipb->ipl_info.ccw.vm_flags & DIAG308_VM_FLAGS_VP_VALID) && + (ipb->ipl_info.ccw.vm_parm_len > 0)) { + + len = min_t(size_t, size - 1, ipb->ipl_info.ccw.vm_parm_len); + memcpy(dest, ipb->ipl_info.ccw.vm_parm, len); + /* If at least one character is lowercase, we assume mixed + * case; otherwise we convert everything to lowercase. + */ + for (i = 0; i < len; i++) + if ((dest[i] > 0x80 && dest[i] < 0x8a) || /* a-i */ + (dest[i] > 0x90 && dest[i] < 0x9a) || /* j-r */ + (dest[i] > 0xa1 && dest[i] < 0xaa)) { /* s-z */ + has_lowercase = 1; + break; + } + if (!has_lowercase) + EBC_TOLOWER(dest, len); + EBCASC(dest, len); + } + dest[len] = 0; + + return len; +} + +size_t append_ipl_vmparm(char *dest, size_t size) +{ + size_t rc; + + rc = 0; + if (ipl_block_valid && ipl_block.hdr.pbt == DIAG308_IPL_TYPE_CCW) + rc = reipl_get_ascii_vmparm(dest, size, &ipl_block); + else + dest[0] = 0; + return rc; +} + +static ssize_t ipl_vm_parm_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + char parm[DIAG308_VMPARM_SIZE + 1] = {}; + + append_ipl_vmparm(parm, sizeof(parm)); + return sprintf(page, "%s\n", parm); +} + +static size_t scpdata_length(const char* buf, size_t count) +{ + while (count) { + if (buf[count - 1] != '\0' && buf[count - 1] != ' ') + break; + count--; + } + return count; +} + +static size_t reipl_append_ascii_scpdata(char *dest, size_t size, + const struct ipl_parameter_block *ipb) +{ + size_t count; + size_t i; + int has_lowercase; + + count = min(size - 1, scpdata_length(ipb->ipl_info.fcp.scp_data, + ipb->ipl_info.fcp.scp_data_len)); + if (!count) + goto out; + + has_lowercase = 0; + for (i = 0; i < count; i++) { + if (!isascii(ipb->ipl_info.fcp.scp_data[i])) { + count = 0; + goto out; + } + if (!has_lowercase && islower(ipb->ipl_info.fcp.scp_data[i])) + has_lowercase = 1; + } + + if (has_lowercase) + memcpy(dest, ipb->ipl_info.fcp.scp_data, count); + else + for (i = 0; i < count; i++) + dest[i] = tolower(ipb->ipl_info.fcp.scp_data[i]); +out: + dest[count] = '\0'; + return count; +} + +size_t append_ipl_scpdata(char *dest, size_t len) +{ + size_t rc; + + rc = 0; + if (ipl_block_valid && ipl_block.hdr.pbt == DIAG308_IPL_TYPE_FCP) + rc = reipl_append_ascii_scpdata(dest, len, &ipl_block); + else + dest[0] = 0; + return rc; +} + + +static struct kobj_attribute sys_ipl_vm_parm_attr = + __ATTR(parm, S_IRUGO, ipl_vm_parm_show, NULL); + +static ssize_t sys_ipl_device_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + switch (ipl_info.type) { + case IPL_TYPE_CCW: + return sprintf(page, "0.%x.%04x\n", ipl_block.ipl_info.ccw.ssid, + ipl_block.ipl_info.ccw.devno); + case IPL_TYPE_FCP: + case IPL_TYPE_FCP_DUMP: + return sprintf(page, "0.0.%04x\n", + ipl_block.ipl_info.fcp.devno); + default: + return 0; + } +} + +static struct kobj_attribute sys_ipl_device_attr = + __ATTR(device, S_IRUGO, sys_ipl_device_show, NULL); + +static ssize_t ipl_parameter_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, char *buf, + loff_t off, size_t count) +{ + return memory_read_from_buffer(buf, count, &off, &ipl_block, + ipl_block.hdr.len); +} +static struct bin_attribute ipl_parameter_attr = + __BIN_ATTR(binary_parameter, S_IRUGO, ipl_parameter_read, NULL, + PAGE_SIZE); + +static ssize_t ipl_scp_data_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, char *buf, + loff_t off, size_t count) +{ + unsigned int size = ipl_block.ipl_info.fcp.scp_data_len; + void *scp_data = &ipl_block.ipl_info.fcp.scp_data; + + return memory_read_from_buffer(buf, count, &off, scp_data, size); +} +static struct bin_attribute ipl_scp_data_attr = + __BIN_ATTR(scp_data, S_IRUGO, ipl_scp_data_read, NULL, PAGE_SIZE); + +static struct bin_attribute *ipl_fcp_bin_attrs[] = { + &ipl_parameter_attr, + &ipl_scp_data_attr, + NULL, +}; + +/* FCP ipl device attributes */ + +DEFINE_IPL_ATTR_RO(ipl_fcp, wwpn, "0x%016llx\n", + (unsigned long long)ipl_block.ipl_info.fcp.wwpn); +DEFINE_IPL_ATTR_RO(ipl_fcp, lun, "0x%016llx\n", + (unsigned long long)ipl_block.ipl_info.fcp.lun); +DEFINE_IPL_ATTR_RO(ipl_fcp, bootprog, "%lld\n", + (unsigned long long)ipl_block.ipl_info.fcp.bootprog); +DEFINE_IPL_ATTR_RO(ipl_fcp, br_lba, "%lld\n", + (unsigned long long)ipl_block.ipl_info.fcp.br_lba); + +static ssize_t ipl_ccw_loadparm_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + char loadparm[LOADPARM_LEN + 1] = {}; + + if (!sclp_ipl_info.is_valid) + return sprintf(page, "#unknown#\n"); + memcpy(loadparm, &sclp_ipl_info.loadparm, LOADPARM_LEN); + EBCASC(loadparm, LOADPARM_LEN); + strim(loadparm); + return sprintf(page, "%s\n", loadparm); +} + +static struct kobj_attribute sys_ipl_ccw_loadparm_attr = + __ATTR(loadparm, 0444, ipl_ccw_loadparm_show, NULL); + +static struct attribute *ipl_fcp_attrs[] = { + &sys_ipl_type_attr.attr, + &sys_ipl_device_attr.attr, + &sys_ipl_fcp_wwpn_attr.attr, + &sys_ipl_fcp_lun_attr.attr, + &sys_ipl_fcp_bootprog_attr.attr, + &sys_ipl_fcp_br_lba_attr.attr, + &sys_ipl_ccw_loadparm_attr.attr, + NULL, +}; + +static struct attribute_group ipl_fcp_attr_group = { + .attrs = ipl_fcp_attrs, + .bin_attrs = ipl_fcp_bin_attrs, +}; + +/* CCW ipl device attributes */ + +static struct attribute *ipl_ccw_attrs_vm[] = { + &sys_ipl_type_attr.attr, + &sys_ipl_device_attr.attr, + &sys_ipl_ccw_loadparm_attr.attr, + &sys_ipl_vm_parm_attr.attr, + NULL, +}; + +static struct attribute *ipl_ccw_attrs_lpar[] = { + &sys_ipl_type_attr.attr, + &sys_ipl_device_attr.attr, + &sys_ipl_ccw_loadparm_attr.attr, + NULL, +}; + +static struct attribute_group ipl_ccw_attr_group_vm = { + .attrs = ipl_ccw_attrs_vm, +}; + +static struct attribute_group ipl_ccw_attr_group_lpar = { + .attrs = ipl_ccw_attrs_lpar +}; + +/* UNKNOWN ipl device attributes */ + +static struct attribute *ipl_unknown_attrs[] = { + &sys_ipl_type_attr.attr, + NULL, +}; + +static struct attribute_group ipl_unknown_attr_group = { + .attrs = ipl_unknown_attrs, +}; + +static struct kset *ipl_kset; + +static void __ipl_run(void *unused) +{ + __bpon(); + diag308(DIAG308_LOAD_CLEAR, NULL); +} + +static void ipl_run(struct shutdown_trigger *trigger) +{ + smp_call_ipl_cpu(__ipl_run, NULL); +} + +static int __init ipl_init(void) +{ + int rc; + + ipl_kset = kset_create_and_add("ipl", NULL, firmware_kobj); + if (!ipl_kset) { + rc = -ENOMEM; + goto out; + } + switch (ipl_info.type) { + case IPL_TYPE_CCW: + if (MACHINE_IS_VM) + rc = sysfs_create_group(&ipl_kset->kobj, + &ipl_ccw_attr_group_vm); + else + rc = sysfs_create_group(&ipl_kset->kobj, + &ipl_ccw_attr_group_lpar); + break; + case IPL_TYPE_FCP: + case IPL_TYPE_FCP_DUMP: + rc = sysfs_create_group(&ipl_kset->kobj, &ipl_fcp_attr_group); + break; + default: + rc = sysfs_create_group(&ipl_kset->kobj, + &ipl_unknown_attr_group); + break; + } +out: + if (rc) + panic("ipl_init failed: rc = %i\n", rc); + + return 0; +} + +static struct shutdown_action __refdata ipl_action = { + .name = SHUTDOWN_ACTION_IPL_STR, + .fn = ipl_run, + .init = ipl_init, +}; + +/* + * reipl shutdown action: Reboot Linux on shutdown. + */ + +/* VM IPL PARM attributes */ +static ssize_t reipl_generic_vmparm_show(struct ipl_parameter_block *ipb, + char *page) +{ + char vmparm[DIAG308_VMPARM_SIZE + 1] = {}; + + reipl_get_ascii_vmparm(vmparm, sizeof(vmparm), ipb); + return sprintf(page, "%s\n", vmparm); +} + +static ssize_t reipl_generic_vmparm_store(struct ipl_parameter_block *ipb, + size_t vmparm_max, + const char *buf, size_t len) +{ + int i, ip_len; + + /* ignore trailing newline */ + ip_len = len; + if ((len > 0) && (buf[len - 1] == '\n')) + ip_len--; + + if (ip_len > vmparm_max) + return -EINVAL; + + /* parm is used to store kernel options, check for common chars */ + for (i = 0; i < ip_len; i++) + if (!(isalnum(buf[i]) || isascii(buf[i]) || isprint(buf[i]))) + return -EINVAL; + + memset(ipb->ipl_info.ccw.vm_parm, 0, DIAG308_VMPARM_SIZE); + ipb->ipl_info.ccw.vm_parm_len = ip_len; + if (ip_len > 0) { + ipb->ipl_info.ccw.vm_flags |= DIAG308_VM_FLAGS_VP_VALID; + memcpy(ipb->ipl_info.ccw.vm_parm, buf, ip_len); + ASCEBC(ipb->ipl_info.ccw.vm_parm, ip_len); + } else { + ipb->ipl_info.ccw.vm_flags &= ~DIAG308_VM_FLAGS_VP_VALID; + } + + return len; +} + +/* NSS wrapper */ +static ssize_t reipl_nss_vmparm_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return reipl_generic_vmparm_show(reipl_block_nss, page); +} + +static ssize_t reipl_nss_vmparm_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + return reipl_generic_vmparm_store(reipl_block_nss, 56, buf, len); +} + +/* CCW wrapper */ +static ssize_t reipl_ccw_vmparm_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return reipl_generic_vmparm_show(reipl_block_ccw, page); +} + +static ssize_t reipl_ccw_vmparm_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + return reipl_generic_vmparm_store(reipl_block_ccw, 64, buf, len); +} + +static struct kobj_attribute sys_reipl_nss_vmparm_attr = + __ATTR(parm, S_IRUGO | S_IWUSR, reipl_nss_vmparm_show, + reipl_nss_vmparm_store); +static struct kobj_attribute sys_reipl_ccw_vmparm_attr = + __ATTR(parm, S_IRUGO | S_IWUSR, reipl_ccw_vmparm_show, + reipl_ccw_vmparm_store); + +/* FCP reipl device attributes */ + +static ssize_t reipl_fcp_scpdata_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, size_t count) +{ + size_t size = reipl_block_fcp->ipl_info.fcp.scp_data_len; + void *scp_data = reipl_block_fcp->ipl_info.fcp.scp_data; + + return memory_read_from_buffer(buf, count, &off, scp_data, size); +} + +static ssize_t reipl_fcp_scpdata_write(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, size_t count) +{ + size_t scpdata_len = count; + size_t padding; + + + if (off) + return -EINVAL; + + memcpy(reipl_block_fcp->ipl_info.fcp.scp_data, buf, count); + if (scpdata_len % 8) { + padding = 8 - (scpdata_len % 8); + memset(reipl_block_fcp->ipl_info.fcp.scp_data + scpdata_len, + 0, padding); + scpdata_len += padding; + } + + reipl_block_fcp->ipl_info.fcp.scp_data_len = scpdata_len; + reipl_block_fcp->hdr.len = IPL_PARM_BLK_FCP_LEN + scpdata_len; + reipl_block_fcp->hdr.blk0_len = IPL_PARM_BLK0_FCP_LEN + scpdata_len; + + return count; +} +static struct bin_attribute sys_reipl_fcp_scp_data_attr = + __BIN_ATTR(scp_data, (S_IRUGO | S_IWUSR), reipl_fcp_scpdata_read, + reipl_fcp_scpdata_write, DIAG308_SCPDATA_SIZE); + +static struct bin_attribute *reipl_fcp_bin_attrs[] = { + &sys_reipl_fcp_scp_data_attr, + NULL, +}; + +DEFINE_IPL_ATTR_RW(reipl_fcp, wwpn, "0x%016llx\n", "%llx\n", + reipl_block_fcp->ipl_info.fcp.wwpn); +DEFINE_IPL_ATTR_RW(reipl_fcp, lun, "0x%016llx\n", "%llx\n", + reipl_block_fcp->ipl_info.fcp.lun); +DEFINE_IPL_ATTR_RW(reipl_fcp, bootprog, "%lld\n", "%lld\n", + reipl_block_fcp->ipl_info.fcp.bootprog); +DEFINE_IPL_ATTR_RW(reipl_fcp, br_lba, "%lld\n", "%lld\n", + reipl_block_fcp->ipl_info.fcp.br_lba); +DEFINE_IPL_ATTR_RW(reipl_fcp, device, "0.0.%04llx\n", "0.0.%llx\n", + reipl_block_fcp->ipl_info.fcp.devno); + +static void reipl_get_ascii_loadparm(char *loadparm, + struct ipl_parameter_block *ibp) +{ + memcpy(loadparm, ibp->hdr.loadparm, LOADPARM_LEN); + EBCASC(loadparm, LOADPARM_LEN); + loadparm[LOADPARM_LEN] = 0; + strim(loadparm); +} + +static ssize_t reipl_generic_loadparm_show(struct ipl_parameter_block *ipb, + char *page) +{ + char buf[LOADPARM_LEN + 1]; + + reipl_get_ascii_loadparm(buf, ipb); + return sprintf(page, "%s\n", buf); +} + +static ssize_t reipl_generic_loadparm_store(struct ipl_parameter_block *ipb, + const char *buf, size_t len) +{ + int i, lp_len; + + /* ignore trailing newline */ + lp_len = len; + if ((len > 0) && (buf[len - 1] == '\n')) + lp_len--; + /* loadparm can have max 8 characters and must not start with a blank */ + if ((lp_len > LOADPARM_LEN) || ((lp_len > 0) && (buf[0] == ' '))) + return -EINVAL; + /* loadparm can only contain "a-z,A-Z,0-9,SP,." */ + for (i = 0; i < lp_len; i++) { + if (isalpha(buf[i]) || isdigit(buf[i]) || (buf[i] == ' ') || + (buf[i] == '.')) + continue; + return -EINVAL; + } + /* initialize loadparm with blanks */ + memset(ipb->hdr.loadparm, ' ', LOADPARM_LEN); + /* copy and convert to ebcdic */ + memcpy(ipb->hdr.loadparm, buf, lp_len); + ASCEBC(ipb->hdr.loadparm, LOADPARM_LEN); + ipb->hdr.flags |= DIAG308_FLAGS_LP_VALID; + return len; +} + +/* FCP wrapper */ +static ssize_t reipl_fcp_loadparm_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return reipl_generic_loadparm_show(reipl_block_fcp, page); +} + +static ssize_t reipl_fcp_loadparm_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + return reipl_generic_loadparm_store(reipl_block_fcp, buf, len); +} + +static struct kobj_attribute sys_reipl_fcp_loadparm_attr = + __ATTR(loadparm, S_IRUGO | S_IWUSR, reipl_fcp_loadparm_show, + reipl_fcp_loadparm_store); + +static struct attribute *reipl_fcp_attrs[] = { + &sys_reipl_fcp_device_attr.attr, + &sys_reipl_fcp_wwpn_attr.attr, + &sys_reipl_fcp_lun_attr.attr, + &sys_reipl_fcp_bootprog_attr.attr, + &sys_reipl_fcp_br_lba_attr.attr, + &sys_reipl_fcp_loadparm_attr.attr, + NULL, +}; + +static struct attribute_group reipl_fcp_attr_group = { + .attrs = reipl_fcp_attrs, + .bin_attrs = reipl_fcp_bin_attrs, +}; + +/* CCW reipl device attributes */ +DEFINE_IPL_CCW_ATTR_RW(reipl_ccw, device, reipl_block_ccw->ipl_info.ccw); + +/* NSS wrapper */ +static ssize_t reipl_nss_loadparm_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return reipl_generic_loadparm_show(reipl_block_nss, page); +} + +static ssize_t reipl_nss_loadparm_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + return reipl_generic_loadparm_store(reipl_block_nss, buf, len); +} + +/* CCW wrapper */ +static ssize_t reipl_ccw_loadparm_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return reipl_generic_loadparm_show(reipl_block_ccw, page); +} + +static ssize_t reipl_ccw_loadparm_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + return reipl_generic_loadparm_store(reipl_block_ccw, buf, len); +} + +static struct kobj_attribute sys_reipl_ccw_loadparm_attr = + __ATTR(loadparm, S_IRUGO | S_IWUSR, reipl_ccw_loadparm_show, + reipl_ccw_loadparm_store); + +static struct attribute *reipl_ccw_attrs_vm[] = { + &sys_reipl_ccw_device_attr.attr, + &sys_reipl_ccw_loadparm_attr.attr, + &sys_reipl_ccw_vmparm_attr.attr, + NULL, +}; + +static struct attribute *reipl_ccw_attrs_lpar[] = { + &sys_reipl_ccw_device_attr.attr, + &sys_reipl_ccw_loadparm_attr.attr, + NULL, +}; + +static struct attribute_group reipl_ccw_attr_group_vm = { + .name = IPL_CCW_STR, + .attrs = reipl_ccw_attrs_vm, +}; + +static struct attribute_group reipl_ccw_attr_group_lpar = { + .name = IPL_CCW_STR, + .attrs = reipl_ccw_attrs_lpar, +}; + + +/* NSS reipl device attributes */ +static void reipl_get_ascii_nss_name(char *dst, + struct ipl_parameter_block *ipb) +{ + memcpy(dst, ipb->ipl_info.ccw.nss_name, NSS_NAME_SIZE); + EBCASC(dst, NSS_NAME_SIZE); + dst[NSS_NAME_SIZE] = 0; +} + +static ssize_t reipl_nss_name_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + char nss_name[NSS_NAME_SIZE + 1] = {}; + + reipl_get_ascii_nss_name(nss_name, reipl_block_nss); + return sprintf(page, "%s\n", nss_name); +} + +static ssize_t reipl_nss_name_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + int nss_len; + + /* ignore trailing newline */ + nss_len = len; + if ((len > 0) && (buf[len - 1] == '\n')) + nss_len--; + + if (nss_len > NSS_NAME_SIZE) + return -EINVAL; + + memset(reipl_block_nss->ipl_info.ccw.nss_name, 0x40, NSS_NAME_SIZE); + if (nss_len > 0) { + reipl_block_nss->ipl_info.ccw.vm_flags |= + DIAG308_VM_FLAGS_NSS_VALID; + memcpy(reipl_block_nss->ipl_info.ccw.nss_name, buf, nss_len); + ASCEBC(reipl_block_nss->ipl_info.ccw.nss_name, nss_len); + EBC_TOUPPER(reipl_block_nss->ipl_info.ccw.nss_name, nss_len); + } else { + reipl_block_nss->ipl_info.ccw.vm_flags &= + ~DIAG308_VM_FLAGS_NSS_VALID; + } + + return len; +} + +static struct kobj_attribute sys_reipl_nss_name_attr = + __ATTR(name, S_IRUGO | S_IWUSR, reipl_nss_name_show, + reipl_nss_name_store); + +static struct kobj_attribute sys_reipl_nss_loadparm_attr = + __ATTR(loadparm, S_IRUGO | S_IWUSR, reipl_nss_loadparm_show, + reipl_nss_loadparm_store); + +static struct attribute *reipl_nss_attrs[] = { + &sys_reipl_nss_name_attr.attr, + &sys_reipl_nss_loadparm_attr.attr, + &sys_reipl_nss_vmparm_attr.attr, + NULL, +}; + +static struct attribute_group reipl_nss_attr_group = { + .name = IPL_NSS_STR, + .attrs = reipl_nss_attrs, +}; + +void set_os_info_reipl_block(void) +{ + os_info_entry_add(OS_INFO_REIPL_BLOCK, reipl_block_actual, + reipl_block_actual->hdr.len); +} + +/* reipl type */ + +static int reipl_set_type(enum ipl_type type) +{ + if (!(reipl_capabilities & type)) + return -EINVAL; + + switch(type) { + case IPL_TYPE_CCW: + reipl_block_actual = reipl_block_ccw; + break; + case IPL_TYPE_FCP: + reipl_block_actual = reipl_block_fcp; + break; + case IPL_TYPE_NSS: + reipl_block_actual = reipl_block_nss; + break; + default: + break; + } + reipl_type = type; + return 0; +} + +static ssize_t reipl_type_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sprintf(page, "%s\n", ipl_type_str(reipl_type)); +} + +static ssize_t reipl_type_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + int rc = -EINVAL; + + if (strncmp(buf, IPL_CCW_STR, strlen(IPL_CCW_STR)) == 0) + rc = reipl_set_type(IPL_TYPE_CCW); + else if (strncmp(buf, IPL_FCP_STR, strlen(IPL_FCP_STR)) == 0) + rc = reipl_set_type(IPL_TYPE_FCP); + else if (strncmp(buf, IPL_NSS_STR, strlen(IPL_NSS_STR)) == 0) + rc = reipl_set_type(IPL_TYPE_NSS); + return (rc != 0) ? rc : len; +} + +static struct kobj_attribute reipl_type_attr = + __ATTR(reipl_type, 0644, reipl_type_show, reipl_type_store); + +static struct kset *reipl_kset; +static struct kset *reipl_fcp_kset; + +static void __reipl_run(void *unused) +{ + switch (reipl_type) { + case IPL_TYPE_CCW: + diag308(DIAG308_SET, reipl_block_ccw); + diag308(DIAG308_LOAD_CLEAR, NULL); + break; + case IPL_TYPE_FCP: + diag308(DIAG308_SET, reipl_block_fcp); + diag308(DIAG308_LOAD_CLEAR, NULL); + break; + case IPL_TYPE_NSS: + diag308(DIAG308_SET, reipl_block_nss); + diag308(DIAG308_LOAD_CLEAR, NULL); + break; + case IPL_TYPE_UNKNOWN: + diag308(DIAG308_LOAD_CLEAR, NULL); + break; + case IPL_TYPE_FCP_DUMP: + break; + } + disabled_wait((unsigned long) __builtin_return_address(0)); +} + +static void reipl_run(struct shutdown_trigger *trigger) +{ + smp_call_ipl_cpu(__reipl_run, NULL); +} + +static void reipl_block_ccw_init(struct ipl_parameter_block *ipb) +{ + ipb->hdr.len = IPL_PARM_BLK_CCW_LEN; + ipb->hdr.version = IPL_PARM_BLOCK_VERSION; + ipb->hdr.blk0_len = IPL_PARM_BLK0_CCW_LEN; + ipb->hdr.pbt = DIAG308_IPL_TYPE_CCW; +} + +static void reipl_block_ccw_fill_parms(struct ipl_parameter_block *ipb) +{ + /* LOADPARM */ + /* check if read scp info worked and set loadparm */ + if (sclp_ipl_info.is_valid) + memcpy(ipb->hdr.loadparm, &sclp_ipl_info.loadparm, LOADPARM_LEN); + else + /* read scp info failed: set empty loadparm (EBCDIC blanks) */ + memset(ipb->hdr.loadparm, 0x40, LOADPARM_LEN); + ipb->hdr.flags = DIAG308_FLAGS_LP_VALID; + + /* VM PARM */ + if (MACHINE_IS_VM && ipl_block_valid && + (ipl_block.ipl_info.ccw.vm_flags & DIAG308_VM_FLAGS_VP_VALID)) { + + ipb->ipl_info.ccw.vm_flags |= DIAG308_VM_FLAGS_VP_VALID; + ipb->ipl_info.ccw.vm_parm_len = + ipl_block.ipl_info.ccw.vm_parm_len; + memcpy(ipb->ipl_info.ccw.vm_parm, + ipl_block.ipl_info.ccw.vm_parm, DIAG308_VMPARM_SIZE); + } +} + +static int __init reipl_nss_init(void) +{ + int rc; + + if (!MACHINE_IS_VM) + return 0; + + reipl_block_nss = (void *) get_zeroed_page(GFP_KERNEL); + if (!reipl_block_nss) + return -ENOMEM; + + rc = sysfs_create_group(&reipl_kset->kobj, &reipl_nss_attr_group); + if (rc) + return rc; + + reipl_block_ccw_init(reipl_block_nss); + reipl_capabilities |= IPL_TYPE_NSS; + return 0; +} + +static int __init reipl_ccw_init(void) +{ + int rc; + + reipl_block_ccw = (void *) get_zeroed_page(GFP_KERNEL); + if (!reipl_block_ccw) + return -ENOMEM; + + rc = sysfs_create_group(&reipl_kset->kobj, + MACHINE_IS_VM ? &reipl_ccw_attr_group_vm + : &reipl_ccw_attr_group_lpar); + if (rc) + return rc; + + reipl_block_ccw_init(reipl_block_ccw); + if (ipl_info.type == IPL_TYPE_CCW) { + reipl_block_ccw->ipl_info.ccw.ssid = ipl_block.ipl_info.ccw.ssid; + reipl_block_ccw->ipl_info.ccw.devno = ipl_block.ipl_info.ccw.devno; + reipl_block_ccw_fill_parms(reipl_block_ccw); + } + + reipl_capabilities |= IPL_TYPE_CCW; + return 0; +} + +static int __init reipl_fcp_init(void) +{ + int rc; + + reipl_block_fcp = (void *) get_zeroed_page(GFP_KERNEL); + if (!reipl_block_fcp) + return -ENOMEM; + + /* sysfs: create fcp kset for mixing attr group and bin attrs */ + reipl_fcp_kset = kset_create_and_add(IPL_FCP_STR, NULL, + &reipl_kset->kobj); + if (!reipl_fcp_kset) { + free_page((unsigned long) reipl_block_fcp); + return -ENOMEM; + } + + rc = sysfs_create_group(&reipl_fcp_kset->kobj, &reipl_fcp_attr_group); + if (rc) { + kset_unregister(reipl_fcp_kset); + free_page((unsigned long) reipl_block_fcp); + return rc; + } + + if (ipl_info.type == IPL_TYPE_FCP) { + memcpy(reipl_block_fcp, &ipl_block, sizeof(ipl_block)); + /* + * Fix loadparm: There are systems where the (SCSI) LOADPARM + * is invalid in the SCSI IPL parameter block, so take it + * always from sclp_ipl_info. + */ + memcpy(reipl_block_fcp->hdr.loadparm, sclp_ipl_info.loadparm, + LOADPARM_LEN); + } else { + reipl_block_fcp->hdr.len = IPL_PARM_BLK_FCP_LEN; + reipl_block_fcp->hdr.version = IPL_PARM_BLOCK_VERSION; + reipl_block_fcp->hdr.blk0_len = IPL_PARM_BLK0_FCP_LEN; + reipl_block_fcp->hdr.pbt = DIAG308_IPL_TYPE_FCP; + reipl_block_fcp->ipl_info.fcp.opt = DIAG308_IPL_OPT_IPL; + } + reipl_capabilities |= IPL_TYPE_FCP; + return 0; +} + +static int __init reipl_type_init(void) +{ + enum ipl_type reipl_type = ipl_info.type; + struct ipl_parameter_block *reipl_block; + unsigned long size; + + reipl_block = os_info_old_entry(OS_INFO_REIPL_BLOCK, &size); + if (!reipl_block) + goto out; + /* + * If we have an OS info reipl block, this will be used + */ + if (reipl_block->hdr.pbt == DIAG308_IPL_TYPE_FCP) { + memcpy(reipl_block_fcp, reipl_block, size); + reipl_type = IPL_TYPE_FCP; + } else if (reipl_block->hdr.pbt == DIAG308_IPL_TYPE_CCW) { + memcpy(reipl_block_ccw, reipl_block, size); + reipl_type = IPL_TYPE_CCW; + } +out: + return reipl_set_type(reipl_type); +} + +static int __init reipl_init(void) +{ + int rc; + + reipl_kset = kset_create_and_add("reipl", NULL, firmware_kobj); + if (!reipl_kset) + return -ENOMEM; + rc = sysfs_create_file(&reipl_kset->kobj, &reipl_type_attr.attr); + if (rc) { + kset_unregister(reipl_kset); + return rc; + } + rc = reipl_ccw_init(); + if (rc) + return rc; + rc = reipl_fcp_init(); + if (rc) + return rc; + rc = reipl_nss_init(); + if (rc) + return rc; + return reipl_type_init(); +} + +static struct shutdown_action __refdata reipl_action = { + .name = SHUTDOWN_ACTION_REIPL_STR, + .fn = reipl_run, + .init = reipl_init, +}; + +/* + * dump shutdown action: Dump Linux on shutdown. + */ + +/* FCP dump device attributes */ + +DEFINE_IPL_ATTR_RW(dump_fcp, wwpn, "0x%016llx\n", "%llx\n", + dump_block_fcp->ipl_info.fcp.wwpn); +DEFINE_IPL_ATTR_RW(dump_fcp, lun, "0x%016llx\n", "%llx\n", + dump_block_fcp->ipl_info.fcp.lun); +DEFINE_IPL_ATTR_RW(dump_fcp, bootprog, "%lld\n", "%lld\n", + dump_block_fcp->ipl_info.fcp.bootprog); +DEFINE_IPL_ATTR_RW(dump_fcp, br_lba, "%lld\n", "%lld\n", + dump_block_fcp->ipl_info.fcp.br_lba); +DEFINE_IPL_ATTR_RW(dump_fcp, device, "0.0.%04llx\n", "0.0.%llx\n", + dump_block_fcp->ipl_info.fcp.devno); + +static struct attribute *dump_fcp_attrs[] = { + &sys_dump_fcp_device_attr.attr, + &sys_dump_fcp_wwpn_attr.attr, + &sys_dump_fcp_lun_attr.attr, + &sys_dump_fcp_bootprog_attr.attr, + &sys_dump_fcp_br_lba_attr.attr, + NULL, +}; + +static struct attribute_group dump_fcp_attr_group = { + .name = IPL_FCP_STR, + .attrs = dump_fcp_attrs, +}; + +/* CCW dump device attributes */ +DEFINE_IPL_CCW_ATTR_RW(dump_ccw, device, dump_block_ccw->ipl_info.ccw); + +static struct attribute *dump_ccw_attrs[] = { + &sys_dump_ccw_device_attr.attr, + NULL, +}; + +static struct attribute_group dump_ccw_attr_group = { + .name = IPL_CCW_STR, + .attrs = dump_ccw_attrs, +}; + +/* dump type */ + +static int dump_set_type(enum dump_type type) +{ + if (!(dump_capabilities & type)) + return -EINVAL; + dump_type = type; + return 0; +} + +static ssize_t dump_type_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sprintf(page, "%s\n", dump_type_str(dump_type)); +} + +static ssize_t dump_type_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + int rc = -EINVAL; + + if (strncmp(buf, DUMP_NONE_STR, strlen(DUMP_NONE_STR)) == 0) + rc = dump_set_type(DUMP_TYPE_NONE); + else if (strncmp(buf, DUMP_CCW_STR, strlen(DUMP_CCW_STR)) == 0) + rc = dump_set_type(DUMP_TYPE_CCW); + else if (strncmp(buf, DUMP_FCP_STR, strlen(DUMP_FCP_STR)) == 0) + rc = dump_set_type(DUMP_TYPE_FCP); + return (rc != 0) ? rc : len; +} + +static struct kobj_attribute dump_type_attr = + __ATTR(dump_type, 0644, dump_type_show, dump_type_store); + +static struct kset *dump_kset; + +static void diag308_dump(void *dump_block) +{ + diag308(DIAG308_SET, dump_block); + while (1) { + if (diag308(DIAG308_LOAD_NORMAL_DUMP, NULL) != 0x302) + break; + udelay_simple(USEC_PER_SEC); + } +} + +static void __dump_run(void *unused) +{ + switch (dump_type) { + case DUMP_TYPE_CCW: + diag308_dump(dump_block_ccw); + break; + case DUMP_TYPE_FCP: + diag308_dump(dump_block_fcp); + break; + default: + break; + } +} + +static void dump_run(struct shutdown_trigger *trigger) +{ + if (dump_type == DUMP_TYPE_NONE) + return; + smp_send_stop(); + smp_call_ipl_cpu(__dump_run, NULL); +} + +static int __init dump_ccw_init(void) +{ + int rc; + + dump_block_ccw = (void *) get_zeroed_page(GFP_KERNEL); + if (!dump_block_ccw) + return -ENOMEM; + rc = sysfs_create_group(&dump_kset->kobj, &dump_ccw_attr_group); + if (rc) { + free_page((unsigned long)dump_block_ccw); + return rc; + } + dump_block_ccw->hdr.len = IPL_PARM_BLK_CCW_LEN; + dump_block_ccw->hdr.version = IPL_PARM_BLOCK_VERSION; + dump_block_ccw->hdr.blk0_len = IPL_PARM_BLK0_CCW_LEN; + dump_block_ccw->hdr.pbt = DIAG308_IPL_TYPE_CCW; + dump_capabilities |= DUMP_TYPE_CCW; + return 0; +} + +static int __init dump_fcp_init(void) +{ + int rc; + + if (!sclp_ipl_info.has_dump) + return 0; /* LDIPL DUMP is not installed */ + dump_block_fcp = (void *) get_zeroed_page(GFP_KERNEL); + if (!dump_block_fcp) + return -ENOMEM; + rc = sysfs_create_group(&dump_kset->kobj, &dump_fcp_attr_group); + if (rc) { + free_page((unsigned long)dump_block_fcp); + return rc; + } + dump_block_fcp->hdr.len = IPL_PARM_BLK_FCP_LEN; + dump_block_fcp->hdr.version = IPL_PARM_BLOCK_VERSION; + dump_block_fcp->hdr.blk0_len = IPL_PARM_BLK0_FCP_LEN; + dump_block_fcp->hdr.pbt = DIAG308_IPL_TYPE_FCP; + dump_block_fcp->ipl_info.fcp.opt = DIAG308_IPL_OPT_DUMP; + dump_capabilities |= DUMP_TYPE_FCP; + return 0; +} + +static int __init dump_init(void) +{ + int rc; + + dump_kset = kset_create_and_add("dump", NULL, firmware_kobj); + if (!dump_kset) + return -ENOMEM; + rc = sysfs_create_file(&dump_kset->kobj, &dump_type_attr.attr); + if (rc) { + kset_unregister(dump_kset); + return rc; + } + rc = dump_ccw_init(); + if (rc) + return rc; + rc = dump_fcp_init(); + if (rc) + return rc; + dump_set_type(DUMP_TYPE_NONE); + return 0; +} + +static struct shutdown_action __refdata dump_action = { + .name = SHUTDOWN_ACTION_DUMP_STR, + .fn = dump_run, + .init = dump_init, +}; + +static void dump_reipl_run(struct shutdown_trigger *trigger) +{ + unsigned long ipib = (unsigned long) reipl_block_actual; + unsigned int csum; + + csum = (__force unsigned int) + csum_partial(reipl_block_actual, reipl_block_actual->hdr.len, 0); + mem_assign_absolute(S390_lowcore.ipib, ipib); + mem_assign_absolute(S390_lowcore.ipib_checksum, csum); + dump_run(trigger); +} + +static struct shutdown_action __refdata dump_reipl_action = { + .name = SHUTDOWN_ACTION_DUMP_REIPL_STR, + .fn = dump_reipl_run, +}; + +/* + * vmcmd shutdown action: Trigger vm command on shutdown. + */ + +static char vmcmd_on_reboot[128]; +static char vmcmd_on_panic[128]; +static char vmcmd_on_halt[128]; +static char vmcmd_on_poff[128]; +static char vmcmd_on_restart[128]; + +DEFINE_IPL_ATTR_STR_RW(vmcmd, on_reboot, "%s\n", "%s\n", vmcmd_on_reboot); +DEFINE_IPL_ATTR_STR_RW(vmcmd, on_panic, "%s\n", "%s\n", vmcmd_on_panic); +DEFINE_IPL_ATTR_STR_RW(vmcmd, on_halt, "%s\n", "%s\n", vmcmd_on_halt); +DEFINE_IPL_ATTR_STR_RW(vmcmd, on_poff, "%s\n", "%s\n", vmcmd_on_poff); +DEFINE_IPL_ATTR_STR_RW(vmcmd, on_restart, "%s\n", "%s\n", vmcmd_on_restart); + +static struct attribute *vmcmd_attrs[] = { + &sys_vmcmd_on_reboot_attr.attr, + &sys_vmcmd_on_panic_attr.attr, + &sys_vmcmd_on_halt_attr.attr, + &sys_vmcmd_on_poff_attr.attr, + &sys_vmcmd_on_restart_attr.attr, + NULL, +}; + +static struct attribute_group vmcmd_attr_group = { + .attrs = vmcmd_attrs, +}; + +static struct kset *vmcmd_kset; + +static void vmcmd_run(struct shutdown_trigger *trigger) +{ + char *cmd; + + if (strcmp(trigger->name, ON_REIPL_STR) == 0) + cmd = vmcmd_on_reboot; + else if (strcmp(trigger->name, ON_PANIC_STR) == 0) + cmd = vmcmd_on_panic; + else if (strcmp(trigger->name, ON_HALT_STR) == 0) + cmd = vmcmd_on_halt; + else if (strcmp(trigger->name, ON_POFF_STR) == 0) + cmd = vmcmd_on_poff; + else if (strcmp(trigger->name, ON_RESTART_STR) == 0) + cmd = vmcmd_on_restart; + else + return; + + if (strlen(cmd) == 0) + return; + __cpcmd(cmd, NULL, 0, NULL); +} + +static int vmcmd_init(void) +{ + if (!MACHINE_IS_VM) + return -EOPNOTSUPP; + vmcmd_kset = kset_create_and_add("vmcmd", NULL, firmware_kobj); + if (!vmcmd_kset) + return -ENOMEM; + return sysfs_create_group(&vmcmd_kset->kobj, &vmcmd_attr_group); +} + +static struct shutdown_action vmcmd_action = {SHUTDOWN_ACTION_VMCMD_STR, + vmcmd_run, vmcmd_init}; + +/* + * stop shutdown action: Stop Linux on shutdown. + */ + +static void stop_run(struct shutdown_trigger *trigger) +{ + if (strcmp(trigger->name, ON_PANIC_STR) == 0 || + strcmp(trigger->name, ON_RESTART_STR) == 0) + disabled_wait((unsigned long) __builtin_return_address(0)); + smp_stop_cpu(); +} + +static struct shutdown_action stop_action = {SHUTDOWN_ACTION_STOP_STR, + stop_run, NULL}; + +/* action list */ + +static struct shutdown_action *shutdown_actions_list[] = { + &ipl_action, &reipl_action, &dump_reipl_action, &dump_action, + &vmcmd_action, &stop_action}; +#define SHUTDOWN_ACTIONS_COUNT (sizeof(shutdown_actions_list) / sizeof(void *)) + +/* + * Trigger section + */ + +static struct kset *shutdown_actions_kset; + +static int set_trigger(const char *buf, struct shutdown_trigger *trigger, + size_t len) +{ + int i; + + for (i = 0; i < SHUTDOWN_ACTIONS_COUNT; i++) { + if (sysfs_streq(buf, shutdown_actions_list[i]->name)) { + if (shutdown_actions_list[i]->init_rc) { + return shutdown_actions_list[i]->init_rc; + } else { + trigger->action = shutdown_actions_list[i]; + return len; + } + } + } + return -EINVAL; +} + +/* on reipl */ + +static struct shutdown_trigger on_reboot_trigger = {ON_REIPL_STR, + &reipl_action}; + +static ssize_t on_reboot_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sprintf(page, "%s\n", on_reboot_trigger.action->name); +} + +static ssize_t on_reboot_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + return set_trigger(buf, &on_reboot_trigger, len); +} +static struct kobj_attribute on_reboot_attr = __ATTR_RW(on_reboot); + +static void do_machine_restart(char *__unused) +{ + smp_send_stop(); + on_reboot_trigger.action->fn(&on_reboot_trigger); + reipl_run(NULL); +} +void (*_machine_restart)(char *command) = do_machine_restart; + +/* on panic */ + +static struct shutdown_trigger on_panic_trigger = {ON_PANIC_STR, &stop_action}; + +static ssize_t on_panic_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sprintf(page, "%s\n", on_panic_trigger.action->name); +} + +static ssize_t on_panic_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + return set_trigger(buf, &on_panic_trigger, len); +} +static struct kobj_attribute on_panic_attr = __ATTR_RW(on_panic); + +static void do_panic(void) +{ + lgr_info_log(); + on_panic_trigger.action->fn(&on_panic_trigger); + stop_run(&on_panic_trigger); +} + +/* on restart */ + +static struct shutdown_trigger on_restart_trigger = {ON_RESTART_STR, + &stop_action}; + +static ssize_t on_restart_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sprintf(page, "%s\n", on_restart_trigger.action->name); +} + +static ssize_t on_restart_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + return set_trigger(buf, &on_restart_trigger, len); +} +static struct kobj_attribute on_restart_attr = __ATTR_RW(on_restart); + +static void __do_restart(void *ignore) +{ + __arch_local_irq_stosm(0x04); /* enable DAT */ + smp_send_stop(); +#ifdef CONFIG_CRASH_DUMP + crash_kexec(NULL); +#endif + on_restart_trigger.action->fn(&on_restart_trigger); + stop_run(&on_restart_trigger); +} + +void do_restart(void) +{ + tracing_off(); + debug_locks_off(); + lgr_info_log(); + smp_call_online_cpu(__do_restart, NULL); +} + +/* on halt */ + +static struct shutdown_trigger on_halt_trigger = {ON_HALT_STR, &stop_action}; + +static ssize_t on_halt_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sprintf(page, "%s\n", on_halt_trigger.action->name); +} + +static ssize_t on_halt_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + return set_trigger(buf, &on_halt_trigger, len); +} +static struct kobj_attribute on_halt_attr = __ATTR_RW(on_halt); + +static void do_machine_halt(void) +{ + smp_send_stop(); + on_halt_trigger.action->fn(&on_halt_trigger); + stop_run(&on_halt_trigger); +} +void (*_machine_halt)(void) = do_machine_halt; + +/* on power off */ + +static struct shutdown_trigger on_poff_trigger = {ON_POFF_STR, &stop_action}; + +static ssize_t on_poff_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sprintf(page, "%s\n", on_poff_trigger.action->name); +} + +static ssize_t on_poff_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + return set_trigger(buf, &on_poff_trigger, len); +} +static struct kobj_attribute on_poff_attr = __ATTR_RW(on_poff); + +static void do_machine_power_off(void) +{ + smp_send_stop(); + on_poff_trigger.action->fn(&on_poff_trigger); + stop_run(&on_poff_trigger); +} +void (*_machine_power_off)(void) = do_machine_power_off; + +static struct attribute *shutdown_action_attrs[] = { + &on_restart_attr.attr, + &on_reboot_attr.attr, + &on_panic_attr.attr, + &on_halt_attr.attr, + &on_poff_attr.attr, + NULL, +}; + +static struct attribute_group shutdown_action_attr_group = { + .attrs = shutdown_action_attrs, +}; + +static void __init shutdown_triggers_init(void) +{ + shutdown_actions_kset = kset_create_and_add("shutdown_actions", NULL, + firmware_kobj); + if (!shutdown_actions_kset) + goto fail; + if (sysfs_create_group(&shutdown_actions_kset->kobj, + &shutdown_action_attr_group)) + goto fail; + return; +fail: + panic("shutdown_triggers_init failed\n"); +} + +static void __init shutdown_actions_init(void) +{ + int i; + + for (i = 0; i < SHUTDOWN_ACTIONS_COUNT; i++) { + if (!shutdown_actions_list[i]->init) + continue; + shutdown_actions_list[i]->init_rc = + shutdown_actions_list[i]->init(); + } +} + +static int __init s390_ipl_init(void) +{ + char str[8] = {0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40}; + + sclp_early_get_ipl_info(&sclp_ipl_info); + /* + * Fix loadparm: There are systems where the (SCSI) LOADPARM + * returned by read SCP info is invalid (contains EBCDIC blanks) + * when the system has been booted via diag308. In that case we use + * the value from diag308, if available. + * + * There are also systems where diag308 store does not work in + * case the system is booted from HMC. Fortunately in this case + * READ SCP info provides the correct value. + */ + if (memcmp(sclp_ipl_info.loadparm, str, sizeof(str)) == 0 && ipl_block_valid) + memcpy(sclp_ipl_info.loadparm, ipl_block.hdr.loadparm, LOADPARM_LEN); + shutdown_actions_init(); + shutdown_triggers_init(); + return 0; +} + +__initcall(s390_ipl_init); + +static void __init strncpy_skip_quote(char *dst, char *src, int n) +{ + int sx, dx; + + dx = 0; + for (sx = 0; src[sx] != 0; sx++) { + if (src[sx] == '"') + continue; + dst[dx++] = src[sx]; + if (dx >= n) + break; + } +} + +static int __init vmcmd_on_reboot_setup(char *str) +{ + if (!MACHINE_IS_VM) + return 1; + strncpy_skip_quote(vmcmd_on_reboot, str, 127); + vmcmd_on_reboot[127] = 0; + on_reboot_trigger.action = &vmcmd_action; + return 1; +} +__setup("vmreboot=", vmcmd_on_reboot_setup); + +static int __init vmcmd_on_panic_setup(char *str) +{ + if (!MACHINE_IS_VM) + return 1; + strncpy_skip_quote(vmcmd_on_panic, str, 127); + vmcmd_on_panic[127] = 0; + on_panic_trigger.action = &vmcmd_action; + return 1; +} +__setup("vmpanic=", vmcmd_on_panic_setup); + +static int __init vmcmd_on_halt_setup(char *str) +{ + if (!MACHINE_IS_VM) + return 1; + strncpy_skip_quote(vmcmd_on_halt, str, 127); + vmcmd_on_halt[127] = 0; + on_halt_trigger.action = &vmcmd_action; + return 1; +} +__setup("vmhalt=", vmcmd_on_halt_setup); + +static int __init vmcmd_on_poff_setup(char *str) +{ + if (!MACHINE_IS_VM) + return 1; + strncpy_skip_quote(vmcmd_on_poff, str, 127); + vmcmd_on_poff[127] = 0; + on_poff_trigger.action = &vmcmd_action; + return 1; +} +__setup("vmpoff=", vmcmd_on_poff_setup); + +static int on_panic_notify(struct notifier_block *self, + unsigned long event, void *data) +{ + do_panic(); + return NOTIFY_OK; +} + +static struct notifier_block on_panic_nb = { + .notifier_call = on_panic_notify, + .priority = INT_MIN, +}; + +void __init setup_ipl(void) +{ + BUILD_BUG_ON(sizeof(struct ipl_parameter_block) != PAGE_SIZE); + + ipl_info.type = get_ipl_type(); + switch (ipl_info.type) { + case IPL_TYPE_CCW: + ipl_info.data.ccw.dev_id.ssid = ipl_block.ipl_info.ccw.ssid; + ipl_info.data.ccw.dev_id.devno = ipl_block.ipl_info.ccw.devno; + break; + case IPL_TYPE_FCP: + case IPL_TYPE_FCP_DUMP: + ipl_info.data.fcp.dev_id.ssid = 0; + ipl_info.data.fcp.dev_id.devno = ipl_block.ipl_info.fcp.devno; + ipl_info.data.fcp.wwpn = ipl_block.ipl_info.fcp.wwpn; + ipl_info.data.fcp.lun = ipl_block.ipl_info.fcp.lun; + break; + case IPL_TYPE_NSS: + case IPL_TYPE_UNKNOWN: + /* We have no info to copy */ + break; + } + atomic_notifier_chain_register(&panic_notifier_list, &on_panic_nb); +} + +void __init ipl_store_parameters(void) +{ + int rc; + + rc = diag308(DIAG308_STORE, &ipl_block); + if (rc == DIAG308_RC_OK && ipl_block.hdr.version <= IPL_MAX_SUPPORTED_VERSION) + ipl_block_valid = 1; +} + +void s390_reset_system(void) +{ + /* Disable prefixing */ + set_prefix(0); + + /* Disable lowcore protection */ + __ctl_clear_bit(0, 28); + diag308_reset(); +} diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c new file mode 100644 index 000000000..3d17c4107 --- /dev/null +++ b/arch/s390/kernel/irq.c @@ -0,0 +1,311 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2004, 2011 + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>, + * Holger Smolinski <Holger.Smolinski@de.ibm.com>, + * Thomas Spatzier <tspat@de.ibm.com>, + * + * This file contains interrupt related functions. + */ + +#include <linux/kernel_stat.h> +#include <linux/interrupt.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> +#include <linux/profile.h> +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/ftrace.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/cpu.h> +#include <linux/irq.h> +#include <asm/irq_regs.h> +#include <asm/cputime.h> +#include <asm/lowcore.h> +#include <asm/irq.h> +#include <asm/hw_irq.h> +#include "entry.h" + +DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_stat, irq_stat); +EXPORT_PER_CPU_SYMBOL_GPL(irq_stat); + +struct irq_class { + int irq; + char *name; + char *desc; +}; + +/* + * The list of "main" irq classes on s390. This is the list of interrupts + * that appear both in /proc/stat ("intr" line) and /proc/interrupts. + * Historically only external and I/O interrupts have been part of /proc/stat. + * We can't add the split external and I/O sub classes since the first field + * in the "intr" line in /proc/stat is supposed to be the sum of all other + * fields. + * Since the external and I/O interrupt fields are already sums we would end + * up with having a sum which accounts each interrupt twice. + */ +static const struct irq_class irqclass_main_desc[NR_IRQS_BASE] = { + {.irq = EXT_INTERRUPT, .name = "EXT"}, + {.irq = IO_INTERRUPT, .name = "I/O"}, + {.irq = THIN_INTERRUPT, .name = "AIO"}, +}; + +/* + * The list of split external and I/O interrupts that appear only in + * /proc/interrupts. + * In addition this list contains non external / I/O events like NMIs. + */ +static const struct irq_class irqclass_sub_desc[] = { + {.irq = IRQEXT_CLK, .name = "CLK", .desc = "[EXT] Clock Comparator"}, + {.irq = IRQEXT_EXC, .name = "EXC", .desc = "[EXT] External Call"}, + {.irq = IRQEXT_EMS, .name = "EMS", .desc = "[EXT] Emergency Signal"}, + {.irq = IRQEXT_TMR, .name = "TMR", .desc = "[EXT] CPU Timer"}, + {.irq = IRQEXT_TLA, .name = "TAL", .desc = "[EXT] Timing Alert"}, + {.irq = IRQEXT_PFL, .name = "PFL", .desc = "[EXT] Pseudo Page Fault"}, + {.irq = IRQEXT_DSD, .name = "DSD", .desc = "[EXT] DASD Diag"}, + {.irq = IRQEXT_VRT, .name = "VRT", .desc = "[EXT] Virtio"}, + {.irq = IRQEXT_SCP, .name = "SCP", .desc = "[EXT] Service Call"}, + {.irq = IRQEXT_IUC, .name = "IUC", .desc = "[EXT] IUCV"}, + {.irq = IRQEXT_CMS, .name = "CMS", .desc = "[EXT] CPU-Measurement: Sampling"}, + {.irq = IRQEXT_CMC, .name = "CMC", .desc = "[EXT] CPU-Measurement: Counter"}, + {.irq = IRQEXT_FTP, .name = "FTP", .desc = "[EXT] HMC FTP Service"}, + {.irq = IRQIO_CIO, .name = "CIO", .desc = "[I/O] Common I/O Layer Interrupt"}, + {.irq = IRQIO_QAI, .name = "QAI", .desc = "[I/O] QDIO Adapter Interrupt"}, + {.irq = IRQIO_DAS, .name = "DAS", .desc = "[I/O] DASD"}, + {.irq = IRQIO_C15, .name = "C15", .desc = "[I/O] 3215"}, + {.irq = IRQIO_C70, .name = "C70", .desc = "[I/O] 3270"}, + {.irq = IRQIO_TAP, .name = "TAP", .desc = "[I/O] Tape"}, + {.irq = IRQIO_VMR, .name = "VMR", .desc = "[I/O] Unit Record Devices"}, + {.irq = IRQIO_LCS, .name = "LCS", .desc = "[I/O] LCS"}, + {.irq = IRQIO_CTC, .name = "CTC", .desc = "[I/O] CTC"}, + {.irq = IRQIO_APB, .name = "APB", .desc = "[I/O] AP Bus"}, + {.irq = IRQIO_ADM, .name = "ADM", .desc = "[I/O] EADM Subchannel"}, + {.irq = IRQIO_CSC, .name = "CSC", .desc = "[I/O] CHSC Subchannel"}, + {.irq = IRQIO_PCI, .name = "PCI", .desc = "[I/O] PCI Interrupt" }, + {.irq = IRQIO_MSI, .name = "MSI", .desc = "[I/O] MSI Interrupt" }, + {.irq = IRQIO_VIR, .name = "VIR", .desc = "[I/O] Virtual I/O Devices"}, + {.irq = IRQIO_VAI, .name = "VAI", .desc = "[I/O] Virtual I/O Devices AI"}, + {.irq = NMI_NMI, .name = "NMI", .desc = "[NMI] Machine Check"}, + {.irq = CPU_RST, .name = "RST", .desc = "[CPU] CPU Restart"}, +}; + +void __init init_IRQ(void) +{ + BUILD_BUG_ON(ARRAY_SIZE(irqclass_sub_desc) != NR_ARCH_IRQS); + init_cio_interrupts(); + init_airq_interrupts(); + init_ext_interrupts(); +} + +void do_IRQ(struct pt_regs *regs, int irq) +{ + struct pt_regs *old_regs; + + old_regs = set_irq_regs(regs); + irq_enter(); + if (tod_after_eq(S390_lowcore.int_clock, + S390_lowcore.clock_comparator)) + /* Serve timer interrupts first. */ + clock_comparator_work(); + generic_handle_irq(irq); + irq_exit(); + set_irq_regs(old_regs); +} + +/* + * show_interrupts is needed by /proc/interrupts. + */ +int show_interrupts(struct seq_file *p, void *v) +{ + int index = *(loff_t *) v; + int cpu, irq; + + get_online_cpus(); + if (index == 0) { + seq_puts(p, " "); + for_each_online_cpu(cpu) + seq_printf(p, "CPU%d ", cpu); + seq_putc(p, '\n'); + } + if (index < NR_IRQS_BASE) { + seq_printf(p, "%s: ", irqclass_main_desc[index].name); + irq = irqclass_main_desc[index].irq; + for_each_online_cpu(cpu) + seq_printf(p, "%10u ", kstat_irqs_cpu(irq, cpu)); + seq_putc(p, '\n'); + goto out; + } + if (index > NR_IRQS_BASE) + goto out; + + for (index = 0; index < NR_ARCH_IRQS; index++) { + seq_printf(p, "%s: ", irqclass_sub_desc[index].name); + irq = irqclass_sub_desc[index].irq; + for_each_online_cpu(cpu) + seq_printf(p, "%10u ", + per_cpu(irq_stat, cpu).irqs[irq]); + if (irqclass_sub_desc[index].desc) + seq_printf(p, " %s", irqclass_sub_desc[index].desc); + seq_putc(p, '\n'); + } +out: + put_online_cpus(); + return 0; +} + +unsigned int arch_dynirq_lower_bound(unsigned int from) +{ + return from < NR_IRQS_BASE ? NR_IRQS_BASE : from; +} + +/* + * Switch to the asynchronous interrupt stack for softirq execution. + */ +void do_softirq_own_stack(void) +{ + unsigned long old, new; + + old = current_stack_pointer(); + /* Check against async. stack address range. */ + new = S390_lowcore.async_stack; + if (((new - old) >> (PAGE_SHIFT + THREAD_SIZE_ORDER)) != 0) { + /* Need to switch to the async. stack. */ + new -= STACK_FRAME_OVERHEAD; + ((struct stack_frame *) new)->back_chain = old; + asm volatile(" la 15,0(%0)\n" + " brasl 14,__do_softirq\n" + " la 15,0(%1)\n" + : : "a" (new), "a" (old) + : "0", "1", "2", "3", "4", "5", "14", + "cc", "memory" ); + } else { + /* We are already on the async stack. */ + __do_softirq(); + } +} + +/* + * ext_int_hash[index] is the list head for all external interrupts that hash + * to this index. + */ +static struct hlist_head ext_int_hash[32] ____cacheline_aligned; + +struct ext_int_info { + ext_int_handler_t handler; + struct hlist_node entry; + struct rcu_head rcu; + u16 code; +}; + +/* ext_int_hash_lock protects the handler lists for external interrupts */ +static DEFINE_SPINLOCK(ext_int_hash_lock); + +static inline int ext_hash(u16 code) +{ + BUILD_BUG_ON(!is_power_of_2(ARRAY_SIZE(ext_int_hash))); + + return (code + (code >> 9)) & (ARRAY_SIZE(ext_int_hash) - 1); +} + +int register_external_irq(u16 code, ext_int_handler_t handler) +{ + struct ext_int_info *p; + unsigned long flags; + int index; + + p = kmalloc(sizeof(*p), GFP_ATOMIC); + if (!p) + return -ENOMEM; + p->code = code; + p->handler = handler; + index = ext_hash(code); + + spin_lock_irqsave(&ext_int_hash_lock, flags); + hlist_add_head_rcu(&p->entry, &ext_int_hash[index]); + spin_unlock_irqrestore(&ext_int_hash_lock, flags); + return 0; +} +EXPORT_SYMBOL(register_external_irq); + +int unregister_external_irq(u16 code, ext_int_handler_t handler) +{ + struct ext_int_info *p; + unsigned long flags; + int index = ext_hash(code); + + spin_lock_irqsave(&ext_int_hash_lock, flags); + hlist_for_each_entry_rcu(p, &ext_int_hash[index], entry) { + if (p->code == code && p->handler == handler) { + hlist_del_rcu(&p->entry); + kfree_rcu(p, rcu); + } + } + spin_unlock_irqrestore(&ext_int_hash_lock, flags); + return 0; +} +EXPORT_SYMBOL(unregister_external_irq); + +static irqreturn_t do_ext_interrupt(int irq, void *dummy) +{ + struct pt_regs *regs = get_irq_regs(); + struct ext_code ext_code; + struct ext_int_info *p; + int index; + + ext_code = *(struct ext_code *) ®s->int_code; + if (ext_code.code != EXT_IRQ_CLK_COMP) + set_cpu_flag(CIF_NOHZ_DELAY); + + index = ext_hash(ext_code.code); + rcu_read_lock(); + hlist_for_each_entry_rcu(p, &ext_int_hash[index], entry) { + if (unlikely(p->code != ext_code.code)) + continue; + p->handler(ext_code, regs->int_parm, regs->int_parm_long); + } + rcu_read_unlock(); + return IRQ_HANDLED; +} + +static struct irqaction external_interrupt = { + .name = "EXT", + .handler = do_ext_interrupt, +}; + +void __init init_ext_interrupts(void) +{ + int idx; + + for (idx = 0; idx < ARRAY_SIZE(ext_int_hash); idx++) + INIT_HLIST_HEAD(&ext_int_hash[idx]); + + irq_set_chip_and_handler(EXT_INTERRUPT, + &dummy_irq_chip, handle_percpu_irq); + setup_irq(EXT_INTERRUPT, &external_interrupt); +} + +static DEFINE_SPINLOCK(irq_subclass_lock); +static unsigned char irq_subclass_refcount[64]; + +void irq_subclass_register(enum irq_subclass subclass) +{ + spin_lock(&irq_subclass_lock); + if (!irq_subclass_refcount[subclass]) + ctl_set_bit(0, subclass); + irq_subclass_refcount[subclass]++; + spin_unlock(&irq_subclass_lock); +} +EXPORT_SYMBOL(irq_subclass_register); + +void irq_subclass_unregister(enum irq_subclass subclass) +{ + spin_lock(&irq_subclass_lock); + irq_subclass_refcount[subclass]--; + if (!irq_subclass_refcount[subclass]) + ctl_clear_bit(0, subclass); + spin_unlock(&irq_subclass_lock); +} +EXPORT_SYMBOL(irq_subclass_unregister); diff --git a/arch/s390/kernel/jump_label.c b/arch/s390/kernel/jump_label.c new file mode 100644 index 000000000..10009a0cd --- /dev/null +++ b/arch/s390/kernel/jump_label.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Jump label s390 support + * + * Copyright IBM Corp. 2011 + * Author(s): Jan Glauber <jang@linux.vnet.ibm.com> + */ +#include <linux/uaccess.h> +#include <linux/stop_machine.h> +#include <linux/jump_label.h> +#include <asm/ipl.h> + +struct insn { + u16 opcode; + s32 offset; +} __packed; + +struct insn_args { + struct jump_entry *entry; + enum jump_label_type type; +}; + +static void jump_label_make_nop(struct jump_entry *entry, struct insn *insn) +{ + /* brcl 0,0 */ + insn->opcode = 0xc004; + insn->offset = 0; +} + +static void jump_label_make_branch(struct jump_entry *entry, struct insn *insn) +{ + /* brcl 15,offset */ + insn->opcode = 0xc0f4; + insn->offset = (entry->target - entry->code) >> 1; +} + +static void jump_label_bug(struct jump_entry *entry, struct insn *expected, + struct insn *new) +{ + unsigned char *ipc = (unsigned char *)entry->code; + unsigned char *ipe = (unsigned char *)expected; + unsigned char *ipn = (unsigned char *)new; + + pr_emerg("Jump label code mismatch at %pS [%px]\n", ipc, ipc); + pr_emerg("Found: %6ph\n", ipc); + pr_emerg("Expected: %6ph\n", ipe); + pr_emerg("New: %6ph\n", ipn); + panic("Corrupted kernel text"); +} + +static struct insn orignop = { + .opcode = 0xc004, + .offset = JUMP_LABEL_NOP_OFFSET >> 1, +}; + +static void __jump_label_transform(struct jump_entry *entry, + enum jump_label_type type, + int init) +{ + struct insn old, new; + + if (type == JUMP_LABEL_JMP) { + jump_label_make_nop(entry, &old); + jump_label_make_branch(entry, &new); + } else { + jump_label_make_branch(entry, &old); + jump_label_make_nop(entry, &new); + } + if (init) { + if (memcmp((void *)entry->code, &orignop, sizeof(orignop))) + jump_label_bug(entry, &orignop, &new); + } else { + if (memcmp((void *)entry->code, &old, sizeof(old))) + jump_label_bug(entry, &old, &new); + } + s390_kernel_write((void *)entry->code, &new, sizeof(new)); +} + +static int __sm_arch_jump_label_transform(void *data) +{ + struct insn_args *args = data; + + __jump_label_transform(args->entry, args->type, 0); + return 0; +} + +void arch_jump_label_transform(struct jump_entry *entry, + enum jump_label_type type) +{ + struct insn_args args; + + args.entry = entry; + args.type = type; + + stop_machine_cpuslocked(__sm_arch_jump_label_transform, &args, NULL); +} + +void arch_jump_label_transform_static(struct jump_entry *entry, + enum jump_label_type type) +{ + __jump_label_transform(entry, type, 1); +} diff --git a/arch/s390/kernel/kdebugfs.c b/arch/s390/kernel/kdebugfs.c new file mode 100644 index 000000000..2c46bd6c6 --- /dev/null +++ b/arch/s390/kernel/kdebugfs.c @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/debugfs.h> +#include <linux/export.h> +#include <linux/init.h> + +struct dentry *arch_debugfs_dir; +EXPORT_SYMBOL(arch_debugfs_dir); + +static int __init arch_kdebugfs_init(void) +{ + arch_debugfs_dir = debugfs_create_dir("s390", NULL); + if (IS_ERR(arch_debugfs_dir)) + arch_debugfs_dir = NULL; + return 0; +} +postcore_initcall(arch_kdebugfs_init); diff --git a/arch/s390/kernel/kexec_elf.c b/arch/s390/kernel/kexec_elf.c new file mode 100644 index 000000000..5cf340b77 --- /dev/null +++ b/arch/s390/kernel/kexec_elf.c @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ELF loader for kexec_file_load system call. + * + * Copyright IBM Corp. 2018 + * + * Author(s): Philipp Rudo <prudo@linux.vnet.ibm.com> + */ + +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/kexec.h> +#include <asm/setup.h> + +static int kexec_file_add_elf_kernel(struct kimage *image, + struct s390_load_data *data, + char *kernel, unsigned long kernel_len) +{ + struct kexec_buf buf; + const Elf_Ehdr *ehdr; + const Elf_Phdr *phdr; + Elf_Addr entry; + int i, ret; + + ehdr = (Elf_Ehdr *)kernel; + buf.image = image; + if (image->type == KEXEC_TYPE_CRASH) + entry = STARTUP_KDUMP_OFFSET; + else + entry = ehdr->e_entry; + + phdr = (void *)ehdr + ehdr->e_phoff; + for (i = 0; i < ehdr->e_phnum; i++, phdr++) { + if (phdr->p_type != PT_LOAD) + continue; + + buf.buffer = kernel + phdr->p_offset; + buf.bufsz = phdr->p_filesz; + + buf.mem = ALIGN(phdr->p_paddr, phdr->p_align); + buf.memsz = phdr->p_memsz; + + if (entry - phdr->p_paddr < phdr->p_memsz) { + data->kernel_buf = buf.buffer; + data->memsz += STARTUP_NORMAL_OFFSET; + + buf.buffer += STARTUP_NORMAL_OFFSET; + buf.bufsz -= STARTUP_NORMAL_OFFSET; + + buf.mem += STARTUP_NORMAL_OFFSET; + buf.memsz -= STARTUP_NORMAL_OFFSET; + } + + if (image->type == KEXEC_TYPE_CRASH) + buf.mem += crashk_res.start; + + ret = kexec_add_buffer(&buf); + if (ret) + return ret; + + data->memsz = ALIGN(data->memsz, phdr->p_align) + buf.memsz; + } + + return 0; +} + +static void *s390_elf_load(struct kimage *image, + char *kernel, unsigned long kernel_len, + char *initrd, unsigned long initrd_len, + char *cmdline, unsigned long cmdline_len) +{ + struct s390_load_data data = {0}; + const Elf_Ehdr *ehdr; + const Elf_Phdr *phdr; + size_t size; + int i, ret; + + /* image->fobs->probe already checked for valid ELF magic number. */ + ehdr = (Elf_Ehdr *)kernel; + + if (ehdr->e_type != ET_EXEC || + ehdr->e_ident[EI_CLASS] != ELFCLASS64 || + !elf_check_arch(ehdr)) + return ERR_PTR(-EINVAL); + + if (!ehdr->e_phnum || ehdr->e_phentsize != sizeof(Elf_Phdr)) + return ERR_PTR(-EINVAL); + + size = ehdr->e_ehsize + ehdr->e_phoff; + size += ehdr->e_phentsize * ehdr->e_phnum; + if (size > kernel_len) + return ERR_PTR(-EINVAL); + + phdr = (void *)ehdr + ehdr->e_phoff; + size = ALIGN(size, phdr->p_align); + for (i = 0; i < ehdr->e_phnum; i++, phdr++) { + if (phdr->p_type == PT_INTERP) + return ERR_PTR(-EINVAL); + + if (phdr->p_offset > kernel_len) + return ERR_PTR(-EINVAL); + + size += ALIGN(phdr->p_filesz, phdr->p_align); + } + + if (size > kernel_len) + return ERR_PTR(-EINVAL); + + ret = kexec_file_add_elf_kernel(image, &data, kernel, kernel_len); + if (ret) + return ERR_PTR(ret); + + if (!data.memsz) + return ERR_PTR(-EINVAL); + + if (initrd) { + ret = kexec_file_add_initrd(image, &data, initrd, initrd_len); + if (ret) + return ERR_PTR(ret); + } + + ret = kexec_file_add_purgatory(image, &data); + if (ret) + return ERR_PTR(ret); + + return kexec_file_update_kernel(image, &data); +} + +static int s390_elf_probe(const char *buf, unsigned long len) +{ + const Elf_Ehdr *ehdr; + + if (len < sizeof(Elf_Ehdr)) + return -ENOEXEC; + + ehdr = (Elf_Ehdr *)buf; + + /* Only check the ELF magic number here and do proper validity check + * in the loader. Any check here that fails would send the erroneous + * ELF file to the image loader that does not care what it gets. + * (Most likely) causing behavior not intended by the user. + */ + if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0) + return -ENOEXEC; + + return 0; +} + +const struct kexec_file_ops s390_kexec_elf_ops = { + .probe = s390_elf_probe, + .load = s390_elf_load, +}; diff --git a/arch/s390/kernel/kexec_image.c b/arch/s390/kernel/kexec_image.c new file mode 100644 index 000000000..380085259 --- /dev/null +++ b/arch/s390/kernel/kexec_image.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Image loader for kexec_file_load system call. + * + * Copyright IBM Corp. 2018 + * + * Author(s): Philipp Rudo <prudo@linux.vnet.ibm.com> + */ + +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/kexec.h> +#include <asm/setup.h> + +static int kexec_file_add_image_kernel(struct kimage *image, + struct s390_load_data *data, + char *kernel, unsigned long kernel_len) +{ + struct kexec_buf buf; + int ret; + + buf.image = image; + + buf.buffer = kernel + STARTUP_NORMAL_OFFSET; + buf.bufsz = kernel_len - STARTUP_NORMAL_OFFSET; + + buf.mem = STARTUP_NORMAL_OFFSET; + if (image->type == KEXEC_TYPE_CRASH) + buf.mem += crashk_res.start; + buf.memsz = buf.bufsz; + + ret = kexec_add_buffer(&buf); + + data->kernel_buf = kernel; + data->memsz += buf.memsz + STARTUP_NORMAL_OFFSET; + + return ret; +} + +static void *s390_image_load(struct kimage *image, + char *kernel, unsigned long kernel_len, + char *initrd, unsigned long initrd_len, + char *cmdline, unsigned long cmdline_len) +{ + struct s390_load_data data = {0}; + int ret; + + ret = kexec_file_add_image_kernel(image, &data, kernel, kernel_len); + if (ret) + return ERR_PTR(ret); + + if (initrd) { + ret = kexec_file_add_initrd(image, &data, initrd, initrd_len); + if (ret) + return ERR_PTR(ret); + } + + ret = kexec_file_add_purgatory(image, &data); + if (ret) + return ERR_PTR(ret); + + return kexec_file_update_kernel(image, &data); +} + +static int s390_image_probe(const char *buf, unsigned long len) +{ + /* Can't reliably tell if an image is valid. Therefore give the + * user whatever he wants. + */ + return 0; +} + +const struct kexec_file_ops s390_kexec_image_ops = { + .probe = s390_image_probe, + .load = s390_image_load, +}; diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c new file mode 100644 index 000000000..7c0a095e9 --- /dev/null +++ b/arch/s390/kernel/kprobes.c @@ -0,0 +1,658 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Kernel Probes (KProbes) + * + * Copyright IBM Corp. 2002, 2006 + * + * s390 port, used ppc64 as template. Mike Grundy <grundym@us.ibm.com> + */ + +#include <linux/kprobes.h> +#include <linux/ptrace.h> +#include <linux/preempt.h> +#include <linux/stop_machine.h> +#include <linux/kdebug.h> +#include <linux/uaccess.h> +#include <linux/extable.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/hardirq.h> +#include <linux/ftrace.h> +#include <asm/set_memory.h> +#include <asm/sections.h> +#include <asm/dis.h> + +DEFINE_PER_CPU(struct kprobe *, current_kprobe); +DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); + +struct kretprobe_blackpoint kretprobe_blacklist[] = { }; + +DEFINE_INSN_CACHE_OPS(dmainsn); + +static void *alloc_dmainsn_page(void) +{ + void *page; + + page = (void *) __get_free_page(GFP_KERNEL | GFP_DMA); + if (page) + set_memory_x((unsigned long) page, 1); + return page; +} + +static void free_dmainsn_page(void *page) +{ + set_memory_nx((unsigned long) page, 1); + free_page((unsigned long)page); +} + +struct kprobe_insn_cache kprobe_dmainsn_slots = { + .mutex = __MUTEX_INITIALIZER(kprobe_dmainsn_slots.mutex), + .alloc = alloc_dmainsn_page, + .free = free_dmainsn_page, + .pages = LIST_HEAD_INIT(kprobe_dmainsn_slots.pages), + .insn_size = MAX_INSN_SIZE, +}; + +static void copy_instruction(struct kprobe *p) +{ + unsigned long ip = (unsigned long) p->addr; + s64 disp, new_disp; + u64 addr, new_addr; + + if (ftrace_location(ip) == ip) { + /* + * If kprobes patches the instruction that is morphed by + * ftrace make sure that kprobes always sees the branch + * "jg .+24" that skips the mcount block or the "brcl 0,0" + * in case of hotpatch. + */ + ftrace_generate_nop_insn((struct ftrace_insn *)p->ainsn.insn); + p->ainsn.is_ftrace_insn = 1; + } else + memcpy(p->ainsn.insn, p->addr, insn_length(*p->addr >> 8)); + p->opcode = p->ainsn.insn[0]; + if (!probe_is_insn_relative_long(p->ainsn.insn)) + return; + /* + * For pc-relative instructions in RIL-b or RIL-c format patch the + * RI2 displacement field. We have already made sure that the insn + * slot for the patched instruction is within the same 2GB area + * as the original instruction (either kernel image or module area). + * Therefore the new displacement will always fit. + */ + disp = *(s32 *)&p->ainsn.insn[1]; + addr = (u64)(unsigned long)p->addr; + new_addr = (u64)(unsigned long)p->ainsn.insn; + new_disp = ((addr + (disp * 2)) - new_addr) / 2; + *(s32 *)&p->ainsn.insn[1] = new_disp; +} +NOKPROBE_SYMBOL(copy_instruction); + +static inline int is_kernel_addr(void *addr) +{ + return addr < (void *)_end; +} + +static int s390_get_insn_slot(struct kprobe *p) +{ + /* + * Get an insn slot that is within the same 2GB area like the original + * instruction. That way instructions with a 32bit signed displacement + * field can be patched and executed within the insn slot. + */ + p->ainsn.insn = NULL; + if (is_kernel_addr(p->addr)) + p->ainsn.insn = get_dmainsn_slot(); + else if (is_module_addr(p->addr)) + p->ainsn.insn = get_insn_slot(); + return p->ainsn.insn ? 0 : -ENOMEM; +} +NOKPROBE_SYMBOL(s390_get_insn_slot); + +static void s390_free_insn_slot(struct kprobe *p) +{ + if (!p->ainsn.insn) + return; + if (is_kernel_addr(p->addr)) + free_dmainsn_slot(p->ainsn.insn, 0); + else + free_insn_slot(p->ainsn.insn, 0); + p->ainsn.insn = NULL; +} +NOKPROBE_SYMBOL(s390_free_insn_slot); + +int arch_prepare_kprobe(struct kprobe *p) +{ + if ((unsigned long) p->addr & 0x01) + return -EINVAL; + /* Make sure the probe isn't going on a difficult instruction */ + if (probe_is_prohibited_opcode(p->addr)) + return -EINVAL; + if (s390_get_insn_slot(p)) + return -ENOMEM; + copy_instruction(p); + return 0; +} +NOKPROBE_SYMBOL(arch_prepare_kprobe); + +int arch_check_ftrace_location(struct kprobe *p) +{ + return 0; +} + +struct swap_insn_args { + struct kprobe *p; + unsigned int arm_kprobe : 1; +}; + +static int swap_instruction(void *data) +{ + struct swap_insn_args *args = data; + struct ftrace_insn new_insn, *insn; + struct kprobe *p = args->p; + size_t len; + + new_insn.opc = args->arm_kprobe ? BREAKPOINT_INSTRUCTION : p->opcode; + len = sizeof(new_insn.opc); + if (!p->ainsn.is_ftrace_insn) + goto skip_ftrace; + len = sizeof(new_insn); + insn = (struct ftrace_insn *) p->addr; + if (args->arm_kprobe) { + if (is_ftrace_nop(insn)) + new_insn.disp = KPROBE_ON_FTRACE_NOP; + else + new_insn.disp = KPROBE_ON_FTRACE_CALL; + } else { + ftrace_generate_call_insn(&new_insn, (unsigned long)p->addr); + if (insn->disp == KPROBE_ON_FTRACE_NOP) + ftrace_generate_nop_insn(&new_insn); + } +skip_ftrace: + s390_kernel_write(p->addr, &new_insn, len); + return 0; +} +NOKPROBE_SYMBOL(swap_instruction); + +void arch_arm_kprobe(struct kprobe *p) +{ + struct swap_insn_args args = {.p = p, .arm_kprobe = 1}; + + stop_machine_cpuslocked(swap_instruction, &args, NULL); +} +NOKPROBE_SYMBOL(arch_arm_kprobe); + +void arch_disarm_kprobe(struct kprobe *p) +{ + struct swap_insn_args args = {.p = p, .arm_kprobe = 0}; + + stop_machine_cpuslocked(swap_instruction, &args, NULL); +} +NOKPROBE_SYMBOL(arch_disarm_kprobe); + +void arch_remove_kprobe(struct kprobe *p) +{ + s390_free_insn_slot(p); +} +NOKPROBE_SYMBOL(arch_remove_kprobe); + +static void enable_singlestep(struct kprobe_ctlblk *kcb, + struct pt_regs *regs, + unsigned long ip) +{ + struct per_regs per_kprobe; + + /* Set up the PER control registers %cr9-%cr11 */ + per_kprobe.control = PER_EVENT_IFETCH; + per_kprobe.start = ip; + per_kprobe.end = ip; + + /* Save control regs and psw mask */ + __ctl_store(kcb->kprobe_saved_ctl, 9, 11); + kcb->kprobe_saved_imask = regs->psw.mask & + (PSW_MASK_PER | PSW_MASK_IO | PSW_MASK_EXT); + + /* Set PER control regs, turns on single step for the given address */ + __ctl_load(per_kprobe, 9, 11); + regs->psw.mask |= PSW_MASK_PER; + regs->psw.mask &= ~(PSW_MASK_IO | PSW_MASK_EXT); + regs->psw.addr = ip; +} +NOKPROBE_SYMBOL(enable_singlestep); + +static void disable_singlestep(struct kprobe_ctlblk *kcb, + struct pt_regs *regs, + unsigned long ip) +{ + /* Restore control regs and psw mask, set new psw address */ + __ctl_load(kcb->kprobe_saved_ctl, 9, 11); + regs->psw.mask &= ~PSW_MASK_PER; + regs->psw.mask |= kcb->kprobe_saved_imask; + regs->psw.addr = ip; +} +NOKPROBE_SYMBOL(disable_singlestep); + +/* + * Activate a kprobe by storing its pointer to current_kprobe. The + * previous kprobe is stored in kcb->prev_kprobe. A stack of up to + * two kprobes can be active, see KPROBE_REENTER. + */ +static void push_kprobe(struct kprobe_ctlblk *kcb, struct kprobe *p) +{ + kcb->prev_kprobe.kp = __this_cpu_read(current_kprobe); + kcb->prev_kprobe.status = kcb->kprobe_status; + __this_cpu_write(current_kprobe, p); +} +NOKPROBE_SYMBOL(push_kprobe); + +/* + * Deactivate a kprobe by backing up to the previous state. If the + * current state is KPROBE_REENTER prev_kprobe.kp will be non-NULL, + * for any other state prev_kprobe.kp will be NULL. + */ +static void pop_kprobe(struct kprobe_ctlblk *kcb) +{ + __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp); + kcb->kprobe_status = kcb->prev_kprobe.status; +} +NOKPROBE_SYMBOL(pop_kprobe); + +void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + ri->ret_addr = (kprobe_opcode_t *) regs->gprs[14]; + + /* Replace the return addr with trampoline addr */ + regs->gprs[14] = (unsigned long) &kretprobe_trampoline; +} +NOKPROBE_SYMBOL(arch_prepare_kretprobe); + +static void kprobe_reenter_check(struct kprobe_ctlblk *kcb, struct kprobe *p) +{ + switch (kcb->kprobe_status) { + case KPROBE_HIT_SSDONE: + case KPROBE_HIT_ACTIVE: + kprobes_inc_nmissed_count(p); + break; + case KPROBE_HIT_SS: + case KPROBE_REENTER: + default: + /* + * A kprobe on the code path to single step an instruction + * is a BUG. The code path resides in the .kprobes.text + * section and is executed with interrupts disabled. + */ + pr_err("Invalid kprobe detected.\n"); + dump_kprobe(p); + BUG(); + } +} +NOKPROBE_SYMBOL(kprobe_reenter_check); + +static int kprobe_handler(struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb; + struct kprobe *p; + + /* + * We want to disable preemption for the entire duration of kprobe + * processing. That includes the calls to the pre/post handlers + * and single stepping the kprobe instruction. + */ + preempt_disable(); + kcb = get_kprobe_ctlblk(); + p = get_kprobe((void *)(regs->psw.addr - 2)); + + if (p) { + if (kprobe_running()) { + /* + * We have hit a kprobe while another is still + * active. This can happen in the pre and post + * handler. Single step the instruction of the + * new probe but do not call any handler function + * of this secondary kprobe. + * push_kprobe and pop_kprobe saves and restores + * the currently active kprobe. + */ + kprobe_reenter_check(kcb, p); + push_kprobe(kcb, p); + kcb->kprobe_status = KPROBE_REENTER; + } else { + /* + * If we have no pre-handler or it returned 0, we + * continue with single stepping. If we have a + * pre-handler and it returned non-zero, it prepped + * for changing execution path, so get out doing + * nothing more here. + */ + push_kprobe(kcb, p); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + if (p->pre_handler && p->pre_handler(p, regs)) { + pop_kprobe(kcb); + preempt_enable_no_resched(); + return 1; + } + kcb->kprobe_status = KPROBE_HIT_SS; + } + enable_singlestep(kcb, regs, (unsigned long) p->ainsn.insn); + return 1; + } /* else: + * No kprobe at this address and no active kprobe. The trap has + * not been caused by a kprobe breakpoint. The race of breakpoint + * vs. kprobe remove does not exist because on s390 as we use + * stop_machine to arm/disarm the breakpoints. + */ + preempt_enable_no_resched(); + return 0; +} +NOKPROBE_SYMBOL(kprobe_handler); + +/* + * Function return probe trampoline: + * - init_kprobes() establishes a probepoint here + * - When the probed function returns, this probe + * causes the handlers to fire + */ +static void __used kretprobe_trampoline_holder(void) +{ + asm volatile(".global kretprobe_trampoline\n" + "kretprobe_trampoline: bcr 0,0\n"); +} + +/* + * Called when the probe at kretprobe trampoline is hit + */ +static int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kretprobe_instance *ri; + struct hlist_head *head, empty_rp; + struct hlist_node *tmp; + unsigned long flags, orig_ret_address; + unsigned long trampoline_address; + kprobe_opcode_t *correct_ret_addr; + + INIT_HLIST_HEAD(&empty_rp); + kretprobe_hash_lock(current, &head, &flags); + + /* + * It is possible to have multiple instances associated with a given + * task either because an multiple functions in the call path + * have a return probe installed on them, and/or more than one return + * return probe was registered for a target function. + * + * We can handle this because: + * - instances are always inserted at the head of the list + * - when multiple return probes are registered for the same + * function, the first instance's ret_addr will point to the + * real return address, and all the rest will point to + * kretprobe_trampoline + */ + ri = NULL; + orig_ret_address = 0; + correct_ret_addr = NULL; + trampoline_address = (unsigned long) &kretprobe_trampoline; + hlist_for_each_entry_safe(ri, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + + orig_ret_address = (unsigned long) ri->ret_addr; + + if (orig_ret_address != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; + } + + kretprobe_assert(ri, orig_ret_address, trampoline_address); + + correct_ret_addr = ri->ret_addr; + hlist_for_each_entry_safe(ri, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + + orig_ret_address = (unsigned long) ri->ret_addr; + + if (ri->rp && ri->rp->handler) { + ri->ret_addr = correct_ret_addr; + ri->rp->handler(ri, regs); + } + + recycle_rp_inst(ri, &empty_rp); + + if (orig_ret_address != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; + } + + regs->psw.addr = orig_ret_address; + + kretprobe_hash_unlock(current, &flags); + + hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { + hlist_del(&ri->hlist); + kfree(ri); + } + /* + * By returning a non-zero value, we are telling + * kprobe_handler() that we don't want the post_handler + * to run (and have re-enabled preemption) + */ + return 1; +} +NOKPROBE_SYMBOL(trampoline_probe_handler); + +/* + * Called after single-stepping. p->addr is the address of the + * instruction whose first byte has been replaced by the "breakpoint" + * instruction. To avoid the SMP problems that can occur when we + * temporarily put back the original opcode to single-step, we + * single-stepped a copy of the instruction. The address of this + * copy is p->ainsn.insn. + */ +static void resume_execution(struct kprobe *p, struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + unsigned long ip = regs->psw.addr; + int fixup = probe_get_fixup_type(p->ainsn.insn); + + /* Check if the kprobes location is an enabled ftrace caller */ + if (p->ainsn.is_ftrace_insn) { + struct ftrace_insn *insn = (struct ftrace_insn *) p->addr; + struct ftrace_insn call_insn; + + ftrace_generate_call_insn(&call_insn, (unsigned long) p->addr); + /* + * A kprobe on an enabled ftrace call site actually single + * stepped an unconditional branch (ftrace nop equivalent). + * Now we need to fixup things and pretend that a brasl r0,... + * was executed instead. + */ + if (insn->disp == KPROBE_ON_FTRACE_CALL) { + ip += call_insn.disp * 2 - MCOUNT_INSN_SIZE; + regs->gprs[0] = (unsigned long)p->addr + sizeof(*insn); + } + } + + if (fixup & FIXUP_PSW_NORMAL) + ip += (unsigned long) p->addr - (unsigned long) p->ainsn.insn; + + if (fixup & FIXUP_BRANCH_NOT_TAKEN) { + int ilen = insn_length(p->ainsn.insn[0] >> 8); + if (ip - (unsigned long) p->ainsn.insn == ilen) + ip = (unsigned long) p->addr + ilen; + } + + if (fixup & FIXUP_RETURN_REGISTER) { + int reg = (p->ainsn.insn[0] & 0xf0) >> 4; + regs->gprs[reg] += (unsigned long) p->addr - + (unsigned long) p->ainsn.insn; + } + + disable_singlestep(kcb, regs, ip); +} +NOKPROBE_SYMBOL(resume_execution); + +static int post_kprobe_handler(struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + struct kprobe *p = kprobe_running(); + + if (!p) + return 0; + + if (kcb->kprobe_status != KPROBE_REENTER && p->post_handler) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + p->post_handler(p, regs, 0); + } + + resume_execution(p, regs); + pop_kprobe(kcb); + preempt_enable_no_resched(); + + /* + * if somebody else is singlestepping across a probe point, psw mask + * will have PER set, in which case, continue the remaining processing + * of do_single_step, as if this is not a probe hit. + */ + if (regs->psw.mask & PSW_MASK_PER) + return 0; + + return 1; +} +NOKPROBE_SYMBOL(post_kprobe_handler); + +static int kprobe_trap_handler(struct pt_regs *regs, int trapnr) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + struct kprobe *p = kprobe_running(); + const struct exception_table_entry *entry; + + switch(kcb->kprobe_status) { + case KPROBE_HIT_SS: + case KPROBE_REENTER: + /* + * We are here because the instruction being single + * stepped caused a page fault. We reset the current + * kprobe and the nip points back to the probe address + * and allow the page fault handler to continue as a + * normal page fault. + */ + disable_singlestep(kcb, regs, (unsigned long) p->addr); + pop_kprobe(kcb); + preempt_enable_no_resched(); + break; + case KPROBE_HIT_ACTIVE: + case KPROBE_HIT_SSDONE: + /* + * We increment the nmissed count for accounting, + * we can also use npre/npostfault count for accounting + * these specific fault cases. + */ + kprobes_inc_nmissed_count(p); + + /* + * We come here because instructions in the pre/post + * handler caused the page_fault, this could happen + * if handler tries to access user space by + * copy_from_user(), get_user() etc. Let the + * user-specified handler try to fix it first. + */ + if (p->fault_handler && p->fault_handler(p, regs, trapnr)) + return 1; + + /* + * In case the user-specified fault handler returned + * zero, try to fix up. + */ + entry = search_exception_tables(regs->psw.addr); + if (entry) { + regs->psw.addr = extable_fixup(entry); + return 1; + } + + /* + * fixup_exception() could not handle it, + * Let do_page_fault() fix it. + */ + break; + default: + break; + } + return 0; +} +NOKPROBE_SYMBOL(kprobe_trap_handler); + +int kprobe_fault_handler(struct pt_regs *regs, int trapnr) +{ + int ret; + + if (regs->psw.mask & (PSW_MASK_IO | PSW_MASK_EXT)) + local_irq_disable(); + ret = kprobe_trap_handler(regs, trapnr); + if (regs->psw.mask & (PSW_MASK_IO | PSW_MASK_EXT)) + local_irq_restore(regs->psw.mask & ~PSW_MASK_PER); + return ret; +} +NOKPROBE_SYMBOL(kprobe_fault_handler); + +/* + * Wrapper routine to for handling exceptions. + */ +int kprobe_exceptions_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct die_args *args = (struct die_args *) data; + struct pt_regs *regs = args->regs; + int ret = NOTIFY_DONE; + + if (regs->psw.mask & (PSW_MASK_IO | PSW_MASK_EXT)) + local_irq_disable(); + + switch (val) { + case DIE_BPT: + if (kprobe_handler(regs)) + ret = NOTIFY_STOP; + break; + case DIE_SSTEP: + if (post_kprobe_handler(regs)) + ret = NOTIFY_STOP; + break; + case DIE_TRAP: + if (!preemptible() && kprobe_running() && + kprobe_trap_handler(regs, args->trapnr)) + ret = NOTIFY_STOP; + break; + default: + break; + } + + if (regs->psw.mask & (PSW_MASK_IO | PSW_MASK_EXT)) + local_irq_restore(regs->psw.mask & ~PSW_MASK_PER); + + return ret; +} +NOKPROBE_SYMBOL(kprobe_exceptions_notify); + +static struct kprobe trampoline = { + .addr = (kprobe_opcode_t *) &kretprobe_trampoline, + .pre_handler = trampoline_probe_handler +}; + +int __init arch_init_kprobes(void) +{ + return register_kprobe(&trampoline); +} + +int arch_trampoline_kprobe(struct kprobe *p) +{ + return p->addr == (kprobe_opcode_t *) &kretprobe_trampoline; +} +NOKPROBE_SYMBOL(arch_trampoline_kprobe); diff --git a/arch/s390/kernel/lgr.c b/arch/s390/kernel/lgr.c new file mode 100644 index 000000000..452502f9a --- /dev/null +++ b/arch/s390/kernel/lgr.c @@ -0,0 +1,188 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Linux Guest Relocation (LGR) detection + * + * Copyright IBM Corp. 2012 + * Author(s): Michael Holzheu <holzheu@linux.vnet.ibm.com> + */ + +#include <linux/init.h> +#include <linux/export.h> +#include <linux/timer.h> +#include <linux/slab.h> +#include <asm/facility.h> +#include <asm/sysinfo.h> +#include <asm/ebcdic.h> +#include <asm/debug.h> +#include <asm/ipl.h> + +#define LGR_TIMER_INTERVAL_SECS (30 * 60) +#define VM_LEVEL_MAX 2 /* Maximum is 8, but we only record two levels */ + +/* + * LGR info: Contains stfle and stsi data + */ +struct lgr_info { + /* Bit field with facility information: 4 DWORDs are stored */ + u64 stfle_fac_list[4]; + /* Level of system (1 = CEC, 2 = LPAR, 3 = z/VM */ + u32 level; + /* Level 1: CEC info (stsi 1.1.1) */ + char manufacturer[16]; + char type[4]; + char sequence[16]; + char plant[4]; + char model[16]; + /* Level 2: LPAR info (stsi 2.2.2) */ + u16 lpar_number; + char name[8]; + /* Level 3: VM info (stsi 3.2.2) */ + u8 vm_count; + struct { + char name[8]; + char cpi[16]; + } vm[VM_LEVEL_MAX]; +} __packed __aligned(8); + +/* + * LGR globals + */ +static char lgr_page[PAGE_SIZE] __aligned(PAGE_SIZE); +static struct lgr_info lgr_info_last; +static struct lgr_info lgr_info_cur; +static struct debug_info *lgr_dbf; + +/* + * Copy buffer and then convert it to ASCII + */ +static void cpascii(char *dst, char *src, int size) +{ + memcpy(dst, src, size); + EBCASC(dst, size); +} + +/* + * Fill LGR info with 1.1.1 stsi data + */ +static void lgr_stsi_1_1_1(struct lgr_info *lgr_info) +{ + struct sysinfo_1_1_1 *si = (void *) lgr_page; + + if (stsi(si, 1, 1, 1)) + return; + cpascii(lgr_info->manufacturer, si->manufacturer, + sizeof(si->manufacturer)); + cpascii(lgr_info->type, si->type, sizeof(si->type)); + cpascii(lgr_info->model, si->model, sizeof(si->model)); + cpascii(lgr_info->sequence, si->sequence, sizeof(si->sequence)); + cpascii(lgr_info->plant, si->plant, sizeof(si->plant)); +} + +/* + * Fill LGR info with 2.2.2 stsi data + */ +static void lgr_stsi_2_2_2(struct lgr_info *lgr_info) +{ + struct sysinfo_2_2_2 *si = (void *) lgr_page; + + if (stsi(si, 2, 2, 2)) + return; + cpascii(lgr_info->name, si->name, sizeof(si->name)); + memcpy(&lgr_info->lpar_number, &si->lpar_number, + sizeof(lgr_info->lpar_number)); +} + +/* + * Fill LGR info with 3.2.2 stsi data + */ +static void lgr_stsi_3_2_2(struct lgr_info *lgr_info) +{ + struct sysinfo_3_2_2 *si = (void *) lgr_page; + int i; + + if (stsi(si, 3, 2, 2)) + return; + for (i = 0; i < min_t(u8, si->count, VM_LEVEL_MAX); i++) { + cpascii(lgr_info->vm[i].name, si->vm[i].name, + sizeof(si->vm[i].name)); + cpascii(lgr_info->vm[i].cpi, si->vm[i].cpi, + sizeof(si->vm[i].cpi)); + } + lgr_info->vm_count = si->count; +} + +/* + * Fill LGR info with current data + */ +static void lgr_info_get(struct lgr_info *lgr_info) +{ + int level; + + memset(lgr_info, 0, sizeof(*lgr_info)); + stfle(lgr_info->stfle_fac_list, ARRAY_SIZE(lgr_info->stfle_fac_list)); + level = stsi(NULL, 0, 0, 0); + lgr_info->level = level; + if (level >= 1) + lgr_stsi_1_1_1(lgr_info); + if (level >= 2) + lgr_stsi_2_2_2(lgr_info); + if (level >= 3) + lgr_stsi_3_2_2(lgr_info); +} + +/* + * Check if LGR info has changed and if yes log new LGR info to s390dbf + */ +void lgr_info_log(void) +{ + static DEFINE_SPINLOCK(lgr_info_lock); + unsigned long flags; + + if (!spin_trylock_irqsave(&lgr_info_lock, flags)) + return; + lgr_info_get(&lgr_info_cur); + if (memcmp(&lgr_info_last, &lgr_info_cur, sizeof(lgr_info_cur)) != 0) { + debug_event(lgr_dbf, 1, &lgr_info_cur, sizeof(lgr_info_cur)); + lgr_info_last = lgr_info_cur; + } + spin_unlock_irqrestore(&lgr_info_lock, flags); +} +EXPORT_SYMBOL_GPL(lgr_info_log); + +static void lgr_timer_set(void); + +/* + * LGR timer callback + */ +static void lgr_timer_fn(struct timer_list *unused) +{ + lgr_info_log(); + lgr_timer_set(); +} + +static struct timer_list lgr_timer; + +/* + * Setup next LGR timer + */ +static void lgr_timer_set(void) +{ + mod_timer(&lgr_timer, jiffies + LGR_TIMER_INTERVAL_SECS * HZ); +} + +/* + * Initialize LGR: Add s390dbf, write initial lgr_info and setup timer + */ +static int __init lgr_init(void) +{ + lgr_dbf = debug_register("lgr", 1, 1, sizeof(struct lgr_info)); + if (!lgr_dbf) + return -ENOMEM; + debug_register_view(lgr_dbf, &debug_hex_ascii_view); + lgr_info_get(&lgr_info_last); + debug_event(lgr_dbf, 1, &lgr_info_last, sizeof(lgr_info_last)); + timer_setup(&lgr_timer, lgr_timer_fn, TIMER_DEFERRABLE); + lgr_timer_set(); + return 0; +} +device_initcall(lgr_init); diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c new file mode 100644 index 000000000..b7020e721 --- /dev/null +++ b/arch/s390/kernel/machine_kexec.c @@ -0,0 +1,304 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2005, 2011 + * + * Author(s): Rolf Adelsberger, + * Heiko Carstens <heiko.carstens@de.ibm.com> + * Michael Holzheu <holzheu@linux.vnet.ibm.com> + */ + +#include <linux/device.h> +#include <linux/mm.h> +#include <linux/kexec.h> +#include <linux/delay.h> +#include <linux/reboot.h> +#include <linux/ftrace.h> +#include <linux/debug_locks.h> +#include <linux/suspend.h> +#include <asm/cio.h> +#include <asm/setup.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/smp.h> +#include <asm/ipl.h> +#include <asm/diag.h> +#include <asm/elf.h> +#include <asm/asm-offsets.h> +#include <asm/cacheflush.h> +#include <asm/os_info.h> +#include <asm/set_memory.h> +#include <asm/switch_to.h> +#include <asm/nmi.h> + +typedef void (*relocate_kernel_t)(kimage_entry_t *, unsigned long); + +extern const unsigned char relocate_kernel[]; +extern const unsigned long long relocate_kernel_len; + +#ifdef CONFIG_CRASH_DUMP + +/* + * PM notifier callback for kdump + */ +static int machine_kdump_pm_cb(struct notifier_block *nb, unsigned long action, + void *ptr) +{ + switch (action) { + case PM_SUSPEND_PREPARE: + case PM_HIBERNATION_PREPARE: + if (kexec_crash_image) + arch_kexec_unprotect_crashkres(); + break; + case PM_POST_SUSPEND: + case PM_POST_HIBERNATION: + if (kexec_crash_image) + arch_kexec_protect_crashkres(); + break; + default: + return NOTIFY_DONE; + } + return NOTIFY_OK; +} + +static int __init machine_kdump_pm_init(void) +{ + pm_notifier(machine_kdump_pm_cb, 0); + return 0; +} +arch_initcall(machine_kdump_pm_init); + +/* + * Reset the system, copy boot CPU registers to absolute zero, + * and jump to the kdump image + */ +static void __do_machine_kdump(void *image) +{ + int (*start_kdump)(int); + unsigned long prefix; + + /* store_status() saved the prefix register to lowcore */ + prefix = (unsigned long) S390_lowcore.prefixreg_save_area; + + /* Now do the reset */ + s390_reset_system(); + + /* + * Copy dump CPU store status info to absolute zero. + * This need to be done *after* s390_reset_system set the + * prefix register of this CPU to zero + */ + memcpy((void *) __LC_FPREGS_SAVE_AREA, + (void *)(prefix + __LC_FPREGS_SAVE_AREA), 512); + + __load_psw_mask(PSW_MASK_BASE | PSW_DEFAULT_KEY | PSW_MASK_EA | PSW_MASK_BA); + start_kdump = (void *)((struct kimage *) image)->start; + start_kdump(1); + + /* Die if start_kdump returns */ + disabled_wait((unsigned long) __builtin_return_address(0)); +} + +/* + * Start kdump: create a LGR log entry, store status of all CPUs and + * branch to __do_machine_kdump. + */ +static noinline void __machine_kdump(void *image) +{ + struct mcesa *mcesa; + union ctlreg2 cr2_old, cr2_new; + int this_cpu, cpu; + + lgr_info_log(); + /* Get status of the other CPUs */ + this_cpu = smp_find_processor_id(stap()); + for_each_online_cpu(cpu) { + if (cpu == this_cpu) + continue; + if (smp_store_status(cpu)) + continue; + } + /* Store status of the boot CPU */ + mcesa = (struct mcesa *)(S390_lowcore.mcesad & MCESA_ORIGIN_MASK); + if (MACHINE_HAS_VX) + save_vx_regs((__vector128 *) mcesa->vector_save_area); + if (MACHINE_HAS_GS) { + __ctl_store(cr2_old.val, 2, 2); + cr2_new = cr2_old; + cr2_new.gse = 1; + __ctl_load(cr2_new.val, 2, 2); + save_gs_cb((struct gs_cb *) mcesa->guarded_storage_save_area); + __ctl_load(cr2_old.val, 2, 2); + } + /* + * To create a good backchain for this CPU in the dump store_status + * is passed the address of a function. The address is saved into + * the PSW save area of the boot CPU and the function is invoked as + * a tail call of store_status. The backchain in the dump will look + * like this: + * restart_int_handler -> __machine_kexec -> __do_machine_kdump + * The call to store_status() will not return. + */ + store_status(__do_machine_kdump, image); +} +#endif + +/* + * Check if kdump checksums are valid: We call purgatory with parameter "0" + */ +static bool kdump_csum_valid(struct kimage *image) +{ +#ifdef CONFIG_CRASH_DUMP + int (*start_kdump)(int) = (void *)image->start; + int rc; + + __arch_local_irq_stnsm(0xfb); /* disable DAT */ + rc = start_kdump(0); + __arch_local_irq_stosm(0x04); /* enable DAT */ + return rc == 0; +#else + return false; +#endif +} + +#ifdef CONFIG_CRASH_DUMP + +void crash_free_reserved_phys_range(unsigned long begin, unsigned long end) +{ + unsigned long addr, size; + + for (addr = begin; addr < end; addr += PAGE_SIZE) + free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT)); + size = begin - crashk_res.start; + if (size) + os_info_crashkernel_add(crashk_res.start, size); + else + os_info_crashkernel_add(0, 0); +} + +static void crash_protect_pages(int protect) +{ + unsigned long size; + + if (!crashk_res.end) + return; + size = resource_size(&crashk_res); + if (protect) + set_memory_ro(crashk_res.start, size >> PAGE_SHIFT); + else + set_memory_rw(crashk_res.start, size >> PAGE_SHIFT); +} + +void arch_kexec_protect_crashkres(void) +{ + crash_protect_pages(1); +} + +void arch_kexec_unprotect_crashkres(void) +{ + crash_protect_pages(0); +} + +#endif + +/* + * Give back memory to hypervisor before new kdump is loaded + */ +static int machine_kexec_prepare_kdump(void) +{ +#ifdef CONFIG_CRASH_DUMP + if (MACHINE_IS_VM) + diag10_range(PFN_DOWN(crashk_res.start), + PFN_DOWN(crashk_res.end - crashk_res.start + 1)); + return 0; +#else + return -EINVAL; +#endif +} + +int machine_kexec_prepare(struct kimage *image) +{ + void *reboot_code_buffer; + + if (image->type == KEXEC_TYPE_CRASH) + return machine_kexec_prepare_kdump(); + + /* We don't support anything but the default image type for now. */ + if (image->type != KEXEC_TYPE_DEFAULT) + return -EINVAL; + + /* Get the destination where the assembler code should be copied to.*/ + reboot_code_buffer = (void *) page_to_phys(image->control_code_page); + + /* Then copy it */ + memcpy(reboot_code_buffer, relocate_kernel, relocate_kernel_len); + return 0; +} + +void machine_kexec_cleanup(struct kimage *image) +{ +} + +void arch_crash_save_vmcoreinfo(void) +{ + VMCOREINFO_SYMBOL(lowcore_ptr); + VMCOREINFO_SYMBOL(high_memory); + VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS); + mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note()); +} + +void machine_shutdown(void) +{ +} + +void machine_crash_shutdown(struct pt_regs *regs) +{ + set_os_info_reipl_block(); +} + +/* + * Do normal kexec + */ +static void __do_machine_kexec(void *data) +{ + relocate_kernel_t data_mover; + struct kimage *image = data; + + s390_reset_system(); + data_mover = (relocate_kernel_t) page_to_phys(image->control_code_page); + + __arch_local_irq_stnsm(0xfb); /* disable DAT - avoid no-execute */ + /* Call the moving routine */ + (*data_mover)(&image->head, image->start); + + /* Die if kexec returns */ + disabled_wait((unsigned long) __builtin_return_address(0)); +} + +/* + * Reset system and call either kdump or normal kexec + */ +static void __machine_kexec(void *data) +{ + __arch_local_irq_stosm(0x04); /* enable DAT */ + pfault_fini(); + tracing_off(); + debug_locks_off(); +#ifdef CONFIG_CRASH_DUMP + if (((struct kimage *) data)->type == KEXEC_TYPE_CRASH) + __machine_kdump(data); +#endif + __do_machine_kexec(data); +} + +/* + * Do either kdump or normal kexec. In case of kdump we first ask + * purgatory, if kdump checksums are valid. + */ +void machine_kexec(struct kimage *image) +{ + if (image->type == KEXEC_TYPE_CRASH && !kdump_csum_valid(image)) + return; + tracer_disable(); + smp_send_stop(); + smp_call_ipl_cpu(__machine_kexec, image); +} diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c new file mode 100644 index 000000000..f413f57f8 --- /dev/null +++ b/arch/s390/kernel/machine_kexec_file.c @@ -0,0 +1,245 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * s390 code for kexec_file_load system call + * + * Copyright IBM Corp. 2018 + * + * Author(s): Philipp Rudo <prudo@linux.vnet.ibm.com> + */ + +#include <linux/elf.h> +#include <linux/kexec.h> +#include <asm/setup.h> + +const struct kexec_file_ops * const kexec_file_loaders[] = { + &s390_kexec_elf_ops, + &s390_kexec_image_ops, + NULL, +}; + +int *kexec_file_update_kernel(struct kimage *image, + struct s390_load_data *data) +{ + unsigned long *loc; + + if (image->cmdline_buf_len >= ARCH_COMMAND_LINE_SIZE) + return ERR_PTR(-EINVAL); + + if (image->cmdline_buf_len) + memcpy(data->kernel_buf + COMMAND_LINE_OFFSET, + image->cmdline_buf, image->cmdline_buf_len); + + if (image->type == KEXEC_TYPE_CRASH) { + loc = (unsigned long *)(data->kernel_buf + OLDMEM_BASE_OFFSET); + *loc = crashk_res.start; + + loc = (unsigned long *)(data->kernel_buf + OLDMEM_SIZE_OFFSET); + *loc = crashk_res.end - crashk_res.start + 1; + } + + if (image->initrd_buf) { + loc = (unsigned long *)(data->kernel_buf + INITRD_START_OFFSET); + *loc = data->initrd_load_addr; + + loc = (unsigned long *)(data->kernel_buf + INITRD_SIZE_OFFSET); + *loc = image->initrd_buf_len; + } + + return NULL; +} + +static int kexec_file_update_purgatory(struct kimage *image) +{ + u64 entry, type; + int ret; + + if (image->type == KEXEC_TYPE_CRASH) { + entry = STARTUP_KDUMP_OFFSET; + type = KEXEC_TYPE_CRASH; + } else { + entry = STARTUP_NORMAL_OFFSET; + type = KEXEC_TYPE_DEFAULT; + } + + ret = kexec_purgatory_get_set_symbol(image, "kernel_entry", &entry, + sizeof(entry), false); + if (ret) + return ret; + + ret = kexec_purgatory_get_set_symbol(image, "kernel_type", &type, + sizeof(type), false); + if (ret) + return ret; + + if (image->type == KEXEC_TYPE_CRASH) { + u64 crash_size; + + ret = kexec_purgatory_get_set_symbol(image, "crash_start", + &crashk_res.start, + sizeof(crashk_res.start), + false); + if (ret) + return ret; + + crash_size = crashk_res.end - crashk_res.start + 1; + ret = kexec_purgatory_get_set_symbol(image, "crash_size", + &crash_size, + sizeof(crash_size), + false); + } + return ret; +} + +int kexec_file_add_purgatory(struct kimage *image, struct s390_load_data *data) +{ + struct kexec_buf buf; + int ret; + + buf.image = image; + + data->memsz = ALIGN(data->memsz, PAGE_SIZE); + buf.mem = data->memsz; + if (image->type == KEXEC_TYPE_CRASH) + buf.mem += crashk_res.start; + + ret = kexec_load_purgatory(image, &buf); + if (ret) + return ret; + + ret = kexec_file_update_purgatory(image); + return ret; +} + +int kexec_file_add_initrd(struct kimage *image, struct s390_load_data *data, + char *initrd, unsigned long initrd_len) +{ + struct kexec_buf buf; + int ret; + + buf.image = image; + + buf.buffer = initrd; + buf.bufsz = initrd_len; + + data->memsz = ALIGN(data->memsz, PAGE_SIZE); + buf.mem = data->memsz; + if (image->type == KEXEC_TYPE_CRASH) + buf.mem += crashk_res.start; + buf.memsz = buf.bufsz; + + data->initrd_load_addr = buf.mem; + data->memsz += buf.memsz; + + ret = kexec_add_buffer(&buf); + return ret; +} + +/* + * The kernel is loaded to a fixed location. Turn off kexec_locate_mem_hole + * and provide kbuf->mem by hand. + */ +int arch_kexec_walk_mem(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) +{ + return 1; +} + +int arch_kexec_apply_relocations_add(struct purgatory_info *pi, + Elf_Shdr *section, + const Elf_Shdr *relsec, + const Elf_Shdr *symtab) +{ + Elf_Rela *relas; + int i; + + relas = (void *)pi->ehdr + relsec->sh_offset; + + for (i = 0; i < relsec->sh_size / sizeof(*relas); i++) { + const Elf_Sym *sym; /* symbol to relocate */ + unsigned long addr; /* final location after relocation */ + unsigned long val; /* relocated symbol value */ + void *loc; /* tmp location to modify */ + + sym = (void *)pi->ehdr + symtab->sh_offset; + sym += ELF64_R_SYM(relas[i].r_info); + + if (sym->st_shndx == SHN_UNDEF) + return -ENOEXEC; + + if (sym->st_shndx == SHN_COMMON) + return -ENOEXEC; + + if (sym->st_shndx >= pi->ehdr->e_shnum && + sym->st_shndx != SHN_ABS) + return -ENOEXEC; + + loc = pi->purgatory_buf; + loc += section->sh_offset; + loc += relas[i].r_offset; + + val = sym->st_value; + if (sym->st_shndx != SHN_ABS) + val += pi->sechdrs[sym->st_shndx].sh_addr; + val += relas[i].r_addend; + + addr = section->sh_addr + relas[i].r_offset; + + switch (ELF64_R_TYPE(relas[i].r_info)) { + case R_390_8: /* Direct 8 bit. */ + *(u8 *)loc = val; + break; + case R_390_12: /* Direct 12 bit. */ + *(u16 *)loc &= 0xf000; + *(u16 *)loc |= val & 0xfff; + break; + case R_390_16: /* Direct 16 bit. */ + *(u16 *)loc = val; + break; + case R_390_20: /* Direct 20 bit. */ + *(u32 *)loc &= 0xf00000ff; + *(u32 *)loc |= (val & 0xfff) << 16; /* DL */ + *(u32 *)loc |= (val & 0xff000) >> 4; /* DH */ + break; + case R_390_32: /* Direct 32 bit. */ + *(u32 *)loc = val; + break; + case R_390_64: /* Direct 64 bit. */ + *(u64 *)loc = val; + break; + case R_390_PC16: /* PC relative 16 bit. */ + *(u16 *)loc = (val - addr); + break; + case R_390_PC16DBL: /* PC relative 16 bit shifted by 1. */ + *(u16 *)loc = (val - addr) >> 1; + break; + case R_390_PC32DBL: /* PC relative 32 bit shifted by 1. */ + *(u32 *)loc = (val - addr) >> 1; + break; + case R_390_PC32: /* PC relative 32 bit. */ + *(u32 *)loc = (val - addr); + break; + case R_390_PC64: /* PC relative 64 bit. */ + *(u64 *)loc = (val - addr); + break; + default: + break; + } + } + return 0; +} + +int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, + unsigned long buf_len) +{ + /* A kernel must be at least large enough to contain head.S. During + * load memory in head.S will be accessed, e.g. to register the next + * command line. If the next kernel were smaller the current kernel + * will panic at load. + * + * 0x11000 = sizeof(head.S) + */ + if (buf_len < 0x11000) + return -ENOEXEC; + + return kexec_image_probe_default(image, buf, buf_len); +} diff --git a/arch/s390/kernel/mcount.S b/arch/s390/kernel/mcount.S new file mode 100644 index 000000000..a59e6b2b2 --- /dev/null +++ b/arch/s390/kernel/mcount.S @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2008, 2009 + * + * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>, + * + */ + +#include <linux/linkage.h> +#include <asm/asm-offsets.h> +#include <asm/ftrace.h> +#include <asm/nospec-insn.h> +#include <asm/ptrace.h> +#include <asm/export.h> + + GEN_BR_THUNK %r1 + GEN_BR_THUNK %r14 + + .section .kprobes.text, "ax" + +ENTRY(ftrace_stub) + BR_EX %r14 + +#define STACK_FRAME_SIZE (STACK_FRAME_OVERHEAD + __PT_SIZE) +#define STACK_PTREGS (STACK_FRAME_OVERHEAD) +#define STACK_PTREGS_GPRS (STACK_PTREGS + __PT_GPRS) +#define STACK_PTREGS_PSW (STACK_PTREGS + __PT_PSW) +#ifdef __PACK_STACK +/* allocate just enough for r14, r15 and backchain */ +#define TRACED_FUNC_FRAME_SIZE 24 +#else +#define TRACED_FUNC_FRAME_SIZE STACK_FRAME_OVERHEAD +#endif + +ENTRY(_mcount) + BR_EX %r14 + +EXPORT_SYMBOL(_mcount) + +ENTRY(ftrace_caller) + .globl ftrace_regs_caller + .set ftrace_regs_caller,ftrace_caller + stg %r14,(__SF_GPRS+8*8)(%r15) # save traced function caller + lgr %r1,%r15 +#if !(defined(CC_USING_HOTPATCH) || defined(CC_USING_NOP_MCOUNT)) + aghi %r0,MCOUNT_RETURN_FIXUP +#endif + # allocate stack frame for ftrace_caller to contain traced function + aghi %r15,-TRACED_FUNC_FRAME_SIZE + stg %r1,__SF_BACKCHAIN(%r15) + stg %r0,(__SF_GPRS+8*8)(%r15) + stg %r15,(__SF_GPRS+9*8)(%r15) + # allocate pt_regs and stack frame for ftrace_trace_function + aghi %r15,-STACK_FRAME_SIZE + stg %r1,(STACK_PTREGS_GPRS+15*8)(%r15) + aghi %r1,-TRACED_FUNC_FRAME_SIZE + stg %r1,__SF_BACKCHAIN(%r15) + stg %r0,(STACK_PTREGS_PSW+8)(%r15) + stmg %r2,%r14,(STACK_PTREGS_GPRS+2*8)(%r15) +#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES + aghik %r2,%r0,-MCOUNT_INSN_SIZE + lgrl %r4,function_trace_op + lgrl %r1,ftrace_func +#else + lgr %r2,%r0 + aghi %r2,-MCOUNT_INSN_SIZE + larl %r4,function_trace_op + lg %r4,0(%r4) + larl %r1,ftrace_func + lg %r1,0(%r1) +#endif + lgr %r3,%r14 + la %r5,STACK_PTREGS(%r15) + BASR_EX %r14,%r1 +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +# The j instruction gets runtime patched to a nop instruction. +# See ftrace_enable_ftrace_graph_caller. +ENTRY(ftrace_graph_caller) + j ftrace_graph_caller_end + lg %r2,(STACK_PTREGS_GPRS+14*8)(%r15) + lg %r3,(STACK_PTREGS_PSW+8)(%r15) + brasl %r14,prepare_ftrace_return + stg %r2,(STACK_PTREGS_GPRS+14*8)(%r15) +ftrace_graph_caller_end: + .globl ftrace_graph_caller_end +#endif + lg %r1,(STACK_PTREGS_PSW+8)(%r15) + lmg %r2,%r15,(STACK_PTREGS_GPRS+2*8)(%r15) + BR_EX %r1 + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +ENTRY(return_to_handler) + stmg %r2,%r5,32(%r15) + lgr %r1,%r15 + aghi %r15,-STACK_FRAME_OVERHEAD + stg %r1,__SF_BACKCHAIN(%r15) + brasl %r14,ftrace_return_to_handler + aghi %r15,STACK_FRAME_OVERHEAD + lgr %r14,%r2 + lmg %r2,%r5,32(%r15) + BR_EX %r14 + +#endif diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c new file mode 100644 index 000000000..d298d3cb4 --- /dev/null +++ b/arch/s390/kernel/module.c @@ -0,0 +1,478 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Kernel module help for s390. + * + * S390 version + * Copyright IBM Corp. 2002, 2003 + * Author(s): Arnd Bergmann (arndb@de.ibm.com) + * Martin Schwidefsky (schwidefsky@de.ibm.com) + * + * based on i386 version + * Copyright (C) 2001 Rusty Russell. + */ +#include <linux/module.h> +#include <linux/elf.h> +#include <linux/vmalloc.h> +#include <linux/fs.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/moduleloader.h> +#include <linux/bug.h> +#include <asm/alternative.h> +#include <asm/nospec-branch.h> +#include <asm/facility.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(fmt , ...) +#endif + +#define PLT_ENTRY_SIZE 20 + +void *module_alloc(unsigned long size) +{ + if (PAGE_ALIGN(size) > MODULES_LEN) + return NULL; + return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, + GFP_KERNEL, PAGE_KERNEL_EXEC, + 0, NUMA_NO_NODE, + __builtin_return_address(0)); +} + +void module_arch_freeing_init(struct module *mod) +{ + if (is_livepatch_module(mod) && + mod->state == MODULE_STATE_LIVE) + return; + + vfree(mod->arch.syminfo); + mod->arch.syminfo = NULL; +} + +static void check_rela(Elf_Rela *rela, struct module *me) +{ + struct mod_arch_syminfo *info; + + info = me->arch.syminfo + ELF_R_SYM (rela->r_info); + switch (ELF_R_TYPE (rela->r_info)) { + case R_390_GOT12: /* 12 bit GOT offset. */ + case R_390_GOT16: /* 16 bit GOT offset. */ + case R_390_GOT20: /* 20 bit GOT offset. */ + case R_390_GOT32: /* 32 bit GOT offset. */ + case R_390_GOT64: /* 64 bit GOT offset. */ + case R_390_GOTENT: /* 32 bit PC rel. to GOT entry shifted by 1. */ + case R_390_GOTPLT12: /* 12 bit offset to jump slot. */ + case R_390_GOTPLT16: /* 16 bit offset to jump slot. */ + case R_390_GOTPLT20: /* 20 bit offset to jump slot. */ + case R_390_GOTPLT32: /* 32 bit offset to jump slot. */ + case R_390_GOTPLT64: /* 64 bit offset to jump slot. */ + case R_390_GOTPLTENT: /* 32 bit rel. offset to jump slot >> 1. */ + if (info->got_offset == -1UL) { + info->got_offset = me->arch.got_size; + me->arch.got_size += sizeof(void*); + } + break; + case R_390_PLT16DBL: /* 16 bit PC rel. PLT shifted by 1. */ + case R_390_PLT32DBL: /* 32 bit PC rel. PLT shifted by 1. */ + case R_390_PLT32: /* 32 bit PC relative PLT address. */ + case R_390_PLT64: /* 64 bit PC relative PLT address. */ + case R_390_PLTOFF16: /* 16 bit offset from GOT to PLT. */ + case R_390_PLTOFF32: /* 32 bit offset from GOT to PLT. */ + case R_390_PLTOFF64: /* 16 bit offset from GOT to PLT. */ + if (info->plt_offset == -1UL) { + info->plt_offset = me->arch.plt_size; + me->arch.plt_size += PLT_ENTRY_SIZE; + } + break; + case R_390_COPY: + case R_390_GLOB_DAT: + case R_390_JMP_SLOT: + case R_390_RELATIVE: + /* Only needed if we want to support loading of + modules linked with -shared. */ + break; + } +} + +/* + * Account for GOT and PLT relocations. We can't add sections for + * got and plt but we can increase the core module size. + */ +int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, + char *secstrings, struct module *me) +{ + Elf_Shdr *symtab; + Elf_Sym *symbols; + Elf_Rela *rela; + char *strings; + int nrela, i, j; + + /* Find symbol table and string table. */ + symtab = NULL; + for (i = 0; i < hdr->e_shnum; i++) + switch (sechdrs[i].sh_type) { + case SHT_SYMTAB: + symtab = sechdrs + i; + break; + } + if (!symtab) { + printk(KERN_ERR "module %s: no symbol table\n", me->name); + return -ENOEXEC; + } + + /* Allocate one syminfo structure per symbol. */ + me->arch.nsyms = symtab->sh_size / sizeof(Elf_Sym); + me->arch.syminfo = vmalloc(array_size(sizeof(struct mod_arch_syminfo), + me->arch.nsyms)); + if (!me->arch.syminfo) + return -ENOMEM; + symbols = (void *) hdr + symtab->sh_offset; + strings = (void *) hdr + sechdrs[symtab->sh_link].sh_offset; + for (i = 0; i < me->arch.nsyms; i++) { + if (symbols[i].st_shndx == SHN_UNDEF && + strcmp(strings + symbols[i].st_name, + "_GLOBAL_OFFSET_TABLE_") == 0) + /* "Define" it as absolute. */ + symbols[i].st_shndx = SHN_ABS; + me->arch.syminfo[i].got_offset = -1UL; + me->arch.syminfo[i].plt_offset = -1UL; + me->arch.syminfo[i].got_initialized = 0; + me->arch.syminfo[i].plt_initialized = 0; + } + + /* Search for got/plt relocations. */ + me->arch.got_size = me->arch.plt_size = 0; + for (i = 0; i < hdr->e_shnum; i++) { + if (sechdrs[i].sh_type != SHT_RELA) + continue; + nrela = sechdrs[i].sh_size / sizeof(Elf_Rela); + rela = (void *) hdr + sechdrs[i].sh_offset; + for (j = 0; j < nrela; j++) + check_rela(rela + j, me); + } + + /* Increase core size by size of got & plt and set start + offsets for got and plt. */ + me->core_layout.size = ALIGN(me->core_layout.size, 4); + me->arch.got_offset = me->core_layout.size; + me->core_layout.size += me->arch.got_size; + me->arch.plt_offset = me->core_layout.size; + if (me->arch.plt_size) { + if (IS_ENABLED(CONFIG_EXPOLINE) && !nospec_disable) + me->arch.plt_size += PLT_ENTRY_SIZE; + me->core_layout.size += me->arch.plt_size; + } + return 0; +} + +static int apply_rela_bits(Elf_Addr loc, Elf_Addr val, + int sign, int bits, int shift) +{ + unsigned long umax; + long min, max; + + if (val & ((1UL << shift) - 1)) + return -ENOEXEC; + if (sign) { + val = (Elf_Addr)(((long) val) >> shift); + min = -(1L << (bits - 1)); + max = (1L << (bits - 1)) - 1; + if ((long) val < min || (long) val > max) + return -ENOEXEC; + } else { + val >>= shift; + umax = ((1UL << (bits - 1)) << 1) - 1; + if ((unsigned long) val > umax) + return -ENOEXEC; + } + + if (bits == 8) + *(unsigned char *) loc = val; + else if (bits == 12) + *(unsigned short *) loc = (val & 0xfff) | + (*(unsigned short *) loc & 0xf000); + else if (bits == 16) + *(unsigned short *) loc = val; + else if (bits == 20) + *(unsigned int *) loc = (val & 0xfff) << 16 | + (val & 0xff000) >> 4 | + (*(unsigned int *) loc & 0xf00000ff); + else if (bits == 32) + *(unsigned int *) loc = val; + else if (bits == 64) + *(unsigned long *) loc = val; + return 0; +} + +static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab, + const char *strtab, struct module *me) +{ + struct mod_arch_syminfo *info; + Elf_Addr loc, val; + int r_type, r_sym; + int rc = -ENOEXEC; + + /* This is where to make the change */ + loc = base + rela->r_offset; + /* This is the symbol it is referring to. Note that all + undefined symbols have been resolved. */ + r_sym = ELF_R_SYM(rela->r_info); + r_type = ELF_R_TYPE(rela->r_info); + info = me->arch.syminfo + r_sym; + val = symtab[r_sym].st_value; + + switch (r_type) { + case R_390_NONE: /* No relocation. */ + rc = 0; + break; + case R_390_8: /* Direct 8 bit. */ + case R_390_12: /* Direct 12 bit. */ + case R_390_16: /* Direct 16 bit. */ + case R_390_20: /* Direct 20 bit. */ + case R_390_32: /* Direct 32 bit. */ + case R_390_64: /* Direct 64 bit. */ + val += rela->r_addend; + if (r_type == R_390_8) + rc = apply_rela_bits(loc, val, 0, 8, 0); + else if (r_type == R_390_12) + rc = apply_rela_bits(loc, val, 0, 12, 0); + else if (r_type == R_390_16) + rc = apply_rela_bits(loc, val, 0, 16, 0); + else if (r_type == R_390_20) + rc = apply_rela_bits(loc, val, 1, 20, 0); + else if (r_type == R_390_32) + rc = apply_rela_bits(loc, val, 0, 32, 0); + else if (r_type == R_390_64) + rc = apply_rela_bits(loc, val, 0, 64, 0); + break; + case R_390_PC16: /* PC relative 16 bit. */ + case R_390_PC16DBL: /* PC relative 16 bit shifted by 1. */ + case R_390_PC32DBL: /* PC relative 32 bit shifted by 1. */ + case R_390_PC32: /* PC relative 32 bit. */ + case R_390_PC64: /* PC relative 64 bit. */ + val += rela->r_addend - loc; + if (r_type == R_390_PC16) + rc = apply_rela_bits(loc, val, 1, 16, 0); + else if (r_type == R_390_PC16DBL) + rc = apply_rela_bits(loc, val, 1, 16, 1); + else if (r_type == R_390_PC32DBL) + rc = apply_rela_bits(loc, val, 1, 32, 1); + else if (r_type == R_390_PC32) + rc = apply_rela_bits(loc, val, 1, 32, 0); + else if (r_type == R_390_PC64) + rc = apply_rela_bits(loc, val, 1, 64, 0); + break; + case R_390_GOT12: /* 12 bit GOT offset. */ + case R_390_GOT16: /* 16 bit GOT offset. */ + case R_390_GOT20: /* 20 bit GOT offset. */ + case R_390_GOT32: /* 32 bit GOT offset. */ + case R_390_GOT64: /* 64 bit GOT offset. */ + case R_390_GOTENT: /* 32 bit PC rel. to GOT entry shifted by 1. */ + case R_390_GOTPLT12: /* 12 bit offset to jump slot. */ + case R_390_GOTPLT20: /* 20 bit offset to jump slot. */ + case R_390_GOTPLT16: /* 16 bit offset to jump slot. */ + case R_390_GOTPLT32: /* 32 bit offset to jump slot. */ + case R_390_GOTPLT64: /* 64 bit offset to jump slot. */ + case R_390_GOTPLTENT: /* 32 bit rel. offset to jump slot >> 1. */ + if (info->got_initialized == 0) { + Elf_Addr *gotent; + + gotent = me->core_layout.base + me->arch.got_offset + + info->got_offset; + *gotent = val; + info->got_initialized = 1; + } + val = info->got_offset + rela->r_addend; + if (r_type == R_390_GOT12 || + r_type == R_390_GOTPLT12) + rc = apply_rela_bits(loc, val, 0, 12, 0); + else if (r_type == R_390_GOT16 || + r_type == R_390_GOTPLT16) + rc = apply_rela_bits(loc, val, 0, 16, 0); + else if (r_type == R_390_GOT20 || + r_type == R_390_GOTPLT20) + rc = apply_rela_bits(loc, val, 1, 20, 0); + else if (r_type == R_390_GOT32 || + r_type == R_390_GOTPLT32) + rc = apply_rela_bits(loc, val, 0, 32, 0); + else if (r_type == R_390_GOT64 || + r_type == R_390_GOTPLT64) + rc = apply_rela_bits(loc, val, 0, 64, 0); + else if (r_type == R_390_GOTENT || + r_type == R_390_GOTPLTENT) { + val += (Elf_Addr) me->core_layout.base - loc; + rc = apply_rela_bits(loc, val, 1, 32, 1); + } + break; + case R_390_PLT16DBL: /* 16 bit PC rel. PLT shifted by 1. */ + case R_390_PLT32DBL: /* 32 bit PC rel. PLT shifted by 1. */ + case R_390_PLT32: /* 32 bit PC relative PLT address. */ + case R_390_PLT64: /* 64 bit PC relative PLT address. */ + case R_390_PLTOFF16: /* 16 bit offset from GOT to PLT. */ + case R_390_PLTOFF32: /* 32 bit offset from GOT to PLT. */ + case R_390_PLTOFF64: /* 16 bit offset from GOT to PLT. */ + if (info->plt_initialized == 0) { + unsigned int *ip; + ip = me->core_layout.base + me->arch.plt_offset + + info->plt_offset; + ip[0] = 0x0d10e310; /* basr 1,0 */ + ip[1] = 0x100a0004; /* lg 1,10(1) */ + if (IS_ENABLED(CONFIG_EXPOLINE) && !nospec_disable) { + unsigned int *ij; + ij = me->core_layout.base + + me->arch.plt_offset + + me->arch.plt_size - PLT_ENTRY_SIZE; + ip[2] = 0xa7f40000 + /* j __jump_r1 */ + (unsigned int)(u16) + (((unsigned long) ij - 8 - + (unsigned long) ip) / 2); + } else { + ip[2] = 0x07f10000; /* br %r1 */ + } + ip[3] = (unsigned int) (val >> 32); + ip[4] = (unsigned int) val; + info->plt_initialized = 1; + } + if (r_type == R_390_PLTOFF16 || + r_type == R_390_PLTOFF32 || + r_type == R_390_PLTOFF64) + val = me->arch.plt_offset - me->arch.got_offset + + info->plt_offset + rela->r_addend; + else { + if (!((r_type == R_390_PLT16DBL && + val - loc + 0xffffUL < 0x1ffffeUL) || + (r_type == R_390_PLT32DBL && + val - loc + 0xffffffffULL < 0x1fffffffeULL))) + val = (Elf_Addr) me->core_layout.base + + me->arch.plt_offset + + info->plt_offset; + val += rela->r_addend - loc; + } + if (r_type == R_390_PLT16DBL) + rc = apply_rela_bits(loc, val, 1, 16, 1); + else if (r_type == R_390_PLTOFF16) + rc = apply_rela_bits(loc, val, 0, 16, 0); + else if (r_type == R_390_PLT32DBL) + rc = apply_rela_bits(loc, val, 1, 32, 1); + else if (r_type == R_390_PLT32 || + r_type == R_390_PLTOFF32) + rc = apply_rela_bits(loc, val, 0, 32, 0); + else if (r_type == R_390_PLT64 || + r_type == R_390_PLTOFF64) + rc = apply_rela_bits(loc, val, 0, 64, 0); + break; + case R_390_GOTOFF16: /* 16 bit offset to GOT. */ + case R_390_GOTOFF32: /* 32 bit offset to GOT. */ + case R_390_GOTOFF64: /* 64 bit offset to GOT. */ + val = val + rela->r_addend - + ((Elf_Addr) me->core_layout.base + me->arch.got_offset); + if (r_type == R_390_GOTOFF16) + rc = apply_rela_bits(loc, val, 0, 16, 0); + else if (r_type == R_390_GOTOFF32) + rc = apply_rela_bits(loc, val, 0, 32, 0); + else if (r_type == R_390_GOTOFF64) + rc = apply_rela_bits(loc, val, 0, 64, 0); + break; + case R_390_GOTPC: /* 32 bit PC relative offset to GOT. */ + case R_390_GOTPCDBL: /* 32 bit PC rel. off. to GOT shifted by 1. */ + val = (Elf_Addr) me->core_layout.base + me->arch.got_offset + + rela->r_addend - loc; + if (r_type == R_390_GOTPC) + rc = apply_rela_bits(loc, val, 1, 32, 0); + else if (r_type == R_390_GOTPCDBL) + rc = apply_rela_bits(loc, val, 1, 32, 1); + break; + case R_390_COPY: + case R_390_GLOB_DAT: /* Create GOT entry. */ + case R_390_JMP_SLOT: /* Create PLT entry. */ + case R_390_RELATIVE: /* Adjust by program base. */ + /* Only needed if we want to support loading of + modules linked with -shared. */ + return -ENOEXEC; + default: + printk(KERN_ERR "module %s: unknown relocation: %u\n", + me->name, r_type); + return -ENOEXEC; + } + if (rc) { + printk(KERN_ERR "module %s: relocation error for symbol %s " + "(r_type %i, value 0x%lx)\n", + me->name, strtab + symtab[r_sym].st_name, + r_type, (unsigned long) val); + return rc; + } + return 0; +} + +int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, + unsigned int symindex, unsigned int relsec, + struct module *me) +{ + Elf_Addr base; + Elf_Sym *symtab; + Elf_Rela *rela; + unsigned long i, n; + int rc; + + DEBUGP("Applying relocate section %u to %u\n", + relsec, sechdrs[relsec].sh_info); + base = sechdrs[sechdrs[relsec].sh_info].sh_addr; + symtab = (Elf_Sym *) sechdrs[symindex].sh_addr; + rela = (Elf_Rela *) sechdrs[relsec].sh_addr; + n = sechdrs[relsec].sh_size / sizeof(Elf_Rela); + + for (i = 0; i < n; i++, rela++) { + rc = apply_rela(rela, base, symtab, strtab, me); + if (rc) + return rc; + } + return 0; +} + +int module_finalize(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, + struct module *me) +{ + const Elf_Shdr *s; + char *secstrings, *secname; + void *aseg; + + if (IS_ENABLED(CONFIG_EXPOLINE) && + !nospec_disable && me->arch.plt_size) { + unsigned int *ij; + + ij = me->core_layout.base + me->arch.plt_offset + + me->arch.plt_size - PLT_ENTRY_SIZE; + if (test_facility(35)) { + ij[0] = 0xc6000000; /* exrl %r0,.+10 */ + ij[1] = 0x0005a7f4; /* j . */ + ij[2] = 0x000007f1; /* br %r1 */ + } else { + ij[0] = 0x44000000 | (unsigned int) + offsetof(struct lowcore, br_r1_trampoline); + ij[1] = 0xa7f40000; /* j . */ + } + } + + secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { + aseg = (void *) s->sh_addr; + secname = secstrings + s->sh_name; + + if (!strcmp(".altinstructions", secname)) + /* patch .altinstructions */ + apply_alternatives(aseg, aseg + s->sh_size); + + if (IS_ENABLED(CONFIG_EXPOLINE) && + (!strncmp(".s390_indirect", secname, 14))) + nospec_revert(aseg, aseg + s->sh_size); + + if (IS_ENABLED(CONFIG_EXPOLINE) && + (!strncmp(".s390_return", secname, 12))) + nospec_revert(aseg, aseg + s->sh_size); + } + + jump_label_apply_nops(me); + return 0; +} diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c new file mode 100644 index 000000000..8c867b43c --- /dev/null +++ b/arch/s390/kernel/nmi.c @@ -0,0 +1,476 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Machine check handler + * + * Copyright IBM Corp. 2000, 2009 + * Author(s): Ingo Adlung <adlung@de.ibm.com>, + * Martin Schwidefsky <schwidefsky@de.ibm.com>, + * Cornelia Huck <cornelia.huck@de.ibm.com>, + * Heiko Carstens <heiko.carstens@de.ibm.com>, + */ + +#include <linux/kernel_stat.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/hardirq.h> +#include <linux/log2.h> +#include <linux/kprobes.h> +#include <linux/kmemleak.h> +#include <linux/time.h> +#include <linux/module.h> +#include <linux/sched/signal.h> + +#include <linux/export.h> +#include <asm/lowcore.h> +#include <asm/smp.h> +#include <asm/stp.h> +#include <asm/cputime.h> +#include <asm/nmi.h> +#include <asm/crw.h> +#include <asm/switch_to.h> +#include <asm/ctl_reg.h> +#include <asm/asm-offsets.h> +#include <linux/kvm_host.h> + +struct mcck_struct { + unsigned int kill_task : 1; + unsigned int channel_report : 1; + unsigned int warning : 1; + unsigned int stp_queue : 1; + unsigned long mcck_code; +}; + +static DEFINE_PER_CPU(struct mcck_struct, cpu_mcck); +static struct kmem_cache *mcesa_cache; +static unsigned long mcesa_origin_lc; + +static inline int nmi_needs_mcesa(void) +{ + return MACHINE_HAS_VX || MACHINE_HAS_GS; +} + +static inline unsigned long nmi_get_mcesa_size(void) +{ + if (MACHINE_HAS_GS) + return MCESA_MAX_SIZE; + return MCESA_MIN_SIZE; +} + +/* + * The initial machine check extended save area for the boot CPU. + * It will be replaced by nmi_init() with an allocated structure. + * The structure is required for machine check happening early in + * the boot process. + */ +static struct mcesa boot_mcesa __initdata __aligned(MCESA_MAX_SIZE); + +void __init nmi_alloc_boot_cpu(struct lowcore *lc) +{ + if (!nmi_needs_mcesa()) + return; + lc->mcesad = (unsigned long) &boot_mcesa; + if (MACHINE_HAS_GS) + lc->mcesad |= ilog2(MCESA_MAX_SIZE); +} + +static int __init nmi_init(void) +{ + unsigned long origin, cr0, size; + + if (!nmi_needs_mcesa()) + return 0; + size = nmi_get_mcesa_size(); + if (size > MCESA_MIN_SIZE) + mcesa_origin_lc = ilog2(size); + /* create slab cache for the machine-check-extended-save-areas */ + mcesa_cache = kmem_cache_create("nmi_save_areas", size, size, 0, NULL); + if (!mcesa_cache) + panic("Couldn't create nmi save area cache"); + origin = (unsigned long) kmem_cache_alloc(mcesa_cache, GFP_KERNEL); + if (!origin) + panic("Couldn't allocate nmi save area"); + /* The pointer is stored with mcesa_bits ORed in */ + kmemleak_not_leak((void *) origin); + __ctl_store(cr0, 0, 0); + __ctl_clear_bit(0, 28); /* disable lowcore protection */ + /* Replace boot_mcesa on the boot CPU */ + S390_lowcore.mcesad = origin | mcesa_origin_lc; + __ctl_load(cr0, 0, 0); + return 0; +} +early_initcall(nmi_init); + +int nmi_alloc_per_cpu(struct lowcore *lc) +{ + unsigned long origin; + + if (!nmi_needs_mcesa()) + return 0; + origin = (unsigned long) kmem_cache_alloc(mcesa_cache, GFP_KERNEL); + if (!origin) + return -ENOMEM; + /* The pointer is stored with mcesa_bits ORed in */ + kmemleak_not_leak((void *) origin); + lc->mcesad = origin | mcesa_origin_lc; + return 0; +} + +void nmi_free_per_cpu(struct lowcore *lc) +{ + if (!nmi_needs_mcesa()) + return; + kmem_cache_free(mcesa_cache, (void *)(lc->mcesad & MCESA_ORIGIN_MASK)); +} + +static notrace void s390_handle_damage(void) +{ + smp_emergency_stop(); + disabled_wait((unsigned long) __builtin_return_address(0)); + while (1); +} +NOKPROBE_SYMBOL(s390_handle_damage); + +/* + * Main machine check handler function. Will be called with interrupts enabled + * or disabled and machine checks enabled or disabled. + */ +void s390_handle_mcck(void) +{ + unsigned long flags; + struct mcck_struct mcck; + + /* + * Disable machine checks and get the current state of accumulated + * machine checks. Afterwards delete the old state and enable machine + * checks again. + */ + local_irq_save(flags); + local_mcck_disable(); + mcck = *this_cpu_ptr(&cpu_mcck); + memset(this_cpu_ptr(&cpu_mcck), 0, sizeof(mcck)); + clear_cpu_flag(CIF_MCCK_PENDING); + local_mcck_enable(); + local_irq_restore(flags); + + if (mcck.channel_report) + crw_handle_channel_report(); + /* + * A warning may remain for a prolonged period on the bare iron. + * (actually until the machine is powered off, or the problem is gone) + * So we just stop listening for the WARNING MCH and avoid continuously + * being interrupted. One caveat is however, that we must do this per + * processor and cannot use the smp version of ctl_clear_bit(). + * On VM we only get one interrupt per virtally presented machinecheck. + * Though one suffices, we may get one interrupt per (virtual) cpu. + */ + if (mcck.warning) { /* WARNING pending ? */ + static int mchchk_wng_posted = 0; + + /* Use single cpu clear, as we cannot handle smp here. */ + __ctl_clear_bit(14, 24); /* Disable WARNING MCH */ + if (xchg(&mchchk_wng_posted, 1) == 0) + kill_cad_pid(SIGPWR, 1); + } + if (mcck.stp_queue) + stp_queue_work(); + if (mcck.kill_task) { + local_irq_enable(); + printk(KERN_EMERG "mcck: Terminating task because of machine " + "malfunction (code 0x%016lx).\n", mcck.mcck_code); + printk(KERN_EMERG "mcck: task: %s, pid: %d.\n", + current->comm, current->pid); + do_exit(SIGSEGV); + } +} +EXPORT_SYMBOL_GPL(s390_handle_mcck); + +/* + * returns 0 if all required registers are available + * returns 1 otherwise + */ +static int notrace s390_check_registers(union mci mci, int umode) +{ + union ctlreg2 cr2; + int kill_task; + + kill_task = 0; + + if (!mci.gr) { + /* + * General purpose registers couldn't be restored and have + * unknown contents. Stop system or terminate process. + */ + if (!umode) + s390_handle_damage(); + kill_task = 1; + } + /* Check control registers */ + if (!mci.cr) { + /* + * Control registers have unknown contents. + * Can't recover and therefore stopping machine. + */ + s390_handle_damage(); + } + if (!mci.fp) { + /* + * Floating point registers can't be restored. If the + * kernel currently uses floating point registers the + * system is stopped. If the process has its floating + * pointer registers loaded it is terminated. + */ + if (S390_lowcore.fpu_flags & KERNEL_VXR_V0V7) + s390_handle_damage(); + if (!test_cpu_flag(CIF_FPU)) + kill_task = 1; + } + if (!mci.fc) { + /* + * Floating point control register can't be restored. + * If the kernel currently uses the floating pointer + * registers and needs the FPC register the system is + * stopped. If the process has its floating pointer + * registers loaded it is terminated. + */ + if (S390_lowcore.fpu_flags & KERNEL_FPC) + s390_handle_damage(); + if (!test_cpu_flag(CIF_FPU)) + kill_task = 1; + } + + if (MACHINE_HAS_VX) { + if (!mci.vr) { + /* + * Vector registers can't be restored. If the kernel + * currently uses vector registers the system is + * stopped. If the process has its vector registers + * loaded it is terminated. + */ + if (S390_lowcore.fpu_flags & KERNEL_VXR) + s390_handle_damage(); + if (!test_cpu_flag(CIF_FPU)) + kill_task = 1; + } + } + /* Check if access registers are valid */ + if (!mci.ar) { + /* + * Access registers have unknown contents. + * Terminating task. + */ + kill_task = 1; + } + /* Check guarded storage registers */ + cr2.val = S390_lowcore.cregs_save_area[2]; + if (cr2.gse) { + if (!mci.gs) { + /* + * Guarded storage register can't be restored and + * the current processes uses guarded storage. + * It has to be terminated. + */ + kill_task = 1; + } + } + /* Check if old PSW is valid */ + if (!mci.wp) { + /* + * Can't tell if we come from user or kernel mode + * -> stopping machine. + */ + s390_handle_damage(); + } + /* Check for invalid kernel instruction address */ + if (!mci.ia && !umode) { + /* + * The instruction address got lost while running + * in the kernel -> stopping machine. + */ + s390_handle_damage(); + } + + if (!mci.ms || !mci.pm || !mci.ia) + kill_task = 1; + + return kill_task; +} +NOKPROBE_SYMBOL(s390_check_registers); + +/* + * Backup the guest's machine check info to its description block + */ +static void notrace s390_backup_mcck_info(struct pt_regs *regs) +{ + struct mcck_volatile_info *mcck_backup; + struct sie_page *sie_page; + + /* r14 contains the sie block, which was set in sie64a */ + struct kvm_s390_sie_block *sie_block = + (struct kvm_s390_sie_block *) regs->gprs[14]; + + if (sie_block == NULL) + /* Something's seriously wrong, stop system. */ + s390_handle_damage(); + + sie_page = container_of(sie_block, struct sie_page, sie_block); + mcck_backup = &sie_page->mcck_info; + mcck_backup->mcic = S390_lowcore.mcck_interruption_code & + ~(MCCK_CODE_CP | MCCK_CODE_EXT_DAMAGE); + mcck_backup->ext_damage_code = S390_lowcore.external_damage_code; + mcck_backup->failing_storage_address + = S390_lowcore.failing_storage_address; +} +NOKPROBE_SYMBOL(s390_backup_mcck_info); + +#define MAX_IPD_COUNT 29 +#define MAX_IPD_TIME (5 * 60 * USEC_PER_SEC) /* 5 minutes */ + +#define ED_STP_ISLAND 6 /* External damage STP island check */ +#define ED_STP_SYNC 7 /* External damage STP sync check */ + +#define MCCK_CODE_NO_GUEST (MCCK_CODE_CP | MCCK_CODE_EXT_DAMAGE) + +/* + * machine check handler. + */ +void notrace s390_do_machine_check(struct pt_regs *regs) +{ + static int ipd_count; + static DEFINE_SPINLOCK(ipd_lock); + static unsigned long long last_ipd; + struct mcck_struct *mcck; + unsigned long long tmp; + union mci mci; + unsigned long mcck_dam_code; + + nmi_enter(); + inc_irq_stat(NMI_NMI); + mci.val = S390_lowcore.mcck_interruption_code; + mcck = this_cpu_ptr(&cpu_mcck); + + if (mci.sd) { + /* System damage -> stopping machine */ + s390_handle_damage(); + } + + /* + * Reinject the instruction processing damages' machine checks + * including Delayed Access Exception into the guest + * instead of damaging the host if they happen in the guest. + */ + if (mci.pd && !test_cpu_flag(CIF_MCCK_GUEST)) { + if (mci.b) { + /* Processing backup -> verify if we can survive this */ + u64 z_mcic, o_mcic, t_mcic; + z_mcic = (1ULL<<63 | 1ULL<<59 | 1ULL<<29); + o_mcic = (1ULL<<43 | 1ULL<<42 | 1ULL<<41 | 1ULL<<40 | + 1ULL<<36 | 1ULL<<35 | 1ULL<<34 | 1ULL<<32 | + 1ULL<<30 | 1ULL<<21 | 1ULL<<20 | 1ULL<<17 | + 1ULL<<16); + t_mcic = mci.val; + + if (((t_mcic & z_mcic) != 0) || + ((t_mcic & o_mcic) != o_mcic)) { + s390_handle_damage(); + } + + /* + * Nullifying exigent condition, therefore we might + * retry this instruction. + */ + spin_lock(&ipd_lock); + tmp = get_tod_clock(); + if (((tmp - last_ipd) >> 12) < MAX_IPD_TIME) + ipd_count++; + else + ipd_count = 1; + last_ipd = tmp; + if (ipd_count == MAX_IPD_COUNT) + s390_handle_damage(); + spin_unlock(&ipd_lock); + } else { + /* Processing damage -> stopping machine */ + s390_handle_damage(); + } + } + if (s390_check_registers(mci, user_mode(regs))) { + /* + * Couldn't restore all register contents for the + * user space process -> mark task for termination. + */ + mcck->kill_task = 1; + mcck->mcck_code = mci.val; + set_cpu_flag(CIF_MCCK_PENDING); + } + + /* + * Backup the machine check's info if it happens when the guest + * is running. + */ + if (test_cpu_flag(CIF_MCCK_GUEST)) + s390_backup_mcck_info(regs); + + if (mci.cd) { + /* Timing facility damage */ + s390_handle_damage(); + } + if (mci.ed && mci.ec) { + /* External damage */ + if (S390_lowcore.external_damage_code & (1U << ED_STP_SYNC)) + mcck->stp_queue |= stp_sync_check(); + if (S390_lowcore.external_damage_code & (1U << ED_STP_ISLAND)) + mcck->stp_queue |= stp_island_check(); + if (mcck->stp_queue) + set_cpu_flag(CIF_MCCK_PENDING); + } + + /* + * Reinject storage related machine checks into the guest if they + * happen when the guest is running. + */ + if (!test_cpu_flag(CIF_MCCK_GUEST)) { + if (mci.se) + /* Storage error uncorrected */ + s390_handle_damage(); + if (mci.ke) + /* Storage key-error uncorrected */ + s390_handle_damage(); + if (mci.ds && mci.fa) + /* Storage degradation */ + s390_handle_damage(); + } + if (mci.cp) { + /* Channel report word pending */ + mcck->channel_report = 1; + set_cpu_flag(CIF_MCCK_PENDING); + } + if (mci.w) { + /* Warning pending */ + mcck->warning = 1; + set_cpu_flag(CIF_MCCK_PENDING); + } + + /* + * If there are only Channel Report Pending and External Damage + * machine checks, they will not be reinjected into the guest + * because they refer to host conditions only. + */ + mcck_dam_code = (mci.val & MCIC_SUBCLASS_MASK); + if (test_cpu_flag(CIF_MCCK_GUEST) && + (mcck_dam_code & MCCK_CODE_NO_GUEST) != mcck_dam_code) { + /* Set exit reason code for host's later handling */ + *((long *)(regs->gprs[15] + __SF_SIE_REASON)) = -EINTR; + } + clear_cpu_flag(CIF_MCCK_GUEST); + nmi_exit(); +} +NOKPROBE_SYMBOL(s390_do_machine_check); + +static int __init machine_check_init(void) +{ + ctl_set_bit(14, 25); /* enable external damage MCH */ + ctl_set_bit(14, 27); /* enable system recovery MCH */ + ctl_set_bit(14, 24); /* enable warning MCH */ + return 0; +} +early_initcall(machine_check_init); diff --git a/arch/s390/kernel/nospec-branch.c b/arch/s390/kernel/nospec-branch.c new file mode 100644 index 000000000..649135cbe --- /dev/null +++ b/arch/s390/kernel/nospec-branch.c @@ -0,0 +1,176 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/module.h> +#include <linux/device.h> +#include <linux/cpu.h> +#include <asm/nospec-branch.h> + +static int __init nobp_setup_early(char *str) +{ + bool enabled; + int rc; + + rc = kstrtobool(str, &enabled); + if (rc) + return rc; + if (enabled && test_facility(82)) { + /* + * The user explicitely requested nobp=1, enable it and + * disable the expoline support. + */ + __set_facility(82, S390_lowcore.alt_stfle_fac_list); + if (IS_ENABLED(CONFIG_EXPOLINE)) + nospec_disable = 1; + } else { + __clear_facility(82, S390_lowcore.alt_stfle_fac_list); + } + return 0; +} +early_param("nobp", nobp_setup_early); + +static int __init nospec_setup_early(char *str) +{ + __clear_facility(82, S390_lowcore.alt_stfle_fac_list); + return 0; +} +early_param("nospec", nospec_setup_early); + +static int __init nospec_report(void) +{ + if (test_facility(156)) + pr_info("Spectre V2 mitigation: etokens\n"); + if (IS_ENABLED(CC_USING_EXPOLINE) && !nospec_disable) + pr_info("Spectre V2 mitigation: execute trampolines\n"); + if (__test_facility(82, S390_lowcore.alt_stfle_fac_list)) + pr_info("Spectre V2 mitigation: limited branch prediction\n"); + return 0; +} +arch_initcall(nospec_report); + +#ifdef CONFIG_EXPOLINE + +int nospec_disable = IS_ENABLED(CONFIG_EXPOLINE_OFF); + +static int __init nospectre_v2_setup_early(char *str) +{ + nospec_disable = 1; + return 0; +} +early_param("nospectre_v2", nospectre_v2_setup_early); + +void __init nospec_auto_detect(void) +{ + if (test_facility(156) || cpu_mitigations_off()) { + /* + * The machine supports etokens. + * Disable expolines and disable nobp. + */ + if (IS_ENABLED(CC_USING_EXPOLINE)) + nospec_disable = 1; + __clear_facility(82, S390_lowcore.alt_stfle_fac_list); + } else if (IS_ENABLED(CC_USING_EXPOLINE)) { + /* + * The kernel has been compiled with expolines. + * Keep expolines enabled and disable nobp. + */ + nospec_disable = 0; + __clear_facility(82, S390_lowcore.alt_stfle_fac_list); + } + /* + * If the kernel has not been compiled with expolines the + * nobp setting decides what is done, this depends on the + * CONFIG_KERNEL_NP option and the nobp/nospec parameters. + */ +} + +static int __init spectre_v2_setup_early(char *str) +{ + if (str && !strncmp(str, "on", 2)) { + nospec_disable = 0; + __clear_facility(82, S390_lowcore.alt_stfle_fac_list); + } + if (str && !strncmp(str, "off", 3)) + nospec_disable = 1; + if (str && !strncmp(str, "auto", 4)) + nospec_auto_detect(); + return 0; +} +early_param("spectre_v2", spectre_v2_setup_early); + +static void __init_or_module __nospec_revert(s32 *start, s32 *end) +{ + enum { BRCL_EXPOLINE, BRASL_EXPOLINE } type; + u8 *instr, *thunk, *br; + u8 insnbuf[6]; + s32 *epo; + + /* Second part of the instruction replace is always a nop */ + for (epo = start; epo < end; epo++) { + instr = (u8 *) epo + *epo; + if (instr[0] == 0xc0 && (instr[1] & 0x0f) == 0x04) + type = BRCL_EXPOLINE; /* brcl instruction */ + else if (instr[0] == 0xc0 && (instr[1] & 0x0f) == 0x05) + type = BRASL_EXPOLINE; /* brasl instruction */ + else + continue; + thunk = instr + (*(int *)(instr + 2)) * 2; + if (thunk[0] == 0xc6 && thunk[1] == 0x00) + /* exrl %r0,<target-br> */ + br = thunk + (*(int *)(thunk + 2)) * 2; + else if (thunk[0] == 0xc0 && (thunk[1] & 0x0f) == 0x00 && + thunk[6] == 0x44 && thunk[7] == 0x00 && + (thunk[8] & 0x0f) == 0x00 && thunk[9] == 0x00 && + (thunk[1] & 0xf0) == (thunk[8] & 0xf0)) + /* larl %rx,<target br> + ex %r0,0(%rx) */ + br = thunk + (*(int *)(thunk + 2)) * 2; + else + continue; + /* Check for unconditional branch 0x07f? or 0x47f???? */ + if ((br[0] & 0xbf) != 0x07 || (br[1] & 0xf0) != 0xf0) + continue; + + memcpy(insnbuf + 2, (char[]) { 0x47, 0x00, 0x07, 0x00 }, 4); + switch (type) { + case BRCL_EXPOLINE: + insnbuf[0] = br[0]; + insnbuf[1] = (instr[1] & 0xf0) | (br[1] & 0x0f); + if (br[0] == 0x47) { + /* brcl to b, replace with bc + nopr */ + insnbuf[2] = br[2]; + insnbuf[3] = br[3]; + } else { + /* brcl to br, replace with bcr + nop */ + } + break; + case BRASL_EXPOLINE: + insnbuf[1] = (instr[1] & 0xf0) | (br[1] & 0x0f); + if (br[0] == 0x47) { + /* brasl to b, replace with bas + nopr */ + insnbuf[0] = 0x4d; + insnbuf[2] = br[2]; + insnbuf[3] = br[3]; + } else { + /* brasl to br, replace with basr + nop */ + insnbuf[0] = 0x0d; + } + break; + } + + s390_kernel_write(instr, insnbuf, 6); + } +} + +void __init_or_module nospec_revert(s32 *start, s32 *end) +{ + if (nospec_disable) + __nospec_revert(start, end); +} + +extern s32 __nospec_call_start[], __nospec_call_end[]; +extern s32 __nospec_return_start[], __nospec_return_end[]; +void __init nospec_init_branches(void) +{ + nospec_revert(__nospec_call_start, __nospec_call_end); + nospec_revert(__nospec_return_start, __nospec_return_end); +} + +#endif /* CONFIG_EXPOLINE */ diff --git a/arch/s390/kernel/nospec-sysfs.c b/arch/s390/kernel/nospec-sysfs.c new file mode 100644 index 000000000..e30e580ae --- /dev/null +++ b/arch/s390/kernel/nospec-sysfs.c @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/device.h> +#include <linux/cpu.h> +#include <asm/facility.h> +#include <asm/nospec-branch.h> + +ssize_t cpu_show_spectre_v1(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "Mitigation: __user pointer sanitization\n"); +} + +ssize_t cpu_show_spectre_v2(struct device *dev, + struct device_attribute *attr, char *buf) +{ + if (test_facility(156)) + return sprintf(buf, "Mitigation: etokens\n"); + if (IS_ENABLED(CC_USING_EXPOLINE) && !nospec_disable) + return sprintf(buf, "Mitigation: execute trampolines\n"); + if (__test_facility(82, S390_lowcore.alt_stfle_fac_list)) + return sprintf(buf, "Mitigation: limited branch prediction\n"); + return sprintf(buf, "Vulnerable\n"); +} diff --git a/arch/s390/kernel/os_info.c b/arch/s390/kernel/os_info.c new file mode 100644 index 000000000..0a5e4bafb --- /dev/null +++ b/arch/s390/kernel/os_info.c @@ -0,0 +1,170 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * OS info memory interface + * + * Copyright IBM Corp. 2012 + * Author(s): Michael Holzheu <holzheu@linux.vnet.ibm.com> + */ + +#define KMSG_COMPONENT "os_info" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/crash_dump.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <asm/checksum.h> +#include <asm/lowcore.h> +#include <asm/os_info.h> + +/* + * OS info structure has to be page aligned + */ +static struct os_info os_info __page_aligned_data; + +/* + * Compute checksum over OS info structure + */ +u32 os_info_csum(struct os_info *os_info) +{ + int size = sizeof(*os_info) - offsetof(struct os_info, version_major); + return (__force u32)csum_partial(&os_info->version_major, size, 0); +} + +/* + * Add crashkernel info to OS info and update checksum + */ +void os_info_crashkernel_add(unsigned long base, unsigned long size) +{ + os_info.crashkernel_addr = (u64)(unsigned long)base; + os_info.crashkernel_size = (u64)(unsigned long)size; + os_info.csum = os_info_csum(&os_info); +} + +/* + * Add OS info entry and update checksum + */ +void os_info_entry_add(int nr, void *ptr, u64 size) +{ + os_info.entry[nr].addr = (u64)(unsigned long)ptr; + os_info.entry[nr].size = size; + os_info.entry[nr].csum = (__force u32)csum_partial(ptr, size, 0); + os_info.csum = os_info_csum(&os_info); +} + +/* + * Initialize OS info struture and set lowcore pointer + */ +void __init os_info_init(void) +{ + void *ptr = &os_info; + + os_info.version_major = OS_INFO_VERSION_MAJOR; + os_info.version_minor = OS_INFO_VERSION_MINOR; + os_info.magic = OS_INFO_MAGIC; + os_info.csum = os_info_csum(&os_info); + mem_assign_absolute(S390_lowcore.os_info, (unsigned long) ptr); +} + +#ifdef CONFIG_CRASH_DUMP + +static struct os_info *os_info_old; + +/* + * Allocate and copy OS info entry from oldmem + */ +static void os_info_old_alloc(int nr, int align) +{ + unsigned long addr, size = 0; + char *buf, *buf_align, *msg; + u32 csum; + + addr = os_info_old->entry[nr].addr; + if (!addr) { + msg = "not available"; + goto fail; + } + size = os_info_old->entry[nr].size; + buf = kmalloc(size + align - 1, GFP_KERNEL); + if (!buf) { + msg = "alloc failed"; + goto fail; + } + buf_align = PTR_ALIGN(buf, align); + if (copy_oldmem_kernel(buf_align, (void *) addr, size)) { + msg = "copy failed"; + goto fail_free; + } + csum = (__force u32)csum_partial(buf_align, size, 0); + if (csum != os_info_old->entry[nr].csum) { + msg = "checksum failed"; + goto fail_free; + } + os_info_old->entry[nr].addr = (u64)(unsigned long)buf_align; + msg = "copied"; + goto out; +fail_free: + kfree(buf); +fail: + os_info_old->entry[nr].addr = 0; +out: + pr_info("entry %i: %s (addr=0x%lx size=%lu)\n", + nr, msg, addr, size); +} + +/* + * Initialize os info and os info entries from oldmem + */ +static void os_info_old_init(void) +{ + static int os_info_init; + unsigned long addr; + + if (os_info_init) + return; + if (!OLDMEM_BASE) + goto fail; + if (copy_oldmem_kernel(&addr, &S390_lowcore.os_info, sizeof(addr))) + goto fail; + if (addr == 0 || addr % PAGE_SIZE) + goto fail; + os_info_old = kzalloc(sizeof(*os_info_old), GFP_KERNEL); + if (!os_info_old) + goto fail; + if (copy_oldmem_kernel(os_info_old, (void *) addr, + sizeof(*os_info_old))) + goto fail_free; + if (os_info_old->magic != OS_INFO_MAGIC) + goto fail_free; + if (os_info_old->csum != os_info_csum(os_info_old)) + goto fail_free; + if (os_info_old->version_major > OS_INFO_VERSION_MAJOR) + goto fail_free; + os_info_old_alloc(OS_INFO_VMCOREINFO, 1); + os_info_old_alloc(OS_INFO_REIPL_BLOCK, 1); + pr_info("crashkernel: addr=0x%lx size=%lu\n", + (unsigned long) os_info_old->crashkernel_addr, + (unsigned long) os_info_old->crashkernel_size); + os_info_init = 1; + return; +fail_free: + kfree(os_info_old); +fail: + os_info_init = 1; + os_info_old = NULL; +} + +/* + * Return pointer to os infor entry and its size + */ +void *os_info_old_entry(int nr, unsigned long *size) +{ + os_info_old_init(); + + if (!os_info_old) + return NULL; + if (!os_info_old->entry[nr].addr) + return NULL; + *size = (unsigned long) os_info_old->entry[nr].size; + return (void *)(unsigned long)os_info_old->entry[nr].addr; +} +#endif diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c new file mode 100644 index 000000000..d5523aded --- /dev/null +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -0,0 +1,724 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Performance event support for s390x - CPU-measurement Counter Facility + * + * Copyright IBM Corp. 2012, 2017 + * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> + */ +#define KMSG_COMPONENT "cpum_cf" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/kernel.h> +#include <linux/kernel_stat.h> +#include <linux/perf_event.h> +#include <linux/percpu.h> +#include <linux/notifier.h> +#include <linux/init.h> +#include <linux/export.h> +#include <asm/ctl_reg.h> +#include <asm/irq.h> +#include <asm/cpu_mf.h> + +enum cpumf_ctr_set { + CPUMF_CTR_SET_BASIC = 0, /* Basic Counter Set */ + CPUMF_CTR_SET_USER = 1, /* Problem-State Counter Set */ + CPUMF_CTR_SET_CRYPTO = 2, /* Crypto-Activity Counter Set */ + CPUMF_CTR_SET_EXT = 3, /* Extended Counter Set */ + CPUMF_CTR_SET_MT_DIAG = 4, /* MT-diagnostic Counter Set */ + + /* Maximum number of counter sets */ + CPUMF_CTR_SET_MAX, +}; + +#define CPUMF_LCCTL_ENABLE_SHIFT 16 +#define CPUMF_LCCTL_ACTCTL_SHIFT 0 +static const u64 cpumf_state_ctl[CPUMF_CTR_SET_MAX] = { + [CPUMF_CTR_SET_BASIC] = 0x02, + [CPUMF_CTR_SET_USER] = 0x04, + [CPUMF_CTR_SET_CRYPTO] = 0x08, + [CPUMF_CTR_SET_EXT] = 0x01, + [CPUMF_CTR_SET_MT_DIAG] = 0x20, +}; + +static void ctr_set_enable(u64 *state, int ctr_set) +{ + *state |= cpumf_state_ctl[ctr_set] << CPUMF_LCCTL_ENABLE_SHIFT; +} +static void ctr_set_disable(u64 *state, int ctr_set) +{ + *state &= ~(cpumf_state_ctl[ctr_set] << CPUMF_LCCTL_ENABLE_SHIFT); +} +static void ctr_set_start(u64 *state, int ctr_set) +{ + *state |= cpumf_state_ctl[ctr_set] << CPUMF_LCCTL_ACTCTL_SHIFT; +} +static void ctr_set_stop(u64 *state, int ctr_set) +{ + *state &= ~(cpumf_state_ctl[ctr_set] << CPUMF_LCCTL_ACTCTL_SHIFT); +} + +/* Local CPUMF event structure */ +struct cpu_hw_events { + struct cpumf_ctr_info info; + atomic_t ctr_set[CPUMF_CTR_SET_MAX]; + u64 state, tx_state; + unsigned int flags; + unsigned int txn_flags; +}; +static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { + .ctr_set = { + [CPUMF_CTR_SET_BASIC] = ATOMIC_INIT(0), + [CPUMF_CTR_SET_USER] = ATOMIC_INIT(0), + [CPUMF_CTR_SET_CRYPTO] = ATOMIC_INIT(0), + [CPUMF_CTR_SET_EXT] = ATOMIC_INIT(0), + [CPUMF_CTR_SET_MT_DIAG] = ATOMIC_INIT(0), + }, + .state = 0, + .flags = 0, + .txn_flags = 0, +}; + +static enum cpumf_ctr_set get_counter_set(u64 event) +{ + int set = CPUMF_CTR_SET_MAX; + + if (event < 32) + set = CPUMF_CTR_SET_BASIC; + else if (event < 64) + set = CPUMF_CTR_SET_USER; + else if (event < 128) + set = CPUMF_CTR_SET_CRYPTO; + else if (event < 256) + set = CPUMF_CTR_SET_EXT; + else if (event >= 448 && event < 496) + set = CPUMF_CTR_SET_MT_DIAG; + + return set; +} + +static int validate_ctr_version(const struct hw_perf_event *hwc) +{ + struct cpu_hw_events *cpuhw; + int err = 0; + u16 mtdiag_ctl; + + cpuhw = &get_cpu_var(cpu_hw_events); + + /* check required version for counter sets */ + switch (hwc->config_base) { + case CPUMF_CTR_SET_BASIC: + case CPUMF_CTR_SET_USER: + if (cpuhw->info.cfvn < 1) + err = -EOPNOTSUPP; + break; + case CPUMF_CTR_SET_CRYPTO: + case CPUMF_CTR_SET_EXT: + if (cpuhw->info.csvn < 1) + err = -EOPNOTSUPP; + if ((cpuhw->info.csvn == 1 && hwc->config > 159) || + (cpuhw->info.csvn == 2 && hwc->config > 175) || + (cpuhw->info.csvn > 2 && hwc->config > 255)) + err = -EOPNOTSUPP; + break; + case CPUMF_CTR_SET_MT_DIAG: + if (cpuhw->info.csvn <= 3) + err = -EOPNOTSUPP; + /* + * MT-diagnostic counters are read-only. The counter set + * is automatically enabled and activated on all CPUs with + * multithreading (SMT). Deactivation of multithreading + * also disables the counter set. State changes are ignored + * by lcctl(). Because Linux controls SMT enablement through + * a kernel parameter only, the counter set is either disabled + * or enabled and active. + * + * Thus, the counters can only be used if SMT is on and the + * counter set is enabled and active. + */ + mtdiag_ctl = cpumf_state_ctl[CPUMF_CTR_SET_MT_DIAG]; + if (!((cpuhw->info.auth_ctl & mtdiag_ctl) && + (cpuhw->info.enable_ctl & mtdiag_ctl) && + (cpuhw->info.act_ctl & mtdiag_ctl))) + err = -EOPNOTSUPP; + break; + } + + put_cpu_var(cpu_hw_events); + return err; +} + +static int validate_ctr_auth(const struct hw_perf_event *hwc) +{ + struct cpu_hw_events *cpuhw; + u64 ctrs_state; + int err = 0; + + cpuhw = &get_cpu_var(cpu_hw_events); + + /* Check authorization for cpu counter sets. + * If the particular CPU counter set is not authorized, + * return with -ENOENT in order to fall back to other + * PMUs that might suffice the event request. + */ + ctrs_state = cpumf_state_ctl[hwc->config_base]; + if (!(ctrs_state & cpuhw->info.auth_ctl)) + err = -ENOENT; + + put_cpu_var(cpu_hw_events); + return err; +} + +/* + * Change the CPUMF state to active. + * Enable and activate the CPU-counter sets according + * to the per-cpu control state. + */ +static void cpumf_pmu_enable(struct pmu *pmu) +{ + struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events); + int err; + + if (cpuhw->flags & PMU_F_ENABLED) + return; + + err = lcctl(cpuhw->state); + if (err) { + pr_err("Enabling the performance measuring unit " + "failed with rc=%x\n", err); + return; + } + + cpuhw->flags |= PMU_F_ENABLED; +} + +/* + * Change the CPUMF state to inactive. + * Disable and enable (inactive) the CPU-counter sets according + * to the per-cpu control state. + */ +static void cpumf_pmu_disable(struct pmu *pmu) +{ + struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events); + int err; + u64 inactive; + + if (!(cpuhw->flags & PMU_F_ENABLED)) + return; + + inactive = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1); + err = lcctl(inactive); + if (err) { + pr_err("Disabling the performance measuring unit " + "failed with rc=%x\n", err); + return; + } + + cpuhw->flags &= ~PMU_F_ENABLED; +} + + +/* Number of perf events counting hardware events */ +static atomic_t num_events = ATOMIC_INIT(0); +/* Used to avoid races in calling reserve/release_cpumf_hardware */ +static DEFINE_MUTEX(pmc_reserve_mutex); + +/* CPU-measurement alerts for the counter facility */ +static void cpumf_measurement_alert(struct ext_code ext_code, + unsigned int alert, unsigned long unused) +{ + struct cpu_hw_events *cpuhw; + + if (!(alert & CPU_MF_INT_CF_MASK)) + return; + + inc_irq_stat(IRQEXT_CMC); + cpuhw = this_cpu_ptr(&cpu_hw_events); + + /* Measurement alerts are shared and might happen when the PMU + * is not reserved. Ignore these alerts in this case. */ + if (!(cpuhw->flags & PMU_F_RESERVED)) + return; + + /* counter authorization change alert */ + if (alert & CPU_MF_INT_CF_CACA) + qctri(&cpuhw->info); + + /* loss of counter data alert */ + if (alert & CPU_MF_INT_CF_LCDA) + pr_err("CPU[%i] Counter data was lost\n", smp_processor_id()); + + /* loss of MT counter data alert */ + if (alert & CPU_MF_INT_CF_MTDA) + pr_warn("CPU[%i] MT counter data was lost\n", + smp_processor_id()); +} + +#define PMC_INIT 0 +#define PMC_RELEASE 1 +static void setup_pmc_cpu(void *flags) +{ + struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events); + + switch (*((int *) flags)) { + case PMC_INIT: + memset(&cpuhw->info, 0, sizeof(cpuhw->info)); + qctri(&cpuhw->info); + cpuhw->flags |= PMU_F_RESERVED; + break; + + case PMC_RELEASE: + cpuhw->flags &= ~PMU_F_RESERVED; + break; + } + + /* Disable CPU counter sets */ + lcctl(0); +} + +/* Initialize the CPU-measurement facility */ +static int reserve_pmc_hardware(void) +{ + int flags = PMC_INIT; + + on_each_cpu(setup_pmc_cpu, &flags, 1); + irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT); + + return 0; +} + +/* Release the CPU-measurement facility */ +static void release_pmc_hardware(void) +{ + int flags = PMC_RELEASE; + + on_each_cpu(setup_pmc_cpu, &flags, 1); + irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT); +} + +/* Release the PMU if event is the last perf event */ +static void hw_perf_event_destroy(struct perf_event *event) +{ + if (!atomic_add_unless(&num_events, -1, 1)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_dec_return(&num_events) == 0) + release_pmc_hardware(); + mutex_unlock(&pmc_reserve_mutex); + } +} + +/* CPUMF <-> perf event mappings for kernel+userspace (basic set) */ +static const int cpumf_generic_events_basic[] = { + [PERF_COUNT_HW_CPU_CYCLES] = 0, + [PERF_COUNT_HW_INSTRUCTIONS] = 1, + [PERF_COUNT_HW_CACHE_REFERENCES] = -1, + [PERF_COUNT_HW_CACHE_MISSES] = -1, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = -1, + [PERF_COUNT_HW_BRANCH_MISSES] = -1, + [PERF_COUNT_HW_BUS_CYCLES] = -1, +}; +/* CPUMF <-> perf event mappings for userspace (problem-state set) */ +static const int cpumf_generic_events_user[] = { + [PERF_COUNT_HW_CPU_CYCLES] = 32, + [PERF_COUNT_HW_INSTRUCTIONS] = 33, + [PERF_COUNT_HW_CACHE_REFERENCES] = -1, + [PERF_COUNT_HW_CACHE_MISSES] = -1, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = -1, + [PERF_COUNT_HW_BRANCH_MISSES] = -1, + [PERF_COUNT_HW_BUS_CYCLES] = -1, +}; + +static int __hw_perf_event_init(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + struct hw_perf_event *hwc = &event->hw; + enum cpumf_ctr_set set; + int err; + u64 ev; + + switch (attr->type) { + case PERF_TYPE_RAW: + /* Raw events are used to access counters directly, + * hence do not permit excludes */ + if (attr->exclude_kernel || attr->exclude_user || + attr->exclude_hv) + return -EOPNOTSUPP; + ev = attr->config; + break; + + case PERF_TYPE_HARDWARE: + if (is_sampling_event(event)) /* No sampling support */ + return -ENOENT; + ev = attr->config; + /* Count user space (problem-state) only */ + if (!attr->exclude_user && attr->exclude_kernel) { + if (ev >= ARRAY_SIZE(cpumf_generic_events_user)) + return -EOPNOTSUPP; + ev = cpumf_generic_events_user[ev]; + + /* No support for kernel space counters only */ + } else if (!attr->exclude_kernel && attr->exclude_user) { + return -EOPNOTSUPP; + + /* Count user and kernel space */ + } else { + if (ev >= ARRAY_SIZE(cpumf_generic_events_basic)) + return -EOPNOTSUPP; + ev = cpumf_generic_events_basic[ev]; + } + break; + + default: + return -ENOENT; + } + + if (ev == -1) + return -ENOENT; + + if (ev > PERF_CPUM_CF_MAX_CTR) + return -ENOENT; + + /* Obtain the counter set to which the specified counter belongs */ + set = get_counter_set(ev); + switch (set) { + case CPUMF_CTR_SET_BASIC: + case CPUMF_CTR_SET_USER: + case CPUMF_CTR_SET_CRYPTO: + case CPUMF_CTR_SET_EXT: + case CPUMF_CTR_SET_MT_DIAG: + /* + * Use the hardware perf event structure to store the + * counter number in the 'config' member and the counter + * set number in the 'config_base'. The counter set number + * is then later used to enable/disable the counter(s). + */ + hwc->config = ev; + hwc->config_base = set; + break; + case CPUMF_CTR_SET_MAX: + /* The counter could not be associated to a counter set */ + return -EINVAL; + }; + + /* Initialize for using the CPU-measurement counter facility */ + if (!atomic_inc_not_zero(&num_events)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_read(&num_events) == 0 && reserve_pmc_hardware()) + err = -EBUSY; + else + atomic_inc(&num_events); + mutex_unlock(&pmc_reserve_mutex); + } + event->destroy = hw_perf_event_destroy; + + /* Finally, validate version and authorization of the counter set */ + err = validate_ctr_auth(hwc); + if (!err) + err = validate_ctr_version(hwc); + + return err; +} + +static int cpumf_pmu_event_init(struct perf_event *event) +{ + int err; + + switch (event->attr.type) { + case PERF_TYPE_HARDWARE: + case PERF_TYPE_HW_CACHE: + case PERF_TYPE_RAW: + err = __hw_perf_event_init(event); + break; + default: + return -ENOENT; + } + + if (unlikely(err) && event->destroy) + event->destroy(event); + + return err; +} + +static int hw_perf_event_reset(struct perf_event *event) +{ + u64 prev, new; + int err; + + do { + prev = local64_read(&event->hw.prev_count); + err = ecctr(event->hw.config, &new); + if (err) { + if (err != 3) + break; + /* The counter is not (yet) available. This + * might happen if the counter set to which + * this counter belongs is in the disabled + * state. + */ + new = 0; + } + } while (local64_cmpxchg(&event->hw.prev_count, prev, new) != prev); + + return err; +} + +static void hw_perf_event_update(struct perf_event *event) +{ + u64 prev, new, delta; + int err; + + do { + prev = local64_read(&event->hw.prev_count); + err = ecctr(event->hw.config, &new); + if (err) + return; + } while (local64_cmpxchg(&event->hw.prev_count, prev, new) != prev); + + delta = (prev <= new) ? new - prev + : (-1ULL - prev) + new + 1; /* overflow */ + local64_add(delta, &event->count); +} + +static void cpumf_pmu_read(struct perf_event *event) +{ + if (event->hw.state & PERF_HES_STOPPED) + return; + + hw_perf_event_update(event); +} + +static void cpumf_pmu_start(struct perf_event *event, int flags) +{ + struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + + if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) + return; + + if (WARN_ON_ONCE(hwc->config == -1)) + return; + + if (flags & PERF_EF_RELOAD) + WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); + + hwc->state = 0; + + /* (Re-)enable and activate the counter set */ + ctr_set_enable(&cpuhw->state, hwc->config_base); + ctr_set_start(&cpuhw->state, hwc->config_base); + + /* The counter set to which this counter belongs can be already active. + * Because all counters in a set are active, the event->hw.prev_count + * needs to be synchronized. At this point, the counter set can be in + * the inactive or disabled state. + */ + hw_perf_event_reset(event); + + /* increment refcount for this counter set */ + atomic_inc(&cpuhw->ctr_set[hwc->config_base]); +} + +static void cpumf_pmu_stop(struct perf_event *event, int flags) +{ + struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + + if (!(hwc->state & PERF_HES_STOPPED)) { + /* Decrement reference count for this counter set and if this + * is the last used counter in the set, clear activation + * control and set the counter set state to inactive. + */ + if (!atomic_dec_return(&cpuhw->ctr_set[hwc->config_base])) + ctr_set_stop(&cpuhw->state, hwc->config_base); + event->hw.state |= PERF_HES_STOPPED; + } + + if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + hw_perf_event_update(event); + event->hw.state |= PERF_HES_UPTODATE; + } +} + +static int cpumf_pmu_add(struct perf_event *event, int flags) +{ + struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events); + + /* Check authorization for the counter set to which this + * counter belongs. + * For group events transaction, the authorization check is + * done in cpumf_pmu_commit_txn(). + */ + if (!(cpuhw->txn_flags & PERF_PMU_TXN_ADD)) + if (validate_ctr_auth(&event->hw)) + return -ENOENT; + + ctr_set_enable(&cpuhw->state, event->hw.config_base); + event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + if (flags & PERF_EF_START) + cpumf_pmu_start(event, PERF_EF_RELOAD); + + perf_event_update_userpage(event); + + return 0; +} + +static void cpumf_pmu_del(struct perf_event *event, int flags) +{ + struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events); + + cpumf_pmu_stop(event, PERF_EF_UPDATE); + + /* Check if any counter in the counter set is still used. If not used, + * change the counter set to the disabled state. This also clears the + * content of all counters in the set. + * + * When a new perf event has been added but not yet started, this can + * clear enable control and resets all counters in a set. Therefore, + * cpumf_pmu_start() always has to reenable a counter set. + */ + if (!atomic_read(&cpuhw->ctr_set[event->hw.config_base])) + ctr_set_disable(&cpuhw->state, event->hw.config_base); + + perf_event_update_userpage(event); +} + +/* + * Start group events scheduling transaction. + * Set flags to perform a single test at commit time. + * + * We only support PERF_PMU_TXN_ADD transactions. Save the + * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD + * transactions. + */ +static void cpumf_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags) +{ + struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events); + + WARN_ON_ONCE(cpuhw->txn_flags); /* txn already in flight */ + + cpuhw->txn_flags = txn_flags; + if (txn_flags & ~PERF_PMU_TXN_ADD) + return; + + perf_pmu_disable(pmu); + cpuhw->tx_state = cpuhw->state; +} + +/* + * Stop and cancel a group events scheduling tranctions. + * Assumes cpumf_pmu_del() is called for each successful added + * cpumf_pmu_add() during the transaction. + */ +static void cpumf_pmu_cancel_txn(struct pmu *pmu) +{ + unsigned int txn_flags; + struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events); + + WARN_ON_ONCE(!cpuhw->txn_flags); /* no txn in flight */ + + txn_flags = cpuhw->txn_flags; + cpuhw->txn_flags = 0; + if (txn_flags & ~PERF_PMU_TXN_ADD) + return; + + WARN_ON(cpuhw->tx_state != cpuhw->state); + + perf_pmu_enable(pmu); +} + +/* + * Commit the group events scheduling transaction. On success, the + * transaction is closed. On error, the transaction is kept open + * until cpumf_pmu_cancel_txn() is called. + */ +static int cpumf_pmu_commit_txn(struct pmu *pmu) +{ + struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events); + u64 state; + + WARN_ON_ONCE(!cpuhw->txn_flags); /* no txn in flight */ + + if (cpuhw->txn_flags & ~PERF_PMU_TXN_ADD) { + cpuhw->txn_flags = 0; + return 0; + } + + /* check if the updated state can be scheduled */ + state = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1); + state >>= CPUMF_LCCTL_ENABLE_SHIFT; + if ((state & cpuhw->info.auth_ctl) != state) + return -ENOENT; + + cpuhw->txn_flags = 0; + perf_pmu_enable(pmu); + return 0; +} + +/* Performance monitoring unit for s390x */ +static struct pmu cpumf_pmu = { + .task_ctx_nr = perf_sw_context, + .capabilities = PERF_PMU_CAP_NO_INTERRUPT, + .pmu_enable = cpumf_pmu_enable, + .pmu_disable = cpumf_pmu_disable, + .event_init = cpumf_pmu_event_init, + .add = cpumf_pmu_add, + .del = cpumf_pmu_del, + .start = cpumf_pmu_start, + .stop = cpumf_pmu_stop, + .read = cpumf_pmu_read, + .start_txn = cpumf_pmu_start_txn, + .commit_txn = cpumf_pmu_commit_txn, + .cancel_txn = cpumf_pmu_cancel_txn, +}; + +static int cpumf_pmf_setup(unsigned int cpu, int flags) +{ + local_irq_disable(); + setup_pmc_cpu(&flags); + local_irq_enable(); + return 0; +} + +static int s390_pmu_online_cpu(unsigned int cpu) +{ + return cpumf_pmf_setup(cpu, PMC_INIT); +} + +static int s390_pmu_offline_cpu(unsigned int cpu) +{ + return cpumf_pmf_setup(cpu, PMC_RELEASE); +} + +static int __init cpumf_pmu_init(void) +{ + int rc; + + if (!cpum_cf_avail()) + return -ENODEV; + + /* clear bit 15 of cr0 to unauthorize problem-state to + * extract measurement counters */ + ctl_clear_bit(0, 48); + + /* register handler for measurement-alert interruptions */ + rc = register_external_irq(EXT_IRQ_MEASURE_ALERT, + cpumf_measurement_alert); + if (rc) { + pr_err("Registering for CPU-measurement alerts " + "failed with rc=%i\n", rc); + return rc; + } + + cpumf_pmu.attr_groups = cpumf_cf_event_group(); + rc = perf_pmu_register(&cpumf_pmu, "cpum_cf", PERF_TYPE_RAW); + if (rc) { + pr_err("Registering the cpum_cf PMU failed with rc=%i\n", rc); + unregister_external_irq(EXT_IRQ_MEASURE_ALERT, + cpumf_measurement_alert); + return rc; + } + return cpuhp_setup_state(CPUHP_AP_PERF_S390_CF_ONLINE, + "perf/s390/cf:online", + s390_pmu_online_cpu, s390_pmu_offline_cpu); +} +early_initcall(cpumf_pmu_init); diff --git a/arch/s390/kernel/perf_cpum_cf_events.c b/arch/s390/kernel/perf_cpum_cf_events.c new file mode 100644 index 000000000..d63fb3c56 --- /dev/null +++ b/arch/s390/kernel/perf_cpum_cf_events.c @@ -0,0 +1,598 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Perf PMU sysfs events attributes for available CPU-measurement counters + * + */ + +#include <linux/slab.h> +#include <linux/perf_event.h> + + +/* BEGIN: CPUM_CF COUNTER DEFINITIONS =================================== */ + +CPUMF_EVENT_ATTR(cf_fvn1, CPU_CYCLES, 0x0000); +CPUMF_EVENT_ATTR(cf_fvn1, INSTRUCTIONS, 0x0001); +CPUMF_EVENT_ATTR(cf_fvn1, L1I_DIR_WRITES, 0x0002); +CPUMF_EVENT_ATTR(cf_fvn1, L1I_PENALTY_CYCLES, 0x0003); +CPUMF_EVENT_ATTR(cf_fvn1, PROBLEM_STATE_CPU_CYCLES, 0x0020); +CPUMF_EVENT_ATTR(cf_fvn1, PROBLEM_STATE_INSTRUCTIONS, 0x0021); +CPUMF_EVENT_ATTR(cf_fvn1, PROBLEM_STATE_L1I_DIR_WRITES, 0x0022); +CPUMF_EVENT_ATTR(cf_fvn1, PROBLEM_STATE_L1I_PENALTY_CYCLES, 0x0023); +CPUMF_EVENT_ATTR(cf_fvn1, PROBLEM_STATE_L1D_DIR_WRITES, 0x0024); +CPUMF_EVENT_ATTR(cf_fvn1, PROBLEM_STATE_L1D_PENALTY_CYCLES, 0x0025); +CPUMF_EVENT_ATTR(cf_fvn1, L1D_DIR_WRITES, 0x0004); +CPUMF_EVENT_ATTR(cf_fvn1, L1D_PENALTY_CYCLES, 0x0005); +CPUMF_EVENT_ATTR(cf_fvn3, CPU_CYCLES, 0x0000); +CPUMF_EVENT_ATTR(cf_fvn3, INSTRUCTIONS, 0x0001); +CPUMF_EVENT_ATTR(cf_fvn3, L1I_DIR_WRITES, 0x0002); +CPUMF_EVENT_ATTR(cf_fvn3, L1I_PENALTY_CYCLES, 0x0003); +CPUMF_EVENT_ATTR(cf_fvn3, PROBLEM_STATE_CPU_CYCLES, 0x0020); +CPUMF_EVENT_ATTR(cf_fvn3, PROBLEM_STATE_INSTRUCTIONS, 0x0021); +CPUMF_EVENT_ATTR(cf_fvn3, L1D_DIR_WRITES, 0x0004); +CPUMF_EVENT_ATTR(cf_fvn3, L1D_PENALTY_CYCLES, 0x0005); +CPUMF_EVENT_ATTR(cf_svn_generic, PRNG_FUNCTIONS, 0x0040); +CPUMF_EVENT_ATTR(cf_svn_generic, PRNG_CYCLES, 0x0041); +CPUMF_EVENT_ATTR(cf_svn_generic, PRNG_BLOCKED_FUNCTIONS, 0x0042); +CPUMF_EVENT_ATTR(cf_svn_generic, PRNG_BLOCKED_CYCLES, 0x0043); +CPUMF_EVENT_ATTR(cf_svn_generic, SHA_FUNCTIONS, 0x0044); +CPUMF_EVENT_ATTR(cf_svn_generic, SHA_CYCLES, 0x0045); +CPUMF_EVENT_ATTR(cf_svn_generic, SHA_BLOCKED_FUNCTIONS, 0x0046); +CPUMF_EVENT_ATTR(cf_svn_generic, SHA_BLOCKED_CYCLES, 0x0047); +CPUMF_EVENT_ATTR(cf_svn_generic, DEA_FUNCTIONS, 0x0048); +CPUMF_EVENT_ATTR(cf_svn_generic, DEA_CYCLES, 0x0049); +CPUMF_EVENT_ATTR(cf_svn_generic, DEA_BLOCKED_FUNCTIONS, 0x004a); +CPUMF_EVENT_ATTR(cf_svn_generic, DEA_BLOCKED_CYCLES, 0x004b); +CPUMF_EVENT_ATTR(cf_svn_generic, AES_FUNCTIONS, 0x004c); +CPUMF_EVENT_ATTR(cf_svn_generic, AES_CYCLES, 0x004d); +CPUMF_EVENT_ATTR(cf_svn_generic, AES_BLOCKED_FUNCTIONS, 0x004e); +CPUMF_EVENT_ATTR(cf_svn_generic, AES_BLOCKED_CYCLES, 0x004f); +CPUMF_EVENT_ATTR(cf_z10, L1I_L2_SOURCED_WRITES, 0x0080); +CPUMF_EVENT_ATTR(cf_z10, L1D_L2_SOURCED_WRITES, 0x0081); +CPUMF_EVENT_ATTR(cf_z10, L1I_L3_LOCAL_WRITES, 0x0082); +CPUMF_EVENT_ATTR(cf_z10, L1D_L3_LOCAL_WRITES, 0x0083); +CPUMF_EVENT_ATTR(cf_z10, L1I_L3_REMOTE_WRITES, 0x0084); +CPUMF_EVENT_ATTR(cf_z10, L1D_L3_REMOTE_WRITES, 0x0085); +CPUMF_EVENT_ATTR(cf_z10, L1D_LMEM_SOURCED_WRITES, 0x0086); +CPUMF_EVENT_ATTR(cf_z10, L1I_LMEM_SOURCED_WRITES, 0x0087); +CPUMF_EVENT_ATTR(cf_z10, L1D_RO_EXCL_WRITES, 0x0088); +CPUMF_EVENT_ATTR(cf_z10, L1I_CACHELINE_INVALIDATES, 0x0089); +CPUMF_EVENT_ATTR(cf_z10, ITLB1_WRITES, 0x008a); +CPUMF_EVENT_ATTR(cf_z10, DTLB1_WRITES, 0x008b); +CPUMF_EVENT_ATTR(cf_z10, TLB2_PTE_WRITES, 0x008c); +CPUMF_EVENT_ATTR(cf_z10, TLB2_CRSTE_WRITES, 0x008d); +CPUMF_EVENT_ATTR(cf_z10, TLB2_CRSTE_HPAGE_WRITES, 0x008e); +CPUMF_EVENT_ATTR(cf_z10, ITLB1_MISSES, 0x0091); +CPUMF_EVENT_ATTR(cf_z10, DTLB1_MISSES, 0x0092); +CPUMF_EVENT_ATTR(cf_z10, L2C_STORES_SENT, 0x0093); +CPUMF_EVENT_ATTR(cf_z196, L1D_L2_SOURCED_WRITES, 0x0080); +CPUMF_EVENT_ATTR(cf_z196, L1I_L2_SOURCED_WRITES, 0x0081); +CPUMF_EVENT_ATTR(cf_z196, DTLB1_MISSES, 0x0082); +CPUMF_EVENT_ATTR(cf_z196, ITLB1_MISSES, 0x0083); +CPUMF_EVENT_ATTR(cf_z196, L2C_STORES_SENT, 0x0085); +CPUMF_EVENT_ATTR(cf_z196, L1D_OFFBOOK_L3_SOURCED_WRITES, 0x0086); +CPUMF_EVENT_ATTR(cf_z196, L1D_ONBOOK_L4_SOURCED_WRITES, 0x0087); +CPUMF_EVENT_ATTR(cf_z196, L1I_ONBOOK_L4_SOURCED_WRITES, 0x0088); +CPUMF_EVENT_ATTR(cf_z196, L1D_RO_EXCL_WRITES, 0x0089); +CPUMF_EVENT_ATTR(cf_z196, L1D_OFFBOOK_L4_SOURCED_WRITES, 0x008a); +CPUMF_EVENT_ATTR(cf_z196, L1I_OFFBOOK_L4_SOURCED_WRITES, 0x008b); +CPUMF_EVENT_ATTR(cf_z196, DTLB1_HPAGE_WRITES, 0x008c); +CPUMF_EVENT_ATTR(cf_z196, L1D_LMEM_SOURCED_WRITES, 0x008d); +CPUMF_EVENT_ATTR(cf_z196, L1I_LMEM_SOURCED_WRITES, 0x008e); +CPUMF_EVENT_ATTR(cf_z196, L1I_OFFBOOK_L3_SOURCED_WRITES, 0x008f); +CPUMF_EVENT_ATTR(cf_z196, DTLB1_WRITES, 0x0090); +CPUMF_EVENT_ATTR(cf_z196, ITLB1_WRITES, 0x0091); +CPUMF_EVENT_ATTR(cf_z196, TLB2_PTE_WRITES, 0x0092); +CPUMF_EVENT_ATTR(cf_z196, TLB2_CRSTE_HPAGE_WRITES, 0x0093); +CPUMF_EVENT_ATTR(cf_z196, TLB2_CRSTE_WRITES, 0x0094); +CPUMF_EVENT_ATTR(cf_z196, L1D_ONCHIP_L3_SOURCED_WRITES, 0x0096); +CPUMF_EVENT_ATTR(cf_z196, L1D_OFFCHIP_L3_SOURCED_WRITES, 0x0098); +CPUMF_EVENT_ATTR(cf_z196, L1I_ONCHIP_L3_SOURCED_WRITES, 0x0099); +CPUMF_EVENT_ATTR(cf_z196, L1I_OFFCHIP_L3_SOURCED_WRITES, 0x009b); +CPUMF_EVENT_ATTR(cf_zec12, DTLB1_MISSES, 0x0080); +CPUMF_EVENT_ATTR(cf_zec12, ITLB1_MISSES, 0x0081); +CPUMF_EVENT_ATTR(cf_zec12, L1D_L2I_SOURCED_WRITES, 0x0082); +CPUMF_EVENT_ATTR(cf_zec12, L1I_L2I_SOURCED_WRITES, 0x0083); +CPUMF_EVENT_ATTR(cf_zec12, L1D_L2D_SOURCED_WRITES, 0x0084); +CPUMF_EVENT_ATTR(cf_zec12, DTLB1_WRITES, 0x0085); +CPUMF_EVENT_ATTR(cf_zec12, L1D_LMEM_SOURCED_WRITES, 0x0087); +CPUMF_EVENT_ATTR(cf_zec12, L1I_LMEM_SOURCED_WRITES, 0x0089); +CPUMF_EVENT_ATTR(cf_zec12, L1D_RO_EXCL_WRITES, 0x008a); +CPUMF_EVENT_ATTR(cf_zec12, DTLB1_HPAGE_WRITES, 0x008b); +CPUMF_EVENT_ATTR(cf_zec12, ITLB1_WRITES, 0x008c); +CPUMF_EVENT_ATTR(cf_zec12, TLB2_PTE_WRITES, 0x008d); +CPUMF_EVENT_ATTR(cf_zec12, TLB2_CRSTE_HPAGE_WRITES, 0x008e); +CPUMF_EVENT_ATTR(cf_zec12, TLB2_CRSTE_WRITES, 0x008f); +CPUMF_EVENT_ATTR(cf_zec12, L1D_ONCHIP_L3_SOURCED_WRITES, 0x0090); +CPUMF_EVENT_ATTR(cf_zec12, L1D_OFFCHIP_L3_SOURCED_WRITES, 0x0091); +CPUMF_EVENT_ATTR(cf_zec12, L1D_OFFBOOK_L3_SOURCED_WRITES, 0x0092); +CPUMF_EVENT_ATTR(cf_zec12, L1D_ONBOOK_L4_SOURCED_WRITES, 0x0093); +CPUMF_EVENT_ATTR(cf_zec12, L1D_OFFBOOK_L4_SOURCED_WRITES, 0x0094); +CPUMF_EVENT_ATTR(cf_zec12, TX_NC_TEND, 0x0095); +CPUMF_EVENT_ATTR(cf_zec12, L1D_ONCHIP_L3_SOURCED_WRITES_IV, 0x0096); +CPUMF_EVENT_ATTR(cf_zec12, L1D_OFFCHIP_L3_SOURCED_WRITES_IV, 0x0097); +CPUMF_EVENT_ATTR(cf_zec12, L1D_OFFBOOK_L3_SOURCED_WRITES_IV, 0x0098); +CPUMF_EVENT_ATTR(cf_zec12, L1I_ONCHIP_L3_SOURCED_WRITES, 0x0099); +CPUMF_EVENT_ATTR(cf_zec12, L1I_OFFCHIP_L3_SOURCED_WRITES, 0x009a); +CPUMF_EVENT_ATTR(cf_zec12, L1I_OFFBOOK_L3_SOURCED_WRITES, 0x009b); +CPUMF_EVENT_ATTR(cf_zec12, L1I_ONBOOK_L4_SOURCED_WRITES, 0x009c); +CPUMF_EVENT_ATTR(cf_zec12, L1I_OFFBOOK_L4_SOURCED_WRITES, 0x009d); +CPUMF_EVENT_ATTR(cf_zec12, TX_C_TEND, 0x009e); +CPUMF_EVENT_ATTR(cf_zec12, L1I_ONCHIP_L3_SOURCED_WRITES_IV, 0x009f); +CPUMF_EVENT_ATTR(cf_zec12, L1I_OFFCHIP_L3_SOURCED_WRITES_IV, 0x00a0); +CPUMF_EVENT_ATTR(cf_zec12, L1I_OFFBOOK_L3_SOURCED_WRITES_IV, 0x00a1); +CPUMF_EVENT_ATTR(cf_zec12, TX_NC_TABORT, 0x00b1); +CPUMF_EVENT_ATTR(cf_zec12, TX_C_TABORT_NO_SPECIAL, 0x00b2); +CPUMF_EVENT_ATTR(cf_zec12, TX_C_TABORT_SPECIAL, 0x00b3); +CPUMF_EVENT_ATTR(cf_z13, L1D_RO_EXCL_WRITES, 0x0080); +CPUMF_EVENT_ATTR(cf_z13, DTLB1_WRITES, 0x0081); +CPUMF_EVENT_ATTR(cf_z13, DTLB1_MISSES, 0x0082); +CPUMF_EVENT_ATTR(cf_z13, DTLB1_HPAGE_WRITES, 0x0083); +CPUMF_EVENT_ATTR(cf_z13, DTLB1_GPAGE_WRITES, 0x0084); +CPUMF_EVENT_ATTR(cf_z13, L1D_L2D_SOURCED_WRITES, 0x0085); +CPUMF_EVENT_ATTR(cf_z13, ITLB1_WRITES, 0x0086); +CPUMF_EVENT_ATTR(cf_z13, ITLB1_MISSES, 0x0087); +CPUMF_EVENT_ATTR(cf_z13, L1I_L2I_SOURCED_WRITES, 0x0088); +CPUMF_EVENT_ATTR(cf_z13, TLB2_PTE_WRITES, 0x0089); +CPUMF_EVENT_ATTR(cf_z13, TLB2_CRSTE_HPAGE_WRITES, 0x008a); +CPUMF_EVENT_ATTR(cf_z13, TLB2_CRSTE_WRITES, 0x008b); +CPUMF_EVENT_ATTR(cf_z13, TX_C_TEND, 0x008c); +CPUMF_EVENT_ATTR(cf_z13, TX_NC_TEND, 0x008d); +CPUMF_EVENT_ATTR(cf_z13, L1C_TLB1_MISSES, 0x008f); +CPUMF_EVENT_ATTR(cf_z13, L1D_ONCHIP_L3_SOURCED_WRITES, 0x0090); +CPUMF_EVENT_ATTR(cf_z13, L1D_ONCHIP_L3_SOURCED_WRITES_IV, 0x0091); +CPUMF_EVENT_ATTR(cf_z13, L1D_ONNODE_L4_SOURCED_WRITES, 0x0092); +CPUMF_EVENT_ATTR(cf_z13, L1D_ONNODE_L3_SOURCED_WRITES_IV, 0x0093); +CPUMF_EVENT_ATTR(cf_z13, L1D_ONNODE_L3_SOURCED_WRITES, 0x0094); +CPUMF_EVENT_ATTR(cf_z13, L1D_ONDRAWER_L4_SOURCED_WRITES, 0x0095); +CPUMF_EVENT_ATTR(cf_z13, L1D_ONDRAWER_L3_SOURCED_WRITES_IV, 0x0096); +CPUMF_EVENT_ATTR(cf_z13, L1D_ONDRAWER_L3_SOURCED_WRITES, 0x0097); +CPUMF_EVENT_ATTR(cf_z13, L1D_OFFDRAWER_SCOL_L4_SOURCED_WRITES, 0x0098); +CPUMF_EVENT_ATTR(cf_z13, L1D_OFFDRAWER_SCOL_L3_SOURCED_WRITES_IV, 0x0099); +CPUMF_EVENT_ATTR(cf_z13, L1D_OFFDRAWER_SCOL_L3_SOURCED_WRITES, 0x009a); +CPUMF_EVENT_ATTR(cf_z13, L1D_OFFDRAWER_FCOL_L4_SOURCED_WRITES, 0x009b); +CPUMF_EVENT_ATTR(cf_z13, L1D_OFFDRAWER_FCOL_L3_SOURCED_WRITES_IV, 0x009c); +CPUMF_EVENT_ATTR(cf_z13, L1D_OFFDRAWER_FCOL_L3_SOURCED_WRITES, 0x009d); +CPUMF_EVENT_ATTR(cf_z13, L1D_ONNODE_MEM_SOURCED_WRITES, 0x009e); +CPUMF_EVENT_ATTR(cf_z13, L1D_ONDRAWER_MEM_SOURCED_WRITES, 0x009f); +CPUMF_EVENT_ATTR(cf_z13, L1D_OFFDRAWER_MEM_SOURCED_WRITES, 0x00a0); +CPUMF_EVENT_ATTR(cf_z13, L1D_ONCHIP_MEM_SOURCED_WRITES, 0x00a1); +CPUMF_EVENT_ATTR(cf_z13, L1I_ONCHIP_L3_SOURCED_WRITES, 0x00a2); +CPUMF_EVENT_ATTR(cf_z13, L1I_ONCHIP_L3_SOURCED_WRITES_IV, 0x00a3); +CPUMF_EVENT_ATTR(cf_z13, L1I_ONNODE_L4_SOURCED_WRITES, 0x00a4); +CPUMF_EVENT_ATTR(cf_z13, L1I_ONNODE_L3_SOURCED_WRITES_IV, 0x00a5); +CPUMF_EVENT_ATTR(cf_z13, L1I_ONNODE_L3_SOURCED_WRITES, 0x00a6); +CPUMF_EVENT_ATTR(cf_z13, L1I_ONDRAWER_L4_SOURCED_WRITES, 0x00a7); +CPUMF_EVENT_ATTR(cf_z13, L1I_ONDRAWER_L3_SOURCED_WRITES_IV, 0x00a8); +CPUMF_EVENT_ATTR(cf_z13, L1I_ONDRAWER_L3_SOURCED_WRITES, 0x00a9); +CPUMF_EVENT_ATTR(cf_z13, L1I_OFFDRAWER_SCOL_L4_SOURCED_WRITES, 0x00aa); +CPUMF_EVENT_ATTR(cf_z13, L1I_OFFDRAWER_SCOL_L3_SOURCED_WRITES_IV, 0x00ab); +CPUMF_EVENT_ATTR(cf_z13, L1I_OFFDRAWER_SCOL_L3_SOURCED_WRITES, 0x00ac); +CPUMF_EVENT_ATTR(cf_z13, L1I_OFFDRAWER_FCOL_L4_SOURCED_WRITES, 0x00ad); +CPUMF_EVENT_ATTR(cf_z13, L1I_OFFDRAWER_FCOL_L3_SOURCED_WRITES_IV, 0x00ae); +CPUMF_EVENT_ATTR(cf_z13, L1I_OFFDRAWER_FCOL_L3_SOURCED_WRITES, 0x00af); +CPUMF_EVENT_ATTR(cf_z13, L1I_ONNODE_MEM_SOURCED_WRITES, 0x00b0); +CPUMF_EVENT_ATTR(cf_z13, L1I_ONDRAWER_MEM_SOURCED_WRITES, 0x00b1); +CPUMF_EVENT_ATTR(cf_z13, L1I_OFFDRAWER_MEM_SOURCED_WRITES, 0x00b2); +CPUMF_EVENT_ATTR(cf_z13, L1I_ONCHIP_MEM_SOURCED_WRITES, 0x00b3); +CPUMF_EVENT_ATTR(cf_z13, TX_NC_TABORT, 0x00da); +CPUMF_EVENT_ATTR(cf_z13, TX_C_TABORT_NO_SPECIAL, 0x00db); +CPUMF_EVENT_ATTR(cf_z13, TX_C_TABORT_SPECIAL, 0x00dc); +CPUMF_EVENT_ATTR(cf_z13, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0); +CPUMF_EVENT_ATTR(cf_z13, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1); +CPUMF_EVENT_ATTR(cf_z14, L1D_RO_EXCL_WRITES, 0x0080); +CPUMF_EVENT_ATTR(cf_z14, DTLB2_WRITES, 0x0081); +CPUMF_EVENT_ATTR(cf_z14, DTLB2_MISSES, 0x0082); +CPUMF_EVENT_ATTR(cf_z14, DTLB2_HPAGE_WRITES, 0x0083); +CPUMF_EVENT_ATTR(cf_z14, DTLB2_GPAGE_WRITES, 0x0084); +CPUMF_EVENT_ATTR(cf_z14, L1D_L2D_SOURCED_WRITES, 0x0085); +CPUMF_EVENT_ATTR(cf_z14, ITLB2_WRITES, 0x0086); +CPUMF_EVENT_ATTR(cf_z14, ITLB2_MISSES, 0x0087); +CPUMF_EVENT_ATTR(cf_z14, L1I_L2I_SOURCED_WRITES, 0x0088); +CPUMF_EVENT_ATTR(cf_z14, TLB2_PTE_WRITES, 0x0089); +CPUMF_EVENT_ATTR(cf_z14, TLB2_CRSTE_WRITES, 0x008a); +CPUMF_EVENT_ATTR(cf_z14, TLB2_ENGINES_BUSY, 0x008b); +CPUMF_EVENT_ATTR(cf_z14, TX_C_TEND, 0x008c); +CPUMF_EVENT_ATTR(cf_z14, TX_NC_TEND, 0x008d); +CPUMF_EVENT_ATTR(cf_z14, L1C_TLB2_MISSES, 0x008f); +CPUMF_EVENT_ATTR(cf_z14, L1D_ONCHIP_L3_SOURCED_WRITES, 0x0090); +CPUMF_EVENT_ATTR(cf_z14, L1D_ONCHIP_MEMORY_SOURCED_WRITES, 0x0091); +CPUMF_EVENT_ATTR(cf_z14, L1D_ONCHIP_L3_SOURCED_WRITES_IV, 0x0092); +CPUMF_EVENT_ATTR(cf_z14, L1D_ONCLUSTER_L3_SOURCED_WRITES, 0x0093); +CPUMF_EVENT_ATTR(cf_z14, L1D_ONCLUSTER_MEMORY_SOURCED_WRITES, 0x0094); +CPUMF_EVENT_ATTR(cf_z14, L1D_ONCLUSTER_L3_SOURCED_WRITES_IV, 0x0095); +CPUMF_EVENT_ATTR(cf_z14, L1D_OFFCLUSTER_L3_SOURCED_WRITES, 0x0096); +CPUMF_EVENT_ATTR(cf_z14, L1D_OFFCLUSTER_MEMORY_SOURCED_WRITES, 0x0097); +CPUMF_EVENT_ATTR(cf_z14, L1D_OFFCLUSTER_L3_SOURCED_WRITES_IV, 0x0098); +CPUMF_EVENT_ATTR(cf_z14, L1D_OFFDRAWER_L3_SOURCED_WRITES, 0x0099); +CPUMF_EVENT_ATTR(cf_z14, L1D_OFFDRAWER_MEMORY_SOURCED_WRITES, 0x009a); +CPUMF_EVENT_ATTR(cf_z14, L1D_OFFDRAWER_L3_SOURCED_WRITES_IV, 0x009b); +CPUMF_EVENT_ATTR(cf_z14, L1D_ONDRAWER_L4_SOURCED_WRITES, 0x009c); +CPUMF_EVENT_ATTR(cf_z14, L1D_OFFDRAWER_L4_SOURCED_WRITES, 0x009d); +CPUMF_EVENT_ATTR(cf_z14, L1D_ONCHIP_L3_SOURCED_WRITES_RO, 0x009e); +CPUMF_EVENT_ATTR(cf_z14, L1I_ONCHIP_L3_SOURCED_WRITES, 0x00a2); +CPUMF_EVENT_ATTR(cf_z14, L1I_ONCHIP_MEMORY_SOURCED_WRITES, 0x00a3); +CPUMF_EVENT_ATTR(cf_z14, L1I_ONCHIP_L3_SOURCED_WRITES_IV, 0x00a4); +CPUMF_EVENT_ATTR(cf_z14, L1I_ONCLUSTER_L3_SOURCED_WRITES, 0x00a5); +CPUMF_EVENT_ATTR(cf_z14, L1I_ONCLUSTER_MEMORY_SOURCED_WRITES, 0x00a6); +CPUMF_EVENT_ATTR(cf_z14, L1I_ONCLUSTER_L3_SOURCED_WRITES_IV, 0x00a7); +CPUMF_EVENT_ATTR(cf_z14, L1I_OFFCLUSTER_L3_SOURCED_WRITES, 0x00a8); +CPUMF_EVENT_ATTR(cf_z14, L1I_OFFCLUSTER_MEMORY_SOURCED_WRITES, 0x00a9); +CPUMF_EVENT_ATTR(cf_z14, L1I_OFFCLUSTER_L3_SOURCED_WRITES_IV, 0x00aa); +CPUMF_EVENT_ATTR(cf_z14, L1I_OFFDRAWER_L3_SOURCED_WRITES, 0x00ab); +CPUMF_EVENT_ATTR(cf_z14, L1I_OFFDRAWER_MEMORY_SOURCED_WRITES, 0x00ac); +CPUMF_EVENT_ATTR(cf_z14, L1I_OFFDRAWER_L3_SOURCED_WRITES_IV, 0x00ad); +CPUMF_EVENT_ATTR(cf_z14, L1I_ONDRAWER_L4_SOURCED_WRITES, 0x00ae); +CPUMF_EVENT_ATTR(cf_z14, L1I_OFFDRAWER_L4_SOURCED_WRITES, 0x00af); +CPUMF_EVENT_ATTR(cf_z14, BCD_DFP_EXECUTION_SLOTS, 0x00e0); +CPUMF_EVENT_ATTR(cf_z14, VX_BCD_EXECUTION_SLOTS, 0x00e1); +CPUMF_EVENT_ATTR(cf_z14, DECIMAL_INSTRUCTIONS, 0x00e2); +CPUMF_EVENT_ATTR(cf_z14, LAST_HOST_TRANSLATIONS, 0x00e8); +CPUMF_EVENT_ATTR(cf_z14, TX_NC_TABORT, 0x00f3); +CPUMF_EVENT_ATTR(cf_z14, TX_C_TABORT_NO_SPECIAL, 0x00f4); +CPUMF_EVENT_ATTR(cf_z14, TX_C_TABORT_SPECIAL, 0x00f5); +CPUMF_EVENT_ATTR(cf_z14, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0); +CPUMF_EVENT_ATTR(cf_z14, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1); + +static struct attribute *cpumcf_fvn1_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_fvn1, CPU_CYCLES), + CPUMF_EVENT_PTR(cf_fvn1, INSTRUCTIONS), + CPUMF_EVENT_PTR(cf_fvn1, L1I_DIR_WRITES), + CPUMF_EVENT_PTR(cf_fvn1, L1I_PENALTY_CYCLES), + CPUMF_EVENT_PTR(cf_fvn1, PROBLEM_STATE_CPU_CYCLES), + CPUMF_EVENT_PTR(cf_fvn1, PROBLEM_STATE_INSTRUCTIONS), + CPUMF_EVENT_PTR(cf_fvn1, PROBLEM_STATE_L1I_DIR_WRITES), + CPUMF_EVENT_PTR(cf_fvn1, PROBLEM_STATE_L1I_PENALTY_CYCLES), + CPUMF_EVENT_PTR(cf_fvn1, PROBLEM_STATE_L1D_DIR_WRITES), + CPUMF_EVENT_PTR(cf_fvn1, PROBLEM_STATE_L1D_PENALTY_CYCLES), + CPUMF_EVENT_PTR(cf_fvn1, L1D_DIR_WRITES), + CPUMF_EVENT_PTR(cf_fvn1, L1D_PENALTY_CYCLES), + NULL, +}; + +static struct attribute *cpumcf_fvn3_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_fvn3, CPU_CYCLES), + CPUMF_EVENT_PTR(cf_fvn3, INSTRUCTIONS), + CPUMF_EVENT_PTR(cf_fvn3, L1I_DIR_WRITES), + CPUMF_EVENT_PTR(cf_fvn3, L1I_PENALTY_CYCLES), + CPUMF_EVENT_PTR(cf_fvn3, PROBLEM_STATE_CPU_CYCLES), + CPUMF_EVENT_PTR(cf_fvn3, PROBLEM_STATE_INSTRUCTIONS), + CPUMF_EVENT_PTR(cf_fvn3, L1D_DIR_WRITES), + CPUMF_EVENT_PTR(cf_fvn3, L1D_PENALTY_CYCLES), + NULL, +}; + +static struct attribute *cpumcf_svn_generic_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_svn_generic, PRNG_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_generic, PRNG_CYCLES), + CPUMF_EVENT_PTR(cf_svn_generic, PRNG_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_generic, PRNG_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_generic, SHA_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_generic, SHA_CYCLES), + CPUMF_EVENT_PTR(cf_svn_generic, SHA_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_generic, SHA_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_generic, DEA_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_generic, DEA_CYCLES), + CPUMF_EVENT_PTR(cf_svn_generic, DEA_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_generic, DEA_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_generic, AES_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_generic, AES_CYCLES), + CPUMF_EVENT_PTR(cf_svn_generic, AES_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_generic, AES_BLOCKED_CYCLES), + NULL, +}; + +static struct attribute *cpumcf_z10_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_z10, L1I_L2_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z10, L1D_L2_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z10, L1I_L3_LOCAL_WRITES), + CPUMF_EVENT_PTR(cf_z10, L1D_L3_LOCAL_WRITES), + CPUMF_EVENT_PTR(cf_z10, L1I_L3_REMOTE_WRITES), + CPUMF_EVENT_PTR(cf_z10, L1D_L3_REMOTE_WRITES), + CPUMF_EVENT_PTR(cf_z10, L1D_LMEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z10, L1I_LMEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z10, L1D_RO_EXCL_WRITES), + CPUMF_EVENT_PTR(cf_z10, L1I_CACHELINE_INVALIDATES), + CPUMF_EVENT_PTR(cf_z10, ITLB1_WRITES), + CPUMF_EVENT_PTR(cf_z10, DTLB1_WRITES), + CPUMF_EVENT_PTR(cf_z10, TLB2_PTE_WRITES), + CPUMF_EVENT_PTR(cf_z10, TLB2_CRSTE_WRITES), + CPUMF_EVENT_PTR(cf_z10, TLB2_CRSTE_HPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z10, ITLB1_MISSES), + CPUMF_EVENT_PTR(cf_z10, DTLB1_MISSES), + CPUMF_EVENT_PTR(cf_z10, L2C_STORES_SENT), + NULL, +}; + +static struct attribute *cpumcf_z196_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_z196, L1D_L2_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z196, L1I_L2_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z196, DTLB1_MISSES), + CPUMF_EVENT_PTR(cf_z196, ITLB1_MISSES), + CPUMF_EVENT_PTR(cf_z196, L2C_STORES_SENT), + CPUMF_EVENT_PTR(cf_z196, L1D_OFFBOOK_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z196, L1D_ONBOOK_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z196, L1I_ONBOOK_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z196, L1D_RO_EXCL_WRITES), + CPUMF_EVENT_PTR(cf_z196, L1D_OFFBOOK_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z196, L1I_OFFBOOK_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z196, DTLB1_HPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z196, L1D_LMEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z196, L1I_LMEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z196, L1I_OFFBOOK_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z196, DTLB1_WRITES), + CPUMF_EVENT_PTR(cf_z196, ITLB1_WRITES), + CPUMF_EVENT_PTR(cf_z196, TLB2_PTE_WRITES), + CPUMF_EVENT_PTR(cf_z196, TLB2_CRSTE_HPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z196, TLB2_CRSTE_WRITES), + CPUMF_EVENT_PTR(cf_z196, L1D_ONCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z196, L1D_OFFCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z196, L1I_ONCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z196, L1I_OFFCHIP_L3_SOURCED_WRITES), + NULL, +}; + +static struct attribute *cpumcf_zec12_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_zec12, DTLB1_MISSES), + CPUMF_EVENT_PTR(cf_zec12, ITLB1_MISSES), + CPUMF_EVENT_PTR(cf_zec12, L1D_L2I_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_zec12, L1I_L2I_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_zec12, L1D_L2D_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_zec12, DTLB1_WRITES), + CPUMF_EVENT_PTR(cf_zec12, L1D_LMEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_zec12, L1I_LMEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_zec12, L1D_RO_EXCL_WRITES), + CPUMF_EVENT_PTR(cf_zec12, DTLB1_HPAGE_WRITES), + CPUMF_EVENT_PTR(cf_zec12, ITLB1_WRITES), + CPUMF_EVENT_PTR(cf_zec12, TLB2_PTE_WRITES), + CPUMF_EVENT_PTR(cf_zec12, TLB2_CRSTE_HPAGE_WRITES), + CPUMF_EVENT_PTR(cf_zec12, TLB2_CRSTE_WRITES), + CPUMF_EVENT_PTR(cf_zec12, L1D_ONCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_zec12, L1D_OFFCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_zec12, L1D_OFFBOOK_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_zec12, L1D_ONBOOK_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_zec12, L1D_OFFBOOK_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_zec12, TX_NC_TEND), + CPUMF_EVENT_PTR(cf_zec12, L1D_ONCHIP_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_zec12, L1D_OFFCHIP_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_zec12, L1D_OFFBOOK_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_zec12, L1I_ONCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_zec12, L1I_OFFCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_zec12, L1I_OFFBOOK_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_zec12, L1I_ONBOOK_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_zec12, L1I_OFFBOOK_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_zec12, TX_C_TEND), + CPUMF_EVENT_PTR(cf_zec12, L1I_ONCHIP_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_zec12, L1I_OFFCHIP_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_zec12, L1I_OFFBOOK_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_zec12, TX_NC_TABORT), + CPUMF_EVENT_PTR(cf_zec12, TX_C_TABORT_NO_SPECIAL), + CPUMF_EVENT_PTR(cf_zec12, TX_C_TABORT_SPECIAL), + NULL, +}; + +static struct attribute *cpumcf_z13_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_z13, L1D_RO_EXCL_WRITES), + CPUMF_EVENT_PTR(cf_z13, DTLB1_WRITES), + CPUMF_EVENT_PTR(cf_z13, DTLB1_MISSES), + CPUMF_EVENT_PTR(cf_z13, DTLB1_HPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z13, DTLB1_GPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_L2D_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, ITLB1_WRITES), + CPUMF_EVENT_PTR(cf_z13, ITLB1_MISSES), + CPUMF_EVENT_PTR(cf_z13, L1I_L2I_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, TLB2_PTE_WRITES), + CPUMF_EVENT_PTR(cf_z13, TLB2_CRSTE_HPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z13, TLB2_CRSTE_WRITES), + CPUMF_EVENT_PTR(cf_z13, TX_C_TEND), + CPUMF_EVENT_PTR(cf_z13, TX_NC_TEND), + CPUMF_EVENT_PTR(cf_z13, L1C_TLB1_MISSES), + CPUMF_EVENT_PTR(cf_z13, L1D_ONCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_ONCHIP_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z13, L1D_ONNODE_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_ONNODE_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z13, L1D_ONNODE_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_ONDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_ONDRAWER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z13, L1D_ONDRAWER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_OFFDRAWER_SCOL_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_OFFDRAWER_SCOL_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z13, L1D_OFFDRAWER_SCOL_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_OFFDRAWER_FCOL_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_OFFDRAWER_FCOL_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z13, L1D_OFFDRAWER_FCOL_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_ONNODE_MEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_ONDRAWER_MEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_OFFDRAWER_MEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_ONCHIP_MEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_ONCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_ONCHIP_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z13, L1I_ONNODE_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_ONNODE_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z13, L1I_ONNODE_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_ONDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_ONDRAWER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z13, L1I_ONDRAWER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_OFFDRAWER_SCOL_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_OFFDRAWER_SCOL_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z13, L1I_OFFDRAWER_SCOL_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_OFFDRAWER_FCOL_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_OFFDRAWER_FCOL_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z13, L1I_OFFDRAWER_FCOL_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_ONNODE_MEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_ONDRAWER_MEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_OFFDRAWER_MEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_ONCHIP_MEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, TX_NC_TABORT), + CPUMF_EVENT_PTR(cf_z13, TX_C_TABORT_NO_SPECIAL), + CPUMF_EVENT_PTR(cf_z13, TX_C_TABORT_SPECIAL), + CPUMF_EVENT_PTR(cf_z13, MT_DIAG_CYCLES_ONE_THR_ACTIVE), + CPUMF_EVENT_PTR(cf_z13, MT_DIAG_CYCLES_TWO_THR_ACTIVE), + NULL, +}; + +static struct attribute *cpumcf_z14_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_z14, L1D_RO_EXCL_WRITES), + CPUMF_EVENT_PTR(cf_z14, DTLB2_WRITES), + CPUMF_EVENT_PTR(cf_z14, DTLB2_MISSES), + CPUMF_EVENT_PTR(cf_z14, DTLB2_HPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z14, DTLB2_GPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1D_L2D_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, ITLB2_WRITES), + CPUMF_EVENT_PTR(cf_z14, ITLB2_MISSES), + CPUMF_EVENT_PTR(cf_z14, L1I_L2I_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, TLB2_PTE_WRITES), + CPUMF_EVENT_PTR(cf_z14, TLB2_CRSTE_WRITES), + CPUMF_EVENT_PTR(cf_z14, TLB2_ENGINES_BUSY), + CPUMF_EVENT_PTR(cf_z14, TX_C_TEND), + CPUMF_EVENT_PTR(cf_z14, TX_NC_TEND), + CPUMF_EVENT_PTR(cf_z14, L1C_TLB2_MISSES), + CPUMF_EVENT_PTR(cf_z14, L1D_ONCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1D_ONCHIP_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1D_ONCHIP_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z14, L1D_ONCLUSTER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1D_ONCLUSTER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1D_ONCLUSTER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z14, L1D_OFFCLUSTER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1D_OFFCLUSTER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1D_OFFCLUSTER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z14, L1D_OFFDRAWER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1D_OFFDRAWER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1D_OFFDRAWER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z14, L1D_ONDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1D_OFFDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1D_ONCHIP_L3_SOURCED_WRITES_RO), + CPUMF_EVENT_PTR(cf_z14, L1I_ONCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1I_ONCHIP_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1I_ONCHIP_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z14, L1I_ONCLUSTER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1I_ONCLUSTER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1I_ONCLUSTER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z14, L1I_OFFCLUSTER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1I_OFFCLUSTER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1I_OFFCLUSTER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z14, L1I_OFFDRAWER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1I_OFFDRAWER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1I_OFFDRAWER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z14, L1I_ONDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1I_OFFDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, BCD_DFP_EXECUTION_SLOTS), + CPUMF_EVENT_PTR(cf_z14, VX_BCD_EXECUTION_SLOTS), + CPUMF_EVENT_PTR(cf_z14, DECIMAL_INSTRUCTIONS), + CPUMF_EVENT_PTR(cf_z14, LAST_HOST_TRANSLATIONS), + CPUMF_EVENT_PTR(cf_z14, TX_NC_TABORT), + CPUMF_EVENT_PTR(cf_z14, TX_C_TABORT_NO_SPECIAL), + CPUMF_EVENT_PTR(cf_z14, TX_C_TABORT_SPECIAL), + CPUMF_EVENT_PTR(cf_z14, MT_DIAG_CYCLES_ONE_THR_ACTIVE), + CPUMF_EVENT_PTR(cf_z14, MT_DIAG_CYCLES_TWO_THR_ACTIVE), + NULL, +}; + +/* END: CPUM_CF COUNTER DEFINITIONS ===================================== */ + +static struct attribute_group cpumcf_pmu_events_group = { + .name = "events", +}; + +PMU_FORMAT_ATTR(event, "config:0-63"); + +static struct attribute *cpumcf_pmu_format_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group cpumcf_pmu_format_group = { + .name = "format", + .attrs = cpumcf_pmu_format_attr, +}; + +static const struct attribute_group *cpumcf_pmu_attr_groups[] = { + &cpumcf_pmu_events_group, + &cpumcf_pmu_format_group, + NULL, +}; + + +static __init struct attribute **merge_attr(struct attribute **a, + struct attribute **b, + struct attribute **c) +{ + struct attribute **new; + int j, i; + + for (j = 0; a[j]; j++) + ; + for (i = 0; b[i]; i++) + j++; + for (i = 0; c[i]; i++) + j++; + j++; + + new = kmalloc_array(j, sizeof(struct attribute *), GFP_KERNEL); + if (!new) + return NULL; + j = 0; + for (i = 0; a[i]; i++) + new[j++] = a[i]; + for (i = 0; b[i]; i++) + new[j++] = b[i]; + for (i = 0; c[i]; i++) + new[j++] = c[i]; + new[j] = NULL; + + return new; +} + +__init const struct attribute_group **cpumf_cf_event_group(void) +{ + struct attribute **combined, **model, **cfvn, **csvn; + struct attribute *none[] = { NULL }; + struct cpumf_ctr_info ci; + struct cpuid cpu_id; + + /* Determine generic counters set(s) */ + qctri(&ci); + switch (ci.cfvn) { + case 1: + cfvn = cpumcf_fvn1_pmu_event_attr; + break; + case 3: + cfvn = cpumcf_fvn3_pmu_event_attr; + break; + default: + cfvn = none; + } + csvn = cpumcf_svn_generic_pmu_event_attr; + + /* Determine model-specific counter set(s) */ + get_cpu_id(&cpu_id); + switch (cpu_id.machine) { + case 0x2097: + case 0x2098: + model = cpumcf_z10_pmu_event_attr; + break; + case 0x2817: + case 0x2818: + model = cpumcf_z196_pmu_event_attr; + break; + case 0x2827: + case 0x2828: + model = cpumcf_zec12_pmu_event_attr; + break; + case 0x2964: + case 0x2965: + model = cpumcf_z13_pmu_event_attr; + break; + case 0x3906: + case 0x3907: + model = cpumcf_z14_pmu_event_attr; + break; + default: + model = none; + break; + } + + combined = merge_attr(cfvn, csvn, model); + if (combined) + cpumcf_pmu_events_group.attrs = combined; + return cpumcf_pmu_attr_groups; +} diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c new file mode 100644 index 000000000..c8e1e3252 --- /dev/null +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -0,0 +1,2100 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Performance event support for the System z CPU-measurement Sampling Facility + * + * Copyright IBM Corp. 2013, 2018 + * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> + */ +#define KMSG_COMPONENT "cpum_sf" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/kernel.h> +#include <linux/kernel_stat.h> +#include <linux/perf_event.h> +#include <linux/percpu.h> +#include <linux/pid.h> +#include <linux/notifier.h> +#include <linux/export.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/moduleparam.h> +#include <asm/cpu_mf.h> +#include <asm/irq.h> +#include <asm/debug.h> +#include <asm/timex.h> + +/* Minimum number of sample-data-block-tables: + * At least one table is required for the sampling buffer structure. + * A single table contains up to 511 pointers to sample-data-blocks. + */ +#define CPUM_SF_MIN_SDBT 1 + +/* Number of sample-data-blocks per sample-data-block-table (SDBT): + * A table contains SDB pointers (8 bytes) and one table-link entry + * that points to the origin of the next SDBT. + */ +#define CPUM_SF_SDB_PER_TABLE ((PAGE_SIZE - 8) / 8) + +/* Maximum page offset for an SDBT table-link entry: + * If this page offset is reached, a table-link entry to the next SDBT + * must be added. + */ +#define CPUM_SF_SDBT_TL_OFFSET (CPUM_SF_SDB_PER_TABLE * 8) +static inline int require_table_link(const void *sdbt) +{ + return ((unsigned long) sdbt & ~PAGE_MASK) == CPUM_SF_SDBT_TL_OFFSET; +} + +/* Minimum and maximum sampling buffer sizes: + * + * This number represents the maximum size of the sampling buffer taking + * the number of sample-data-block-tables into account. Note that these + * numbers apply to the basic-sampling function only. + * The maximum number of SDBs is increased by CPUM_SF_SDB_DIAG_FACTOR if + * the diagnostic-sampling function is active. + * + * Sampling buffer size Buffer characteristics + * --------------------------------------------------- + * 64KB == 16 pages (4KB per page) + * 1 page for SDB-tables + * 15 pages for SDBs + * + * 32MB == 8192 pages (4KB per page) + * 16 pages for SDB-tables + * 8176 pages for SDBs + */ +static unsigned long __read_mostly CPUM_SF_MIN_SDB = 15; +static unsigned long __read_mostly CPUM_SF_MAX_SDB = 8176; +static unsigned long __read_mostly CPUM_SF_SDB_DIAG_FACTOR = 1; + +struct sf_buffer { + unsigned long *sdbt; /* Sample-data-block-table origin */ + /* buffer characteristics (required for buffer increments) */ + unsigned long num_sdb; /* Number of sample-data-blocks */ + unsigned long num_sdbt; /* Number of sample-data-block-tables */ + unsigned long *tail; /* last sample-data-block-table */ +}; + +struct aux_buffer { + struct sf_buffer sfb; + unsigned long head; /* index of SDB of buffer head */ + unsigned long alert_mark; /* index of SDB of alert request position */ + unsigned long empty_mark; /* mark of SDB not marked full */ + unsigned long *sdb_index; /* SDB address for fast lookup */ + unsigned long *sdbt_index; /* SDBT address for fast lookup */ +}; + +struct cpu_hw_sf { + /* CPU-measurement sampling information block */ + struct hws_qsi_info_block qsi; + /* CPU-measurement sampling control block */ + struct hws_lsctl_request_block lsctl; + struct sf_buffer sfb; /* Sampling buffer */ + unsigned int flags; /* Status flags */ + struct perf_event *event; /* Scheduled perf event */ + struct perf_output_handle handle; /* AUX buffer output handle */ +}; +static DEFINE_PER_CPU(struct cpu_hw_sf, cpu_hw_sf); + +/* Debug feature */ +static debug_info_t *sfdbg; + +/* + * sf_disable() - Switch off sampling facility + */ +static int sf_disable(void) +{ + struct hws_lsctl_request_block sreq; + + memset(&sreq, 0, sizeof(sreq)); + return lsctl(&sreq); +} + +/* + * sf_buffer_available() - Check for an allocated sampling buffer + */ +static int sf_buffer_available(struct cpu_hw_sf *cpuhw) +{ + return !!cpuhw->sfb.sdbt; +} + +/* + * deallocate sampling facility buffer + */ +static void free_sampling_buffer(struct sf_buffer *sfb) +{ + unsigned long *sdbt, *curr; + + if (!sfb->sdbt) + return; + + sdbt = sfb->sdbt; + curr = sdbt; + + /* Free the SDBT after all SDBs are processed... */ + while (1) { + if (!*curr || !sdbt) + break; + + /* Process table-link entries */ + if (is_link_entry(curr)) { + curr = get_next_sdbt(curr); + if (sdbt) + free_page((unsigned long) sdbt); + + /* If the origin is reached, sampling buffer is freed */ + if (curr == sfb->sdbt) + break; + else + sdbt = curr; + } else { + /* Process SDB pointer */ + if (*curr) { + free_page(*curr); + curr++; + } + } + } + + debug_sprintf_event(sfdbg, 5, + "free_sampling_buffer: freed sdbt=%p\n", sfb->sdbt); + memset(sfb, 0, sizeof(*sfb)); +} + +static int alloc_sample_data_block(unsigned long *sdbt, gfp_t gfp_flags) +{ + unsigned long sdb, *trailer; + + /* Allocate and initialize sample-data-block */ + sdb = get_zeroed_page(gfp_flags); + if (!sdb) + return -ENOMEM; + trailer = trailer_entry_ptr(sdb); + *trailer = SDB_TE_ALERT_REQ_MASK; + + /* Link SDB into the sample-data-block-table */ + *sdbt = sdb; + + return 0; +} + +/* + * realloc_sampling_buffer() - extend sampler memory + * + * Allocates new sample-data-blocks and adds them to the specified sampling + * buffer memory. + * + * Important: This modifies the sampling buffer and must be called when the + * sampling facility is disabled. + * + * Returns zero on success, non-zero otherwise. + */ +static int realloc_sampling_buffer(struct sf_buffer *sfb, + unsigned long num_sdb, gfp_t gfp_flags) +{ + int i, rc; + unsigned long *new, *tail, *tail_prev = NULL; + + if (!sfb->sdbt || !sfb->tail) + return -EINVAL; + + if (!is_link_entry(sfb->tail)) + return -EINVAL; + + /* Append to the existing sampling buffer, overwriting the table-link + * register. + * The tail variables always points to the "tail" (last and table-link) + * entry in an SDB-table. + */ + tail = sfb->tail; + + /* Do a sanity check whether the table-link entry points to + * the sampling buffer origin. + */ + if (sfb->sdbt != get_next_sdbt(tail)) { + debug_sprintf_event(sfdbg, 3, "realloc_sampling_buffer: " + "sampling buffer is not linked: origin=%p" + "tail=%p\n", + (void *) sfb->sdbt, (void *) tail); + return -EINVAL; + } + + /* Allocate remaining SDBs */ + rc = 0; + for (i = 0; i < num_sdb; i++) { + /* Allocate a new SDB-table if it is full. */ + if (require_table_link(tail)) { + new = (unsigned long *) get_zeroed_page(gfp_flags); + if (!new) { + rc = -ENOMEM; + break; + } + sfb->num_sdbt++; + /* Link current page to tail of chain */ + *tail = (unsigned long)(void *) new + 1; + tail_prev = tail; + tail = new; + } + + /* Allocate a new sample-data-block. + * If there is not enough memory, stop the realloc process + * and simply use what was allocated. If this is a temporary + * issue, a new realloc call (if required) might succeed. + */ + rc = alloc_sample_data_block(tail, gfp_flags); + if (rc) { + /* Undo last SDBT. An SDBT with no SDB at its first + * entry but with an SDBT entry instead can not be + * handled by the interrupt handler code. + * Avoid this situation. + */ + if (tail_prev) { + sfb->num_sdbt--; + free_page((unsigned long) new); + tail = tail_prev; + } + break; + } + sfb->num_sdb++; + tail++; + tail_prev = new = NULL; /* Allocated at least one SBD */ + } + + /* Link sampling buffer to its origin */ + *tail = (unsigned long) sfb->sdbt + 1; + sfb->tail = tail; + + debug_sprintf_event(sfdbg, 4, "realloc_sampling_buffer: new buffer" + " settings: sdbt=%lu sdb=%lu\n", + sfb->num_sdbt, sfb->num_sdb); + return rc; +} + +/* + * allocate_sampling_buffer() - allocate sampler memory + * + * Allocates and initializes a sampling buffer structure using the + * specified number of sample-data-blocks (SDB). For each allocation, + * a 4K page is used. The number of sample-data-block-tables (SDBT) + * are calculated from SDBs. + * Also set the ALERT_REQ mask in each SDBs trailer. + * + * Returns zero on success, non-zero otherwise. + */ +static int alloc_sampling_buffer(struct sf_buffer *sfb, unsigned long num_sdb) +{ + int rc; + + if (sfb->sdbt) + return -EINVAL; + + /* Allocate the sample-data-block-table origin */ + sfb->sdbt = (unsigned long *) get_zeroed_page(GFP_KERNEL); + if (!sfb->sdbt) + return -ENOMEM; + sfb->num_sdb = 0; + sfb->num_sdbt = 1; + + /* Link the table origin to point to itself to prepare for + * realloc_sampling_buffer() invocation. + */ + sfb->tail = sfb->sdbt; + *sfb->tail = (unsigned long)(void *) sfb->sdbt + 1; + + /* Allocate requested number of sample-data-blocks */ + rc = realloc_sampling_buffer(sfb, num_sdb, GFP_KERNEL); + if (rc) { + free_sampling_buffer(sfb); + debug_sprintf_event(sfdbg, 4, "alloc_sampling_buffer: " + "realloc_sampling_buffer failed with rc=%i\n", rc); + } else + debug_sprintf_event(sfdbg, 4, + "alloc_sampling_buffer: tear=%p dear=%p\n", + sfb->sdbt, (void *) *sfb->sdbt); + return rc; +} + +static void sfb_set_limits(unsigned long min, unsigned long max) +{ + struct hws_qsi_info_block si; + + CPUM_SF_MIN_SDB = min; + CPUM_SF_MAX_SDB = max; + + memset(&si, 0, sizeof(si)); + if (!qsi(&si)) + CPUM_SF_SDB_DIAG_FACTOR = DIV_ROUND_UP(si.dsdes, si.bsdes); +} + +static unsigned long sfb_max_limit(struct hw_perf_event *hwc) +{ + return SAMPL_DIAG_MODE(hwc) ? CPUM_SF_MAX_SDB * CPUM_SF_SDB_DIAG_FACTOR + : CPUM_SF_MAX_SDB; +} + +static unsigned long sfb_pending_allocs(struct sf_buffer *sfb, + struct hw_perf_event *hwc) +{ + if (!sfb->sdbt) + return SFB_ALLOC_REG(hwc); + if (SFB_ALLOC_REG(hwc) > sfb->num_sdb) + return SFB_ALLOC_REG(hwc) - sfb->num_sdb; + return 0; +} + +static int sfb_has_pending_allocs(struct sf_buffer *sfb, + struct hw_perf_event *hwc) +{ + return sfb_pending_allocs(sfb, hwc) > 0; +} + +static void sfb_account_allocs(unsigned long num, struct hw_perf_event *hwc) +{ + /* Limit the number of SDBs to not exceed the maximum */ + num = min_t(unsigned long, num, sfb_max_limit(hwc) - SFB_ALLOC_REG(hwc)); + if (num) + SFB_ALLOC_REG(hwc) += num; +} + +static void sfb_init_allocs(unsigned long num, struct hw_perf_event *hwc) +{ + SFB_ALLOC_REG(hwc) = 0; + sfb_account_allocs(num, hwc); +} + +static void deallocate_buffers(struct cpu_hw_sf *cpuhw) +{ + if (cpuhw->sfb.sdbt) + free_sampling_buffer(&cpuhw->sfb); +} + +static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc) +{ + unsigned long n_sdb, freq, factor; + size_t sample_size; + + /* Calculate sampling buffers using 4K pages + * + * 1. Determine the sample data size which depends on the used + * sampling functions, for example, basic-sampling or + * basic-sampling with diagnostic-sampling. + * + * 2. Use the sampling frequency as input. The sampling buffer is + * designed for almost one second. This can be adjusted through + * the "factor" variable. + * In any case, alloc_sampling_buffer() sets the Alert Request + * Control indicator to trigger a measurement-alert to harvest + * sample-data-blocks (sdb). + * + * 3. Compute the number of sample-data-blocks and ensure a minimum + * of CPUM_SF_MIN_SDB. Also ensure the upper limit does not + * exceed a "calculated" maximum. The symbolic maximum is + * designed for basic-sampling only and needs to be increased if + * diagnostic-sampling is active. + * See also the remarks for these symbolic constants. + * + * 4. Compute the number of sample-data-block-tables (SDBT) and + * ensure a minimum of CPUM_SF_MIN_SDBT (one table can manage up + * to 511 SDBs). + */ + sample_size = sizeof(struct hws_basic_entry); + freq = sample_rate_to_freq(&cpuhw->qsi, SAMPL_RATE(hwc)); + factor = 1; + n_sdb = DIV_ROUND_UP(freq, factor * ((PAGE_SIZE-64) / sample_size)); + if (n_sdb < CPUM_SF_MIN_SDB) + n_sdb = CPUM_SF_MIN_SDB; + + /* If there is already a sampling buffer allocated, it is very likely + * that the sampling facility is enabled too. If the event to be + * initialized requires a greater sampling buffer, the allocation must + * be postponed. Changing the sampling buffer requires the sampling + * facility to be in the disabled state. So, account the number of + * required SDBs and let cpumsf_pmu_enable() resize the buffer just + * before the event is started. + */ + sfb_init_allocs(n_sdb, hwc); + if (sf_buffer_available(cpuhw)) + return 0; + + debug_sprintf_event(sfdbg, 3, + "allocate_buffers: rate=%lu f=%lu sdb=%lu/%lu" + " sample_size=%lu cpuhw=%p\n", + SAMPL_RATE(hwc), freq, n_sdb, sfb_max_limit(hwc), + sample_size, cpuhw); + + return alloc_sampling_buffer(&cpuhw->sfb, + sfb_pending_allocs(&cpuhw->sfb, hwc)); +} + +static unsigned long min_percent(unsigned int percent, unsigned long base, + unsigned long min) +{ + return min_t(unsigned long, min, DIV_ROUND_UP(percent * base, 100)); +} + +static unsigned long compute_sfb_extent(unsigned long ratio, unsigned long base) +{ + /* Use a percentage-based approach to extend the sampling facility + * buffer. Accept up to 5% sample data loss. + * Vary the extents between 1% to 5% of the current number of + * sample-data-blocks. + */ + if (ratio <= 5) + return 0; + if (ratio <= 25) + return min_percent(1, base, 1); + if (ratio <= 50) + return min_percent(1, base, 1); + if (ratio <= 75) + return min_percent(2, base, 2); + if (ratio <= 100) + return min_percent(3, base, 3); + if (ratio <= 250) + return min_percent(4, base, 4); + + return min_percent(5, base, 8); +} + +static void sfb_account_overflows(struct cpu_hw_sf *cpuhw, + struct hw_perf_event *hwc) +{ + unsigned long ratio, num; + + if (!OVERFLOW_REG(hwc)) + return; + + /* The sample_overflow contains the average number of sample data + * that has been lost because sample-data-blocks were full. + * + * Calculate the total number of sample data entries that has been + * discarded. Then calculate the ratio of lost samples to total samples + * per second in percent. + */ + ratio = DIV_ROUND_UP(100 * OVERFLOW_REG(hwc) * cpuhw->sfb.num_sdb, + sample_rate_to_freq(&cpuhw->qsi, SAMPL_RATE(hwc))); + + /* Compute number of sample-data-blocks */ + num = compute_sfb_extent(ratio, cpuhw->sfb.num_sdb); + if (num) + sfb_account_allocs(num, hwc); + + debug_sprintf_event(sfdbg, 5, "sfb: overflow: overflow=%llu ratio=%lu" + " num=%lu\n", OVERFLOW_REG(hwc), ratio, num); + OVERFLOW_REG(hwc) = 0; +} + +/* extend_sampling_buffer() - Extend sampling buffer + * @sfb: Sampling buffer structure (for local CPU) + * @hwc: Perf event hardware structure + * + * Use this function to extend the sampling buffer based on the overflow counter + * and postponed allocation extents stored in the specified Perf event hardware. + * + * Important: This function disables the sampling facility in order to safely + * change the sampling buffer structure. Do not call this function + * when the PMU is active. + */ +static void extend_sampling_buffer(struct sf_buffer *sfb, + struct hw_perf_event *hwc) +{ + unsigned long num, num_old; + int rc; + + num = sfb_pending_allocs(sfb, hwc); + if (!num) + return; + num_old = sfb->num_sdb; + + /* Disable the sampling facility to reset any states and also + * clear pending measurement alerts. + */ + sf_disable(); + + /* Extend the sampling buffer. + * This memory allocation typically happens in an atomic context when + * called by perf. Because this is a reallocation, it is fine if the + * new SDB-request cannot be satisfied immediately. + */ + rc = realloc_sampling_buffer(sfb, num, GFP_ATOMIC); + if (rc) + debug_sprintf_event(sfdbg, 5, "sfb: extend: realloc " + "failed with rc=%i\n", rc); + + if (sfb_has_pending_allocs(sfb, hwc)) + debug_sprintf_event(sfdbg, 5, "sfb: extend: " + "req=%lu alloc=%lu remaining=%lu\n", + num, sfb->num_sdb - num_old, + sfb_pending_allocs(sfb, hwc)); +} + + +/* Number of perf events counting hardware events */ +static atomic_t num_events; +/* Used to avoid races in calling reserve/release_cpumf_hardware */ +static DEFINE_MUTEX(pmc_reserve_mutex); + +#define PMC_INIT 0 +#define PMC_RELEASE 1 +#define PMC_FAILURE 2 +static void setup_pmc_cpu(void *flags) +{ + int err; + struct cpu_hw_sf *cpusf = this_cpu_ptr(&cpu_hw_sf); + + err = 0; + switch (*((int *) flags)) { + case PMC_INIT: + memset(cpusf, 0, sizeof(*cpusf)); + err = qsi(&cpusf->qsi); + if (err) + break; + cpusf->flags |= PMU_F_RESERVED; + err = sf_disable(); + if (err) + pr_err("Switching off the sampling facility failed " + "with rc=%i\n", err); + debug_sprintf_event(sfdbg, 5, + "setup_pmc_cpu: initialized: cpuhw=%p\n", cpusf); + break; + case PMC_RELEASE: + cpusf->flags &= ~PMU_F_RESERVED; + err = sf_disable(); + if (err) { + pr_err("Switching off the sampling facility failed " + "with rc=%i\n", err); + } else + deallocate_buffers(cpusf); + debug_sprintf_event(sfdbg, 5, + "setup_pmc_cpu: released: cpuhw=%p\n", cpusf); + break; + } + if (err) + *((int *) flags) |= PMC_FAILURE; +} + +static void release_pmc_hardware(void) +{ + int flags = PMC_RELEASE; + + irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT); + on_each_cpu(setup_pmc_cpu, &flags, 1); +} + +static int reserve_pmc_hardware(void) +{ + int flags = PMC_INIT; + + on_each_cpu(setup_pmc_cpu, &flags, 1); + if (flags & PMC_FAILURE) { + release_pmc_hardware(); + return -ENODEV; + } + irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT); + + return 0; +} + +static void hw_perf_event_destroy(struct perf_event *event) +{ + /* Release PMC if this is the last perf event */ + if (!atomic_add_unless(&num_events, -1, 1)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_dec_return(&num_events) == 0) + release_pmc_hardware(); + mutex_unlock(&pmc_reserve_mutex); + } +} + +static void hw_init_period(struct hw_perf_event *hwc, u64 period) +{ + hwc->sample_period = period; + hwc->last_period = hwc->sample_period; + local64_set(&hwc->period_left, hwc->sample_period); +} + +static void hw_reset_registers(struct hw_perf_event *hwc, + unsigned long *sdbt_origin) +{ + /* (Re)set to first sample-data-block-table */ + TEAR_REG(hwc) = (unsigned long) sdbt_origin; +} + +static unsigned long hw_limit_rate(const struct hws_qsi_info_block *si, + unsigned long rate) +{ + return clamp_t(unsigned long, rate, + si->min_sampl_rate, si->max_sampl_rate); +} + +static u32 cpumsf_pid_type(struct perf_event *event, + u32 pid, enum pid_type type) +{ + struct task_struct *tsk; + + /* Idle process */ + if (!pid) + goto out; + + tsk = find_task_by_pid_ns(pid, &init_pid_ns); + pid = -1; + if (tsk) { + /* + * Only top level events contain the pid namespace in which + * they are created. + */ + if (event->parent) + event = event->parent; + pid = __task_pid_nr_ns(tsk, type, event->ns); + /* + * See also 1d953111b648 + * "perf/core: Don't report zero PIDs for exiting tasks". + */ + if (!pid && !pid_alive(tsk)) + pid = -1; + } +out: + return pid; +} + +static void cpumsf_output_event_pid(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + u32 pid; + struct perf_event_header header; + struct perf_output_handle handle; + + /* + * Obtain the PID from the basic-sampling data entry and + * correct the data->tid_entry.pid value. + */ + pid = data->tid_entry.pid; + + /* Protect callchain buffers, tasks */ + rcu_read_lock(); + + perf_prepare_sample(&header, data, event, regs); + if (perf_output_begin(&handle, event, header.size)) + goto out; + + /* Update the process ID (see also kernel/events/core.c) */ + data->tid_entry.pid = cpumsf_pid_type(event, pid, PIDTYPE_TGID); + data->tid_entry.tid = cpumsf_pid_type(event, pid, PIDTYPE_PID); + + perf_output_sample(&handle, &header, data, event); + perf_output_end(&handle); +out: + rcu_read_unlock(); +} + +static int __hw_perf_event_init(struct perf_event *event) +{ + struct cpu_hw_sf *cpuhw; + struct hws_qsi_info_block si; + struct perf_event_attr *attr = &event->attr; + struct hw_perf_event *hwc = &event->hw; + unsigned long rate; + int cpu, err; + + /* Reserve CPU-measurement sampling facility */ + err = 0; + if (!atomic_inc_not_zero(&num_events)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_read(&num_events) == 0 && reserve_pmc_hardware()) + err = -EBUSY; + else + atomic_inc(&num_events); + mutex_unlock(&pmc_reserve_mutex); + } + event->destroy = hw_perf_event_destroy; + + if (err) + goto out; + + /* Access per-CPU sampling information (query sampling info) */ + /* + * The event->cpu value can be -1 to count on every CPU, for example, + * when attaching to a task. If this is specified, use the query + * sampling info from the current CPU, otherwise use event->cpu to + * retrieve the per-CPU information. + * Later, cpuhw indicates whether to allocate sampling buffers for a + * particular CPU (cpuhw!=NULL) or each online CPU (cpuw==NULL). + */ + memset(&si, 0, sizeof(si)); + cpuhw = NULL; + if (event->cpu == -1) + qsi(&si); + else { + /* Event is pinned to a particular CPU, retrieve the per-CPU + * sampling structure for accessing the CPU-specific QSI. + */ + cpuhw = &per_cpu(cpu_hw_sf, event->cpu); + si = cpuhw->qsi; + } + + /* Check sampling facility authorization and, if not authorized, + * fall back to other PMUs. It is safe to check any CPU because + * the authorization is identical for all configured CPUs. + */ + if (!si.as) { + err = -ENOENT; + goto out; + } + + /* Always enable basic sampling */ + SAMPL_FLAGS(hwc) = PERF_CPUM_SF_BASIC_MODE; + + /* Check if diagnostic sampling is requested. Deny if the required + * sampling authorization is missing. + */ + if (attr->config == PERF_EVENT_CPUM_SF_DIAG) { + if (!si.ad) { + err = -EPERM; + goto out; + } + SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_DIAG_MODE; + } + + /* Check and set other sampling flags */ + if (attr->config1 & PERF_CPUM_SF_FULL_BLOCKS) + SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_FULL_BLOCKS; + + /* The sampling information (si) contains information about the + * min/max sampling intervals and the CPU speed. So calculate the + * correct sampling interval and avoid the whole period adjust + * feedback loop. + */ + rate = 0; + if (attr->freq) { + if (!attr->sample_freq) { + err = -EINVAL; + goto out; + } + rate = freq_to_sample_rate(&si, attr->sample_freq); + rate = hw_limit_rate(&si, rate); + attr->freq = 0; + attr->sample_period = rate; + } else { + /* The min/max sampling rates specifies the valid range + * of sample periods. If the specified sample period is + * out of range, limit the period to the range boundary. + */ + rate = hw_limit_rate(&si, hwc->sample_period); + + /* The perf core maintains a maximum sample rate that is + * configurable through the sysctl interface. Ensure the + * sampling rate does not exceed this value. This also helps + * to avoid throttling when pushing samples with + * perf_event_overflow(). + */ + if (sample_rate_to_freq(&si, rate) > + sysctl_perf_event_sample_rate) { + err = -EINVAL; + debug_sprintf_event(sfdbg, 1, "Sampling rate exceeds maximum perf sample rate\n"); + goto out; + } + } + SAMPL_RATE(hwc) = rate; + hw_init_period(hwc, SAMPL_RATE(hwc)); + + /* Initialize sample data overflow accounting */ + hwc->extra_reg.reg = REG_OVERFLOW; + OVERFLOW_REG(hwc) = 0; + + /* Use AUX buffer. No need to allocate it by ourself */ + if (attr->config == PERF_EVENT_CPUM_SF_DIAG) + return 0; + + /* Allocate the per-CPU sampling buffer using the CPU information + * from the event. If the event is not pinned to a particular + * CPU (event->cpu == -1; or cpuhw == NULL), allocate sampling + * buffers for each online CPU. + */ + if (cpuhw) + /* Event is pinned to a particular CPU */ + err = allocate_buffers(cpuhw, hwc); + else { + /* Event is not pinned, allocate sampling buffer on + * each online CPU + */ + for_each_online_cpu(cpu) { + cpuhw = &per_cpu(cpu_hw_sf, cpu); + err = allocate_buffers(cpuhw, hwc); + if (err) + break; + } + } + + /* If PID/TID sampling is active, replace the default overflow + * handler to extract and resolve the PIDs from the basic-sampling + * data entries. + */ + if (event->attr.sample_type & PERF_SAMPLE_TID) + if (is_default_overflow_handler(event)) + event->overflow_handler = cpumsf_output_event_pid; +out: + return err; +} + +static int cpumsf_pmu_event_init(struct perf_event *event) +{ + int err; + + /* No support for taken branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + + switch (event->attr.type) { + case PERF_TYPE_RAW: + if ((event->attr.config != PERF_EVENT_CPUM_SF) && + (event->attr.config != PERF_EVENT_CPUM_SF_DIAG)) + return -ENOENT; + break; + case PERF_TYPE_HARDWARE: + /* Support sampling of CPU cycles in addition to the + * counter facility. However, the counter facility + * is more precise and, hence, restrict this PMU to + * sampling events only. + */ + if (event->attr.config != PERF_COUNT_HW_CPU_CYCLES) + return -ENOENT; + if (!is_sampling_event(event)) + return -ENOENT; + break; + default: + return -ENOENT; + } + + /* Check online status of the CPU to which the event is pinned */ + if (event->cpu >= 0 && !cpu_online(event->cpu)) + return -ENODEV; + + /* Force reset of idle/hv excludes regardless of what the + * user requested. + */ + if (event->attr.exclude_hv) + event->attr.exclude_hv = 0; + if (event->attr.exclude_idle) + event->attr.exclude_idle = 0; + + err = __hw_perf_event_init(event); + if (unlikely(err)) + if (event->destroy) + event->destroy(event); + return err; +} + +static void cpumsf_pmu_enable(struct pmu *pmu) +{ + struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); + struct hw_perf_event *hwc; + int err; + + if (cpuhw->flags & PMU_F_ENABLED) + return; + + if (cpuhw->flags & PMU_F_ERR_MASK) + return; + + /* Check whether to extent the sampling buffer. + * + * Two conditions trigger an increase of the sampling buffer for a + * perf event: + * 1. Postponed buffer allocations from the event initialization. + * 2. Sampling overflows that contribute to pending allocations. + * + * Note that the extend_sampling_buffer() function disables the sampling + * facility, but it can be fully re-enabled using sampling controls that + * have been saved in cpumsf_pmu_disable(). + */ + if (cpuhw->event) { + hwc = &cpuhw->event->hw; + if (!(SAMPL_DIAG_MODE(hwc))) { + /* + * Account number of overflow-designated + * buffer extents + */ + sfb_account_overflows(cpuhw, hwc); + if (sfb_has_pending_allocs(&cpuhw->sfb, hwc)) + extend_sampling_buffer(&cpuhw->sfb, hwc); + } + } + + /* (Re)enable the PMU and sampling facility */ + cpuhw->flags |= PMU_F_ENABLED; + barrier(); + + err = lsctl(&cpuhw->lsctl); + if (err) { + cpuhw->flags &= ~PMU_F_ENABLED; + pr_err("Loading sampling controls failed: op=%i err=%i\n", + 1, err); + return; + } + + /* Load current program parameter */ + lpp(&S390_lowcore.lpp); + + debug_sprintf_event(sfdbg, 6, "pmu_enable: es=%i cs=%i ed=%i cd=%i " + "tear=%p dear=%p\n", cpuhw->lsctl.es, cpuhw->lsctl.cs, + cpuhw->lsctl.ed, cpuhw->lsctl.cd, + (void *) cpuhw->lsctl.tear, (void *) cpuhw->lsctl.dear); +} + +static void cpumsf_pmu_disable(struct pmu *pmu) +{ + struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); + struct hws_lsctl_request_block inactive; + struct hws_qsi_info_block si; + int err; + + if (!(cpuhw->flags & PMU_F_ENABLED)) + return; + + if (cpuhw->flags & PMU_F_ERR_MASK) + return; + + /* Switch off sampling activation control */ + inactive = cpuhw->lsctl; + inactive.cs = 0; + inactive.cd = 0; + + err = lsctl(&inactive); + if (err) { + pr_err("Loading sampling controls failed: op=%i err=%i\n", + 2, err); + return; + } + + /* Save state of TEAR and DEAR register contents */ + if (!qsi(&si)) { + /* TEAR/DEAR values are valid only if the sampling facility is + * enabled. Note that cpumsf_pmu_disable() might be called even + * for a disabled sampling facility because cpumsf_pmu_enable() + * controls the enable/disable state. + */ + if (si.es) { + cpuhw->lsctl.tear = si.tear; + cpuhw->lsctl.dear = si.dear; + } + } else + debug_sprintf_event(sfdbg, 3, "cpumsf_pmu_disable: " + "qsi() failed with err=%i\n", err); + + cpuhw->flags &= ~PMU_F_ENABLED; +} + +/* perf_exclude_event() - Filter event + * @event: The perf event + * @regs: pt_regs structure + * @sde_regs: Sample-data-entry (sde) regs structure + * + * Filter perf events according to their exclude specification. + * + * Return non-zero if the event shall be excluded. + */ +static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs, + struct perf_sf_sde_regs *sde_regs) +{ + if (event->attr.exclude_user && user_mode(regs)) + return 1; + if (event->attr.exclude_kernel && !user_mode(regs)) + return 1; + if (event->attr.exclude_guest && sde_regs->in_guest) + return 1; + if (event->attr.exclude_host && !sde_regs->in_guest) + return 1; + return 0; +} + +/* perf_push_sample() - Push samples to perf + * @event: The perf event + * @sample: Hardware sample data + * + * Use the hardware sample data to create perf event sample. The sample + * is the pushed to the event subsystem and the function checks for + * possible event overflows. If an event overflow occurs, the PMU is + * stopped. + * + * Return non-zero if an event overflow occurred. + */ +static int perf_push_sample(struct perf_event *event, + struct hws_basic_entry *basic) +{ + int overflow; + struct pt_regs regs; + struct perf_sf_sde_regs *sde_regs; + struct perf_sample_data data; + + /* Setup perf sample */ + perf_sample_data_init(&data, 0, event->hw.last_period); + + /* Setup pt_regs to look like an CPU-measurement external interrupt + * using the Program Request Alert code. The regs.int_parm_long + * field which is unused contains additional sample-data-entry related + * indicators. + */ + memset(®s, 0, sizeof(regs)); + regs.int_code = 0x1407; + regs.int_parm = CPU_MF_INT_SF_PRA; + sde_regs = (struct perf_sf_sde_regs *) ®s.int_parm_long; + + psw_bits(regs.psw).ia = basic->ia; + psw_bits(regs.psw).dat = basic->T; + psw_bits(regs.psw).wait = basic->W; + psw_bits(regs.psw).pstate = basic->P; + psw_bits(regs.psw).as = basic->AS; + + /* + * Use the hardware provided configuration level to decide if the + * sample belongs to a guest or host. If that is not available, + * fall back to the following heuristics: + * A non-zero guest program parameter always indicates a guest + * sample. Some early samples or samples from guests without + * lpp usage would be misaccounted to the host. We use the asn + * value as an addon heuristic to detect most of these guest samples. + * If the value differs from 0xffff (the host value), we assume to + * be a KVM guest. + */ + switch (basic->CL) { + case 1: /* logical partition */ + sde_regs->in_guest = 0; + break; + case 2: /* virtual machine */ + sde_regs->in_guest = 1; + break; + default: /* old machine, use heuristics */ + if (basic->gpp || basic->prim_asn != 0xffff) + sde_regs->in_guest = 1; + break; + } + + /* + * Store the PID value from the sample-data-entry to be + * processed and resolved by cpumsf_output_event_pid(). + */ + data.tid_entry.pid = basic->hpp & LPP_PID_MASK; + + overflow = 0; + if (perf_exclude_event(event, ®s, sde_regs)) + goto out; + if (perf_event_overflow(event, &data, ®s)) { + overflow = 1; + event->pmu->stop(event, 0); + } + perf_event_update_userpage(event); +out: + return overflow; +} + +static void perf_event_count_update(struct perf_event *event, u64 count) +{ + local64_add(count, &event->count); +} + +static void debug_sample_entry(struct hws_basic_entry *sample, + struct hws_trailer_entry *te) +{ + debug_sprintf_event(sfdbg, 4, "hw_collect_samples: Found unknown " + "sampling data entry: te->f=%i basic.def=%04x (%p)\n", + te->f, sample->def, sample); +} + +/* hw_collect_samples() - Walk through a sample-data-block and collect samples + * @event: The perf event + * @sdbt: Sample-data-block table + * @overflow: Event overflow counter + * + * Walks through a sample-data-block and collects sampling data entries that are + * then pushed to the perf event subsystem. Depending on the sampling function, + * there can be either basic-sampling or combined-sampling data entries. A + * combined-sampling data entry consists of a basic- and a diagnostic-sampling + * data entry. The sampling function is determined by the flags in the perf + * event hardware structure. The function always works with a combined-sampling + * data entry but ignores the the diagnostic portion if it is not available. + * + * Note that the implementation focuses on basic-sampling data entries and, if + * such an entry is not valid, the entire combined-sampling data entry is + * ignored. + * + * The overflow variables counts the number of samples that has been discarded + * due to a perf event overflow. + */ +static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt, + unsigned long long *overflow) +{ + struct hws_trailer_entry *te; + struct hws_basic_entry *sample; + + te = (struct hws_trailer_entry *) trailer_entry_ptr(*sdbt); + sample = (struct hws_basic_entry *) *sdbt; + while ((unsigned long *) sample < (unsigned long *) te) { + /* Check for an empty sample */ + if (!sample->def) + break; + + /* Update perf event period */ + perf_event_count_update(event, SAMPL_RATE(&event->hw)); + + /* Check whether sample is valid */ + if (sample->def == 0x0001) { + /* If an event overflow occurred, the PMU is stopped to + * throttle event delivery. Remaining sample data is + * discarded. + */ + if (!*overflow) { + /* Check whether sample is consistent */ + if (sample->I == 0 && sample->W == 0) { + /* Deliver sample data to perf */ + *overflow = perf_push_sample(event, + sample); + } + } else + /* Count discarded samples */ + *overflow += 1; + } else { + debug_sample_entry(sample, te); + /* Sample slot is not yet written or other record. + * + * This condition can occur if the buffer was reused + * from a combined basic- and diagnostic-sampling. + * If only basic-sampling is then active, entries are + * written into the larger diagnostic entries. + * This is typically the case for sample-data-blocks + * that are not full. Stop processing if the first + * invalid format was detected. + */ + if (!te->f) + break; + } + + /* Reset sample slot and advance to next sample */ + sample->def = 0; + sample++; + } +} + +/* hw_perf_event_update() - Process sampling buffer + * @event: The perf event + * @flush_all: Flag to also flush partially filled sample-data-blocks + * + * Processes the sampling buffer and create perf event samples. + * The sampling buffer position are retrieved and saved in the TEAR_REG + * register of the specified perf event. + * + * Only full sample-data-blocks are processed. Specify the flash_all flag + * to also walk through partially filled sample-data-blocks. It is ignored + * if PERF_CPUM_SF_FULL_BLOCKS is set. The PERF_CPUM_SF_FULL_BLOCKS flag + * enforces the processing of full sample-data-blocks only (trailer entries + * with the block-full-indicator bit set). + */ +static void hw_perf_event_update(struct perf_event *event, int flush_all) +{ + struct hw_perf_event *hwc = &event->hw; + struct hws_trailer_entry *te; + unsigned long *sdbt; + unsigned long long event_overflow, sampl_overflow, num_sdb, te_flags; + int done; + + /* + * AUX buffer is used when in diagnostic sampling mode. + * No perf events/samples are created. + */ + if (SAMPL_DIAG_MODE(&event->hw)) + return; + + if (flush_all && SDB_FULL_BLOCKS(hwc)) + flush_all = 0; + + sdbt = (unsigned long *) TEAR_REG(hwc); + done = event_overflow = sampl_overflow = num_sdb = 0; + while (!done) { + /* Get the trailer entry of the sample-data-block */ + te = (struct hws_trailer_entry *) trailer_entry_ptr(*sdbt); + + /* Leave loop if no more work to do (block full indicator) */ + if (!te->f) { + done = 1; + if (!flush_all) + break; + } + + /* Check the sample overflow count */ + if (te->overflow) + /* Account sample overflows and, if a particular limit + * is reached, extend the sampling buffer. + * For details, see sfb_account_overflows(). + */ + sampl_overflow += te->overflow; + + /* Timestamps are valid for full sample-data-blocks only */ + debug_sprintf_event(sfdbg, 6, "hw_perf_event_update: sdbt=%p " + "overflow=%llu timestamp=0x%llx\n", + sdbt, te->overflow, + (te->f) ? trailer_timestamp(te) : 0ULL); + + /* Collect all samples from a single sample-data-block and + * flag if an (perf) event overflow happened. If so, the PMU + * is stopped and remaining samples will be discarded. + */ + hw_collect_samples(event, sdbt, &event_overflow); + num_sdb++; + + /* Reset trailer (using compare-double-and-swap) */ + do { + te_flags = te->flags & ~SDB_TE_BUFFER_FULL_MASK; + te_flags |= SDB_TE_ALERT_REQ_MASK; + } while (!cmpxchg_double(&te->flags, &te->overflow, + te->flags, te->overflow, + te_flags, 0ULL)); + + /* Advance to next sample-data-block */ + sdbt++; + if (is_link_entry(sdbt)) + sdbt = get_next_sdbt(sdbt); + + /* Update event hardware registers */ + TEAR_REG(hwc) = (unsigned long) sdbt; + + /* Stop processing sample-data if all samples of the current + * sample-data-block were flushed even if it was not full. + */ + if (flush_all && done) + break; + } + + /* Account sample overflows in the event hardware structure */ + if (sampl_overflow) + OVERFLOW_REG(hwc) = DIV_ROUND_UP(OVERFLOW_REG(hwc) + + sampl_overflow, 1 + num_sdb); + + /* Perf_event_overflow() and perf_event_account_interrupt() limit + * the interrupt rate to an upper limit. Roughly 1000 samples per + * task tick. + * Hitting this limit results in a large number + * of throttled REF_REPORT_THROTTLE entries and the samples + * are dropped. + * Slightly increase the interval to avoid hitting this limit. + */ + if (event_overflow) { + SAMPL_RATE(hwc) += DIV_ROUND_UP(SAMPL_RATE(hwc), 10); + debug_sprintf_event(sfdbg, 1, "%s: rate adjustment %ld\n", + __func__, + DIV_ROUND_UP(SAMPL_RATE(hwc), 10)); + } + + if (sampl_overflow || event_overflow) + debug_sprintf_event(sfdbg, 4, "hw_perf_event_update: " + "overflow stats: sample=%llu event=%llu\n", + sampl_overflow, event_overflow); +} + +#define AUX_SDB_INDEX(aux, i) ((i) % aux->sfb.num_sdb) +#define AUX_SDB_NUM(aux, start, end) (end >= start ? end - start + 1 : 0) +#define AUX_SDB_NUM_ALERT(aux) AUX_SDB_NUM(aux, aux->head, aux->alert_mark) +#define AUX_SDB_NUM_EMPTY(aux) AUX_SDB_NUM(aux, aux->head, aux->empty_mark) + +/* + * Get trailer entry by index of SDB. + */ +static struct hws_trailer_entry *aux_sdb_trailer(struct aux_buffer *aux, + unsigned long index) +{ + unsigned long sdb; + + index = AUX_SDB_INDEX(aux, index); + sdb = aux->sdb_index[index]; + return (struct hws_trailer_entry *)trailer_entry_ptr(sdb); +} + +/* + * Finish sampling on the cpu. Called by cpumsf_pmu_del() with pmu + * disabled. Collect the full SDBs in AUX buffer which have not reached + * the point of alert indicator. And ignore the SDBs which are not + * full. + * + * 1. Scan SDBs to see how much data is there and consume them. + * 2. Remove alert indicator in the buffer. + */ +static void aux_output_end(struct perf_output_handle *handle) +{ + unsigned long i, range_scan, idx; + struct aux_buffer *aux; + struct hws_trailer_entry *te; + + aux = perf_get_aux(handle); + if (!aux) + return; + + range_scan = AUX_SDB_NUM_ALERT(aux); + for (i = 0, idx = aux->head; i < range_scan; i++, idx++) { + te = aux_sdb_trailer(aux, idx); + if (!(te->flags & SDB_TE_BUFFER_FULL_MASK)) + break; + } + /* i is num of SDBs which are full */ + perf_aux_output_end(handle, i << PAGE_SHIFT); + + /* Remove alert indicators in the buffer */ + te = aux_sdb_trailer(aux, aux->alert_mark); + te->flags &= ~SDB_TE_ALERT_REQ_MASK; + + debug_sprintf_event(sfdbg, 6, "aux_output_end: collect %lx SDBs\n", i); +} + +/* + * Start sampling on the CPU. Called by cpumsf_pmu_add() when an event + * is first added to the CPU or rescheduled again to the CPU. It is called + * with pmu disabled. + * + * 1. Reset the trailer of SDBs to get ready for new data. + * 2. Tell the hardware where to put the data by reset the SDBs buffer + * head(tear/dear). + */ +static int aux_output_begin(struct perf_output_handle *handle, + struct aux_buffer *aux, + struct cpu_hw_sf *cpuhw) +{ + unsigned long range; + unsigned long i, range_scan, idx; + unsigned long head, base, offset; + struct hws_trailer_entry *te; + + if (WARN_ON_ONCE(handle->head & ~PAGE_MASK)) + return -EINVAL; + + aux->head = handle->head >> PAGE_SHIFT; + range = (handle->size + 1) >> PAGE_SHIFT; + if (range <= 1) + return -ENOMEM; + + /* + * SDBs between aux->head and aux->empty_mark are already ready + * for new data. range_scan is num of SDBs not within them. + */ + if (range > AUX_SDB_NUM_EMPTY(aux)) { + range_scan = range - AUX_SDB_NUM_EMPTY(aux); + idx = aux->empty_mark + 1; + for (i = 0; i < range_scan; i++, idx++) { + te = aux_sdb_trailer(aux, idx); + te->flags &= ~(SDB_TE_BUFFER_FULL_MASK | + SDB_TE_ALERT_REQ_MASK); + te->overflow = 0; + } + /* Save the position of empty SDBs */ + aux->empty_mark = aux->head + range - 1; + } + + /* Set alert indicator */ + aux->alert_mark = aux->head + range/2 - 1; + te = aux_sdb_trailer(aux, aux->alert_mark); + te->flags = te->flags | SDB_TE_ALERT_REQ_MASK; + + /* Reset hardware buffer head */ + head = AUX_SDB_INDEX(aux, aux->head); + base = aux->sdbt_index[head / CPUM_SF_SDB_PER_TABLE]; + offset = head % CPUM_SF_SDB_PER_TABLE; + cpuhw->lsctl.tear = base + offset * sizeof(unsigned long); + cpuhw->lsctl.dear = aux->sdb_index[head]; + + debug_sprintf_event(sfdbg, 6, "aux_output_begin: " + "head->alert_mark->empty_mark (num_alert, range)" + "[%lx -> %lx -> %lx] (%lx, %lx) " + "tear index %lx, tear %lx dear %lx\n", + aux->head, aux->alert_mark, aux->empty_mark, + AUX_SDB_NUM_ALERT(aux), range, + head / CPUM_SF_SDB_PER_TABLE, + cpuhw->lsctl.tear, + cpuhw->lsctl.dear); + + return 0; +} + +/* + * Set alert indicator on SDB at index @alert_index while sampler is running. + * + * Return true if successfully. + * Return false if full indicator is already set by hardware sampler. + */ +static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index, + unsigned long long *overflow) +{ + unsigned long long orig_overflow, orig_flags, new_flags; + struct hws_trailer_entry *te; + + te = aux_sdb_trailer(aux, alert_index); + do { + orig_flags = te->flags; + *overflow = orig_overflow = te->overflow; + if (orig_flags & SDB_TE_BUFFER_FULL_MASK) { + /* + * SDB is already set by hardware. + * Abort and try to set somewhere + * behind. + */ + return false; + } + new_flags = orig_flags | SDB_TE_ALERT_REQ_MASK; + } while (!cmpxchg_double(&te->flags, &te->overflow, + orig_flags, orig_overflow, + new_flags, 0ULL)); + return true; +} + +/* + * aux_reset_buffer() - Scan and setup SDBs for new samples + * @aux: The AUX buffer to set + * @range: The range of SDBs to scan started from aux->head + * @overflow: Set to overflow count + * + * Set alert indicator on the SDB at index of aux->alert_mark. If this SDB is + * marked as empty, check if it is already set full by the hardware sampler. + * If yes, that means new data is already there before we can set an alert + * indicator. Caller should try to set alert indicator to some position behind. + * + * Scan the SDBs in AUX buffer from behind aux->empty_mark. They are used + * previously and have already been consumed by user space. Reset these SDBs + * (clear full indicator and alert indicator) for new data. + * If aux->alert_mark fall in this area, just set it. Overflow count is + * recorded while scanning. + * + * SDBs between aux->head and aux->empty_mark are already reset at last time. + * and ready for new samples. So scanning on this area could be skipped. + * + * Return true if alert indicator is set successfully and false if not. + */ +static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range, + unsigned long long *overflow) +{ + unsigned long long orig_overflow, orig_flags, new_flags; + unsigned long i, range_scan, idx; + struct hws_trailer_entry *te; + + if (range <= AUX_SDB_NUM_EMPTY(aux)) + /* + * No need to scan. All SDBs in range are marked as empty. + * Just set alert indicator. Should check race with hardware + * sampler. + */ + return aux_set_alert(aux, aux->alert_mark, overflow); + + if (aux->alert_mark <= aux->empty_mark) + /* + * Set alert indicator on empty SDB. Should check race + * with hardware sampler. + */ + if (!aux_set_alert(aux, aux->alert_mark, overflow)) + return false; + + /* + * Scan the SDBs to clear full and alert indicator used previously. + * Start scanning from one SDB behind empty_mark. If the new alert + * indicator fall into this range, set it. + */ + range_scan = range - AUX_SDB_NUM_EMPTY(aux); + idx = aux->empty_mark + 1; + for (i = 0; i < range_scan; i++, idx++) { + te = aux_sdb_trailer(aux, idx); + do { + orig_flags = te->flags; + orig_overflow = te->overflow; + new_flags = orig_flags & ~SDB_TE_BUFFER_FULL_MASK; + if (idx == aux->alert_mark) + new_flags |= SDB_TE_ALERT_REQ_MASK; + else + new_flags &= ~SDB_TE_ALERT_REQ_MASK; + } while (!cmpxchg_double(&te->flags, &te->overflow, + orig_flags, orig_overflow, + new_flags, 0ULL)); + *overflow += orig_overflow; + } + + /* Update empty_mark to new position */ + aux->empty_mark = aux->head + range - 1; + + return true; +} + +/* + * Measurement alert handler for diagnostic mode sampling. + */ +static void hw_collect_aux(struct cpu_hw_sf *cpuhw) +{ + struct aux_buffer *aux; + int done = 0; + unsigned long range = 0, size; + unsigned long long overflow = 0; + struct perf_output_handle *handle = &cpuhw->handle; + unsigned long num_sdb; + + aux = perf_get_aux(handle); + if (WARN_ON_ONCE(!aux)) + return; + + /* Inform user space new data arrived */ + size = AUX_SDB_NUM_ALERT(aux) << PAGE_SHIFT; + perf_aux_output_end(handle, size); + num_sdb = aux->sfb.num_sdb; + + num_sdb = aux->sfb.num_sdb; + while (!done) { + /* Get an output handle */ + aux = perf_aux_output_begin(handle, cpuhw->event); + if (handle->size == 0) { + pr_err("The AUX buffer with %lu pages for the " + "diagnostic-sampling mode is full\n", + num_sdb); + debug_sprintf_event(sfdbg, 1, "AUX buffer used up\n"); + break; + } + if (WARN_ON_ONCE(!aux)) + return; + + /* Update head and alert_mark to new position */ + aux->head = handle->head >> PAGE_SHIFT; + range = (handle->size + 1) >> PAGE_SHIFT; + if (range == 1) + aux->alert_mark = aux->head; + else + aux->alert_mark = aux->head + range/2 - 1; + + if (aux_reset_buffer(aux, range, &overflow)) { + if (!overflow) { + done = 1; + break; + } + size = range << PAGE_SHIFT; + perf_aux_output_end(&cpuhw->handle, size); + pr_err("Sample data caused the AUX buffer with %lu " + "pages to overflow\n", num_sdb); + debug_sprintf_event(sfdbg, 1, "head %lx range %lx " + "overflow %llx\n", + aux->head, range, overflow); + } else { + size = AUX_SDB_NUM_ALERT(aux) << PAGE_SHIFT; + perf_aux_output_end(&cpuhw->handle, size); + debug_sprintf_event(sfdbg, 6, "head %lx alert %lx " + "already full, try another\n", + aux->head, aux->alert_mark); + } + } + + if (done) + debug_sprintf_event(sfdbg, 6, "aux_reset_buffer: " + "[%lx -> %lx -> %lx] (%lx, %lx)\n", + aux->head, aux->alert_mark, aux->empty_mark, + AUX_SDB_NUM_ALERT(aux), range); +} + +/* + * Callback when freeing AUX buffers. + */ +static void aux_buffer_free(void *data) +{ + struct aux_buffer *aux = data; + unsigned long i, num_sdbt; + + if (!aux) + return; + + /* Free SDBT. SDB is freed by the caller */ + num_sdbt = aux->sfb.num_sdbt; + for (i = 0; i < num_sdbt; i++) + free_page(aux->sdbt_index[i]); + + kfree(aux->sdbt_index); + kfree(aux->sdb_index); + kfree(aux); + + debug_sprintf_event(sfdbg, 4, "aux_buffer_free: free " + "%lu SDBTs\n", num_sdbt); +} + +static void aux_sdb_init(unsigned long sdb) +{ + struct hws_trailer_entry *te; + + te = (struct hws_trailer_entry *)trailer_entry_ptr(sdb); + + /* Save clock base */ + te->clock_base = 1; + memcpy(&te->progusage2, &tod_clock_base[1], 8); +} + +/* + * aux_buffer_setup() - Setup AUX buffer for diagnostic mode sampling + * @event: Event the buffer is setup for, event->cpu == -1 means current + * @pages: Array of pointers to buffer pages passed from perf core + * @nr_pages: Total pages + * @snapshot: Flag for snapshot mode + * + * This is the callback when setup an event using AUX buffer. Perf tool can + * trigger this by an additional mmap() call on the event. Unlike the buffer + * for basic samples, AUX buffer belongs to the event. It is scheduled with + * the task among online cpus when it is a per-thread event. + * + * Return the private AUX buffer structure if success or NULL if fails. + */ +static void *aux_buffer_setup(struct perf_event *event, void **pages, + int nr_pages, bool snapshot) +{ + struct sf_buffer *sfb; + struct aux_buffer *aux; + unsigned long *new, *tail; + int i, n_sdbt; + + if (!nr_pages || !pages) + return NULL; + + if (nr_pages > CPUM_SF_MAX_SDB * CPUM_SF_SDB_DIAG_FACTOR) { + pr_err("AUX buffer size (%i pages) is larger than the " + "maximum sampling buffer limit\n", + nr_pages); + return NULL; + } else if (nr_pages < CPUM_SF_MIN_SDB * CPUM_SF_SDB_DIAG_FACTOR) { + pr_err("AUX buffer size (%i pages) is less than the " + "minimum sampling buffer limit\n", + nr_pages); + return NULL; + } + + /* Allocate aux_buffer struct for the event */ + aux = kzalloc(sizeof(struct aux_buffer), GFP_KERNEL); + if (!aux) + goto no_aux; + sfb = &aux->sfb; + + /* Allocate sdbt_index for fast reference */ + n_sdbt = (nr_pages + CPUM_SF_SDB_PER_TABLE - 1) / CPUM_SF_SDB_PER_TABLE; + aux->sdbt_index = kmalloc_array(n_sdbt, sizeof(void *), GFP_KERNEL); + if (!aux->sdbt_index) + goto no_sdbt_index; + + /* Allocate sdb_index for fast reference */ + aux->sdb_index = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL); + if (!aux->sdb_index) + goto no_sdb_index; + + /* Allocate the first SDBT */ + sfb->num_sdbt = 0; + sfb->sdbt = (unsigned long *) get_zeroed_page(GFP_KERNEL); + if (!sfb->sdbt) + goto no_sdbt; + aux->sdbt_index[sfb->num_sdbt++] = (unsigned long)sfb->sdbt; + tail = sfb->tail = sfb->sdbt; + + /* + * Link the provided pages of AUX buffer to SDBT. + * Allocate SDBT if needed. + */ + for (i = 0; i < nr_pages; i++, tail++) { + if (require_table_link(tail)) { + new = (unsigned long *) get_zeroed_page(GFP_KERNEL); + if (!new) + goto no_sdbt; + aux->sdbt_index[sfb->num_sdbt++] = (unsigned long)new; + /* Link current page to tail of chain */ + *tail = (unsigned long)(void *) new + 1; + tail = new; + } + /* Tail is the entry in a SDBT */ + *tail = (unsigned long)pages[i]; + aux->sdb_index[i] = (unsigned long)pages[i]; + aux_sdb_init((unsigned long)pages[i]); + } + sfb->num_sdb = nr_pages; + + /* Link the last entry in the SDBT to the first SDBT */ + *tail = (unsigned long) sfb->sdbt + 1; + sfb->tail = tail; + + /* + * Initial all SDBs are zeroed. Mark it as empty. + * So there is no need to clear the full indicator + * when this event is first added. + */ + aux->empty_mark = sfb->num_sdb - 1; + + debug_sprintf_event(sfdbg, 4, "aux_buffer_setup: setup %lu SDBTs" + " and %lu SDBs\n", + sfb->num_sdbt, sfb->num_sdb); + + return aux; + +no_sdbt: + /* SDBs (AUX buffer pages) are freed by caller */ + for (i = 0; i < sfb->num_sdbt; i++) + free_page(aux->sdbt_index[i]); + kfree(aux->sdb_index); +no_sdb_index: + kfree(aux->sdbt_index); +no_sdbt_index: + kfree(aux); +no_aux: + return NULL; +} + +static void cpumsf_pmu_read(struct perf_event *event) +{ + /* Nothing to do ... updates are interrupt-driven */ +} + +/* Activate sampling control. + * Next call of pmu_enable() starts sampling. + */ +static void cpumsf_pmu_start(struct perf_event *event, int flags) +{ + struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); + + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + if (flags & PERF_EF_RELOAD) + WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); + + perf_pmu_disable(event->pmu); + event->hw.state = 0; + cpuhw->lsctl.cs = 1; + if (SAMPL_DIAG_MODE(&event->hw)) + cpuhw->lsctl.cd = 1; + perf_pmu_enable(event->pmu); +} + +/* Deactivate sampling control. + * Next call of pmu_enable() stops sampling. + */ +static void cpumsf_pmu_stop(struct perf_event *event, int flags) +{ + struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); + + if (event->hw.state & PERF_HES_STOPPED) + return; + + perf_pmu_disable(event->pmu); + cpuhw->lsctl.cs = 0; + cpuhw->lsctl.cd = 0; + event->hw.state |= PERF_HES_STOPPED; + + if ((flags & PERF_EF_UPDATE) && !(event->hw.state & PERF_HES_UPTODATE)) { + hw_perf_event_update(event, 1); + event->hw.state |= PERF_HES_UPTODATE; + } + perf_pmu_enable(event->pmu); +} + +static int cpumsf_pmu_add(struct perf_event *event, int flags) +{ + struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); + struct aux_buffer *aux; + int err; + + if (cpuhw->flags & PMU_F_IN_USE) + return -EAGAIN; + + if (!SAMPL_DIAG_MODE(&event->hw) && !cpuhw->sfb.sdbt) + return -EINVAL; + + err = 0; + perf_pmu_disable(event->pmu); + + event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + /* Set up sampling controls. Always program the sampling register + * using the SDB-table start. Reset TEAR_REG event hardware register + * that is used by hw_perf_event_update() to store the sampling buffer + * position after samples have been flushed. + */ + cpuhw->lsctl.s = 0; + cpuhw->lsctl.h = 1; + cpuhw->lsctl.interval = SAMPL_RATE(&event->hw); + if (!SAMPL_DIAG_MODE(&event->hw)) { + cpuhw->lsctl.tear = (unsigned long) cpuhw->sfb.sdbt; + cpuhw->lsctl.dear = *(unsigned long *) cpuhw->sfb.sdbt; + hw_reset_registers(&event->hw, cpuhw->sfb.sdbt); + } + + /* Ensure sampling functions are in the disabled state. If disabled, + * switch on sampling enable control. */ + if (WARN_ON_ONCE(cpuhw->lsctl.es == 1 || cpuhw->lsctl.ed == 1)) { + err = -EAGAIN; + goto out; + } + if (SAMPL_DIAG_MODE(&event->hw)) { + aux = perf_aux_output_begin(&cpuhw->handle, event); + if (!aux) { + err = -EINVAL; + goto out; + } + err = aux_output_begin(&cpuhw->handle, aux, cpuhw); + if (err) + goto out; + cpuhw->lsctl.ed = 1; + } + cpuhw->lsctl.es = 1; + + /* Set in_use flag and store event */ + cpuhw->event = event; + cpuhw->flags |= PMU_F_IN_USE; + + if (flags & PERF_EF_START) + cpumsf_pmu_start(event, PERF_EF_RELOAD); +out: + perf_event_update_userpage(event); + perf_pmu_enable(event->pmu); + return err; +} + +static void cpumsf_pmu_del(struct perf_event *event, int flags) +{ + struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); + + perf_pmu_disable(event->pmu); + cpumsf_pmu_stop(event, PERF_EF_UPDATE); + + cpuhw->lsctl.es = 0; + cpuhw->lsctl.ed = 0; + cpuhw->flags &= ~PMU_F_IN_USE; + cpuhw->event = NULL; + + if (SAMPL_DIAG_MODE(&event->hw)) + aux_output_end(&cpuhw->handle); + perf_event_update_userpage(event); + perf_pmu_enable(event->pmu); +} + +CPUMF_EVENT_ATTR(SF, SF_CYCLES_BASIC, PERF_EVENT_CPUM_SF); +CPUMF_EVENT_ATTR(SF, SF_CYCLES_BASIC_DIAG, PERF_EVENT_CPUM_SF_DIAG); + +static struct attribute *cpumsf_pmu_events_attr[] = { + CPUMF_EVENT_PTR(SF, SF_CYCLES_BASIC), + NULL, + NULL, +}; + +PMU_FORMAT_ATTR(event, "config:0-63"); + +static struct attribute *cpumsf_pmu_format_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group cpumsf_pmu_events_group = { + .name = "events", + .attrs = cpumsf_pmu_events_attr, +}; +static struct attribute_group cpumsf_pmu_format_group = { + .name = "format", + .attrs = cpumsf_pmu_format_attr, +}; +static const struct attribute_group *cpumsf_pmu_attr_groups[] = { + &cpumsf_pmu_events_group, + &cpumsf_pmu_format_group, + NULL, +}; + +static struct pmu cpumf_sampling = { + .pmu_enable = cpumsf_pmu_enable, + .pmu_disable = cpumsf_pmu_disable, + + .event_init = cpumsf_pmu_event_init, + .add = cpumsf_pmu_add, + .del = cpumsf_pmu_del, + + .start = cpumsf_pmu_start, + .stop = cpumsf_pmu_stop, + .read = cpumsf_pmu_read, + + .attr_groups = cpumsf_pmu_attr_groups, + + .setup_aux = aux_buffer_setup, + .free_aux = aux_buffer_free, +}; + +static void cpumf_measurement_alert(struct ext_code ext_code, + unsigned int alert, unsigned long unused) +{ + struct cpu_hw_sf *cpuhw; + + if (!(alert & CPU_MF_INT_SF_MASK)) + return; + inc_irq_stat(IRQEXT_CMS); + cpuhw = this_cpu_ptr(&cpu_hw_sf); + + /* Measurement alerts are shared and might happen when the PMU + * is not reserved. Ignore these alerts in this case. */ + if (!(cpuhw->flags & PMU_F_RESERVED)) + return; + + /* The processing below must take care of multiple alert events that + * might be indicated concurrently. */ + + /* Program alert request */ + if (alert & CPU_MF_INT_SF_PRA) { + if (cpuhw->flags & PMU_F_IN_USE) + if (SAMPL_DIAG_MODE(&cpuhw->event->hw)) + hw_collect_aux(cpuhw); + else + hw_perf_event_update(cpuhw->event, 0); + else + WARN_ON_ONCE(!(cpuhw->flags & PMU_F_IN_USE)); + } + + /* Report measurement alerts only for non-PRA codes */ + if (alert != CPU_MF_INT_SF_PRA) + debug_sprintf_event(sfdbg, 6, "measurement alert: 0x%x\n", alert); + + /* Sampling authorization change request */ + if (alert & CPU_MF_INT_SF_SACA) + qsi(&cpuhw->qsi); + + /* Loss of sample data due to high-priority machine activities */ + if (alert & CPU_MF_INT_SF_LSDA) { + pr_err("Sample data was lost\n"); + cpuhw->flags |= PMU_F_ERR_LSDA; + sf_disable(); + } + + /* Invalid sampling buffer entry */ + if (alert & (CPU_MF_INT_SF_IAE|CPU_MF_INT_SF_ISE)) { + pr_err("A sampling buffer entry is incorrect (alert=0x%x)\n", + alert); + cpuhw->flags |= PMU_F_ERR_IBE; + sf_disable(); + } +} +static int cpusf_pmu_setup(unsigned int cpu, int flags) +{ + /* Ignore the notification if no events are scheduled on the PMU. + * This might be racy... + */ + if (!atomic_read(&num_events)) + return 0; + + local_irq_disable(); + setup_pmc_cpu(&flags); + local_irq_enable(); + return 0; +} + +static int s390_pmu_sf_online_cpu(unsigned int cpu) +{ + return cpusf_pmu_setup(cpu, PMC_INIT); +} + +static int s390_pmu_sf_offline_cpu(unsigned int cpu) +{ + return cpusf_pmu_setup(cpu, PMC_RELEASE); +} + +static int param_get_sfb_size(char *buffer, const struct kernel_param *kp) +{ + if (!cpum_sf_avail()) + return -ENODEV; + return sprintf(buffer, "%lu,%lu", CPUM_SF_MIN_SDB, CPUM_SF_MAX_SDB); +} + +static int param_set_sfb_size(const char *val, const struct kernel_param *kp) +{ + int rc; + unsigned long min, max; + + if (!cpum_sf_avail()) + return -ENODEV; + if (!val || !strlen(val)) + return -EINVAL; + + /* Valid parameter values: "min,max" or "max" */ + min = CPUM_SF_MIN_SDB; + max = CPUM_SF_MAX_SDB; + if (strchr(val, ',')) + rc = (sscanf(val, "%lu,%lu", &min, &max) == 2) ? 0 : -EINVAL; + else + rc = kstrtoul(val, 10, &max); + + if (min < 2 || min >= max || max > get_num_physpages()) + rc = -EINVAL; + if (rc) + return rc; + + sfb_set_limits(min, max); + pr_info("The sampling buffer limits have changed to: " + "min=%lu max=%lu (diag=x%lu)\n", + CPUM_SF_MIN_SDB, CPUM_SF_MAX_SDB, CPUM_SF_SDB_DIAG_FACTOR); + return 0; +} + +#define param_check_sfb_size(name, p) __param_check(name, p, void) +static const struct kernel_param_ops param_ops_sfb_size = { + .set = param_set_sfb_size, + .get = param_get_sfb_size, +}; + +#define RS_INIT_FAILURE_QSI 0x0001 +#define RS_INIT_FAILURE_BSDES 0x0002 +#define RS_INIT_FAILURE_ALRT 0x0003 +#define RS_INIT_FAILURE_PERF 0x0004 +static void __init pr_cpumsf_err(unsigned int reason) +{ + pr_err("Sampling facility support for perf is not available: " + "reason=%04x\n", reason); +} + +static int __init init_cpum_sampling_pmu(void) +{ + struct hws_qsi_info_block si; + int err; + + if (!cpum_sf_avail()) + return -ENODEV; + + memset(&si, 0, sizeof(si)); + if (qsi(&si)) { + pr_cpumsf_err(RS_INIT_FAILURE_QSI); + return -ENODEV; + } + + if (!si.as && !si.ad) + return -ENODEV; + + if (si.bsdes != sizeof(struct hws_basic_entry)) { + pr_cpumsf_err(RS_INIT_FAILURE_BSDES); + return -EINVAL; + } + + if (si.ad) { + sfb_set_limits(CPUM_SF_MIN_SDB, CPUM_SF_MAX_SDB); + cpumsf_pmu_events_attr[1] = + CPUMF_EVENT_PTR(SF, SF_CYCLES_BASIC_DIAG); + } + + sfdbg = debug_register(KMSG_COMPONENT, 2, 1, 80); + if (!sfdbg) { + pr_err("Registering for s390dbf failed\n"); + return -ENOMEM; + } + debug_register_view(sfdbg, &debug_sprintf_view); + + err = register_external_irq(EXT_IRQ_MEASURE_ALERT, + cpumf_measurement_alert); + if (err) { + pr_cpumsf_err(RS_INIT_FAILURE_ALRT); + debug_unregister(sfdbg); + goto out; + } + + err = perf_pmu_register(&cpumf_sampling, "cpum_sf", PERF_TYPE_RAW); + if (err) { + pr_cpumsf_err(RS_INIT_FAILURE_PERF); + unregister_external_irq(EXT_IRQ_MEASURE_ALERT, + cpumf_measurement_alert); + debug_unregister(sfdbg); + goto out; + } + + cpuhp_setup_state(CPUHP_AP_PERF_S390_SF_ONLINE, "perf/s390/sf:online", + s390_pmu_sf_online_cpu, s390_pmu_sf_offline_cpu); +out: + return err; +} +arch_initcall(init_cpum_sampling_pmu); +core_param(cpum_sfb_size, CPUM_SF_MAX_SDB, sfb_size, 0644); diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c new file mode 100644 index 000000000..0d770e513 --- /dev/null +++ b/arch/s390/kernel/perf_event.c @@ -0,0 +1,246 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Performance event support for s390x + * + * Copyright IBM Corp. 2012, 2013 + * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> + */ +#define KMSG_COMPONENT "perf" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/kernel.h> +#include <linux/perf_event.h> +#include <linux/kvm_host.h> +#include <linux/percpu.h> +#include <linux/export.h> +#include <linux/seq_file.h> +#include <linux/spinlock.h> +#include <linux/sysfs.h> +#include <asm/irq.h> +#include <asm/cpu_mf.h> +#include <asm/lowcore.h> +#include <asm/processor.h> +#include <asm/sysinfo.h> + +const char *perf_pmu_name(void) +{ + if (cpum_cf_avail() || cpum_sf_avail()) + return "CPU-Measurement Facilities (CPU-MF)"; + return "pmu"; +} +EXPORT_SYMBOL(perf_pmu_name); + +int perf_num_counters(void) +{ + int num = 0; + + if (cpum_cf_avail()) + num += PERF_CPUM_CF_MAX_CTR; + if (cpum_sf_avail()) + num += PERF_CPUM_SF_MAX_CTR; + + return num; +} +EXPORT_SYMBOL(perf_num_counters); + +static struct kvm_s390_sie_block *sie_block(struct pt_regs *regs) +{ + struct stack_frame *stack = (struct stack_frame *) regs->gprs[15]; + + if (!stack) + return NULL; + + return (struct kvm_s390_sie_block *) stack->empty1[0]; +} + +static bool is_in_guest(struct pt_regs *regs) +{ + if (user_mode(regs)) + return false; +#if IS_ENABLED(CONFIG_KVM) + return instruction_pointer(regs) == (unsigned long) &sie_exit; +#else + return false; +#endif +} + +static unsigned long guest_is_user_mode(struct pt_regs *regs) +{ + return sie_block(regs)->gpsw.mask & PSW_MASK_PSTATE; +} + +static unsigned long instruction_pointer_guest(struct pt_regs *regs) +{ + return sie_block(regs)->gpsw.addr; +} + +unsigned long perf_instruction_pointer(struct pt_regs *regs) +{ + return is_in_guest(regs) ? instruction_pointer_guest(regs) + : instruction_pointer(regs); +} + +static unsigned long perf_misc_guest_flags(struct pt_regs *regs) +{ + return guest_is_user_mode(regs) ? PERF_RECORD_MISC_GUEST_USER + : PERF_RECORD_MISC_GUEST_KERNEL; +} + +static unsigned long perf_misc_flags_sf(struct pt_regs *regs) +{ + struct perf_sf_sde_regs *sde_regs; + unsigned long flags; + + sde_regs = (struct perf_sf_sde_regs *) ®s->int_parm_long; + if (sde_regs->in_guest) + flags = user_mode(regs) ? PERF_RECORD_MISC_GUEST_USER + : PERF_RECORD_MISC_GUEST_KERNEL; + else + flags = user_mode(regs) ? PERF_RECORD_MISC_USER + : PERF_RECORD_MISC_KERNEL; + return flags; +} + +unsigned long perf_misc_flags(struct pt_regs *regs) +{ + /* Check if the cpum_sf PMU has created the pt_regs structure. + * In this case, perf misc flags can be easily extracted. Otherwise, + * do regular checks on the pt_regs content. + */ + if (regs->int_code == 0x1407 && regs->int_parm == CPU_MF_INT_SF_PRA) + if (!regs->gprs[15]) + return perf_misc_flags_sf(regs); + + if (is_in_guest(regs)) + return perf_misc_guest_flags(regs); + + return user_mode(regs) ? PERF_RECORD_MISC_USER + : PERF_RECORD_MISC_KERNEL; +} + +static void print_debug_cf(void) +{ + struct cpumf_ctr_info cf_info; + int cpu = smp_processor_id(); + + memset(&cf_info, 0, sizeof(cf_info)); + if (!qctri(&cf_info)) + pr_info("CPU[%i] CPUM_CF: ver=%u.%u A=%04x E=%04x C=%04x\n", + cpu, cf_info.cfvn, cf_info.csvn, + cf_info.auth_ctl, cf_info.enable_ctl, cf_info.act_ctl); +} + +static void print_debug_sf(void) +{ + struct hws_qsi_info_block si; + int cpu = smp_processor_id(); + + memset(&si, 0, sizeof(si)); + if (qsi(&si)) + return; + + pr_info("CPU[%i] CPUM_SF: basic=%i diag=%i min=%lu max=%lu cpu_speed=%u\n", + cpu, si.as, si.ad, si.min_sampl_rate, si.max_sampl_rate, + si.cpu_speed); + + if (si.as) + pr_info("CPU[%i] CPUM_SF: Basic-sampling: a=%i e=%i c=%i" + " bsdes=%i tear=%016lx dear=%016lx\n", cpu, + si.as, si.es, si.cs, si.bsdes, si.tear, si.dear); + if (si.ad) + pr_info("CPU[%i] CPUM_SF: Diagnostic-sampling: a=%i e=%i c=%i" + " dsdes=%i tear=%016lx dear=%016lx\n", cpu, + si.ad, si.ed, si.cd, si.dsdes, si.tear, si.dear); +} + +void perf_event_print_debug(void) +{ + unsigned long flags; + + local_irq_save(flags); + if (cpum_cf_avail()) + print_debug_cf(); + if (cpum_sf_avail()) + print_debug_sf(); + local_irq_restore(flags); +} + +/* Service level infrastructure */ +static void sl_print_counter(struct seq_file *m) +{ + struct cpumf_ctr_info ci; + + memset(&ci, 0, sizeof(ci)); + if (qctri(&ci)) + return; + + seq_printf(m, "CPU-MF: Counter facility: version=%u.%u " + "authorization=%04x\n", ci.cfvn, ci.csvn, ci.auth_ctl); +} + +static void sl_print_sampling(struct seq_file *m) +{ + struct hws_qsi_info_block si; + + memset(&si, 0, sizeof(si)); + if (qsi(&si)) + return; + + if (!si.as && !si.ad) + return; + + seq_printf(m, "CPU-MF: Sampling facility: min_rate=%lu max_rate=%lu" + " cpu_speed=%u\n", si.min_sampl_rate, si.max_sampl_rate, + si.cpu_speed); + if (si.as) + seq_printf(m, "CPU-MF: Sampling facility: mode=basic" + " sample_size=%u\n", si.bsdes); + if (si.ad) + seq_printf(m, "CPU-MF: Sampling facility: mode=diagnostic" + " sample_size=%u\n", si.dsdes); +} + +static void service_level_perf_print(struct seq_file *m, + struct service_level *sl) +{ + if (cpum_cf_avail()) + sl_print_counter(m); + if (cpum_sf_avail()) + sl_print_sampling(m); +} + +static struct service_level service_level_perf = { + .seq_print = service_level_perf_print, +}; + +static int __init service_level_perf_register(void) +{ + return register_service_level(&service_level_perf); +} +arch_initcall(service_level_perf_register); + +static int __perf_callchain_kernel(void *data, unsigned long address, int reliable) +{ + struct perf_callchain_entry_ctx *entry = data; + + perf_callchain_store(entry, address); + return 0; +} + +void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, + struct pt_regs *regs) +{ + if (user_mode(regs)) + return; + dump_trace(__perf_callchain_kernel, entry, NULL, regs->gprs[15]); +} + +/* Perf definitions for PMU event attributes in sysfs */ +ssize_t cpumf_events_sysfs_show(struct device *dev, + struct device_attribute *attr, char *page) +{ + struct perf_pmu_events_attr *pmu_attr; + + pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr); + return sprintf(page, "event=0x%04llx\n", pmu_attr->id); +} diff --git a/arch/s390/kernel/perf_regs.c b/arch/s390/kernel/perf_regs.c new file mode 100644 index 000000000..4352a504f --- /dev/null +++ b/arch/s390/kernel/perf_regs.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/perf_event.h> +#include <linux/perf_regs.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/bug.h> +#include <asm/ptrace.h> +#include <asm/fpu/api.h> +#include <asm/fpu/types.h> + +u64 perf_reg_value(struct pt_regs *regs, int idx) +{ + freg_t fp; + + if (idx >= PERF_REG_S390_R0 && idx <= PERF_REG_S390_R15) + return regs->gprs[idx]; + + if (idx >= PERF_REG_S390_FP0 && idx <= PERF_REG_S390_FP15) { + if (!user_mode(regs)) + return 0; + + idx -= PERF_REG_S390_FP0; + fp = MACHINE_HAS_VX ? *(freg_t *)(current->thread.fpu.vxrs + idx) + : current->thread.fpu.fprs[idx]; + return fp.ui; + } + + if (idx == PERF_REG_S390_MASK) + return regs->psw.mask; + if (idx == PERF_REG_S390_PC) + return regs->psw.addr; + + WARN_ON_ONCE((u32)idx >= PERF_REG_S390_MAX); + return 0; +} + +#define REG_RESERVED (~((1UL << PERF_REG_S390_MAX) - 1)) + +int perf_reg_validate(u64 mask) +{ + if (!mask || mask & REG_RESERVED) + return -EINVAL; + + return 0; +} + +u64 perf_reg_abi(struct task_struct *task) +{ + if (test_tsk_thread_flag(task, TIF_31BIT)) + return PERF_SAMPLE_REGS_ABI_32; + + return PERF_SAMPLE_REGS_ABI_64; +} + +void perf_get_regs_user(struct perf_regs *regs_user, + struct pt_regs *regs, + struct pt_regs *regs_user_copy) +{ + /* + * Use the regs from the first interruption and let + * perf_sample_regs_intr() handle interrupts (regs == get_irq_regs()). + * + * Also save FPU registers for user-space tasks only. + */ + regs_user->regs = task_pt_regs(current); + if (user_mode(regs_user->regs)) + save_fpu_regs(); + regs_user->abi = perf_reg_abi(current); +} diff --git a/arch/s390/kernel/pgm_check.S b/arch/s390/kernel/pgm_check.S new file mode 100644 index 000000000..3e62aae34 --- /dev/null +++ b/arch/s390/kernel/pgm_check.S @@ -0,0 +1,147 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Program check table. + * + * Copyright IBM Corp. 2012 + */ + +#include <linux/linkage.h> + +#define PGM_CHECK(handler) .long handler +#define PGM_CHECK_DEFAULT PGM_CHECK(default_trap_handler) + +/* + * The program check table contains exactly 128 (0x00-0x7f) entries. Each + * line defines the function to be called corresponding to the program check + * interruption code. + */ +.section .rodata, "a" +ENTRY(pgm_check_table) +PGM_CHECK_DEFAULT /* 00 */ +PGM_CHECK(illegal_op) /* 01 */ +PGM_CHECK(privileged_op) /* 02 */ +PGM_CHECK(execute_exception) /* 03 */ +PGM_CHECK(do_protection_exception) /* 04 */ +PGM_CHECK(addressing_exception) /* 05 */ +PGM_CHECK(specification_exception) /* 06 */ +PGM_CHECK(data_exception) /* 07 */ +PGM_CHECK(overflow_exception) /* 08 */ +PGM_CHECK(divide_exception) /* 09 */ +PGM_CHECK(overflow_exception) /* 0a */ +PGM_CHECK(divide_exception) /* 0b */ +PGM_CHECK(hfp_overflow_exception) /* 0c */ +PGM_CHECK(hfp_underflow_exception) /* 0d */ +PGM_CHECK(hfp_significance_exception) /* 0e */ +PGM_CHECK(hfp_divide_exception) /* 0f */ +PGM_CHECK(do_dat_exception) /* 10 */ +PGM_CHECK(do_dat_exception) /* 11 */ +PGM_CHECK(translation_exception) /* 12 */ +PGM_CHECK(special_op_exception) /* 13 */ +PGM_CHECK_DEFAULT /* 14 */ +PGM_CHECK(operand_exception) /* 15 */ +PGM_CHECK_DEFAULT /* 16 */ +PGM_CHECK_DEFAULT /* 17 */ +PGM_CHECK(transaction_exception) /* 18 */ +PGM_CHECK_DEFAULT /* 19 */ +PGM_CHECK_DEFAULT /* 1a */ +PGM_CHECK(vector_exception) /* 1b */ +PGM_CHECK(space_switch_exception) /* 1c */ +PGM_CHECK(hfp_sqrt_exception) /* 1d */ +PGM_CHECK_DEFAULT /* 1e */ +PGM_CHECK_DEFAULT /* 1f */ +PGM_CHECK_DEFAULT /* 20 */ +PGM_CHECK_DEFAULT /* 21 */ +PGM_CHECK_DEFAULT /* 22 */ +PGM_CHECK_DEFAULT /* 23 */ +PGM_CHECK_DEFAULT /* 24 */ +PGM_CHECK_DEFAULT /* 25 */ +PGM_CHECK_DEFAULT /* 26 */ +PGM_CHECK_DEFAULT /* 27 */ +PGM_CHECK_DEFAULT /* 28 */ +PGM_CHECK_DEFAULT /* 29 */ +PGM_CHECK_DEFAULT /* 2a */ +PGM_CHECK_DEFAULT /* 2b */ +PGM_CHECK_DEFAULT /* 2c */ +PGM_CHECK_DEFAULT /* 2d */ +PGM_CHECK_DEFAULT /* 2e */ +PGM_CHECK_DEFAULT /* 2f */ +PGM_CHECK_DEFAULT /* 30 */ +PGM_CHECK_DEFAULT /* 31 */ +PGM_CHECK_DEFAULT /* 32 */ +PGM_CHECK_DEFAULT /* 33 */ +PGM_CHECK_DEFAULT /* 34 */ +PGM_CHECK_DEFAULT /* 35 */ +PGM_CHECK_DEFAULT /* 36 */ +PGM_CHECK_DEFAULT /* 37 */ +PGM_CHECK(do_dat_exception) /* 38 */ +PGM_CHECK(do_dat_exception) /* 39 */ +PGM_CHECK(do_dat_exception) /* 3a */ +PGM_CHECK(do_dat_exception) /* 3b */ +PGM_CHECK_DEFAULT /* 3c */ +PGM_CHECK_DEFAULT /* 3d */ +PGM_CHECK_DEFAULT /* 3e */ +PGM_CHECK_DEFAULT /* 3f */ +PGM_CHECK_DEFAULT /* 40 */ +PGM_CHECK_DEFAULT /* 41 */ +PGM_CHECK_DEFAULT /* 42 */ +PGM_CHECK_DEFAULT /* 43 */ +PGM_CHECK_DEFAULT /* 44 */ +PGM_CHECK_DEFAULT /* 45 */ +PGM_CHECK_DEFAULT /* 46 */ +PGM_CHECK_DEFAULT /* 47 */ +PGM_CHECK_DEFAULT /* 48 */ +PGM_CHECK_DEFAULT /* 49 */ +PGM_CHECK_DEFAULT /* 4a */ +PGM_CHECK_DEFAULT /* 4b */ +PGM_CHECK_DEFAULT /* 4c */ +PGM_CHECK_DEFAULT /* 4d */ +PGM_CHECK_DEFAULT /* 4e */ +PGM_CHECK_DEFAULT /* 4f */ +PGM_CHECK_DEFAULT /* 50 */ +PGM_CHECK_DEFAULT /* 51 */ +PGM_CHECK_DEFAULT /* 52 */ +PGM_CHECK_DEFAULT /* 53 */ +PGM_CHECK_DEFAULT /* 54 */ +PGM_CHECK_DEFAULT /* 55 */ +PGM_CHECK_DEFAULT /* 56 */ +PGM_CHECK_DEFAULT /* 57 */ +PGM_CHECK_DEFAULT /* 58 */ +PGM_CHECK_DEFAULT /* 59 */ +PGM_CHECK_DEFAULT /* 5a */ +PGM_CHECK_DEFAULT /* 5b */ +PGM_CHECK_DEFAULT /* 5c */ +PGM_CHECK_DEFAULT /* 5d */ +PGM_CHECK_DEFAULT /* 5e */ +PGM_CHECK_DEFAULT /* 5f */ +PGM_CHECK_DEFAULT /* 60 */ +PGM_CHECK_DEFAULT /* 61 */ +PGM_CHECK_DEFAULT /* 62 */ +PGM_CHECK_DEFAULT /* 63 */ +PGM_CHECK_DEFAULT /* 64 */ +PGM_CHECK_DEFAULT /* 65 */ +PGM_CHECK_DEFAULT /* 66 */ +PGM_CHECK_DEFAULT /* 67 */ +PGM_CHECK_DEFAULT /* 68 */ +PGM_CHECK_DEFAULT /* 69 */ +PGM_CHECK_DEFAULT /* 6a */ +PGM_CHECK_DEFAULT /* 6b */ +PGM_CHECK_DEFAULT /* 6c */ +PGM_CHECK_DEFAULT /* 6d */ +PGM_CHECK_DEFAULT /* 6e */ +PGM_CHECK_DEFAULT /* 6f */ +PGM_CHECK_DEFAULT /* 70 */ +PGM_CHECK_DEFAULT /* 71 */ +PGM_CHECK_DEFAULT /* 72 */ +PGM_CHECK_DEFAULT /* 73 */ +PGM_CHECK_DEFAULT /* 74 */ +PGM_CHECK_DEFAULT /* 75 */ +PGM_CHECK_DEFAULT /* 76 */ +PGM_CHECK_DEFAULT /* 77 */ +PGM_CHECK_DEFAULT /* 78 */ +PGM_CHECK_DEFAULT /* 79 */ +PGM_CHECK_DEFAULT /* 7a */ +PGM_CHECK_DEFAULT /* 7b */ +PGM_CHECK_DEFAULT /* 7c */ +PGM_CHECK_DEFAULT /* 7d */ +PGM_CHECK_DEFAULT /* 7e */ +PGM_CHECK_DEFAULT /* 7f */ diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c new file mode 100644 index 000000000..99ef537e5 --- /dev/null +++ b/arch/s390/kernel/process.c @@ -0,0 +1,243 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file handles the architecture dependent parts of process handling. + * + * Copyright IBM Corp. 1999, 2009 + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>, + * Hartmut Penner <hp@de.ibm.com>, + * Denis Joseph Barrow, + */ + +#include <linux/elf-randomize.h> +#include <linux/compiler.h> +#include <linux/cpu.h> +#include <linux/sched.h> +#include <linux/sched/debug.h> +#include <linux/sched/task.h> +#include <linux/sched/task_stack.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/elfcore.h> +#include <linux/smp.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <linux/tick.h> +#include <linux/personality.h> +#include <linux/syscalls.h> +#include <linux/compat.h> +#include <linux/kprobes.h> +#include <linux/random.h> +#include <linux/export.h> +#include <linux/init_task.h> +#include <asm/cpu_mf.h> +#include <asm/io.h> +#include <asm/processor.h> +#include <asm/vtimer.h> +#include <asm/exec.h> +#include <asm/irq.h> +#include <asm/nmi.h> +#include <asm/smp.h> +#include <asm/switch_to.h> +#include <asm/runtime_instr.h> +#include "entry.h" + +asmlinkage void ret_from_fork(void) asm ("ret_from_fork"); + +extern void kernel_thread_starter(void); + +void flush_thread(void) +{ +} + +void arch_setup_new_exec(void) +{ + if (S390_lowcore.current_pid != current->pid) { + S390_lowcore.current_pid = current->pid; + if (test_facility(40)) + lpp(&S390_lowcore.lpp); + } +} + +void arch_release_task_struct(struct task_struct *tsk) +{ + runtime_instr_release(tsk); + guarded_storage_release(tsk); +} + +int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) +{ + /* + * Save the floating-point or vector register state of the current + * task and set the CIF_FPU flag to lazy restore the FPU register + * state when returning to user space. + */ + save_fpu_regs(); + + memcpy(dst, src, arch_task_struct_size); + dst->thread.fpu.regs = dst->thread.fpu.fprs; + return 0; +} + +int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp, + unsigned long arg, struct task_struct *p, unsigned long tls) +{ + struct fake_frame + { + struct stack_frame sf; + struct pt_regs childregs; + } *frame; + + frame = container_of(task_pt_regs(p), struct fake_frame, childregs); + p->thread.ksp = (unsigned long) frame; + /* Save access registers to new thread structure. */ + save_access_regs(&p->thread.acrs[0]); + /* start new process with ar4 pointing to the correct address space */ + p->thread.mm_segment = get_fs(); + /* Don't copy debug registers */ + memset(&p->thread.per_user, 0, sizeof(p->thread.per_user)); + memset(&p->thread.per_event, 0, sizeof(p->thread.per_event)); + clear_tsk_thread_flag(p, TIF_SINGLE_STEP); + p->thread.per_flags = 0; + /* Initialize per thread user and system timer values */ + p->thread.user_timer = 0; + p->thread.guest_timer = 0; + p->thread.system_timer = 0; + p->thread.hardirq_timer = 0; + p->thread.softirq_timer = 0; + + frame->sf.back_chain = 0; + /* new return point is ret_from_fork */ + frame->sf.gprs[8] = (unsigned long) ret_from_fork; + /* fake return stack for resume(), don't go back to schedule */ + frame->sf.gprs[9] = (unsigned long) frame; + + /* Store access registers to kernel stack of new process. */ + if (unlikely(p->flags & PF_KTHREAD)) { + /* kernel thread */ + memset(&frame->childregs, 0, sizeof(struct pt_regs)); + frame->childregs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_DAT | + PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; + frame->childregs.psw.addr = + (unsigned long) kernel_thread_starter; + frame->childregs.gprs[9] = new_stackp; /* function */ + frame->childregs.gprs[10] = arg; + frame->childregs.gprs[11] = (unsigned long) do_exit; + frame->childregs.orig_gpr2 = -1; + + return 0; + } + frame->childregs = *current_pt_regs(); + frame->childregs.gprs[2] = 0; /* child returns 0 on fork. */ + frame->childregs.flags = 0; + if (new_stackp) + frame->childregs.gprs[15] = new_stackp; + + /* Don't copy runtime instrumentation info */ + p->thread.ri_cb = NULL; + frame->childregs.psw.mask &= ~PSW_MASK_RI; + /* Don't copy guarded storage control block */ + p->thread.gs_cb = NULL; + p->thread.gs_bc_cb = NULL; + + /* Set a new TLS ? */ + if (clone_flags & CLONE_SETTLS) { + if (is_compat_task()) { + p->thread.acrs[0] = (unsigned int)tls; + } else { + p->thread.acrs[0] = (unsigned int)(tls >> 32); + p->thread.acrs[1] = (unsigned int)tls; + } + } + return 0; +} + +asmlinkage void execve_tail(void) +{ + current->thread.fpu.fpc = 0; + asm volatile("sfpc %0" : : "d" (0)); +} + +/* + * fill in the FPU structure for a core dump. + */ +int dump_fpu (struct pt_regs * regs, s390_fp_regs *fpregs) +{ + save_fpu_regs(); + fpregs->fpc = current->thread.fpu.fpc; + fpregs->pad = 0; + if (MACHINE_HAS_VX) + convert_vx_to_fp((freg_t *)&fpregs->fprs, + current->thread.fpu.vxrs); + else + memcpy(&fpregs->fprs, current->thread.fpu.fprs, + sizeof(fpregs->fprs)); + return 1; +} +EXPORT_SYMBOL(dump_fpu); + +unsigned long get_wchan(struct task_struct *p) +{ + struct stack_frame *sf, *low, *high; + unsigned long return_address; + int count; + + if (!p || p == current || p->state == TASK_RUNNING || !task_stack_page(p)) + return 0; + + if (!try_get_task_stack(p)) + return 0; + + low = task_stack_page(p); + high = (struct stack_frame *) task_pt_regs(p); + sf = (struct stack_frame *) p->thread.ksp; + if (sf <= low || sf > high) { + return_address = 0; + goto out; + } + for (count = 0; count < 16; count++) { + sf = (struct stack_frame *) sf->back_chain; + if (sf <= low || sf > high) { + return_address = 0; + goto out; + } + return_address = sf->gprs[8]; + if (!in_sched_functions(return_address)) + goto out; + } +out: + put_task_stack(p); + return return_address; +} + +unsigned long arch_align_stack(unsigned long sp) +{ + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) + sp -= get_random_int() & ~PAGE_MASK; + return sp & ~0xf; +} + +static inline unsigned long brk_rnd(void) +{ + return (get_random_int() & BRK_RND_MASK) << PAGE_SHIFT; +} + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + unsigned long ret; + + ret = PAGE_ALIGN(mm->brk + brk_rnd()); + return (ret > mm->brk) ? ret : mm->brk; +} + +void set_fs_fixup(void) +{ + struct pt_regs *regs = current_pt_regs(); + static bool warned; + + set_fs(USER_DS); + if (warned) + return; + WARN(1, "Unbalanced set_fs - int code: 0x%x\n", regs->int_code); + show_registers(regs); + warned = true; +} diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c new file mode 100644 index 000000000..675d4be0c --- /dev/null +++ b/arch/s390/kernel/processor.c @@ -0,0 +1,220 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2008 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) + */ + +#define KMSG_COMPONENT "cpu" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/cpufeature.h> +#include <linux/bitops.h> +#include <linux/kernel.h> +#include <linux/sched/mm.h> +#include <linux/init.h> +#include <linux/seq_file.h> +#include <linux/mm_types.h> +#include <linux/delay.h> +#include <linux/cpu.h> + +#include <asm/diag.h> +#include <asm/facility.h> +#include <asm/elf.h> +#include <asm/lowcore.h> +#include <asm/param.h> +#include <asm/smp.h> + +struct cpu_info { + unsigned int cpu_mhz_dynamic; + unsigned int cpu_mhz_static; + struct cpuid cpu_id; +}; + +static DEFINE_PER_CPU(struct cpu_info, cpu_info); + +static bool machine_has_cpu_mhz; + +void __init cpu_detect_mhz_feature(void) +{ + if (test_facility(34) && __ecag(ECAG_CPU_ATTRIBUTE, 0) != -1UL) + machine_has_cpu_mhz = true; +} + +static void update_cpu_mhz(void *arg) +{ + unsigned long mhz; + struct cpu_info *c; + + mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0); + c = this_cpu_ptr(&cpu_info); + c->cpu_mhz_dynamic = mhz >> 32; + c->cpu_mhz_static = mhz & 0xffffffff; +} + +void s390_update_cpu_mhz(void) +{ + s390_adjust_jiffies(); + if (machine_has_cpu_mhz) + on_each_cpu(update_cpu_mhz, NULL, 0); +} + +void notrace cpu_relax_yield(void) +{ + if (!smp_cpu_mtid && MACHINE_HAS_DIAG44) { + diag_stat_inc(DIAG_STAT_X044); + asm volatile("diag 0,0,0x44"); + } + barrier(); +} +EXPORT_SYMBOL(cpu_relax_yield); + +/* + * cpu_init - initializes state that is per-CPU. + */ +void cpu_init(void) +{ + struct cpuid *id = this_cpu_ptr(&cpu_info.cpu_id); + + get_cpu_id(id); + if (machine_has_cpu_mhz) + update_cpu_mhz(NULL); + mmgrab(&init_mm); + current->active_mm = &init_mm; + BUG_ON(current->mm); + enter_lazy_tlb(&init_mm, current); +} + +/* + * cpu_have_feature - Test CPU features on module initialization + */ +int cpu_have_feature(unsigned int num) +{ + return elf_hwcap & (1UL << num); +} +EXPORT_SYMBOL(cpu_have_feature); + +static void show_facilities(struct seq_file *m) +{ + unsigned int bit; + long *facilities; + + facilities = (long *)&S390_lowcore.stfle_fac_list; + seq_puts(m, "facilities :"); + for_each_set_bit_inv(bit, facilities, MAX_FACILITY_BIT) + seq_printf(m, " %d", bit); + seq_putc(m, '\n'); +} + +static void show_cpu_summary(struct seq_file *m, void *v) +{ + static const char *hwcap_str[] = { + "esan3", "zarch", "stfle", "msa", "ldisp", "eimm", "dfp", + "edat", "etf3eh", "highgprs", "te", "vx", "vxd", "vxe", "gs" + }; + static const char * const int_hwcap_str[] = { + "sie" + }; + int i, cpu; + + seq_printf(m, "vendor_id : IBM/S390\n" + "# processors : %i\n" + "bogomips per cpu: %lu.%02lu\n", + num_online_cpus(), loops_per_jiffy/(500000/HZ), + (loops_per_jiffy/(5000/HZ))%100); + seq_printf(m, "max thread id : %d\n", smp_cpu_mtid); + seq_puts(m, "features\t: "); + for (i = 0; i < ARRAY_SIZE(hwcap_str); i++) + if (hwcap_str[i] && (elf_hwcap & (1UL << i))) + seq_printf(m, "%s ", hwcap_str[i]); + for (i = 0; i < ARRAY_SIZE(int_hwcap_str); i++) + if (int_hwcap_str[i] && (int_hwcap & (1UL << i))) + seq_printf(m, "%s ", int_hwcap_str[i]); + seq_puts(m, "\n"); + show_facilities(m); + show_cacheinfo(m); + for_each_online_cpu(cpu) { + struct cpuid *id = &per_cpu(cpu_info.cpu_id, cpu); + + seq_printf(m, "processor %d: " + "version = %02X, " + "identification = %06X, " + "machine = %04X\n", + cpu, id->version, id->ident, id->machine); + } +} + +static void show_cpu_mhz(struct seq_file *m, unsigned long n) +{ + struct cpu_info *c = per_cpu_ptr(&cpu_info, n); + + seq_printf(m, "cpu MHz dynamic : %d\n", c->cpu_mhz_dynamic); + seq_printf(m, "cpu MHz static : %d\n", c->cpu_mhz_static); +} + +/* + * show_cpuinfo - Get information on one CPU for use by procfs. + */ +static int show_cpuinfo(struct seq_file *m, void *v) +{ + unsigned long n = (unsigned long) v - 1; + unsigned long first = cpumask_first(cpu_online_mask); + + if (n == first) + show_cpu_summary(m, v); + if (!machine_has_cpu_mhz) + return 0; + seq_printf(m, "\ncpu number : %ld\n", n); + show_cpu_mhz(m, n); + return 0; +} + +static inline void *c_update(loff_t *pos) +{ + if (*pos) + *pos = cpumask_next(*pos - 1, cpu_online_mask); + else + *pos = cpumask_first(cpu_online_mask); + return *pos < nr_cpu_ids ? (void *)*pos + 1 : NULL; +} + +static void *c_start(struct seq_file *m, loff_t *pos) +{ + get_online_cpus(); + return c_update(pos); +} + +static void *c_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return c_update(pos); +} + +static void c_stop(struct seq_file *m, void *v) +{ + put_online_cpus(); +} + +const struct seq_operations cpuinfo_op = { + .start = c_start, + .next = c_next, + .stop = c_stop, + .show = show_cpuinfo, +}; + +int s390_isolate_bp(void) +{ + if (!test_facility(82)) + return -EOPNOTSUPP; + set_thread_flag(TIF_ISOLATE_BP); + return 0; +} +EXPORT_SYMBOL(s390_isolate_bp); + +int s390_isolate_bp_guest(void) +{ + if (!test_facility(82)) + return -EOPNOTSUPP; + set_thread_flag(TIF_ISOLATE_BP_GUEST); + return 0; +} +EXPORT_SYMBOL(s390_isolate_bp_guest); diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c new file mode 100644 index 000000000..3ffa2847c --- /dev/null +++ b/arch/s390/kernel/ptrace.c @@ -0,0 +1,1776 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Ptrace user space interface. + * + * Copyright IBM Corp. 1999, 2010 + * Author(s): Denis Joseph Barrow + * Martin Schwidefsky (schwidefsky@de.ibm.com) + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/sched/task_stack.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/errno.h> +#include <linux/ptrace.h> +#include <linux/user.h> +#include <linux/security.h> +#include <linux/audit.h> +#include <linux/signal.h> +#include <linux/elf.h> +#include <linux/regset.h> +#include <linux/tracehook.h> +#include <linux/seccomp.h> +#include <linux/compat.h> +#include <trace/syscall.h> +#include <asm/segment.h> +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <linux/uaccess.h> +#include <asm/unistd.h> +#include <asm/switch_to.h> +#include <asm/runtime_instr.h> +#include <asm/facility.h> + +#include "entry.h" + +#ifdef CONFIG_COMPAT +#include "compat_ptrace.h" +#endif + +#define CREATE_TRACE_POINTS +#include <trace/events/syscalls.h> + +void update_cr_regs(struct task_struct *task) +{ + struct pt_regs *regs = task_pt_regs(task); + struct thread_struct *thread = &task->thread; + struct per_regs old, new; + union ctlreg0 cr0_old, cr0_new; + union ctlreg2 cr2_old, cr2_new; + int cr0_changed, cr2_changed; + + __ctl_store(cr0_old.val, 0, 0); + __ctl_store(cr2_old.val, 2, 2); + cr0_new = cr0_old; + cr2_new = cr2_old; + /* Take care of the enable/disable of transactional execution. */ + if (MACHINE_HAS_TE) { + /* Set or clear transaction execution TXC bit 8. */ + cr0_new.tcx = 1; + if (task->thread.per_flags & PER_FLAG_NO_TE) + cr0_new.tcx = 0; + /* Set or clear transaction execution TDC bits 62 and 63. */ + cr2_new.tdc = 0; + if (task->thread.per_flags & PER_FLAG_TE_ABORT_RAND) { + if (task->thread.per_flags & PER_FLAG_TE_ABORT_RAND_TEND) + cr2_new.tdc = 1; + else + cr2_new.tdc = 2; + } + } + /* Take care of enable/disable of guarded storage. */ + if (MACHINE_HAS_GS) { + cr2_new.gse = 0; + if (task->thread.gs_cb) + cr2_new.gse = 1; + } + /* Load control register 0/2 iff changed */ + cr0_changed = cr0_new.val != cr0_old.val; + cr2_changed = cr2_new.val != cr2_old.val; + if (cr0_changed) + __ctl_load(cr0_new.val, 0, 0); + if (cr2_changed) + __ctl_load(cr2_new.val, 2, 2); + /* Copy user specified PER registers */ + new.control = thread->per_user.control; + new.start = thread->per_user.start; + new.end = thread->per_user.end; + + /* merge TIF_SINGLE_STEP into user specified PER registers. */ + if (test_tsk_thread_flag(task, TIF_SINGLE_STEP) || + test_tsk_thread_flag(task, TIF_UPROBE_SINGLESTEP)) { + if (test_tsk_thread_flag(task, TIF_BLOCK_STEP)) + new.control |= PER_EVENT_BRANCH; + else + new.control |= PER_EVENT_IFETCH; + new.control |= PER_CONTROL_SUSPENSION; + new.control |= PER_EVENT_TRANSACTION_END; + if (test_tsk_thread_flag(task, TIF_UPROBE_SINGLESTEP)) + new.control |= PER_EVENT_IFETCH; + new.start = 0; + new.end = -1UL; + } + + /* Take care of the PER enablement bit in the PSW. */ + if (!(new.control & PER_EVENT_MASK)) { + regs->psw.mask &= ~PSW_MASK_PER; + return; + } + regs->psw.mask |= PSW_MASK_PER; + __ctl_store(old, 9, 11); + if (memcmp(&new, &old, sizeof(struct per_regs)) != 0) + __ctl_load(new, 9, 11); +} + +void user_enable_single_step(struct task_struct *task) +{ + clear_tsk_thread_flag(task, TIF_BLOCK_STEP); + set_tsk_thread_flag(task, TIF_SINGLE_STEP); +} + +void user_disable_single_step(struct task_struct *task) +{ + clear_tsk_thread_flag(task, TIF_BLOCK_STEP); + clear_tsk_thread_flag(task, TIF_SINGLE_STEP); +} + +void user_enable_block_step(struct task_struct *task) +{ + set_tsk_thread_flag(task, TIF_SINGLE_STEP); + set_tsk_thread_flag(task, TIF_BLOCK_STEP); +} + +/* + * Called by kernel/ptrace.c when detaching.. + * + * Clear all debugging related fields. + */ +void ptrace_disable(struct task_struct *task) +{ + memset(&task->thread.per_user, 0, sizeof(task->thread.per_user)); + memset(&task->thread.per_event, 0, sizeof(task->thread.per_event)); + clear_tsk_thread_flag(task, TIF_SINGLE_STEP); + clear_pt_regs_flag(task_pt_regs(task), PIF_PER_TRAP); + task->thread.per_flags = 0; +} + +#define __ADDR_MASK 7 + +static inline unsigned long __peek_user_per(struct task_struct *child, + addr_t addr) +{ + struct per_struct_kernel *dummy = NULL; + + if (addr == (addr_t) &dummy->cr9) + /* Control bits of the active per set. */ + return test_thread_flag(TIF_SINGLE_STEP) ? + PER_EVENT_IFETCH : child->thread.per_user.control; + else if (addr == (addr_t) &dummy->cr10) + /* Start address of the active per set. */ + return test_thread_flag(TIF_SINGLE_STEP) ? + 0 : child->thread.per_user.start; + else if (addr == (addr_t) &dummy->cr11) + /* End address of the active per set. */ + return test_thread_flag(TIF_SINGLE_STEP) ? + -1UL : child->thread.per_user.end; + else if (addr == (addr_t) &dummy->bits) + /* Single-step bit. */ + return test_thread_flag(TIF_SINGLE_STEP) ? + (1UL << (BITS_PER_LONG - 1)) : 0; + else if (addr == (addr_t) &dummy->starting_addr) + /* Start address of the user specified per set. */ + return child->thread.per_user.start; + else if (addr == (addr_t) &dummy->ending_addr) + /* End address of the user specified per set. */ + return child->thread.per_user.end; + else if (addr == (addr_t) &dummy->perc_atmid) + /* PER code, ATMID and AI of the last PER trap */ + return (unsigned long) + child->thread.per_event.cause << (BITS_PER_LONG - 16); + else if (addr == (addr_t) &dummy->address) + /* Address of the last PER trap */ + return child->thread.per_event.address; + else if (addr == (addr_t) &dummy->access_id) + /* Access id of the last PER trap */ + return (unsigned long) + child->thread.per_event.paid << (BITS_PER_LONG - 8); + return 0; +} + +/* + * Read the word at offset addr from the user area of a process. The + * trouble here is that the information is littered over different + * locations. The process registers are found on the kernel stack, + * the floating point stuff and the trace settings are stored in + * the task structure. In addition the different structures in + * struct user contain pad bytes that should be read as zeroes. + * Lovely... + */ +static unsigned long __peek_user(struct task_struct *child, addr_t addr) +{ + struct user *dummy = NULL; + addr_t offset, tmp; + + if (addr < (addr_t) &dummy->regs.acrs) { + /* + * psw and gprs are stored on the stack + */ + tmp = *(addr_t *)((addr_t) &task_pt_regs(child)->psw + addr); + if (addr == (addr_t) &dummy->regs.psw.mask) { + /* Return a clean psw mask. */ + tmp &= PSW_MASK_USER | PSW_MASK_RI; + tmp |= PSW_USER_BITS; + } + + } else if (addr < (addr_t) &dummy->regs.orig_gpr2) { + /* + * access registers are stored in the thread structure + */ + offset = addr - (addr_t) &dummy->regs.acrs; + /* + * Very special case: old & broken 64 bit gdb reading + * from acrs[15]. Result is a 64 bit value. Read the + * 32 bit acrs[15] value and shift it by 32. Sick... + */ + if (addr == (addr_t) &dummy->regs.acrs[15]) + tmp = ((unsigned long) child->thread.acrs[15]) << 32; + else + tmp = *(addr_t *)((addr_t) &child->thread.acrs + offset); + + } else if (addr == (addr_t) &dummy->regs.orig_gpr2) { + /* + * orig_gpr2 is stored on the kernel stack + */ + tmp = (addr_t) task_pt_regs(child)->orig_gpr2; + + } else if (addr < (addr_t) &dummy->regs.fp_regs) { + /* + * prevent reads of padding hole between + * orig_gpr2 and fp_regs on s390. + */ + tmp = 0; + + } else if (addr == (addr_t) &dummy->regs.fp_regs.fpc) { + /* + * floating point control reg. is in the thread structure + */ + tmp = child->thread.fpu.fpc; + tmp <<= BITS_PER_LONG - 32; + + } else if (addr < (addr_t) (&dummy->regs.fp_regs + 1)) { + /* + * floating point regs. are either in child->thread.fpu + * or the child->thread.fpu.vxrs array + */ + offset = addr - (addr_t) &dummy->regs.fp_regs.fprs; + if (MACHINE_HAS_VX) + tmp = *(addr_t *) + ((addr_t) child->thread.fpu.vxrs + 2*offset); + else + tmp = *(addr_t *) + ((addr_t) child->thread.fpu.fprs + offset); + + } else if (addr < (addr_t) (&dummy->regs.per_info + 1)) { + /* + * Handle access to the per_info structure. + */ + addr -= (addr_t) &dummy->regs.per_info; + tmp = __peek_user_per(child, addr); + + } else + tmp = 0; + + return tmp; +} + +static int +peek_user(struct task_struct *child, addr_t addr, addr_t data) +{ + addr_t tmp, mask; + + /* + * Stupid gdb peeks/pokes the access registers in 64 bit with + * an alignment of 4. Programmers from hell... + */ + mask = __ADDR_MASK; + if (addr >= (addr_t) &((struct user *) NULL)->regs.acrs && + addr < (addr_t) &((struct user *) NULL)->regs.orig_gpr2) + mask = 3; + if ((addr & mask) || addr > sizeof(struct user) - __ADDR_MASK) + return -EIO; + + tmp = __peek_user(child, addr); + return put_user(tmp, (addr_t __user *) data); +} + +static inline void __poke_user_per(struct task_struct *child, + addr_t addr, addr_t data) +{ + struct per_struct_kernel *dummy = NULL; + + /* + * There are only three fields in the per_info struct that the + * debugger user can write to. + * 1) cr9: the debugger wants to set a new PER event mask + * 2) starting_addr: the debugger wants to set a new starting + * address to use with the PER event mask. + * 3) ending_addr: the debugger wants to set a new ending + * address to use with the PER event mask. + * The user specified PER event mask and the start and end + * addresses are used only if single stepping is not in effect. + * Writes to any other field in per_info are ignored. + */ + if (addr == (addr_t) &dummy->cr9) + /* PER event mask of the user specified per set. */ + child->thread.per_user.control = + data & (PER_EVENT_MASK | PER_CONTROL_MASK); + else if (addr == (addr_t) &dummy->starting_addr) + /* Starting address of the user specified per set. */ + child->thread.per_user.start = data; + else if (addr == (addr_t) &dummy->ending_addr) + /* Ending address of the user specified per set. */ + child->thread.per_user.end = data; +} + +static void fixup_int_code(struct task_struct *child, addr_t data) +{ + struct pt_regs *regs = task_pt_regs(child); + int ilc = regs->int_code >> 16; + u16 insn; + + if (ilc > 6) + return; + + if (ptrace_access_vm(child, regs->psw.addr - (regs->int_code >> 16), + &insn, sizeof(insn), FOLL_FORCE) != sizeof(insn)) + return; + + /* double check that tracee stopped on svc instruction */ + if ((insn >> 8) != 0xa) + return; + + regs->int_code = 0x20000 | (data & 0xffff); +} +/* + * Write a word to the user area of a process at location addr. This + * operation does have an additional problem compared to peek_user. + * Stores to the program status word and on the floating point + * control register needs to get checked for validity. + */ +static int __poke_user(struct task_struct *child, addr_t addr, addr_t data) +{ + struct user *dummy = NULL; + addr_t offset; + + + if (addr < (addr_t) &dummy->regs.acrs) { + struct pt_regs *regs = task_pt_regs(child); + /* + * psw and gprs are stored on the stack + */ + if (addr == (addr_t) &dummy->regs.psw.mask) { + unsigned long mask = PSW_MASK_USER; + + mask |= is_ri_task(child) ? PSW_MASK_RI : 0; + if ((data ^ PSW_USER_BITS) & ~mask) + /* Invalid psw mask. */ + return -EINVAL; + if ((data & PSW_MASK_ASC) == PSW_ASC_HOME) + /* Invalid address-space-control bits */ + return -EINVAL; + if ((data & PSW_MASK_EA) && !(data & PSW_MASK_BA)) + /* Invalid addressing mode bits */ + return -EINVAL; + } + + if (test_pt_regs_flag(regs, PIF_SYSCALL) && + addr == offsetof(struct user, regs.gprs[2])) + fixup_int_code(child, data); + *(addr_t *)((addr_t) ®s->psw + addr) = data; + + } else if (addr < (addr_t) (&dummy->regs.orig_gpr2)) { + /* + * access registers are stored in the thread structure + */ + offset = addr - (addr_t) &dummy->regs.acrs; + /* + * Very special case: old & broken 64 bit gdb writing + * to acrs[15] with a 64 bit value. Ignore the lower + * half of the value and write the upper 32 bit to + * acrs[15]. Sick... + */ + if (addr == (addr_t) &dummy->regs.acrs[15]) + child->thread.acrs[15] = (unsigned int) (data >> 32); + else + *(addr_t *)((addr_t) &child->thread.acrs + offset) = data; + + } else if (addr == (addr_t) &dummy->regs.orig_gpr2) { + /* + * orig_gpr2 is stored on the kernel stack + */ + task_pt_regs(child)->orig_gpr2 = data; + + } else if (addr < (addr_t) &dummy->regs.fp_regs) { + /* + * prevent writes of padding hole between + * orig_gpr2 and fp_regs on s390. + */ + return 0; + + } else if (addr == (addr_t) &dummy->regs.fp_regs.fpc) { + /* + * floating point control reg. is in the thread structure + */ + if ((unsigned int) data != 0 || + test_fp_ctl(data >> (BITS_PER_LONG - 32))) + return -EINVAL; + child->thread.fpu.fpc = data >> (BITS_PER_LONG - 32); + + } else if (addr < (addr_t) (&dummy->regs.fp_regs + 1)) { + /* + * floating point regs. are either in child->thread.fpu + * or the child->thread.fpu.vxrs array + */ + offset = addr - (addr_t) &dummy->regs.fp_regs.fprs; + if (MACHINE_HAS_VX) + *(addr_t *)((addr_t) + child->thread.fpu.vxrs + 2*offset) = data; + else + *(addr_t *)((addr_t) + child->thread.fpu.fprs + offset) = data; + + } else if (addr < (addr_t) (&dummy->regs.per_info + 1)) { + /* + * Handle access to the per_info structure. + */ + addr -= (addr_t) &dummy->regs.per_info; + __poke_user_per(child, addr, data); + + } + + return 0; +} + +static int poke_user(struct task_struct *child, addr_t addr, addr_t data) +{ + addr_t mask; + + /* + * Stupid gdb peeks/pokes the access registers in 64 bit with + * an alignment of 4. Programmers from hell indeed... + */ + mask = __ADDR_MASK; + if (addr >= (addr_t) &((struct user *) NULL)->regs.acrs && + addr < (addr_t) &((struct user *) NULL)->regs.orig_gpr2) + mask = 3; + if ((addr & mask) || addr > sizeof(struct user) - __ADDR_MASK) + return -EIO; + + return __poke_user(child, addr, data); +} + +long arch_ptrace(struct task_struct *child, long request, + unsigned long addr, unsigned long data) +{ + ptrace_area parea; + int copied, ret; + + switch (request) { + case PTRACE_PEEKUSR: + /* read the word at location addr in the USER area. */ + return peek_user(child, addr, data); + + case PTRACE_POKEUSR: + /* write the word at location addr in the USER area */ + return poke_user(child, addr, data); + + case PTRACE_PEEKUSR_AREA: + case PTRACE_POKEUSR_AREA: + if (copy_from_user(&parea, (void __force __user *) addr, + sizeof(parea))) + return -EFAULT; + addr = parea.kernel_addr; + data = parea.process_addr; + copied = 0; + while (copied < parea.len) { + if (request == PTRACE_PEEKUSR_AREA) + ret = peek_user(child, addr, data); + else { + addr_t utmp; + if (get_user(utmp, + (addr_t __force __user *) data)) + return -EFAULT; + ret = poke_user(child, addr, utmp); + } + if (ret) + return ret; + addr += sizeof(unsigned long); + data += sizeof(unsigned long); + copied += sizeof(unsigned long); + } + return 0; + case PTRACE_GET_LAST_BREAK: + put_user(child->thread.last_break, + (unsigned long __user *) data); + return 0; + case PTRACE_ENABLE_TE: + if (!MACHINE_HAS_TE) + return -EIO; + child->thread.per_flags &= ~PER_FLAG_NO_TE; + return 0; + case PTRACE_DISABLE_TE: + if (!MACHINE_HAS_TE) + return -EIO; + child->thread.per_flags |= PER_FLAG_NO_TE; + child->thread.per_flags &= ~PER_FLAG_TE_ABORT_RAND; + return 0; + case PTRACE_TE_ABORT_RAND: + if (!MACHINE_HAS_TE || (child->thread.per_flags & PER_FLAG_NO_TE)) + return -EIO; + switch (data) { + case 0UL: + child->thread.per_flags &= ~PER_FLAG_TE_ABORT_RAND; + break; + case 1UL: + child->thread.per_flags |= PER_FLAG_TE_ABORT_RAND; + child->thread.per_flags |= PER_FLAG_TE_ABORT_RAND_TEND; + break; + case 2UL: + child->thread.per_flags |= PER_FLAG_TE_ABORT_RAND; + child->thread.per_flags &= ~PER_FLAG_TE_ABORT_RAND_TEND; + break; + default: + return -EINVAL; + } + return 0; + default: + return ptrace_request(child, request, addr, data); + } +} + +#ifdef CONFIG_COMPAT +/* + * Now the fun part starts... a 31 bit program running in the + * 31 bit emulation tracing another program. PTRACE_PEEKTEXT, + * PTRACE_PEEKDATA, PTRACE_POKETEXT and PTRACE_POKEDATA are easy + * to handle, the difference to the 64 bit versions of the requests + * is that the access is done in multiples of 4 byte instead of + * 8 bytes (sizeof(unsigned long) on 31/64 bit). + * The ugly part are PTRACE_PEEKUSR, PTRACE_PEEKUSR_AREA, + * PTRACE_POKEUSR and PTRACE_POKEUSR_AREA. If the traced program + * is a 31 bit program too, the content of struct user can be + * emulated. A 31 bit program peeking into the struct user of + * a 64 bit program is a no-no. + */ + +/* + * Same as peek_user_per but for a 31 bit program. + */ +static inline __u32 __peek_user_per_compat(struct task_struct *child, + addr_t addr) +{ + struct compat_per_struct_kernel *dummy32 = NULL; + + if (addr == (addr_t) &dummy32->cr9) + /* Control bits of the active per set. */ + return (__u32) test_thread_flag(TIF_SINGLE_STEP) ? + PER_EVENT_IFETCH : child->thread.per_user.control; + else if (addr == (addr_t) &dummy32->cr10) + /* Start address of the active per set. */ + return (__u32) test_thread_flag(TIF_SINGLE_STEP) ? + 0 : child->thread.per_user.start; + else if (addr == (addr_t) &dummy32->cr11) + /* End address of the active per set. */ + return test_thread_flag(TIF_SINGLE_STEP) ? + PSW32_ADDR_INSN : child->thread.per_user.end; + else if (addr == (addr_t) &dummy32->bits) + /* Single-step bit. */ + return (__u32) test_thread_flag(TIF_SINGLE_STEP) ? + 0x80000000 : 0; + else if (addr == (addr_t) &dummy32->starting_addr) + /* Start address of the user specified per set. */ + return (__u32) child->thread.per_user.start; + else if (addr == (addr_t) &dummy32->ending_addr) + /* End address of the user specified per set. */ + return (__u32) child->thread.per_user.end; + else if (addr == (addr_t) &dummy32->perc_atmid) + /* PER code, ATMID and AI of the last PER trap */ + return (__u32) child->thread.per_event.cause << 16; + else if (addr == (addr_t) &dummy32->address) + /* Address of the last PER trap */ + return (__u32) child->thread.per_event.address; + else if (addr == (addr_t) &dummy32->access_id) + /* Access id of the last PER trap */ + return (__u32) child->thread.per_event.paid << 24; + return 0; +} + +/* + * Same as peek_user but for a 31 bit program. + */ +static u32 __peek_user_compat(struct task_struct *child, addr_t addr) +{ + struct compat_user *dummy32 = NULL; + addr_t offset; + __u32 tmp; + + if (addr < (addr_t) &dummy32->regs.acrs) { + struct pt_regs *regs = task_pt_regs(child); + /* + * psw and gprs are stored on the stack + */ + if (addr == (addr_t) &dummy32->regs.psw.mask) { + /* Fake a 31 bit psw mask. */ + tmp = (__u32)(regs->psw.mask >> 32); + tmp &= PSW32_MASK_USER | PSW32_MASK_RI; + tmp |= PSW32_USER_BITS; + } else if (addr == (addr_t) &dummy32->regs.psw.addr) { + /* Fake a 31 bit psw address. */ + tmp = (__u32) regs->psw.addr | + (__u32)(regs->psw.mask & PSW_MASK_BA); + } else { + /* gpr 0-15 */ + tmp = *(__u32 *)((addr_t) ®s->psw + addr*2 + 4); + } + } else if (addr < (addr_t) (&dummy32->regs.orig_gpr2)) { + /* + * access registers are stored in the thread structure + */ + offset = addr - (addr_t) &dummy32->regs.acrs; + tmp = *(__u32*)((addr_t) &child->thread.acrs + offset); + + } else if (addr == (addr_t) (&dummy32->regs.orig_gpr2)) { + /* + * orig_gpr2 is stored on the kernel stack + */ + tmp = *(__u32*)((addr_t) &task_pt_regs(child)->orig_gpr2 + 4); + + } else if (addr < (addr_t) &dummy32->regs.fp_regs) { + /* + * prevent reads of padding hole between + * orig_gpr2 and fp_regs on s390. + */ + tmp = 0; + + } else if (addr == (addr_t) &dummy32->regs.fp_regs.fpc) { + /* + * floating point control reg. is in the thread structure + */ + tmp = child->thread.fpu.fpc; + + } else if (addr < (addr_t) (&dummy32->regs.fp_regs + 1)) { + /* + * floating point regs. are either in child->thread.fpu + * or the child->thread.fpu.vxrs array + */ + offset = addr - (addr_t) &dummy32->regs.fp_regs.fprs; + if (MACHINE_HAS_VX) + tmp = *(__u32 *) + ((addr_t) child->thread.fpu.vxrs + 2*offset); + else + tmp = *(__u32 *) + ((addr_t) child->thread.fpu.fprs + offset); + + } else if (addr < (addr_t) (&dummy32->regs.per_info + 1)) { + /* + * Handle access to the per_info structure. + */ + addr -= (addr_t) &dummy32->regs.per_info; + tmp = __peek_user_per_compat(child, addr); + + } else + tmp = 0; + + return tmp; +} + +static int peek_user_compat(struct task_struct *child, + addr_t addr, addr_t data) +{ + __u32 tmp; + + if (!is_compat_task() || (addr & 3) || addr > sizeof(struct user) - 3) + return -EIO; + + tmp = __peek_user_compat(child, addr); + return put_user(tmp, (__u32 __user *) data); +} + +/* + * Same as poke_user_per but for a 31 bit program. + */ +static inline void __poke_user_per_compat(struct task_struct *child, + addr_t addr, __u32 data) +{ + struct compat_per_struct_kernel *dummy32 = NULL; + + if (addr == (addr_t) &dummy32->cr9) + /* PER event mask of the user specified per set. */ + child->thread.per_user.control = + data & (PER_EVENT_MASK | PER_CONTROL_MASK); + else if (addr == (addr_t) &dummy32->starting_addr) + /* Starting address of the user specified per set. */ + child->thread.per_user.start = data; + else if (addr == (addr_t) &dummy32->ending_addr) + /* Ending address of the user specified per set. */ + child->thread.per_user.end = data; +} + +/* + * Same as poke_user but for a 31 bit program. + */ +static int __poke_user_compat(struct task_struct *child, + addr_t addr, addr_t data) +{ + struct compat_user *dummy32 = NULL; + __u32 tmp = (__u32) data; + addr_t offset; + + if (addr < (addr_t) &dummy32->regs.acrs) { + struct pt_regs *regs = task_pt_regs(child); + /* + * psw, gprs, acrs and orig_gpr2 are stored on the stack + */ + if (addr == (addr_t) &dummy32->regs.psw.mask) { + __u32 mask = PSW32_MASK_USER; + + mask |= is_ri_task(child) ? PSW32_MASK_RI : 0; + /* Build a 64 bit psw mask from 31 bit mask. */ + if ((tmp ^ PSW32_USER_BITS) & ~mask) + /* Invalid psw mask. */ + return -EINVAL; + if ((data & PSW32_MASK_ASC) == PSW32_ASC_HOME) + /* Invalid address-space-control bits */ + return -EINVAL; + regs->psw.mask = (regs->psw.mask & ~PSW_MASK_USER) | + (regs->psw.mask & PSW_MASK_BA) | + (__u64)(tmp & mask) << 32; + } else if (addr == (addr_t) &dummy32->regs.psw.addr) { + /* Build a 64 bit psw address from 31 bit address. */ + regs->psw.addr = (__u64) tmp & PSW32_ADDR_INSN; + /* Transfer 31 bit amode bit to psw mask. */ + regs->psw.mask = (regs->psw.mask & ~PSW_MASK_BA) | + (__u64)(tmp & PSW32_ADDR_AMODE); + } else { + + if (test_pt_regs_flag(regs, PIF_SYSCALL) && + addr == offsetof(struct compat_user, regs.gprs[2])) + fixup_int_code(child, data); + /* gpr 0-15 */ + *(__u32*)((addr_t) ®s->psw + addr*2 + 4) = tmp; + } + } else if (addr < (addr_t) (&dummy32->regs.orig_gpr2)) { + /* + * access registers are stored in the thread structure + */ + offset = addr - (addr_t) &dummy32->regs.acrs; + *(__u32*)((addr_t) &child->thread.acrs + offset) = tmp; + + } else if (addr == (addr_t) (&dummy32->regs.orig_gpr2)) { + /* + * orig_gpr2 is stored on the kernel stack + */ + *(__u32*)((addr_t) &task_pt_regs(child)->orig_gpr2 + 4) = tmp; + + } else if (addr < (addr_t) &dummy32->regs.fp_regs) { + /* + * prevent writess of padding hole between + * orig_gpr2 and fp_regs on s390. + */ + return 0; + + } else if (addr == (addr_t) &dummy32->regs.fp_regs.fpc) { + /* + * floating point control reg. is in the thread structure + */ + if (test_fp_ctl(tmp)) + return -EINVAL; + child->thread.fpu.fpc = data; + + } else if (addr < (addr_t) (&dummy32->regs.fp_regs + 1)) { + /* + * floating point regs. are either in child->thread.fpu + * or the child->thread.fpu.vxrs array + */ + offset = addr - (addr_t) &dummy32->regs.fp_regs.fprs; + if (MACHINE_HAS_VX) + *(__u32 *)((addr_t) + child->thread.fpu.vxrs + 2*offset) = tmp; + else + *(__u32 *)((addr_t) + child->thread.fpu.fprs + offset) = tmp; + + } else if (addr < (addr_t) (&dummy32->regs.per_info + 1)) { + /* + * Handle access to the per_info structure. + */ + addr -= (addr_t) &dummy32->regs.per_info; + __poke_user_per_compat(child, addr, data); + } + + return 0; +} + +static int poke_user_compat(struct task_struct *child, + addr_t addr, addr_t data) +{ + if (!is_compat_task() || (addr & 3) || + addr > sizeof(struct compat_user) - 3) + return -EIO; + + return __poke_user_compat(child, addr, data); +} + +long compat_arch_ptrace(struct task_struct *child, compat_long_t request, + compat_ulong_t caddr, compat_ulong_t cdata) +{ + unsigned long addr = caddr; + unsigned long data = cdata; + compat_ptrace_area parea; + int copied, ret; + + switch (request) { + case PTRACE_PEEKUSR: + /* read the word at location addr in the USER area. */ + return peek_user_compat(child, addr, data); + + case PTRACE_POKEUSR: + /* write the word at location addr in the USER area */ + return poke_user_compat(child, addr, data); + + case PTRACE_PEEKUSR_AREA: + case PTRACE_POKEUSR_AREA: + if (copy_from_user(&parea, (void __force __user *) addr, + sizeof(parea))) + return -EFAULT; + addr = parea.kernel_addr; + data = parea.process_addr; + copied = 0; + while (copied < parea.len) { + if (request == PTRACE_PEEKUSR_AREA) + ret = peek_user_compat(child, addr, data); + else { + __u32 utmp; + if (get_user(utmp, + (__u32 __force __user *) data)) + return -EFAULT; + ret = poke_user_compat(child, addr, utmp); + } + if (ret) + return ret; + addr += sizeof(unsigned int); + data += sizeof(unsigned int); + copied += sizeof(unsigned int); + } + return 0; + case PTRACE_GET_LAST_BREAK: + put_user(child->thread.last_break, + (unsigned int __user *) data); + return 0; + } + return compat_ptrace_request(child, request, addr, data); +} +#endif + +asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) +{ + unsigned long mask = -1UL; + + /* + * The sysc_tracesys code in entry.S stored the system + * call number to gprs[2]. + */ + if (test_thread_flag(TIF_SYSCALL_TRACE) && + (tracehook_report_syscall_entry(regs) || + regs->gprs[2] >= NR_syscalls)) { + /* + * Tracing decided this syscall should not happen or the + * debugger stored an invalid system call number. Skip + * the system call and the system call restart handling. + */ + clear_pt_regs_flag(regs, PIF_SYSCALL); + return -1; + } + + /* Do the secure computing check after ptrace. */ + if (secure_computing(NULL)) { + /* seccomp failures shouldn't expose any additional code. */ + return -1; + } + + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) + trace_sys_enter(regs, regs->gprs[2]); + + if (is_compat_task()) + mask = 0xffffffff; + + audit_syscall_entry(regs->gprs[2], regs->orig_gpr2 & mask, + regs->gprs[3] &mask, regs->gprs[4] &mask, + regs->gprs[5] &mask); + + return regs->gprs[2]; +} + +asmlinkage void do_syscall_trace_exit(struct pt_regs *regs) +{ + audit_syscall_exit(regs); + + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) + trace_sys_exit(regs, regs->gprs[2]); + + if (test_thread_flag(TIF_SYSCALL_TRACE)) + tracehook_report_syscall_exit(regs, 0); +} + +/* + * user_regset definitions. + */ + +static int s390_regs_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + if (target == current) + save_access_regs(target->thread.acrs); + + if (kbuf) { + unsigned long *k = kbuf; + while (count > 0) { + *k++ = __peek_user(target, pos); + count -= sizeof(*k); + pos += sizeof(*k); + } + } else { + unsigned long __user *u = ubuf; + while (count > 0) { + if (__put_user(__peek_user(target, pos), u++)) + return -EFAULT; + count -= sizeof(*u); + pos += sizeof(*u); + } + } + return 0; +} + +static int s390_regs_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int rc = 0; + + if (target == current) + save_access_regs(target->thread.acrs); + + if (kbuf) { + const unsigned long *k = kbuf; + while (count > 0 && !rc) { + rc = __poke_user(target, pos, *k++); + count -= sizeof(*k); + pos += sizeof(*k); + } + } else { + const unsigned long __user *u = ubuf; + while (count > 0 && !rc) { + unsigned long word; + rc = __get_user(word, u++); + if (rc) + break; + rc = __poke_user(target, pos, word); + count -= sizeof(*u); + pos += sizeof(*u); + } + } + + if (rc == 0 && target == current) + restore_access_regs(target->thread.acrs); + + return rc; +} + +static int s390_fpregs_get(struct task_struct *target, + const struct user_regset *regset, unsigned int pos, + unsigned int count, void *kbuf, void __user *ubuf) +{ + _s390_fp_regs fp_regs; + + if (target == current) + save_fpu_regs(); + + fp_regs.fpc = target->thread.fpu.fpc; + fpregs_store(&fp_regs, &target->thread.fpu); + + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &fp_regs, 0, -1); +} + +static int s390_fpregs_set(struct task_struct *target, + const struct user_regset *regset, unsigned int pos, + unsigned int count, const void *kbuf, + const void __user *ubuf) +{ + int rc = 0; + freg_t fprs[__NUM_FPRS]; + + if (target == current) + save_fpu_regs(); + + if (MACHINE_HAS_VX) + convert_vx_to_fp(fprs, target->thread.fpu.vxrs); + else + memcpy(&fprs, target->thread.fpu.fprs, sizeof(fprs)); + + /* If setting FPC, must validate it first. */ + if (count > 0 && pos < offsetof(s390_fp_regs, fprs)) { + u32 ufpc[2] = { target->thread.fpu.fpc, 0 }; + rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ufpc, + 0, offsetof(s390_fp_regs, fprs)); + if (rc) + return rc; + if (ufpc[1] != 0 || test_fp_ctl(ufpc[0])) + return -EINVAL; + target->thread.fpu.fpc = ufpc[0]; + } + + if (rc == 0 && count > 0) + rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + fprs, offsetof(s390_fp_regs, fprs), -1); + if (rc) + return rc; + + if (MACHINE_HAS_VX) + convert_fp_to_vx(target->thread.fpu.vxrs, fprs); + else + memcpy(target->thread.fpu.fprs, &fprs, sizeof(fprs)); + + return rc; +} + +static int s390_last_break_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + if (count > 0) { + if (kbuf) { + unsigned long *k = kbuf; + *k = target->thread.last_break; + } else { + unsigned long __user *u = ubuf; + if (__put_user(target->thread.last_break, u)) + return -EFAULT; + } + } + return 0; +} + +static int s390_last_break_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + return 0; +} + +static int s390_tdb_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + struct pt_regs *regs = task_pt_regs(target); + unsigned char *data; + + if (!(regs->int_code & 0x200)) + return -ENODATA; + data = target->thread.trap_tdb; + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, data, 0, 256); +} + +static int s390_tdb_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + return 0; +} + +static int s390_vxrs_low_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + __u64 vxrs[__NUM_VXRS_LOW]; + int i; + + if (!MACHINE_HAS_VX) + return -ENODEV; + if (target == current) + save_fpu_regs(); + for (i = 0; i < __NUM_VXRS_LOW; i++) + vxrs[i] = *((__u64 *)(target->thread.fpu.vxrs + i) + 1); + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, vxrs, 0, -1); +} + +static int s390_vxrs_low_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + __u64 vxrs[__NUM_VXRS_LOW]; + int i, rc; + + if (!MACHINE_HAS_VX) + return -ENODEV; + if (target == current) + save_fpu_regs(); + + for (i = 0; i < __NUM_VXRS_LOW; i++) + vxrs[i] = *((__u64 *)(target->thread.fpu.vxrs + i) + 1); + + rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, vxrs, 0, -1); + if (rc == 0) + for (i = 0; i < __NUM_VXRS_LOW; i++) + *((__u64 *)(target->thread.fpu.vxrs + i) + 1) = vxrs[i]; + + return rc; +} + +static int s390_vxrs_high_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + __vector128 vxrs[__NUM_VXRS_HIGH]; + + if (!MACHINE_HAS_VX) + return -ENODEV; + if (target == current) + save_fpu_regs(); + memcpy(vxrs, target->thread.fpu.vxrs + __NUM_VXRS_LOW, sizeof(vxrs)); + + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, vxrs, 0, -1); +} + +static int s390_vxrs_high_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int rc; + + if (!MACHINE_HAS_VX) + return -ENODEV; + if (target == current) + save_fpu_regs(); + + rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + target->thread.fpu.vxrs + __NUM_VXRS_LOW, 0, -1); + return rc; +} + +static int s390_system_call_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + unsigned int *data = &target->thread.system_call; + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + data, 0, sizeof(unsigned int)); +} + +static int s390_system_call_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + unsigned int *data = &target->thread.system_call; + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, + data, 0, sizeof(unsigned int)); +} + +static int s390_gs_cb_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + struct gs_cb *data = target->thread.gs_cb; + + if (!MACHINE_HAS_GS) + return -ENODEV; + if (!data) + return -ENODATA; + if (target == current) + save_gs_cb(data); + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + data, 0, sizeof(struct gs_cb)); +} + +static int s390_gs_cb_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct gs_cb gs_cb = { }, *data = NULL; + int rc; + + if (!MACHINE_HAS_GS) + return -ENODEV; + if (!target->thread.gs_cb) { + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + } + if (!target->thread.gs_cb) + gs_cb.gsd = 25; + else if (target == current) + save_gs_cb(&gs_cb); + else + gs_cb = *target->thread.gs_cb; + rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &gs_cb, 0, sizeof(gs_cb)); + if (rc) { + kfree(data); + return -EFAULT; + } + preempt_disable(); + if (!target->thread.gs_cb) + target->thread.gs_cb = data; + *target->thread.gs_cb = gs_cb; + if (target == current) { + __ctl_set_bit(2, 4); + restore_gs_cb(target->thread.gs_cb); + } + preempt_enable(); + return rc; +} + +static int s390_gs_bc_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + struct gs_cb *data = target->thread.gs_bc_cb; + + if (!MACHINE_HAS_GS) + return -ENODEV; + if (!data) + return -ENODATA; + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + data, 0, sizeof(struct gs_cb)); +} + +static int s390_gs_bc_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct gs_cb *data = target->thread.gs_bc_cb; + + if (!MACHINE_HAS_GS) + return -ENODEV; + if (!data) { + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + target->thread.gs_bc_cb = data; + } + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, + data, 0, sizeof(struct gs_cb)); +} + +static bool is_ri_cb_valid(struct runtime_instr_cb *cb) +{ + return (cb->rca & 0x1f) == 0 && + (cb->roa & 0xfff) == 0 && + (cb->rla & 0xfff) == 0xfff && + cb->s == 1 && + cb->k == 1 && + cb->h == 0 && + cb->reserved1 == 0 && + cb->ps == 1 && + cb->qs == 0 && + cb->pc == 1 && + cb->qc == 0 && + cb->reserved2 == 0 && + cb->reserved3 == 0 && + cb->reserved4 == 0 && + cb->reserved5 == 0 && + cb->reserved6 == 0 && + cb->reserved7 == 0 && + cb->reserved8 == 0 && + cb->rla >= cb->roa && + cb->rca >= cb->roa && + cb->rca <= cb->rla+1 && + cb->m < 3; +} + +static int s390_runtime_instr_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + struct runtime_instr_cb *data = target->thread.ri_cb; + + if (!test_facility(64)) + return -ENODEV; + if (!data) + return -ENODATA; + + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + data, 0, sizeof(struct runtime_instr_cb)); +} + +static int s390_runtime_instr_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct runtime_instr_cb ri_cb = { }, *data = NULL; + int rc; + + if (!test_facility(64)) + return -ENODEV; + + if (!target->thread.ri_cb) { + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + } + + if (target->thread.ri_cb) { + if (target == current) + store_runtime_instr_cb(&ri_cb); + else + ri_cb = *target->thread.ri_cb; + } + + rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &ri_cb, 0, sizeof(struct runtime_instr_cb)); + if (rc) { + kfree(data); + return -EFAULT; + } + + if (!is_ri_cb_valid(&ri_cb)) { + kfree(data); + return -EINVAL; + } + /* + * Override access key in any case, since user space should + * not be able to set it, nor should it care about it. + */ + ri_cb.key = PAGE_DEFAULT_KEY >> 4; + preempt_disable(); + if (!target->thread.ri_cb) + target->thread.ri_cb = data; + *target->thread.ri_cb = ri_cb; + if (target == current) + load_runtime_instr_cb(target->thread.ri_cb); + preempt_enable(); + + return 0; +} + +static const struct user_regset s390_regsets[] = { + { + .core_note_type = NT_PRSTATUS, + .n = sizeof(s390_regs) / sizeof(long), + .size = sizeof(long), + .align = sizeof(long), + .get = s390_regs_get, + .set = s390_regs_set, + }, + { + .core_note_type = NT_PRFPREG, + .n = sizeof(s390_fp_regs) / sizeof(long), + .size = sizeof(long), + .align = sizeof(long), + .get = s390_fpregs_get, + .set = s390_fpregs_set, + }, + { + .core_note_type = NT_S390_SYSTEM_CALL, + .n = 1, + .size = sizeof(unsigned int), + .align = sizeof(unsigned int), + .get = s390_system_call_get, + .set = s390_system_call_set, + }, + { + .core_note_type = NT_S390_LAST_BREAK, + .n = 1, + .size = sizeof(long), + .align = sizeof(long), + .get = s390_last_break_get, + .set = s390_last_break_set, + }, + { + .core_note_type = NT_S390_TDB, + .n = 1, + .size = 256, + .align = 1, + .get = s390_tdb_get, + .set = s390_tdb_set, + }, + { + .core_note_type = NT_S390_VXRS_LOW, + .n = __NUM_VXRS_LOW, + .size = sizeof(__u64), + .align = sizeof(__u64), + .get = s390_vxrs_low_get, + .set = s390_vxrs_low_set, + }, + { + .core_note_type = NT_S390_VXRS_HIGH, + .n = __NUM_VXRS_HIGH, + .size = sizeof(__vector128), + .align = sizeof(__vector128), + .get = s390_vxrs_high_get, + .set = s390_vxrs_high_set, + }, + { + .core_note_type = NT_S390_GS_CB, + .n = sizeof(struct gs_cb) / sizeof(__u64), + .size = sizeof(__u64), + .align = sizeof(__u64), + .get = s390_gs_cb_get, + .set = s390_gs_cb_set, + }, + { + .core_note_type = NT_S390_GS_BC, + .n = sizeof(struct gs_cb) / sizeof(__u64), + .size = sizeof(__u64), + .align = sizeof(__u64), + .get = s390_gs_bc_get, + .set = s390_gs_bc_set, + }, + { + .core_note_type = NT_S390_RI_CB, + .n = sizeof(struct runtime_instr_cb) / sizeof(__u64), + .size = sizeof(__u64), + .align = sizeof(__u64), + .get = s390_runtime_instr_get, + .set = s390_runtime_instr_set, + }, +}; + +static const struct user_regset_view user_s390_view = { + .name = UTS_MACHINE, + .e_machine = EM_S390, + .regsets = s390_regsets, + .n = ARRAY_SIZE(s390_regsets) +}; + +#ifdef CONFIG_COMPAT +static int s390_compat_regs_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + if (target == current) + save_access_regs(target->thread.acrs); + + if (kbuf) { + compat_ulong_t *k = kbuf; + while (count > 0) { + *k++ = __peek_user_compat(target, pos); + count -= sizeof(*k); + pos += sizeof(*k); + } + } else { + compat_ulong_t __user *u = ubuf; + while (count > 0) { + if (__put_user(__peek_user_compat(target, pos), u++)) + return -EFAULT; + count -= sizeof(*u); + pos += sizeof(*u); + } + } + return 0; +} + +static int s390_compat_regs_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int rc = 0; + + if (target == current) + save_access_regs(target->thread.acrs); + + if (kbuf) { + const compat_ulong_t *k = kbuf; + while (count > 0 && !rc) { + rc = __poke_user_compat(target, pos, *k++); + count -= sizeof(*k); + pos += sizeof(*k); + } + } else { + const compat_ulong_t __user *u = ubuf; + while (count > 0 && !rc) { + compat_ulong_t word; + rc = __get_user(word, u++); + if (rc) + break; + rc = __poke_user_compat(target, pos, word); + count -= sizeof(*u); + pos += sizeof(*u); + } + } + + if (rc == 0 && target == current) + restore_access_regs(target->thread.acrs); + + return rc; +} + +static int s390_compat_regs_high_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + compat_ulong_t *gprs_high; + + gprs_high = (compat_ulong_t *) + &task_pt_regs(target)->gprs[pos / sizeof(compat_ulong_t)]; + if (kbuf) { + compat_ulong_t *k = kbuf; + while (count > 0) { + *k++ = *gprs_high; + gprs_high += 2; + count -= sizeof(*k); + } + } else { + compat_ulong_t __user *u = ubuf; + while (count > 0) { + if (__put_user(*gprs_high, u++)) + return -EFAULT; + gprs_high += 2; + count -= sizeof(*u); + } + } + return 0; +} + +static int s390_compat_regs_high_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + compat_ulong_t *gprs_high; + int rc = 0; + + gprs_high = (compat_ulong_t *) + &task_pt_regs(target)->gprs[pos / sizeof(compat_ulong_t)]; + if (kbuf) { + const compat_ulong_t *k = kbuf; + while (count > 0) { + *gprs_high = *k++; + *gprs_high += 2; + count -= sizeof(*k); + } + } else { + const compat_ulong_t __user *u = ubuf; + while (count > 0 && !rc) { + unsigned long word; + rc = __get_user(word, u++); + if (rc) + break; + *gprs_high = word; + *gprs_high += 2; + count -= sizeof(*u); + } + } + + return rc; +} + +static int s390_compat_last_break_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + compat_ulong_t last_break; + + if (count > 0) { + last_break = target->thread.last_break; + if (kbuf) { + unsigned long *k = kbuf; + *k = last_break; + } else { + unsigned long __user *u = ubuf; + if (__put_user(last_break, u)) + return -EFAULT; + } + } + return 0; +} + +static int s390_compat_last_break_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + return 0; +} + +static const struct user_regset s390_compat_regsets[] = { + { + .core_note_type = NT_PRSTATUS, + .n = sizeof(s390_compat_regs) / sizeof(compat_long_t), + .size = sizeof(compat_long_t), + .align = sizeof(compat_long_t), + .get = s390_compat_regs_get, + .set = s390_compat_regs_set, + }, + { + .core_note_type = NT_PRFPREG, + .n = sizeof(s390_fp_regs) / sizeof(compat_long_t), + .size = sizeof(compat_long_t), + .align = sizeof(compat_long_t), + .get = s390_fpregs_get, + .set = s390_fpregs_set, + }, + { + .core_note_type = NT_S390_SYSTEM_CALL, + .n = 1, + .size = sizeof(compat_uint_t), + .align = sizeof(compat_uint_t), + .get = s390_system_call_get, + .set = s390_system_call_set, + }, + { + .core_note_type = NT_S390_LAST_BREAK, + .n = 1, + .size = sizeof(long), + .align = sizeof(long), + .get = s390_compat_last_break_get, + .set = s390_compat_last_break_set, + }, + { + .core_note_type = NT_S390_TDB, + .n = 1, + .size = 256, + .align = 1, + .get = s390_tdb_get, + .set = s390_tdb_set, + }, + { + .core_note_type = NT_S390_VXRS_LOW, + .n = __NUM_VXRS_LOW, + .size = sizeof(__u64), + .align = sizeof(__u64), + .get = s390_vxrs_low_get, + .set = s390_vxrs_low_set, + }, + { + .core_note_type = NT_S390_VXRS_HIGH, + .n = __NUM_VXRS_HIGH, + .size = sizeof(__vector128), + .align = sizeof(__vector128), + .get = s390_vxrs_high_get, + .set = s390_vxrs_high_set, + }, + { + .core_note_type = NT_S390_HIGH_GPRS, + .n = sizeof(s390_compat_regs_high) / sizeof(compat_long_t), + .size = sizeof(compat_long_t), + .align = sizeof(compat_long_t), + .get = s390_compat_regs_high_get, + .set = s390_compat_regs_high_set, + }, + { + .core_note_type = NT_S390_GS_CB, + .n = sizeof(struct gs_cb) / sizeof(__u64), + .size = sizeof(__u64), + .align = sizeof(__u64), + .get = s390_gs_cb_get, + .set = s390_gs_cb_set, + }, + { + .core_note_type = NT_S390_GS_BC, + .n = sizeof(struct gs_cb) / sizeof(__u64), + .size = sizeof(__u64), + .align = sizeof(__u64), + .get = s390_gs_bc_get, + .set = s390_gs_bc_set, + }, + { + .core_note_type = NT_S390_RI_CB, + .n = sizeof(struct runtime_instr_cb) / sizeof(__u64), + .size = sizeof(__u64), + .align = sizeof(__u64), + .get = s390_runtime_instr_get, + .set = s390_runtime_instr_set, + }, +}; + +static const struct user_regset_view user_s390_compat_view = { + .name = "s390", + .e_machine = EM_S390, + .regsets = s390_compat_regsets, + .n = ARRAY_SIZE(s390_compat_regsets) +}; +#endif + +const struct user_regset_view *task_user_regset_view(struct task_struct *task) +{ +#ifdef CONFIG_COMPAT + if (test_tsk_thread_flag(task, TIF_31BIT)) + return &user_s390_compat_view; +#endif + return &user_s390_view; +} + +static const char *gpr_names[NUM_GPRS] = { + "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", +}; + +unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset) +{ + if (offset >= NUM_GPRS) + return 0; + return regs->gprs[offset]; +} + +int regs_query_register_offset(const char *name) +{ + unsigned long offset; + + if (!name || *name != 'r') + return -EINVAL; + if (kstrtoul(name + 1, 10, &offset)) + return -EINVAL; + if (offset >= NUM_GPRS) + return -EINVAL; + return offset; +} + +const char *regs_query_register_name(unsigned int offset) +{ + if (offset >= NUM_GPRS) + return NULL; + return gpr_names[offset]; +} + +static int regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr) +{ + unsigned long ksp = kernel_stack_pointer(regs); + + return (addr & ~(THREAD_SIZE - 1)) == (ksp & ~(THREAD_SIZE - 1)); +} + +/** + * regs_get_kernel_stack_nth() - get Nth entry of the stack + * @regs:pt_regs which contains kernel stack pointer. + * @n:stack entry number. + * + * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which + * is specifined by @regs. If the @n th entry is NOT in the kernel stack, + * this returns 0. + */ +unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n) +{ + unsigned long addr; + + addr = kernel_stack_pointer(regs) + n * sizeof(long); + if (!regs_within_kernel_stack(regs, addr)) + return 0; + return *(unsigned long *)addr; +} diff --git a/arch/s390/kernel/reipl.S b/arch/s390/kernel/reipl.S new file mode 100644 index 000000000..7f14adf51 --- /dev/null +++ b/arch/s390/kernel/reipl.S @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp 2000, 2011 + * Author(s): Holger Smolinski <Holger.Smolinski@de.ibm.com>, + * Denis Joseph Barrow, + */ + +#include <linux/linkage.h> +#include <asm/asm-offsets.h> +#include <asm/nospec-insn.h> +#include <asm/sigp.h> + + GEN_BR_THUNK %r9 + +# +# Issue "store status" for the current CPU to its prefix page +# and call passed function afterwards +# +# r2 = Function to be called after store status +# r3 = Parameter for function +# +ENTRY(store_status) + /* Save register one and load save area base */ + stg %r1,__LC_SAVE_AREA_RESTART + /* General purpose registers */ + lghi %r1,__LC_GPREGS_SAVE_AREA + stmg %r0,%r15,0(%r1) + mvc 8(8,%r1),__LC_SAVE_AREA_RESTART + /* Control registers */ + lghi %r1,__LC_CREGS_SAVE_AREA + stctg %c0,%c15,0(%r1) + /* Access registers */ + lghi %r1,__LC_AREGS_SAVE_AREA + stam %a0,%a15,0(%r1) + /* Floating point registers */ + lghi %r1,__LC_FPREGS_SAVE_AREA + std %f0, 0x00(%r1) + std %f1, 0x08(%r1) + std %f2, 0x10(%r1) + std %f3, 0x18(%r1) + std %f4, 0x20(%r1) + std %f5, 0x28(%r1) + std %f6, 0x30(%r1) + std %f7, 0x38(%r1) + std %f8, 0x40(%r1) + std %f9, 0x48(%r1) + std %f10,0x50(%r1) + std %f11,0x58(%r1) + std %f12,0x60(%r1) + std %f13,0x68(%r1) + std %f14,0x70(%r1) + std %f15,0x78(%r1) + /* Floating point control register */ + lghi %r1,__LC_FP_CREG_SAVE_AREA + stfpc 0(%r1) + /* CPU timer */ + lghi %r1,__LC_CPU_TIMER_SAVE_AREA + stpt 0(%r1) + /* Store prefix register */ + lghi %r1,__LC_PREFIX_SAVE_AREA + stpx 0(%r1) + /* Clock comparator - seven bytes */ + lghi %r1,__LC_CLOCK_COMP_SAVE_AREA + larl %r4,.Lclkcmp + stckc 0(%r4) + mvc 1(7,%r1),1(%r4) + /* Program status word */ + lghi %r1,__LC_PSW_SAVE_AREA + epsw %r4,%r5 + st %r4,0(%r1) + st %r5,4(%r1) + stg %r2,8(%r1) + lgr %r9,%r2 + lgr %r2,%r3 + BR_EX %r9 + + .section .bss + .align 8 +.Lclkcmp: .quad 0x0000000000000000 + .previous diff --git a/arch/s390/kernel/relocate_kernel.S b/arch/s390/kernel/relocate_kernel.S new file mode 100644 index 000000000..c97c2d40f --- /dev/null +++ b/arch/s390/kernel/relocate_kernel.S @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2005 + * + * Author(s): Rolf Adelsberger, + * Heiko Carstens <heiko.carstens@de.ibm.com> + * + */ + +#include <linux/linkage.h> +#include <asm/page.h> +#include <asm/sigp.h> + +/* + * moves the new kernel to its destination... + * %r2 = pointer to first kimage_entry_t + * %r3 = start address - where to jump to after the job is done... + * + * %r5 will be used as temp. storage + * %r6 holds the destination address + * %r7 = PAGE_SIZE + * %r8 holds the source address + * %r9 = PAGE_SIZE + * + * 0xf000 is a page_mask + */ + + .text +ENTRY(relocate_kernel) + basr %r13,0 # base address + .base: + lghi %r7,PAGE_SIZE # load PAGE_SIZE in r7 + lghi %r9,PAGE_SIZE # load PAGE_SIZE in r9 + lg %r5,0(%r2) # read another word for indirection page + aghi %r2,8 # increment pointer + tml %r5,0x1 # is it a destination page? + je .indir_check # NO, goto "indir_check" + lgr %r6,%r5 # r6 = r5 + nill %r6,0xf000 # mask it out and... + j .base # ...next iteration + .indir_check: + tml %r5,0x2 # is it a indirection page? + je .done_test # NO, goto "done_test" + nill %r5,0xf000 # YES, mask out, + lgr %r2,%r5 # move it into the right register, + j .base # and read next... + .done_test: + tml %r5,0x4 # is it the done indicator? + je .source_test # NO! Well, then it should be the source indicator... + j .done # ok, lets finish it here... + .source_test: + tml %r5,0x8 # it should be a source indicator... + je .base # NO, ignore it... + lgr %r8,%r5 # r8 = r5 + nill %r8,0xf000 # masking + 0: mvcle %r6,%r8,0x0 # copy PAGE_SIZE bytes from r8 to r6 - pad with 0 + jo 0b + j .base + .done: + sgr %r0,%r0 # clear register r0 + la %r4,load_psw-.base(%r13) # load psw-address into the register + o %r3,4(%r4) # or load address into psw + st %r3,4(%r4) + mvc 0(8,%r0),0(%r4) # copy psw to absolute address 0 + diag %r0,%r0,0x308 + + .align 8 + load_psw: + .long 0x00080000,0x80000000 + relocate_kernel_end: + .align 8 + .globl relocate_kernel_len + relocate_kernel_len: + .quad relocate_kernel_end - relocate_kernel diff --git a/arch/s390/kernel/runtime_instr.c b/arch/s390/kernel/runtime_instr.c new file mode 100644 index 000000000..1788a5454 --- /dev/null +++ b/arch/s390/kernel/runtime_instr.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2012 + * Author(s): Jan Glauber <jang@linux.vnet.ibm.com> + */ + +#include <linux/kernel.h> +#include <linux/syscalls.h> +#include <linux/signal.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/kernel_stat.h> +#include <linux/sched/task_stack.h> + +#include <asm/runtime_instr.h> +#include <asm/cpu_mf.h> +#include <asm/irq.h> + +#include "entry.h" + +/* empty control block to disable RI by loading it */ +struct runtime_instr_cb runtime_instr_empty_cb; + +void runtime_instr_release(struct task_struct *tsk) +{ + kfree(tsk->thread.ri_cb); +} + +static void disable_runtime_instr(void) +{ + struct task_struct *task = current; + struct pt_regs *regs; + + if (!task->thread.ri_cb) + return; + regs = task_pt_regs(task); + preempt_disable(); + load_runtime_instr_cb(&runtime_instr_empty_cb); + kfree(task->thread.ri_cb); + task->thread.ri_cb = NULL; + preempt_enable(); + + /* + * Make sure the RI bit is deleted from the PSW. If the user did not + * switch off RI before the system call the process will get a + * specification exception otherwise. + */ + regs->psw.mask &= ~PSW_MASK_RI; +} + +static void init_runtime_instr_cb(struct runtime_instr_cb *cb) +{ + cb->rla = 0xfff; + cb->s = 1; + cb->k = 1; + cb->ps = 1; + cb->pc = 1; + cb->key = PAGE_DEFAULT_KEY >> 4; + cb->v = 1; +} + +/* + * The signum argument is unused. In older kernels it was used to + * specify a real-time signal. For backwards compatibility user space + * should pass a valid real-time signal number (the signum argument + * was checked in older kernels). + */ +SYSCALL_DEFINE2(s390_runtime_instr, int, command, int, signum) +{ + struct runtime_instr_cb *cb; + + if (!test_facility(64)) + return -EOPNOTSUPP; + + if (command == S390_RUNTIME_INSTR_STOP) { + disable_runtime_instr(); + return 0; + } + + if (command != S390_RUNTIME_INSTR_START) + return -EINVAL; + + if (!current->thread.ri_cb) { + cb = kzalloc(sizeof(*cb), GFP_KERNEL); + if (!cb) + return -ENOMEM; + } else { + cb = current->thread.ri_cb; + memset(cb, 0, sizeof(*cb)); + } + + init_runtime_instr_cb(cb); + + /* now load the control block to make it available */ + preempt_disable(); + current->thread.ri_cb = cb; + load_runtime_instr_cb(cb); + preempt_enable(); + return 0; +} diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c new file mode 100644 index 000000000..098794fc5 --- /dev/null +++ b/arch/s390/kernel/setup.c @@ -0,0 +1,985 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * S390 version + * Copyright IBM Corp. 1999, 2012 + * Author(s): Hartmut Penner (hp@de.ibm.com), + * Martin Schwidefsky (schwidefsky@de.ibm.com) + * + * Derived from "arch/i386/kernel/setup.c" + * Copyright (C) 1995, Linus Torvalds + */ + +/* + * This file handles the architecture-dependent parts of initialization + */ + +#define KMSG_COMPONENT "setup" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/errno.h> +#include <linux/export.h> +#include <linux/sched.h> +#include <linux/sched/task.h> +#include <linux/cpu.h> +#include <linux/kernel.h> +#include <linux/memblock.h> +#include <linux/mm.h> +#include <linux/stddef.h> +#include <linux/unistd.h> +#include <linux/ptrace.h> +#include <linux/random.h> +#include <linux/user.h> +#include <linux/tty.h> +#include <linux/ioport.h> +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/initrd.h> +#include <linux/bootmem.h> +#include <linux/root_dev.h> +#include <linux/console.h> +#include <linux/kernel_stat.h> +#include <linux/dma-contiguous.h> +#include <linux/device.h> +#include <linux/notifier.h> +#include <linux/pfn.h> +#include <linux/ctype.h> +#include <linux/reboot.h> +#include <linux/topology.h> +#include <linux/kexec.h> +#include <linux/crash_dump.h> +#include <linux/memory.h> +#include <linux/compat.h> + +#include <asm/ipl.h> +#include <asm/facility.h> +#include <asm/smp.h> +#include <asm/mmu_context.h> +#include <asm/cpcmd.h> +#include <asm/lowcore.h> +#include <asm/nmi.h> +#include <asm/irq.h> +#include <asm/page.h> +#include <asm/ptrace.h> +#include <asm/sections.h> +#include <asm/ebcdic.h> +#include <asm/diag.h> +#include <asm/os_info.h> +#include <asm/sclp.h> +#include <asm/sysinfo.h> +#include <asm/numa.h> +#include <asm/alternative.h> +#include <asm/nospec-branch.h> +#include "entry.h" + +/* + * Machine setup.. + */ +unsigned int console_mode = 0; +EXPORT_SYMBOL(console_mode); + +unsigned int console_devno = -1; +EXPORT_SYMBOL(console_devno); + +unsigned int console_irq = -1; +EXPORT_SYMBOL(console_irq); + +unsigned long elf_hwcap __read_mostly = 0; +char elf_platform[ELF_PLATFORM_SIZE]; + +unsigned long int_hwcap = 0; + +int __initdata memory_end_set; +unsigned long __initdata memory_end; +unsigned long __initdata max_physmem_end; + +unsigned long VMALLOC_START; +EXPORT_SYMBOL(VMALLOC_START); + +unsigned long VMALLOC_END; +EXPORT_SYMBOL(VMALLOC_END); + +struct page *vmemmap; +EXPORT_SYMBOL(vmemmap); + +unsigned long MODULES_VADDR; +unsigned long MODULES_END; + +/* An array with a pointer to the lowcore of every CPU. */ +struct lowcore *lowcore_ptr[NR_CPUS]; +EXPORT_SYMBOL(lowcore_ptr); + +/* + * This is set up by the setup-routine at boot-time + * for S390 need to find out, what we have to setup + * using address 0x10400 ... + */ + +#include <asm/setup.h> + +/* + * condev= and conmode= setup parameter. + */ + +static int __init condev_setup(char *str) +{ + int vdev; + + vdev = simple_strtoul(str, &str, 0); + if (vdev >= 0 && vdev < 65536) { + console_devno = vdev; + console_irq = -1; + } + return 1; +} + +__setup("condev=", condev_setup); + +static void __init set_preferred_console(void) +{ + if (CONSOLE_IS_3215 || CONSOLE_IS_SCLP) + add_preferred_console("ttyS", 0, NULL); + else if (CONSOLE_IS_3270) + add_preferred_console("tty3270", 0, NULL); + else if (CONSOLE_IS_VT220) + add_preferred_console("ttysclp", 0, NULL); + else if (CONSOLE_IS_HVC) + add_preferred_console("hvc", 0, NULL); +} + +static int __init conmode_setup(char *str) +{ +#if defined(CONFIG_SCLP_CONSOLE) || defined(CONFIG_SCLP_VT220_CONSOLE) + if (strncmp(str, "hwc", 4) == 0 || strncmp(str, "sclp", 5) == 0) + SET_CONSOLE_SCLP; +#endif +#if defined(CONFIG_TN3215_CONSOLE) + if (strncmp(str, "3215", 5) == 0) + SET_CONSOLE_3215; +#endif +#if defined(CONFIG_TN3270_CONSOLE) + if (strncmp(str, "3270", 5) == 0) + SET_CONSOLE_3270; +#endif + set_preferred_console(); + return 1; +} + +__setup("conmode=", conmode_setup); + +static void __init conmode_default(void) +{ + char query_buffer[1024]; + char *ptr; + + if (MACHINE_IS_VM) { + cpcmd("QUERY CONSOLE", query_buffer, 1024, NULL); + console_devno = simple_strtoul(query_buffer + 5, NULL, 16); + ptr = strstr(query_buffer, "SUBCHANNEL ="); + console_irq = simple_strtoul(ptr + 13, NULL, 16); + cpcmd("QUERY TERM", query_buffer, 1024, NULL); + ptr = strstr(query_buffer, "CONMODE"); + /* + * Set the conmode to 3215 so that the device recognition + * will set the cu_type of the console to 3215. If the + * conmode is 3270 and we don't set it back then both + * 3215 and the 3270 driver will try to access the console + * device (3215 as console and 3270 as normal tty). + */ + cpcmd("TERM CONMODE 3215", NULL, 0, NULL); + if (ptr == NULL) { +#if defined(CONFIG_SCLP_CONSOLE) || defined(CONFIG_SCLP_VT220_CONSOLE) + SET_CONSOLE_SCLP; +#endif + return; + } + if (strncmp(ptr + 8, "3270", 4) == 0) { +#if defined(CONFIG_TN3270_CONSOLE) + SET_CONSOLE_3270; +#elif defined(CONFIG_TN3215_CONSOLE) + SET_CONSOLE_3215; +#elif defined(CONFIG_SCLP_CONSOLE) || defined(CONFIG_SCLP_VT220_CONSOLE) + SET_CONSOLE_SCLP; +#endif + } else if (strncmp(ptr + 8, "3215", 4) == 0) { +#if defined(CONFIG_TN3215_CONSOLE) + SET_CONSOLE_3215; +#elif defined(CONFIG_TN3270_CONSOLE) + SET_CONSOLE_3270; +#elif defined(CONFIG_SCLP_CONSOLE) || defined(CONFIG_SCLP_VT220_CONSOLE) + SET_CONSOLE_SCLP; +#endif + } + } else if (MACHINE_IS_KVM) { + if (sclp.has_vt220 && IS_ENABLED(CONFIG_SCLP_VT220_CONSOLE)) + SET_CONSOLE_VT220; + else if (sclp.has_linemode && IS_ENABLED(CONFIG_SCLP_CONSOLE)) + SET_CONSOLE_SCLP; + else + SET_CONSOLE_HVC; + } else { +#if defined(CONFIG_SCLP_CONSOLE) || defined(CONFIG_SCLP_VT220_CONSOLE) + SET_CONSOLE_SCLP; +#endif + } + if (IS_ENABLED(CONFIG_VT) && IS_ENABLED(CONFIG_DUMMY_CONSOLE)) + conswitchp = &dummy_con; +} + +#ifdef CONFIG_CRASH_DUMP +static void __init setup_zfcpdump(void) +{ + if (ipl_info.type != IPL_TYPE_FCP_DUMP) + return; + if (OLDMEM_BASE) + return; + strcat(boot_command_line, " cio_ignore=all,!ipldev,!condev"); + console_loglevel = 2; +} +#else +static inline void setup_zfcpdump(void) {} +#endif /* CONFIG_CRASH_DUMP */ + + /* + * Reboot, halt and power_off stubs. They just call _machine_restart, + * _machine_halt or _machine_power_off. + */ + +void machine_restart(char *command) +{ + if ((!in_interrupt() && !in_atomic()) || oops_in_progress) + /* + * Only unblank the console if we are called in enabled + * context or a bust_spinlocks cleared the way for us. + */ + console_unblank(); + _machine_restart(command); +} + +void machine_halt(void) +{ + if (!in_interrupt() || oops_in_progress) + /* + * Only unblank the console if we are called in enabled + * context or a bust_spinlocks cleared the way for us. + */ + console_unblank(); + _machine_halt(); +} + +void machine_power_off(void) +{ + if (!in_interrupt() || oops_in_progress) + /* + * Only unblank the console if we are called in enabled + * context or a bust_spinlocks cleared the way for us. + */ + console_unblank(); + _machine_power_off(); +} + +/* + * Dummy power off function. + */ +void (*pm_power_off)(void) = machine_power_off; +EXPORT_SYMBOL_GPL(pm_power_off); + +static int __init early_parse_mem(char *p) +{ + memory_end = memparse(p, &p); + memory_end &= PAGE_MASK; + memory_end_set = 1; + return 0; +} +early_param("mem", early_parse_mem); + +static int __init parse_vmalloc(char *arg) +{ + if (!arg) + return -EINVAL; + VMALLOC_END = (memparse(arg, &arg) + PAGE_SIZE - 1) & PAGE_MASK; + return 0; +} +early_param("vmalloc", parse_vmalloc); + +void *restart_stack __section(.data); + +static void __init setup_lowcore_dat_off(void) +{ + struct lowcore *lc; + + /* + * Setup lowcore for boot cpu + */ + BUILD_BUG_ON(sizeof(struct lowcore) != LC_PAGES * PAGE_SIZE); + lc = memblock_virt_alloc_low(sizeof(*lc), sizeof(*lc)); + lc->restart_psw.mask = PSW_KERNEL_BITS; + lc->restart_psw.addr = (unsigned long) restart_int_handler; + lc->external_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK; + lc->external_new_psw.addr = (unsigned long) ext_int_handler; + lc->svc_new_psw.mask = PSW_KERNEL_BITS | + PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; + lc->svc_new_psw.addr = (unsigned long) system_call; + lc->program_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK; + lc->program_new_psw.addr = (unsigned long) pgm_check_handler; + lc->mcck_new_psw.mask = PSW_KERNEL_BITS; + lc->mcck_new_psw.addr = (unsigned long) mcck_int_handler; + lc->io_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK; + lc->io_new_psw.addr = (unsigned long) io_int_handler; + lc->clock_comparator = clock_comparator_max; + lc->kernel_stack = ((unsigned long) &init_thread_union) + + THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs); + lc->async_stack = (unsigned long) + memblock_virt_alloc(ASYNC_SIZE, ASYNC_SIZE) + + ASYNC_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs); + lc->panic_stack = (unsigned long) + memblock_virt_alloc(PAGE_SIZE, PAGE_SIZE) + + PAGE_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs); + lc->current_task = (unsigned long)&init_task; + lc->lpp = LPP_MAGIC; + lc->machine_flags = S390_lowcore.machine_flags; + lc->preempt_count = S390_lowcore.preempt_count; + lc->stfl_fac_list = S390_lowcore.stfl_fac_list; + memcpy(lc->stfle_fac_list, S390_lowcore.stfle_fac_list, + sizeof(lc->stfle_fac_list)); + memcpy(lc->alt_stfle_fac_list, S390_lowcore.alt_stfle_fac_list, + sizeof(lc->alt_stfle_fac_list)); + nmi_alloc_boot_cpu(lc); + vdso_alloc_boot_cpu(lc); + lc->sync_enter_timer = S390_lowcore.sync_enter_timer; + lc->async_enter_timer = S390_lowcore.async_enter_timer; + lc->exit_timer = S390_lowcore.exit_timer; + lc->user_timer = S390_lowcore.user_timer; + lc->system_timer = S390_lowcore.system_timer; + lc->steal_timer = S390_lowcore.steal_timer; + lc->last_update_timer = S390_lowcore.last_update_timer; + lc->last_update_clock = S390_lowcore.last_update_clock; + + restart_stack = memblock_virt_alloc(ASYNC_SIZE, ASYNC_SIZE); + restart_stack += ASYNC_SIZE; + + /* + * Set up PSW restart to call ipl.c:do_restart(). Copy the relevant + * restart data to the absolute zero lowcore. This is necessary if + * PSW restart is done on an offline CPU that has lowcore zero. + */ + lc->restart_stack = (unsigned long) restart_stack; + lc->restart_fn = (unsigned long) do_restart; + lc->restart_data = 0; + lc->restart_source = -1UL; + + /* Setup absolute zero lowcore */ + mem_assign_absolute(S390_lowcore.restart_stack, lc->restart_stack); + mem_assign_absolute(S390_lowcore.restart_fn, lc->restart_fn); + mem_assign_absolute(S390_lowcore.restart_data, lc->restart_data); + mem_assign_absolute(S390_lowcore.restart_source, lc->restart_source); + mem_assign_absolute(S390_lowcore.restart_psw, lc->restart_psw); + +#ifdef CONFIG_SMP + lc->spinlock_lockval = arch_spin_lockval(0); + lc->spinlock_index = 0; + arch_spin_lock_setup(0); +#endif + lc->br_r1_trampoline = 0x07f1; /* br %r1 */ + + set_prefix((u32)(unsigned long) lc); + lowcore_ptr[0] = lc; +} + +static void __init setup_lowcore_dat_on(void) +{ + __ctl_clear_bit(0, 28); + S390_lowcore.external_new_psw.mask |= PSW_MASK_DAT; + S390_lowcore.svc_new_psw.mask |= PSW_MASK_DAT; + S390_lowcore.program_new_psw.mask |= PSW_MASK_DAT; + S390_lowcore.io_new_psw.mask |= PSW_MASK_DAT; + __ctl_set_bit(0, 28); +} + +static struct resource code_resource = { + .name = "Kernel code", + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, +}; + +static struct resource data_resource = { + .name = "Kernel data", + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, +}; + +static struct resource bss_resource = { + .name = "Kernel bss", + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, +}; + +static struct resource __initdata *standard_resources[] = { + &code_resource, + &data_resource, + &bss_resource, +}; + +static void __init setup_resources(void) +{ + struct resource *res, *std_res, *sub_res; + struct memblock_region *reg; + int j; + + code_resource.start = (unsigned long) _text; + code_resource.end = (unsigned long) _etext - 1; + data_resource.start = (unsigned long) _etext; + data_resource.end = (unsigned long) _edata - 1; + bss_resource.start = (unsigned long) __bss_start; + bss_resource.end = (unsigned long) __bss_stop - 1; + + for_each_memblock(memory, reg) { + res = memblock_virt_alloc(sizeof(*res), 8); + res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; + + res->name = "System RAM"; + res->start = reg->base; + res->end = reg->base + reg->size - 1; + request_resource(&iomem_resource, res); + + for (j = 0; j < ARRAY_SIZE(standard_resources); j++) { + std_res = standard_resources[j]; + if (std_res->start < res->start || + std_res->start > res->end) + continue; + if (std_res->end > res->end) { + sub_res = memblock_virt_alloc(sizeof(*sub_res), 8); + *sub_res = *std_res; + sub_res->end = res->end; + std_res->start = res->end + 1; + request_resource(res, sub_res); + } else { + request_resource(res, std_res); + } + } + } +#ifdef CONFIG_CRASH_DUMP + /* + * Re-add removed crash kernel memory as reserved memory. This makes + * sure it will be mapped with the identity mapping and struct pages + * will be created, so it can be resized later on. + * However add it later since the crash kernel resource should not be + * part of the System RAM resource. + */ + if (crashk_res.end) { + memblock_add_node(crashk_res.start, resource_size(&crashk_res), 0); + memblock_reserve(crashk_res.start, resource_size(&crashk_res)); + insert_resource(&iomem_resource, &crashk_res); + } +#endif +} + +static void __init setup_memory_end(void) +{ + unsigned long vmax, vmalloc_size, tmp; + + /* Choose kernel address space layout: 2, 3, or 4 levels. */ + vmalloc_size = VMALLOC_END ?: (128UL << 30) - MODULES_LEN; + tmp = (memory_end ?: max_physmem_end) / PAGE_SIZE; + tmp = tmp * (sizeof(struct page) + PAGE_SIZE); + if (tmp + vmalloc_size + MODULES_LEN <= _REGION2_SIZE) + vmax = _REGION2_SIZE; /* 3-level kernel page table */ + else + vmax = _REGION1_SIZE; /* 4-level kernel page table */ + /* module area is at the end of the kernel address space. */ + MODULES_END = vmax; + MODULES_VADDR = MODULES_END - MODULES_LEN; + VMALLOC_END = MODULES_VADDR; + VMALLOC_START = vmax - vmalloc_size; + + /* Split remaining virtual space between 1:1 mapping & vmemmap array */ + tmp = VMALLOC_START / (PAGE_SIZE + sizeof(struct page)); + /* vmemmap contains a multiple of PAGES_PER_SECTION struct pages */ + tmp = SECTION_ALIGN_UP(tmp); + tmp = VMALLOC_START - tmp * sizeof(struct page); + tmp &= ~((vmax >> 11) - 1); /* align to page table level */ + tmp = min(tmp, 1UL << MAX_PHYSMEM_BITS); + vmemmap = (struct page *) tmp; + + /* Take care that memory_end is set and <= vmemmap */ + memory_end = min(memory_end ?: max_physmem_end, tmp); + max_pfn = max_low_pfn = PFN_DOWN(memory_end); + memblock_remove(memory_end, ULONG_MAX); + + pr_notice("The maximum memory size is %luMB\n", memory_end >> 20); +} + +#ifdef CONFIG_CRASH_DUMP + +/* + * When kdump is enabled, we have to ensure that no memory from + * the area [0 - crashkernel memory size] and + * [crashk_res.start - crashk_res.end] is set offline. + */ +static int kdump_mem_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct memory_notify *arg = data; + + if (action != MEM_GOING_OFFLINE) + return NOTIFY_OK; + if (arg->start_pfn < PFN_DOWN(resource_size(&crashk_res))) + return NOTIFY_BAD; + if (arg->start_pfn > PFN_DOWN(crashk_res.end)) + return NOTIFY_OK; + if (arg->start_pfn + arg->nr_pages - 1 < PFN_DOWN(crashk_res.start)) + return NOTIFY_OK; + return NOTIFY_BAD; +} + +static struct notifier_block kdump_mem_nb = { + .notifier_call = kdump_mem_notifier, +}; + +#endif + +/* + * Make sure that the area behind memory_end is protected + */ +static void __init reserve_memory_end(void) +{ +#ifdef CONFIG_CRASH_DUMP + if (ipl_info.type == IPL_TYPE_FCP_DUMP && + !OLDMEM_BASE && sclp.hsa_size) { + memory_end = sclp.hsa_size; + memory_end &= PAGE_MASK; + memory_end_set = 1; + } +#endif + if (!memory_end_set) + return; + memblock_reserve(memory_end, ULONG_MAX); +} + +/* + * Make sure that oldmem, where the dump is stored, is protected + */ +static void __init reserve_oldmem(void) +{ +#ifdef CONFIG_CRASH_DUMP + if (OLDMEM_BASE) + /* Forget all memory above the running kdump system */ + memblock_reserve(OLDMEM_SIZE, (phys_addr_t)ULONG_MAX); +#endif +} + +/* + * Make sure that oldmem, where the dump is stored, is protected + */ +static void __init remove_oldmem(void) +{ +#ifdef CONFIG_CRASH_DUMP + if (OLDMEM_BASE) + /* Forget all memory above the running kdump system */ + memblock_remove(OLDMEM_SIZE, (phys_addr_t)ULONG_MAX); +#endif +} + +/* + * Reserve memory for kdump kernel to be loaded with kexec + */ +static void __init reserve_crashkernel(void) +{ +#ifdef CONFIG_CRASH_DUMP + unsigned long long crash_base, crash_size; + phys_addr_t low, high; + int rc; + + rc = parse_crashkernel(boot_command_line, memory_end, &crash_size, + &crash_base); + + crash_base = ALIGN(crash_base, KEXEC_CRASH_MEM_ALIGN); + crash_size = ALIGN(crash_size, KEXEC_CRASH_MEM_ALIGN); + if (rc || crash_size == 0) + return; + + if (memblock.memory.regions[0].size < crash_size) { + pr_info("crashkernel reservation failed: %s\n", + "first memory chunk must be at least crashkernel size"); + return; + } + + low = crash_base ?: OLDMEM_BASE; + high = low + crash_size; + if (low >= OLDMEM_BASE && high <= OLDMEM_BASE + OLDMEM_SIZE) { + /* The crashkernel fits into OLDMEM, reuse OLDMEM */ + crash_base = low; + } else { + /* Find suitable area in free memory */ + low = max_t(unsigned long, crash_size, sclp.hsa_size); + high = crash_base ? crash_base + crash_size : ULONG_MAX; + + if (crash_base && crash_base < low) { + pr_info("crashkernel reservation failed: %s\n", + "crash_base too low"); + return; + } + low = crash_base ?: low; + crash_base = memblock_find_in_range(low, high, crash_size, + KEXEC_CRASH_MEM_ALIGN); + } + + if (!crash_base) { + pr_info("crashkernel reservation failed: %s\n", + "no suitable area found"); + return; + } + + if (register_memory_notifier(&kdump_mem_nb)) + return; + + if (!OLDMEM_BASE && MACHINE_IS_VM) + diag10_range(PFN_DOWN(crash_base), PFN_DOWN(crash_size)); + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; + memblock_remove(crash_base, crash_size); + pr_info("Reserving %lluMB of memory at %lluMB " + "for crashkernel (System RAM: %luMB)\n", + crash_size >> 20, crash_base >> 20, + (unsigned long)memblock.memory.total_size >> 20); + os_info_crashkernel_add(crash_base, crash_size); +#endif +} + +/* + * Reserve the initrd from being used by memblock + */ +static void __init reserve_initrd(void) +{ +#ifdef CONFIG_BLK_DEV_INITRD + if (!INITRD_START || !INITRD_SIZE) + return; + initrd_start = INITRD_START; + initrd_end = initrd_start + INITRD_SIZE; + memblock_reserve(INITRD_START, INITRD_SIZE); +#endif +} + +/* + * Check for initrd being in usable memory + */ +static void __init check_initrd(void) +{ +#ifdef CONFIG_BLK_DEV_INITRD + if (INITRD_START && INITRD_SIZE && + !memblock_is_region_memory(INITRD_START, INITRD_SIZE)) { + pr_err("The initial RAM disk does not fit into the memory\n"); + memblock_free(INITRD_START, INITRD_SIZE); + initrd_start = initrd_end = 0; + } +#endif +} + +/* + * Reserve memory used for lowcore/command line/kernel image. + */ +static void __init reserve_kernel(void) +{ + unsigned long start_pfn = PFN_UP(__pa(_end)); + +#ifdef CONFIG_DMA_API_DEBUG + /* + * DMA_API_DEBUG code stumbles over addresses from the + * range [PARMAREA_END, _stext]. Mark the memory as reserved + * so it is not used for CONFIG_DMA_API_DEBUG=y. + */ + memblock_reserve(0, PFN_PHYS(start_pfn)); +#else + memblock_reserve(0, PARMAREA_END); + memblock_reserve((unsigned long)_stext, PFN_PHYS(start_pfn) + - (unsigned long)_stext); +#endif +} + +static void __init setup_memory(void) +{ + struct memblock_region *reg; + + /* + * Init storage key for present memory + */ + for_each_memblock(memory, reg) { + storage_key_init_range(reg->base, reg->base + reg->size); + } + psw_set_key(PAGE_DEFAULT_KEY); +} + +/* + * Setup hardware capabilities. + */ +static int __init setup_hwcaps(void) +{ + static const int stfl_bits[6] = { 0, 2, 7, 17, 19, 21 }; + struct cpuid cpu_id; + int i; + + /* + * The store facility list bits numbers as found in the principles + * of operation are numbered with bit 1UL<<31 as number 0 to + * bit 1UL<<0 as number 31. + * Bit 0: instructions named N3, "backported" to esa-mode + * Bit 2: z/Architecture mode is active + * Bit 7: the store-facility-list-extended facility is installed + * Bit 17: the message-security assist is installed + * Bit 19: the long-displacement facility is installed + * Bit 21: the extended-immediate facility is installed + * Bit 22: extended-translation facility 3 is installed + * Bit 30: extended-translation facility 3 enhancement facility + * These get translated to: + * HWCAP_S390_ESAN3 bit 0, HWCAP_S390_ZARCH bit 1, + * HWCAP_S390_STFLE bit 2, HWCAP_S390_MSA bit 3, + * HWCAP_S390_LDISP bit 4, HWCAP_S390_EIMM bit 5 and + * HWCAP_S390_ETF3EH bit 8 (22 && 30). + */ + for (i = 0; i < 6; i++) + if (test_facility(stfl_bits[i])) + elf_hwcap |= 1UL << i; + + if (test_facility(22) && test_facility(30)) + elf_hwcap |= HWCAP_S390_ETF3EH; + + /* + * Check for additional facilities with store-facility-list-extended. + * stfle stores doublewords (8 byte) with bit 1ULL<<63 as bit 0 + * and 1ULL<<0 as bit 63. Bits 0-31 contain the same information + * as stored by stfl, bits 32-xxx contain additional facilities. + * How many facility words are stored depends on the number of + * doublewords passed to the instruction. The additional facilities + * are: + * Bit 42: decimal floating point facility is installed + * Bit 44: perform floating point operation facility is installed + * translated to: + * HWCAP_S390_DFP bit 6 (42 && 44). + */ + if ((elf_hwcap & (1UL << 2)) && test_facility(42) && test_facility(44)) + elf_hwcap |= HWCAP_S390_DFP; + + /* + * Huge page support HWCAP_S390_HPAGE is bit 7. + */ + if (MACHINE_HAS_EDAT1) + elf_hwcap |= HWCAP_S390_HPAGE; + + /* + * 64-bit register support for 31-bit processes + * HWCAP_S390_HIGH_GPRS is bit 9. + */ + elf_hwcap |= HWCAP_S390_HIGH_GPRS; + + /* + * Transactional execution support HWCAP_S390_TE is bit 10. + */ + if (MACHINE_HAS_TE) + elf_hwcap |= HWCAP_S390_TE; + + /* + * Vector extension HWCAP_S390_VXRS is bit 11. The Vector extension + * can be disabled with the "novx" parameter. Use MACHINE_HAS_VX + * instead of facility bit 129. + */ + if (MACHINE_HAS_VX) { + elf_hwcap |= HWCAP_S390_VXRS; + if (test_facility(134)) + elf_hwcap |= HWCAP_S390_VXRS_EXT; + if (test_facility(135)) + elf_hwcap |= HWCAP_S390_VXRS_BCD; + } + + /* + * Guarded storage support HWCAP_S390_GS is bit 12. + */ + if (MACHINE_HAS_GS) + elf_hwcap |= HWCAP_S390_GS; + + get_cpu_id(&cpu_id); + add_device_randomness(&cpu_id, sizeof(cpu_id)); + switch (cpu_id.machine) { + case 0x2064: + case 0x2066: + default: /* Use "z900" as default for 64 bit kernels. */ + strcpy(elf_platform, "z900"); + break; + case 0x2084: + case 0x2086: + strcpy(elf_platform, "z990"); + break; + case 0x2094: + case 0x2096: + strcpy(elf_platform, "z9-109"); + break; + case 0x2097: + case 0x2098: + strcpy(elf_platform, "z10"); + break; + case 0x2817: + case 0x2818: + strcpy(elf_platform, "z196"); + break; + case 0x2827: + case 0x2828: + strcpy(elf_platform, "zEC12"); + break; + case 0x2964: + case 0x2965: + strcpy(elf_platform, "z13"); + break; + case 0x3906: + case 0x3907: + strcpy(elf_platform, "z14"); + break; + } + + /* + * Virtualization support HWCAP_INT_SIE is bit 0. + */ + if (sclp.has_sief2) + int_hwcap |= HWCAP_INT_SIE; + + return 0; +} +arch_initcall(setup_hwcaps); + +/* + * Add system information as device randomness + */ +static void __init setup_randomness(void) +{ + struct sysinfo_3_2_2 *vmms; + + vmms = (struct sysinfo_3_2_2 *) memblock_alloc(PAGE_SIZE, PAGE_SIZE); + if (stsi(vmms, 3, 2, 2) == 0 && vmms->count) + add_device_randomness(&vmms->vm, sizeof(vmms->vm[0]) * vmms->count); + memblock_free((unsigned long) vmms, PAGE_SIZE); +} + +/* + * Find the correct size for the task_struct. This depends on + * the size of the struct fpu at the end of the thread_struct + * which is embedded in the task_struct. + */ +static void __init setup_task_size(void) +{ + int task_size = sizeof(struct task_struct); + + if (!MACHINE_HAS_VX) { + task_size -= sizeof(__vector128) * __NUM_VXRS; + task_size += sizeof(freg_t) * __NUM_FPRS; + } + arch_task_struct_size = task_size; +} + +/* + * Setup function called from init/main.c just after the banner + * was printed. + */ + +void __init setup_arch(char **cmdline_p) +{ + /* + * print what head.S has found out about the machine + */ + if (MACHINE_IS_VM) + pr_info("Linux is running as a z/VM " + "guest operating system in 64-bit mode\n"); + else if (MACHINE_IS_KVM) + pr_info("Linux is running under KVM in 64-bit mode\n"); + else if (MACHINE_IS_LPAR) + pr_info("Linux is running natively in 64-bit mode\n"); + else + pr_info("Linux is running as a guest in 64-bit mode\n"); + + /* Have one command line that is parsed and saved in /proc/cmdline */ + /* boot_command_line has been already set up in early.c */ + *cmdline_p = boot_command_line; + + ROOT_DEV = Root_RAM0; + + /* Is init_mm really needed? */ + init_mm.start_code = PAGE_OFFSET; + init_mm.end_code = (unsigned long) _etext; + init_mm.end_data = (unsigned long) _edata; + init_mm.brk = (unsigned long) _end; + + if (IS_ENABLED(CONFIG_EXPOLINE_AUTO)) + nospec_auto_detect(); + + parse_early_param(); +#ifdef CONFIG_CRASH_DUMP + /* Deactivate elfcorehdr= kernel parameter */ + elfcorehdr_addr = ELFCORE_ADDR_MAX; +#endif + + os_info_init(); + setup_ipl(); + setup_task_size(); + + /* Do some memory reservations *before* memory is added to memblock */ + reserve_memory_end(); + reserve_oldmem(); + reserve_kernel(); + reserve_initrd(); + memblock_allow_resize(); + + /* Get information about *all* installed memory */ + detect_memory_memblock(); + + remove_oldmem(); + + /* + * Make sure all chunks are MAX_ORDER aligned so we don't need the + * extra checks that HOLES_IN_ZONE would require. + * + * Is this still required? + */ + memblock_trim_memory(1UL << (MAX_ORDER - 1 + PAGE_SHIFT)); + + setup_memory_end(); + setup_memory(); + dma_contiguous_reserve(memory_end); + vmcp_cma_reserve(); + + check_initrd(); + reserve_crashkernel(); +#ifdef CONFIG_CRASH_DUMP + /* + * Be aware that smp_save_dump_cpus() triggers a system reset. + * Therefore CPU and device initialization should be done afterwards. + */ + smp_save_dump_cpus(); +#endif + + setup_resources(); + setup_lowcore_dat_off(); + smp_fill_possible_mask(); + cpu_detect_mhz_feature(); + cpu_init(); + numa_setup(); + smp_detect_cpus(); + topology_init_early(); + + /* + * Create kernel page tables and switch to virtual addressing. + */ + paging_init(); + + /* + * After paging_init created the kernel page table, the new PSWs + * in lowcore can now run with DAT enabled. + */ + setup_lowcore_dat_on(); + + /* Setup default console */ + conmode_default(); + set_preferred_console(); + + apply_alternative_instructions(); + if (IS_ENABLED(CONFIG_EXPOLINE)) + nospec_init_branches(); + + /* Setup zfcpdump support */ + setup_zfcpdump(); + + /* Add system specific data to the random pool */ + setup_randomness(); +} diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c new file mode 100644 index 000000000..22f08245a --- /dev/null +++ b/arch/s390/kernel/signal.c @@ -0,0 +1,541 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 1999, 2006 + * Author(s): Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com) + * + * Based on Intel version + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson + */ + +#include <linux/sched.h> +#include <linux/sched/task_stack.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/kernel.h> +#include <linux/signal.h> +#include <linux/errno.h> +#include <linux/wait.h> +#include <linux/ptrace.h> +#include <linux/unistd.h> +#include <linux/stddef.h> +#include <linux/tty.h> +#include <linux/personality.h> +#include <linux/binfmts.h> +#include <linux/tracehook.h> +#include <linux/syscalls.h> +#include <linux/compat.h> +#include <asm/ucontext.h> +#include <linux/uaccess.h> +#include <asm/lowcore.h> +#include <asm/switch_to.h> +#include "entry.h" + +/* + * Layout of an old-style signal-frame: + * ----------------------------------------- + * | save area (_SIGNAL_FRAMESIZE) | + * ----------------------------------------- + * | struct sigcontext | + * | oldmask | + * | _sigregs * | + * ----------------------------------------- + * | _sigregs with | + * | _s390_regs_common | + * | _s390_fp_regs | + * ----------------------------------------- + * | int signo | + * ----------------------------------------- + * | _sigregs_ext with | + * | gprs_high 64 byte (opt) | + * | vxrs_low 128 byte (opt) | + * | vxrs_high 256 byte (opt) | + * | reserved 128 byte (opt) | + * ----------------------------------------- + * | __u16 svc_insn | + * ----------------------------------------- + * The svc_insn entry with the sigreturn system call opcode does not + * have a fixed position and moves if gprs_high or vxrs exist. + * Future extensions will be added to _sigregs_ext. + */ +struct sigframe +{ + __u8 callee_used_stack[__SIGNAL_FRAMESIZE]; + struct sigcontext sc; + _sigregs sregs; + int signo; + _sigregs_ext sregs_ext; + __u16 svc_insn; /* Offset of svc_insn is NOT fixed! */ +}; + +/* + * Layout of an rt signal-frame: + * ----------------------------------------- + * | save area (_SIGNAL_FRAMESIZE) | + * ----------------------------------------- + * | svc __NR_rt_sigreturn 2 byte | + * ----------------------------------------- + * | struct siginfo | + * ----------------------------------------- + * | struct ucontext_extended with | + * | unsigned long uc_flags | + * | struct ucontext *uc_link | + * | stack_t uc_stack | + * | _sigregs uc_mcontext with | + * | _s390_regs_common | + * | _s390_fp_regs | + * | sigset_t uc_sigmask | + * | _sigregs_ext uc_mcontext_ext | + * | gprs_high 64 byte (opt) | + * | vxrs_low 128 byte (opt) | + * | vxrs_high 256 byte (opt)| + * | reserved 128 byte (opt) | + * ----------------------------------------- + * Future extensions will be added to _sigregs_ext. + */ +struct rt_sigframe +{ + __u8 callee_used_stack[__SIGNAL_FRAMESIZE]; + __u16 svc_insn; + struct siginfo info; + struct ucontext_extended uc; +}; + +/* Store registers needed to create the signal frame */ +static void store_sigregs(void) +{ + save_access_regs(current->thread.acrs); + save_fpu_regs(); +} + +/* Load registers after signal return */ +static void load_sigregs(void) +{ + restore_access_regs(current->thread.acrs); +} + +/* Returns non-zero on fault. */ +static int save_sigregs(struct pt_regs *regs, _sigregs __user *sregs) +{ + _sigregs user_sregs; + + /* Copy a 'clean' PSW mask to the user to avoid leaking + information about whether PER is currently on. */ + user_sregs.regs.psw.mask = PSW_USER_BITS | + (regs->psw.mask & (PSW_MASK_USER | PSW_MASK_RI)); + user_sregs.regs.psw.addr = regs->psw.addr; + memcpy(&user_sregs.regs.gprs, ®s->gprs, sizeof(sregs->regs.gprs)); + memcpy(&user_sregs.regs.acrs, current->thread.acrs, + sizeof(user_sregs.regs.acrs)); + fpregs_store(&user_sregs.fpregs, ¤t->thread.fpu); + if (__copy_to_user(sregs, &user_sregs, sizeof(_sigregs))) + return -EFAULT; + return 0; +} + +static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs) +{ + _sigregs user_sregs; + + /* Alwys make any pending restarted system call return -EINTR */ + current->restart_block.fn = do_no_restart_syscall; + + if (__copy_from_user(&user_sregs, sregs, sizeof(user_sregs))) + return -EFAULT; + + if (!is_ri_task(current) && (user_sregs.regs.psw.mask & PSW_MASK_RI)) + return -EINVAL; + + /* Test the floating-point-control word. */ + if (test_fp_ctl(user_sregs.fpregs.fpc)) + return -EINVAL; + + /* Use regs->psw.mask instead of PSW_USER_BITS to preserve PER bit. */ + regs->psw.mask = (regs->psw.mask & ~(PSW_MASK_USER | PSW_MASK_RI)) | + (user_sregs.regs.psw.mask & (PSW_MASK_USER | PSW_MASK_RI)); + /* Check for invalid user address space control. */ + if ((regs->psw.mask & PSW_MASK_ASC) == PSW_ASC_HOME) + regs->psw.mask = PSW_ASC_PRIMARY | + (regs->psw.mask & ~PSW_MASK_ASC); + /* Check for invalid amode */ + if (regs->psw.mask & PSW_MASK_EA) + regs->psw.mask |= PSW_MASK_BA; + regs->psw.addr = user_sregs.regs.psw.addr; + memcpy(®s->gprs, &user_sregs.regs.gprs, sizeof(sregs->regs.gprs)); + memcpy(¤t->thread.acrs, &user_sregs.regs.acrs, + sizeof(current->thread.acrs)); + + fpregs_load(&user_sregs.fpregs, ¤t->thread.fpu); + + clear_pt_regs_flag(regs, PIF_SYSCALL); /* No longer in a system call */ + return 0; +} + +/* Returns non-zero on fault. */ +static int save_sigregs_ext(struct pt_regs *regs, + _sigregs_ext __user *sregs_ext) +{ + __u64 vxrs[__NUM_VXRS_LOW]; + int i; + + /* Save vector registers to signal stack */ + if (MACHINE_HAS_VX) { + for (i = 0; i < __NUM_VXRS_LOW; i++) + vxrs[i] = *((__u64 *)(current->thread.fpu.vxrs + i) + 1); + if (__copy_to_user(&sregs_ext->vxrs_low, vxrs, + sizeof(sregs_ext->vxrs_low)) || + __copy_to_user(&sregs_ext->vxrs_high, + current->thread.fpu.vxrs + __NUM_VXRS_LOW, + sizeof(sregs_ext->vxrs_high))) + return -EFAULT; + } + return 0; +} + +static int restore_sigregs_ext(struct pt_regs *regs, + _sigregs_ext __user *sregs_ext) +{ + __u64 vxrs[__NUM_VXRS_LOW]; + int i; + + /* Restore vector registers from signal stack */ + if (MACHINE_HAS_VX) { + if (__copy_from_user(vxrs, &sregs_ext->vxrs_low, + sizeof(sregs_ext->vxrs_low)) || + __copy_from_user(current->thread.fpu.vxrs + __NUM_VXRS_LOW, + &sregs_ext->vxrs_high, + sizeof(sregs_ext->vxrs_high))) + return -EFAULT; + for (i = 0; i < __NUM_VXRS_LOW; i++) + *((__u64 *)(current->thread.fpu.vxrs + i) + 1) = vxrs[i]; + } + return 0; +} + +SYSCALL_DEFINE0(sigreturn) +{ + struct pt_regs *regs = task_pt_regs(current); + struct sigframe __user *frame = + (struct sigframe __user *) regs->gprs[15]; + sigset_t set; + + if (__copy_from_user(&set.sig, &frame->sc.oldmask, _SIGMASK_COPY_SIZE)) + goto badframe; + set_current_blocked(&set); + save_fpu_regs(); + if (restore_sigregs(regs, &frame->sregs)) + goto badframe; + if (restore_sigregs_ext(regs, &frame->sregs_ext)) + goto badframe; + load_sigregs(); + return regs->gprs[2]; +badframe: + force_sig(SIGSEGV, current); + return 0; +} + +SYSCALL_DEFINE0(rt_sigreturn) +{ + struct pt_regs *regs = task_pt_regs(current); + struct rt_sigframe __user *frame = + (struct rt_sigframe __user *)regs->gprs[15]; + sigset_t set; + + if (__copy_from_user(&set.sig, &frame->uc.uc_sigmask, sizeof(set))) + goto badframe; + set_current_blocked(&set); + if (restore_altstack(&frame->uc.uc_stack)) + goto badframe; + save_fpu_regs(); + if (restore_sigregs(regs, &frame->uc.uc_mcontext)) + goto badframe; + if (restore_sigregs_ext(regs, &frame->uc.uc_mcontext_ext)) + goto badframe; + load_sigregs(); + return regs->gprs[2]; +badframe: + force_sig(SIGSEGV, current); + return 0; +} + +/* + * Determine which stack to use.. + */ +static inline void __user * +get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size) +{ + unsigned long sp; + + /* Default to using normal stack */ + sp = regs->gprs[15]; + + /* Overflow on alternate signal stack gives SIGSEGV. */ + if (on_sig_stack(sp) && !on_sig_stack((sp - frame_size) & -8UL)) + return (void __user *) -1UL; + + /* This is the X/Open sanctioned signal stack switching. */ + if (ka->sa.sa_flags & SA_ONSTACK) { + if (! sas_ss_flags(sp)) + sp = current->sas_ss_sp + current->sas_ss_size; + } + + return (void __user *)((sp - frame_size) & -8ul); +} + +static int setup_frame(int sig, struct k_sigaction *ka, + sigset_t *set, struct pt_regs * regs) +{ + struct sigframe __user *frame; + struct sigcontext sc; + unsigned long restorer; + size_t frame_size; + + /* + * gprs_high are only present for a 31-bit task running on + * a 64-bit kernel (see compat_signal.c) but the space for + * gprs_high need to be allocated if vector registers are + * included in the signal frame on a 31-bit system. + */ + frame_size = sizeof(*frame) - sizeof(frame->sregs_ext); + if (MACHINE_HAS_VX) + frame_size += sizeof(frame->sregs_ext); + frame = get_sigframe(ka, regs, frame_size); + if (frame == (void __user *) -1UL) + return -EFAULT; + + /* Set up backchain. */ + if (__put_user(regs->gprs[15], (addr_t __user *) frame)) + return -EFAULT; + + /* Create struct sigcontext on the signal stack */ + memcpy(&sc.oldmask, &set->sig, _SIGMASK_COPY_SIZE); + sc.sregs = (_sigregs __user __force *) &frame->sregs; + if (__copy_to_user(&frame->sc, &sc, sizeof(frame->sc))) + return -EFAULT; + + /* Store registers needed to create the signal frame */ + store_sigregs(); + + /* Create _sigregs on the signal stack */ + if (save_sigregs(regs, &frame->sregs)) + return -EFAULT; + + /* Place signal number on stack to allow backtrace from handler. */ + if (__put_user(regs->gprs[2], (int __user *) &frame->signo)) + return -EFAULT; + + /* Create _sigregs_ext on the signal stack */ + if (save_sigregs_ext(regs, &frame->sregs_ext)) + return -EFAULT; + + /* Set up to return from userspace. If provided, use a stub + already in userspace. */ + if (ka->sa.sa_flags & SA_RESTORER) { + restorer = (unsigned long) ka->sa.sa_restorer; + } else { + /* Signal frame without vector registers are short ! */ + __u16 __user *svc = (void __user *) frame + frame_size - 2; + if (__put_user(S390_SYSCALL_OPCODE | __NR_sigreturn, svc)) + return -EFAULT; + restorer = (unsigned long) svc; + } + + /* Set up registers for signal handler */ + regs->gprs[14] = restorer; + regs->gprs[15] = (unsigned long) frame; + /* Force default amode and default user address space control. */ + regs->psw.mask = PSW_MASK_EA | PSW_MASK_BA | + (PSW_USER_BITS & PSW_MASK_ASC) | + (regs->psw.mask & ~PSW_MASK_ASC); + regs->psw.addr = (unsigned long) ka->sa.sa_handler; + + regs->gprs[2] = sig; + regs->gprs[3] = (unsigned long) &frame->sc; + + /* We forgot to include these in the sigcontext. + To avoid breaking binary compatibility, they are passed as args. */ + if (sig == SIGSEGV || sig == SIGBUS || sig == SIGILL || + sig == SIGTRAP || sig == SIGFPE) { + /* set extra registers only for synchronous signals */ + regs->gprs[4] = regs->int_code & 127; + regs->gprs[5] = regs->int_parm_long; + regs->gprs[6] = current->thread.last_break; + } + return 0; +} + +static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, + struct pt_regs *regs) +{ + struct rt_sigframe __user *frame; + unsigned long uc_flags, restorer; + size_t frame_size; + + frame_size = sizeof(struct rt_sigframe) - sizeof(_sigregs_ext); + /* + * gprs_high are only present for a 31-bit task running on + * a 64-bit kernel (see compat_signal.c) but the space for + * gprs_high need to be allocated if vector registers are + * included in the signal frame on a 31-bit system. + */ + uc_flags = 0; + if (MACHINE_HAS_VX) { + frame_size += sizeof(_sigregs_ext); + uc_flags |= UC_VXRS; + } + frame = get_sigframe(&ksig->ka, regs, frame_size); + if (frame == (void __user *) -1UL) + return -EFAULT; + + /* Set up backchain. */ + if (__put_user(regs->gprs[15], (addr_t __user *) frame)) + return -EFAULT; + + /* Set up to return from userspace. If provided, use a stub + already in userspace. */ + if (ksig->ka.sa.sa_flags & SA_RESTORER) { + restorer = (unsigned long) ksig->ka.sa.sa_restorer; + } else { + __u16 __user *svc = &frame->svc_insn; + if (__put_user(S390_SYSCALL_OPCODE | __NR_rt_sigreturn, svc)) + return -EFAULT; + restorer = (unsigned long) svc; + } + + /* Create siginfo on the signal stack */ + if (copy_siginfo_to_user(&frame->info, &ksig->info)) + return -EFAULT; + + /* Store registers needed to create the signal frame */ + store_sigregs(); + + /* Create ucontext on the signal stack. */ + if (__put_user(uc_flags, &frame->uc.uc_flags) || + __put_user(NULL, &frame->uc.uc_link) || + __save_altstack(&frame->uc.uc_stack, regs->gprs[15]) || + save_sigregs(regs, &frame->uc.uc_mcontext) || + __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)) || + save_sigregs_ext(regs, &frame->uc.uc_mcontext_ext)) + return -EFAULT; + + /* Set up registers for signal handler */ + regs->gprs[14] = restorer; + regs->gprs[15] = (unsigned long) frame; + /* Force default amode and default user address space control. */ + regs->psw.mask = PSW_MASK_EA | PSW_MASK_BA | + (PSW_USER_BITS & PSW_MASK_ASC) | + (regs->psw.mask & ~PSW_MASK_ASC); + regs->psw.addr = (unsigned long) ksig->ka.sa.sa_handler; + + regs->gprs[2] = ksig->sig; + regs->gprs[3] = (unsigned long) &frame->info; + regs->gprs[4] = (unsigned long) &frame->uc; + regs->gprs[5] = current->thread.last_break; + return 0; +} + +static void handle_signal(struct ksignal *ksig, sigset_t *oldset, + struct pt_regs *regs) +{ + int ret; + + /* Set up the stack frame */ + if (ksig->ka.sa.sa_flags & SA_SIGINFO) + ret = setup_rt_frame(ksig, oldset, regs); + else + ret = setup_frame(ksig->sig, &ksig->ka, oldset, regs); + + signal_setup_done(ret, ksig, test_thread_flag(TIF_SINGLE_STEP)); +} + +/* + * Note that 'init' is a special process: it doesn't get signals it doesn't + * want to handle. Thus you cannot kill init even with a SIGKILL even by + * mistake. + * + * Note that we go through the signals twice: once to check the signals that + * the kernel can handle, and then we build all the user-level signal handling + * stack-frames in one go after that. + */ +void do_signal(struct pt_regs *regs) +{ + struct ksignal ksig; + sigset_t *oldset = sigmask_to_save(); + + /* + * Get signal to deliver. When running under ptrace, at this point + * the debugger may change all our registers, including the system + * call information. + */ + current->thread.system_call = + test_pt_regs_flag(regs, PIF_SYSCALL) ? regs->int_code : 0; + + if (get_signal(&ksig)) { + /* Whee! Actually deliver the signal. */ + if (current->thread.system_call) { + regs->int_code = current->thread.system_call; + /* Check for system call restarting. */ + switch (regs->gprs[2]) { + case -ERESTART_RESTARTBLOCK: + case -ERESTARTNOHAND: + regs->gprs[2] = -EINTR; + break; + case -ERESTARTSYS: + if (!(ksig.ka.sa.sa_flags & SA_RESTART)) { + regs->gprs[2] = -EINTR; + break; + } + /* fallthrough */ + case -ERESTARTNOINTR: + regs->gprs[2] = regs->orig_gpr2; + regs->psw.addr = + __rewind_psw(regs->psw, + regs->int_code >> 16); + break; + } + } + /* No longer in a system call */ + clear_pt_regs_flag(regs, PIF_SYSCALL); + rseq_signal_deliver(&ksig, regs); + if (is_compat_task()) + handle_signal32(&ksig, oldset, regs); + else + handle_signal(&ksig, oldset, regs); + return; + } + + /* No handlers present - check for system call restart */ + clear_pt_regs_flag(regs, PIF_SYSCALL); + if (current->thread.system_call) { + regs->int_code = current->thread.system_call; + switch (regs->gprs[2]) { + case -ERESTART_RESTARTBLOCK: + /* Restart with sys_restart_syscall */ + regs->int_code = __NR_restart_syscall; + /* fallthrough */ + case -ERESTARTNOHAND: + case -ERESTARTSYS: + case -ERESTARTNOINTR: + /* Restart system call with magic TIF bit. */ + regs->gprs[2] = regs->orig_gpr2; + set_pt_regs_flag(regs, PIF_SYSCALL); + if (test_thread_flag(TIF_SINGLE_STEP)) + clear_pt_regs_flag(regs, PIF_PER_TRAP); + break; + } + } + + /* + * If there's no signal to deliver, we just put the saved sigmask back. + */ + restore_saved_sigmask(); +} + +void do_notify_resume(struct pt_regs *regs) +{ + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + rseq_handle_notify_resume(NULL, regs); +} diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c new file mode 100644 index 000000000..bce678c71 --- /dev/null +++ b/arch/s390/kernel/smp.c @@ -0,0 +1,1212 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * SMP related functions + * + * Copyright IBM Corp. 1999, 2012 + * Author(s): Denis Joseph Barrow, + * Martin Schwidefsky <schwidefsky@de.ibm.com>, + * Heiko Carstens <heiko.carstens@de.ibm.com>, + * + * based on other smp stuff by + * (c) 1995 Alan Cox, CymruNET Ltd <alan@cymru.net> + * (c) 1998 Ingo Molnar + * + * The code outside of smp.c uses logical cpu numbers, only smp.c does + * the translation of logical to physical cpu ids. All new code that + * operates on physical cpu numbers needs to go into smp.c. + */ + +#define KMSG_COMPONENT "cpu" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/workqueue.h> +#include <linux/bootmem.h> +#include <linux/export.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/err.h> +#include <linux/spinlock.h> +#include <linux/kernel_stat.h> +#include <linux/delay.h> +#include <linux/interrupt.h> +#include <linux/irqflags.h> +#include <linux/cpu.h> +#include <linux/slab.h> +#include <linux/sched/hotplug.h> +#include <linux/sched/task_stack.h> +#include <linux/crash_dump.h> +#include <linux/memblock.h> +#include <linux/kprobes.h> +#include <asm/asm-offsets.h> +#include <asm/diag.h> +#include <asm/switch_to.h> +#include <asm/facility.h> +#include <asm/ipl.h> +#include <asm/setup.h> +#include <asm/irq.h> +#include <asm/tlbflush.h> +#include <asm/vtimer.h> +#include <asm/lowcore.h> +#include <asm/sclp.h> +#include <asm/vdso.h> +#include <asm/debug.h> +#include <asm/os_info.h> +#include <asm/sigp.h> +#include <asm/idle.h> +#include <asm/nmi.h> +#include <asm/topology.h> +#include "entry.h" + +enum { + ec_schedule = 0, + ec_call_function_single, + ec_stop_cpu, +}; + +enum { + CPU_STATE_STANDBY, + CPU_STATE_CONFIGURED, +}; + +static DEFINE_PER_CPU(struct cpu *, cpu_device); + +struct pcpu { + struct lowcore *lowcore; /* lowcore page(s) for the cpu */ + unsigned long ec_mask; /* bit mask for ec_xxx functions */ + unsigned long ec_clk; /* sigp timestamp for ec_xxx */ + signed char state; /* physical cpu state */ + signed char polarization; /* physical polarization */ + u16 address; /* physical cpu address */ +}; + +static u8 boot_core_type; +static struct pcpu pcpu_devices[NR_CPUS]; + +unsigned int smp_cpu_mt_shift; +EXPORT_SYMBOL(smp_cpu_mt_shift); + +unsigned int smp_cpu_mtid; +EXPORT_SYMBOL(smp_cpu_mtid); + +#ifdef CONFIG_CRASH_DUMP +__vector128 __initdata boot_cpu_vector_save_area[__NUM_VXRS]; +#endif + +static unsigned int smp_max_threads __initdata = -1U; + +static int __init early_nosmt(char *s) +{ + smp_max_threads = 1; + return 0; +} +early_param("nosmt", early_nosmt); + +static int __init early_smt(char *s) +{ + get_option(&s, &smp_max_threads); + return 0; +} +early_param("smt", early_smt); + +/* + * The smp_cpu_state_mutex must be held when changing the state or polarization + * member of a pcpu data structure within the pcpu_devices arreay. + */ +DEFINE_MUTEX(smp_cpu_state_mutex); + +/* + * Signal processor helper functions. + */ +static inline int __pcpu_sigp_relax(u16 addr, u8 order, unsigned long parm) +{ + int cc; + + while (1) { + cc = __pcpu_sigp(addr, order, parm, NULL); + if (cc != SIGP_CC_BUSY) + return cc; + cpu_relax(); + } +} + +static int pcpu_sigp_retry(struct pcpu *pcpu, u8 order, u32 parm) +{ + int cc, retry; + + for (retry = 0; ; retry++) { + cc = __pcpu_sigp(pcpu->address, order, parm, NULL); + if (cc != SIGP_CC_BUSY) + break; + if (retry >= 3) + udelay(10); + } + return cc; +} + +static inline int pcpu_stopped(struct pcpu *pcpu) +{ + u32 uninitialized_var(status); + + if (__pcpu_sigp(pcpu->address, SIGP_SENSE, + 0, &status) != SIGP_CC_STATUS_STORED) + return 0; + return !!(status & (SIGP_STATUS_CHECK_STOP|SIGP_STATUS_STOPPED)); +} + +static inline int pcpu_running(struct pcpu *pcpu) +{ + if (__pcpu_sigp(pcpu->address, SIGP_SENSE_RUNNING, + 0, NULL) != SIGP_CC_STATUS_STORED) + return 1; + /* Status stored condition code is equivalent to cpu not running. */ + return 0; +} + +/* + * Find struct pcpu by cpu address. + */ +static struct pcpu *pcpu_find_address(const struct cpumask *mask, u16 address) +{ + int cpu; + + for_each_cpu(cpu, mask) + if (pcpu_devices[cpu].address == address) + return pcpu_devices + cpu; + return NULL; +} + +static void pcpu_ec_call(struct pcpu *pcpu, int ec_bit) +{ + int order; + + if (test_and_set_bit(ec_bit, &pcpu->ec_mask)) + return; + order = pcpu_running(pcpu) ? SIGP_EXTERNAL_CALL : SIGP_EMERGENCY_SIGNAL; + pcpu->ec_clk = get_tod_clock_fast(); + pcpu_sigp_retry(pcpu, order, 0); +} + +#define ASYNC_FRAME_OFFSET (ASYNC_SIZE - STACK_FRAME_OVERHEAD - __PT_SIZE) +#define PANIC_FRAME_OFFSET (PAGE_SIZE - STACK_FRAME_OVERHEAD - __PT_SIZE) + +static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu) +{ + unsigned long async_stack, panic_stack; + struct lowcore *lc; + + if (pcpu != &pcpu_devices[0]) { + pcpu->lowcore = (struct lowcore *) + __get_free_pages(GFP_KERNEL | GFP_DMA, LC_ORDER); + async_stack = __get_free_pages(GFP_KERNEL, ASYNC_ORDER); + panic_stack = __get_free_page(GFP_KERNEL); + if (!pcpu->lowcore || !panic_stack || !async_stack) + goto out; + } else { + async_stack = pcpu->lowcore->async_stack - ASYNC_FRAME_OFFSET; + panic_stack = pcpu->lowcore->panic_stack - PANIC_FRAME_OFFSET; + } + lc = pcpu->lowcore; + memcpy(lc, &S390_lowcore, 512); + memset((char *) lc + 512, 0, sizeof(*lc) - 512); + lc->async_stack = async_stack + ASYNC_FRAME_OFFSET; + lc->panic_stack = panic_stack + PANIC_FRAME_OFFSET; + lc->cpu_nr = cpu; + lc->spinlock_lockval = arch_spin_lockval(cpu); + lc->spinlock_index = 0; + lc->br_r1_trampoline = 0x07f1; /* br %r1 */ + if (nmi_alloc_per_cpu(lc)) + goto out; + if (vdso_alloc_per_cpu(lc)) + goto out_mcesa; + lowcore_ptr[cpu] = lc; + pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, (u32)(unsigned long) lc); + return 0; + +out_mcesa: + nmi_free_per_cpu(lc); +out: + if (pcpu != &pcpu_devices[0]) { + free_page(panic_stack); + free_pages(async_stack, ASYNC_ORDER); + free_pages((unsigned long) pcpu->lowcore, LC_ORDER); + } + return -ENOMEM; +} + +#ifdef CONFIG_HOTPLUG_CPU + +static void pcpu_free_lowcore(struct pcpu *pcpu) +{ + pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, 0); + lowcore_ptr[pcpu - pcpu_devices] = NULL; + vdso_free_per_cpu(pcpu->lowcore); + nmi_free_per_cpu(pcpu->lowcore); + if (pcpu == &pcpu_devices[0]) + return; + free_page(pcpu->lowcore->panic_stack-PANIC_FRAME_OFFSET); + free_pages(pcpu->lowcore->async_stack-ASYNC_FRAME_OFFSET, ASYNC_ORDER); + free_pages((unsigned long) pcpu->lowcore, LC_ORDER); +} + +#endif /* CONFIG_HOTPLUG_CPU */ + +static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu) +{ + struct lowcore *lc = pcpu->lowcore; + + cpumask_set_cpu(cpu, &init_mm.context.cpu_attach_mask); + cpumask_set_cpu(cpu, mm_cpumask(&init_mm)); + lc->cpu_nr = cpu; + lc->spinlock_lockval = arch_spin_lockval(cpu); + lc->spinlock_index = 0; + lc->percpu_offset = __per_cpu_offset[cpu]; + lc->kernel_asce = S390_lowcore.kernel_asce; + lc->user_asce = S390_lowcore.kernel_asce; + lc->machine_flags = S390_lowcore.machine_flags; + lc->user_timer = lc->system_timer = lc->steal_timer = 0; + __ctl_store(lc->cregs_save_area, 0, 15); + lc->cregs_save_area[1] = lc->kernel_asce; + lc->cregs_save_area[7] = lc->vdso_asce; + save_access_regs((unsigned int *) lc->access_regs_save_area); + memcpy(lc->stfle_fac_list, S390_lowcore.stfle_fac_list, + sizeof(lc->stfle_fac_list)); + memcpy(lc->alt_stfle_fac_list, S390_lowcore.alt_stfle_fac_list, + sizeof(lc->alt_stfle_fac_list)); + arch_spin_lock_setup(cpu); +} + +static void pcpu_attach_task(struct pcpu *pcpu, struct task_struct *tsk) +{ + struct lowcore *lc = pcpu->lowcore; + + lc->kernel_stack = (unsigned long) task_stack_page(tsk) + + THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs); + lc->current_task = (unsigned long) tsk; + lc->lpp = LPP_MAGIC; + lc->current_pid = tsk->pid; + lc->user_timer = tsk->thread.user_timer; + lc->guest_timer = tsk->thread.guest_timer; + lc->system_timer = tsk->thread.system_timer; + lc->hardirq_timer = tsk->thread.hardirq_timer; + lc->softirq_timer = tsk->thread.softirq_timer; + lc->steal_timer = 0; +} + +static void pcpu_start_fn(struct pcpu *pcpu, void (*func)(void *), void *data) +{ + struct lowcore *lc = pcpu->lowcore; + + lc->restart_stack = lc->kernel_stack; + lc->restart_fn = (unsigned long) func; + lc->restart_data = (unsigned long) data; + lc->restart_source = -1UL; + pcpu_sigp_retry(pcpu, SIGP_RESTART, 0); +} + +/* + * Call function via PSW restart on pcpu and stop the current cpu. + */ +static void pcpu_delegate(struct pcpu *pcpu, void (*func)(void *), + void *data, unsigned long stack) +{ + struct lowcore *lc = lowcore_ptr[pcpu - pcpu_devices]; + unsigned long source_cpu = stap(); + + __load_psw_mask(PSW_KERNEL_BITS); + if (pcpu->address == source_cpu) + func(data); /* should not return */ + /* Stop target cpu (if func returns this stops the current cpu). */ + pcpu_sigp_retry(pcpu, SIGP_STOP, 0); + /* Restart func on the target cpu and stop the current cpu. */ + mem_assign_absolute(lc->restart_stack, stack); + mem_assign_absolute(lc->restart_fn, (unsigned long) func); + mem_assign_absolute(lc->restart_data, (unsigned long) data); + mem_assign_absolute(lc->restart_source, source_cpu); + __bpon(); + asm volatile( + "0: sigp 0,%0,%2 # sigp restart to target cpu\n" + " brc 2,0b # busy, try again\n" + "1: sigp 0,%1,%3 # sigp stop to current cpu\n" + " brc 2,1b # busy, try again\n" + : : "d" (pcpu->address), "d" (source_cpu), + "K" (SIGP_RESTART), "K" (SIGP_STOP) + : "0", "1", "cc"); + for (;;) ; +} + +/* + * Enable additional logical cpus for multi-threading. + */ +static int pcpu_set_smt(unsigned int mtid) +{ + int cc; + + if (smp_cpu_mtid == mtid) + return 0; + cc = __pcpu_sigp(0, SIGP_SET_MULTI_THREADING, mtid, NULL); + if (cc == 0) { + smp_cpu_mtid = mtid; + smp_cpu_mt_shift = 0; + while (smp_cpu_mtid >= (1U << smp_cpu_mt_shift)) + smp_cpu_mt_shift++; + pcpu_devices[0].address = stap(); + } + return cc; +} + +/* + * Call function on an online CPU. + */ +void smp_call_online_cpu(void (*func)(void *), void *data) +{ + struct pcpu *pcpu; + + /* Use the current cpu if it is online. */ + pcpu = pcpu_find_address(cpu_online_mask, stap()); + if (!pcpu) + /* Use the first online cpu. */ + pcpu = pcpu_devices + cpumask_first(cpu_online_mask); + pcpu_delegate(pcpu, func, data, (unsigned long) restart_stack); +} + +/* + * Call function on the ipl CPU. + */ +void smp_call_ipl_cpu(void (*func)(void *), void *data) +{ + struct lowcore *lc = pcpu_devices->lowcore; + + if (pcpu_devices[0].address == stap()) + lc = &S390_lowcore; + + pcpu_delegate(&pcpu_devices[0], func, data, + lc->panic_stack - PANIC_FRAME_OFFSET + PAGE_SIZE); +} + +int smp_find_processor_id(u16 address) +{ + int cpu; + + for_each_present_cpu(cpu) + if (pcpu_devices[cpu].address == address) + return cpu; + return -1; +} + +bool notrace arch_vcpu_is_preempted(int cpu) +{ + if (test_cpu_flag_of(CIF_ENABLED_WAIT, cpu)) + return false; + if (pcpu_running(pcpu_devices + cpu)) + return false; + return true; +} +EXPORT_SYMBOL(arch_vcpu_is_preempted); + +void notrace smp_yield_cpu(int cpu) +{ + if (MACHINE_HAS_DIAG9C) { + diag_stat_inc_norecursion(DIAG_STAT_X09C); + asm volatile("diag %0,0,0x9c" + : : "d" (pcpu_devices[cpu].address)); + } else if (MACHINE_HAS_DIAG44) { + diag_stat_inc_norecursion(DIAG_STAT_X044); + asm volatile("diag 0,0,0x44"); + } +} + +/* + * Send cpus emergency shutdown signal. This gives the cpus the + * opportunity to complete outstanding interrupts. + */ +void notrace smp_emergency_stop(void) +{ + cpumask_t cpumask; + u64 end; + int cpu; + + cpumask_copy(&cpumask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), &cpumask); + + end = get_tod_clock() + (1000000UL << 12); + for_each_cpu(cpu, &cpumask) { + struct pcpu *pcpu = pcpu_devices + cpu; + set_bit(ec_stop_cpu, &pcpu->ec_mask); + while (__pcpu_sigp(pcpu->address, SIGP_EMERGENCY_SIGNAL, + 0, NULL) == SIGP_CC_BUSY && + get_tod_clock() < end) + cpu_relax(); + } + while (get_tod_clock() < end) { + for_each_cpu(cpu, &cpumask) + if (pcpu_stopped(pcpu_devices + cpu)) + cpumask_clear_cpu(cpu, &cpumask); + if (cpumask_empty(&cpumask)) + break; + cpu_relax(); + } +} +NOKPROBE_SYMBOL(smp_emergency_stop); + +/* + * Stop all cpus but the current one. + */ +void smp_send_stop(void) +{ + int cpu; + + /* Disable all interrupts/machine checks */ + __load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT); + trace_hardirqs_off(); + + debug_set_critical(); + + if (oops_in_progress) + smp_emergency_stop(); + + /* stop all processors */ + for_each_online_cpu(cpu) { + if (cpu == smp_processor_id()) + continue; + pcpu_sigp_retry(pcpu_devices + cpu, SIGP_STOP, 0); + while (!pcpu_stopped(pcpu_devices + cpu)) + cpu_relax(); + } +} + +/* + * This is the main routine where commands issued by other + * cpus are handled. + */ +static void smp_handle_ext_call(void) +{ + unsigned long bits; + + /* handle bit signal external calls */ + bits = xchg(&pcpu_devices[smp_processor_id()].ec_mask, 0); + if (test_bit(ec_stop_cpu, &bits)) + smp_stop_cpu(); + if (test_bit(ec_schedule, &bits)) + scheduler_ipi(); + if (test_bit(ec_call_function_single, &bits)) + generic_smp_call_function_single_interrupt(); +} + +static void do_ext_call_interrupt(struct ext_code ext_code, + unsigned int param32, unsigned long param64) +{ + inc_irq_stat(ext_code.code == 0x1202 ? IRQEXT_EXC : IRQEXT_EMS); + smp_handle_ext_call(); +} + +void arch_send_call_function_ipi_mask(const struct cpumask *mask) +{ + int cpu; + + for_each_cpu(cpu, mask) + pcpu_ec_call(pcpu_devices + cpu, ec_call_function_single); +} + +void arch_send_call_function_single_ipi(int cpu) +{ + pcpu_ec_call(pcpu_devices + cpu, ec_call_function_single); +} + +/* + * this function sends a 'reschedule' IPI to another CPU. + * it goes straight through and wastes no time serializing + * anything. Worst case is that we lose a reschedule ... + */ +void smp_send_reschedule(int cpu) +{ + pcpu_ec_call(pcpu_devices + cpu, ec_schedule); +} + +/* + * parameter area for the set/clear control bit callbacks + */ +struct ec_creg_mask_parms { + unsigned long orval; + unsigned long andval; + int cr; +}; + +/* + * callback for setting/clearing control bits + */ +static void smp_ctl_bit_callback(void *info) +{ + struct ec_creg_mask_parms *pp = info; + unsigned long cregs[16]; + + __ctl_store(cregs, 0, 15); + cregs[pp->cr] = (cregs[pp->cr] & pp->andval) | pp->orval; + __ctl_load(cregs, 0, 15); +} + +/* + * Set a bit in a control register of all cpus + */ +void smp_ctl_set_bit(int cr, int bit) +{ + struct ec_creg_mask_parms parms = { 1UL << bit, -1UL, cr }; + + on_each_cpu(smp_ctl_bit_callback, &parms, 1); +} +EXPORT_SYMBOL(smp_ctl_set_bit); + +/* + * Clear a bit in a control register of all cpus + */ +void smp_ctl_clear_bit(int cr, int bit) +{ + struct ec_creg_mask_parms parms = { 0, ~(1UL << bit), cr }; + + on_each_cpu(smp_ctl_bit_callback, &parms, 1); +} +EXPORT_SYMBOL(smp_ctl_clear_bit); + +#ifdef CONFIG_CRASH_DUMP + +int smp_store_status(int cpu) +{ + struct pcpu *pcpu = pcpu_devices + cpu; + unsigned long pa; + + pa = __pa(&pcpu->lowcore->floating_pt_save_area); + if (__pcpu_sigp_relax(pcpu->address, SIGP_STORE_STATUS_AT_ADDRESS, + pa) != SIGP_CC_ORDER_CODE_ACCEPTED) + return -EIO; + if (!MACHINE_HAS_VX && !MACHINE_HAS_GS) + return 0; + pa = __pa(pcpu->lowcore->mcesad & MCESA_ORIGIN_MASK); + if (MACHINE_HAS_GS) + pa |= pcpu->lowcore->mcesad & MCESA_LC_MASK; + if (__pcpu_sigp_relax(pcpu->address, SIGP_STORE_ADDITIONAL_STATUS, + pa) != SIGP_CC_ORDER_CODE_ACCEPTED) + return -EIO; + return 0; +} + +/* + * Collect CPU state of the previous, crashed system. + * There are four cases: + * 1) standard zfcp dump + * condition: OLDMEM_BASE == NULL && ipl_info.type == IPL_TYPE_FCP_DUMP + * The state for all CPUs except the boot CPU needs to be collected + * with sigp stop-and-store-status. The boot CPU state is located in + * the absolute lowcore of the memory stored in the HSA. The zcore code + * will copy the boot CPU state from the HSA. + * 2) stand-alone kdump for SCSI (zfcp dump with swapped memory) + * condition: OLDMEM_BASE != NULL && ipl_info.type == IPL_TYPE_FCP_DUMP + * The state for all CPUs except the boot CPU needs to be collected + * with sigp stop-and-store-status. The firmware or the boot-loader + * stored the registers of the boot CPU in the absolute lowcore in the + * memory of the old system. + * 3) kdump and the old kernel did not store the CPU state, + * or stand-alone kdump for DASD + * condition: OLDMEM_BASE != NULL && !is_kdump_kernel() + * The state for all CPUs except the boot CPU needs to be collected + * with sigp stop-and-store-status. The kexec code or the boot-loader + * stored the registers of the boot CPU in the memory of the old system. + * 4) kdump and the old kernel stored the CPU state + * condition: OLDMEM_BASE != NULL && is_kdump_kernel() + * This case does not exist for s390 anymore, setup_arch explicitly + * deactivates the elfcorehdr= kernel parameter + */ +static __init void smp_save_cpu_vxrs(struct save_area *sa, u16 addr, + bool is_boot_cpu, unsigned long page) +{ + __vector128 *vxrs = (__vector128 *) page; + + if (is_boot_cpu) + vxrs = boot_cpu_vector_save_area; + else + __pcpu_sigp_relax(addr, SIGP_STORE_ADDITIONAL_STATUS, page); + save_area_add_vxrs(sa, vxrs); +} + +static __init void smp_save_cpu_regs(struct save_area *sa, u16 addr, + bool is_boot_cpu, unsigned long page) +{ + void *regs = (void *) page; + + if (is_boot_cpu) + copy_oldmem_kernel(regs, (void *) __LC_FPREGS_SAVE_AREA, 512); + else + __pcpu_sigp_relax(addr, SIGP_STORE_STATUS_AT_ADDRESS, page); + save_area_add_regs(sa, regs); +} + +void __init smp_save_dump_cpus(void) +{ + int addr, boot_cpu_addr, max_cpu_addr; + struct save_area *sa; + unsigned long page; + bool is_boot_cpu; + + if (!(OLDMEM_BASE || ipl_info.type == IPL_TYPE_FCP_DUMP)) + /* No previous system present, normal boot. */ + return; + /* Allocate a page as dumping area for the store status sigps */ + page = memblock_alloc_base(PAGE_SIZE, PAGE_SIZE, 1UL << 31); + /* Set multi-threading state to the previous system. */ + pcpu_set_smt(sclp.mtid_prev); + boot_cpu_addr = stap(); + max_cpu_addr = SCLP_MAX_CORES << sclp.mtid_prev; + for (addr = 0; addr <= max_cpu_addr; addr++) { + if (__pcpu_sigp_relax(addr, SIGP_SENSE, 0) == + SIGP_CC_NOT_OPERATIONAL) + continue; + is_boot_cpu = (addr == boot_cpu_addr); + /* Allocate save area */ + sa = save_area_alloc(is_boot_cpu); + if (!sa) + panic("could not allocate memory for save area\n"); + if (MACHINE_HAS_VX) + /* Get the vector registers */ + smp_save_cpu_vxrs(sa, addr, is_boot_cpu, page); + /* + * For a zfcp dump OLDMEM_BASE == NULL and the registers + * of the boot CPU are stored in the HSA. To retrieve + * these registers an SCLP request is required which is + * done by drivers/s390/char/zcore.c:init_cpu_info() + */ + if (!is_boot_cpu || OLDMEM_BASE) + /* Get the CPU registers */ + smp_save_cpu_regs(sa, addr, is_boot_cpu, page); + } + memblock_free(page, PAGE_SIZE); + diag308_reset(); + pcpu_set_smt(0); +} +#endif /* CONFIG_CRASH_DUMP */ + +void smp_cpu_set_polarization(int cpu, int val) +{ + pcpu_devices[cpu].polarization = val; +} + +int smp_cpu_get_polarization(int cpu) +{ + return pcpu_devices[cpu].polarization; +} + +static void __ref smp_get_core_info(struct sclp_core_info *info, int early) +{ + static int use_sigp_detection; + int address; + + if (use_sigp_detection || sclp_get_core_info(info, early)) { + use_sigp_detection = 1; + for (address = 0; + address < (SCLP_MAX_CORES << smp_cpu_mt_shift); + address += (1U << smp_cpu_mt_shift)) { + if (__pcpu_sigp_relax(address, SIGP_SENSE, 0) == + SIGP_CC_NOT_OPERATIONAL) + continue; + info->core[info->configured].core_id = + address >> smp_cpu_mt_shift; + info->configured++; + } + info->combined = info->configured; + } +} + +static int smp_add_present_cpu(int cpu); + +static int smp_add_core(struct sclp_core_entry *core, cpumask_t *avail, + bool configured, bool early) +{ + struct pcpu *pcpu; + int cpu, nr, i; + u16 address; + + nr = 0; + if (sclp.has_core_type && core->type != boot_core_type) + return nr; + cpu = cpumask_first(avail); + address = core->core_id << smp_cpu_mt_shift; + for (i = 0; (i <= smp_cpu_mtid) && (cpu < nr_cpu_ids); i++) { + if (pcpu_find_address(cpu_present_mask, address + i)) + continue; + pcpu = pcpu_devices + cpu; + pcpu->address = address + i; + if (configured) + pcpu->state = CPU_STATE_CONFIGURED; + else + pcpu->state = CPU_STATE_STANDBY; + smp_cpu_set_polarization(cpu, POLARIZATION_UNKNOWN); + set_cpu_present(cpu, true); + if (!early && smp_add_present_cpu(cpu) != 0) + set_cpu_present(cpu, false); + else + nr++; + cpumask_clear_cpu(cpu, avail); + cpu = cpumask_next(cpu, avail); + } + return nr; +} + +static int __smp_rescan_cpus(struct sclp_core_info *info, bool early) +{ + struct sclp_core_entry *core; + static cpumask_t avail; + bool configured; + u16 core_id; + int nr, i; + + nr = 0; + cpumask_xor(&avail, cpu_possible_mask, cpu_present_mask); + /* + * Add IPL core first (which got logical CPU number 0) to make sure + * that all SMT threads get subsequent logical CPU numbers. + */ + if (early) { + core_id = pcpu_devices[0].address >> smp_cpu_mt_shift; + for (i = 0; i < info->configured; i++) { + core = &info->core[i]; + if (core->core_id == core_id) { + nr += smp_add_core(core, &avail, true, early); + break; + } + } + } + for (i = 0; i < info->combined; i++) { + configured = i < info->configured; + nr += smp_add_core(&info->core[i], &avail, configured, early); + } + return nr; +} + +void __init smp_detect_cpus(void) +{ + unsigned int cpu, mtid, c_cpus, s_cpus; + struct sclp_core_info *info; + u16 address; + + /* Get CPU information */ + info = memblock_virt_alloc(sizeof(*info), 8); + smp_get_core_info(info, 1); + /* Find boot CPU type */ + if (sclp.has_core_type) { + address = stap(); + for (cpu = 0; cpu < info->combined; cpu++) + if (info->core[cpu].core_id == address) { + /* The boot cpu dictates the cpu type. */ + boot_core_type = info->core[cpu].type; + break; + } + if (cpu >= info->combined) + panic("Could not find boot CPU type"); + } + + /* Set multi-threading state for the current system */ + mtid = boot_core_type ? sclp.mtid : sclp.mtid_cp; + mtid = (mtid < smp_max_threads) ? mtid : smp_max_threads - 1; + pcpu_set_smt(mtid); + + /* Print number of CPUs */ + c_cpus = s_cpus = 0; + for (cpu = 0; cpu < info->combined; cpu++) { + if (sclp.has_core_type && + info->core[cpu].type != boot_core_type) + continue; + if (cpu < info->configured) + c_cpus += smp_cpu_mtid + 1; + else + s_cpus += smp_cpu_mtid + 1; + } + pr_info("%d configured CPUs, %d standby CPUs\n", c_cpus, s_cpus); + + /* Add CPUs present at boot */ + get_online_cpus(); + __smp_rescan_cpus(info, true); + put_online_cpus(); + memblock_free_early((unsigned long)info, sizeof(*info)); +} + +/* + * Activate a secondary processor. + */ +static void smp_start_secondary(void *cpuvoid) +{ + int cpu = raw_smp_processor_id(); + + S390_lowcore.last_update_clock = get_tod_clock(); + S390_lowcore.restart_stack = (unsigned long) restart_stack; + S390_lowcore.restart_fn = (unsigned long) do_restart; + S390_lowcore.restart_data = 0; + S390_lowcore.restart_source = -1UL; + restore_access_regs(S390_lowcore.access_regs_save_area); + __ctl_load(S390_lowcore.cregs_save_area, 0, 15); + __load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT); + set_cpu_flag(CIF_ASCE_PRIMARY); + set_cpu_flag(CIF_ASCE_SECONDARY); + cpu_init(); + rcu_cpu_starting(cpu); + preempt_disable(); + init_cpu_timer(); + vtime_init(); + pfault_init(); + notify_cpu_starting(cpu); + if (topology_cpu_dedicated(cpu)) + set_cpu_flag(CIF_DEDICATED_CPU); + else + clear_cpu_flag(CIF_DEDICATED_CPU); + set_cpu_online(cpu, true); + inc_irq_stat(CPU_RST); + local_irq_enable(); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); +} + +/* Upping and downing of CPUs */ +int __cpu_up(unsigned int cpu, struct task_struct *tidle) +{ + struct pcpu *pcpu = pcpu_devices + cpu; + int rc; + + if (pcpu->state != CPU_STATE_CONFIGURED) + return -EIO; + if (pcpu_sigp_retry(pcpu, SIGP_INITIAL_CPU_RESET, 0) != + SIGP_CC_ORDER_CODE_ACCEPTED) + return -EIO; + + rc = pcpu_alloc_lowcore(pcpu, cpu); + if (rc) + return rc; + pcpu_prepare_secondary(pcpu, cpu); + pcpu_attach_task(pcpu, tidle); + pcpu_start_fn(pcpu, smp_start_secondary, NULL); + /* Wait until cpu puts itself in the online & active maps */ + while (!cpu_online(cpu)) + cpu_relax(); + return 0; +} + +static unsigned int setup_possible_cpus __initdata; + +static int __init _setup_possible_cpus(char *s) +{ + get_option(&s, &setup_possible_cpus); + return 0; +} +early_param("possible_cpus", _setup_possible_cpus); + +#ifdef CONFIG_HOTPLUG_CPU + +int __cpu_disable(void) +{ + unsigned long cregs[16]; + + /* Handle possible pending IPIs */ + smp_handle_ext_call(); + set_cpu_online(smp_processor_id(), false); + /* Disable pseudo page faults on this cpu. */ + pfault_fini(); + /* Disable interrupt sources via control register. */ + __ctl_store(cregs, 0, 15); + cregs[0] &= ~0x0000ee70UL; /* disable all external interrupts */ + cregs[6] &= ~0xff000000UL; /* disable all I/O interrupts */ + cregs[14] &= ~0x1f000000UL; /* disable most machine checks */ + __ctl_load(cregs, 0, 15); + clear_cpu_flag(CIF_NOHZ_DELAY); + return 0; +} + +void __cpu_die(unsigned int cpu) +{ + struct pcpu *pcpu; + + /* Wait until target cpu is down */ + pcpu = pcpu_devices + cpu; + while (!pcpu_stopped(pcpu)) + cpu_relax(); + pcpu_free_lowcore(pcpu); + cpumask_clear_cpu(cpu, mm_cpumask(&init_mm)); + cpumask_clear_cpu(cpu, &init_mm.context.cpu_attach_mask); +} + +void __noreturn cpu_die(void) +{ + idle_task_exit(); + __bpon(); + pcpu_sigp_retry(pcpu_devices + smp_processor_id(), SIGP_STOP, 0); + for (;;) ; +} + +#endif /* CONFIG_HOTPLUG_CPU */ + +void __init smp_fill_possible_mask(void) +{ + unsigned int possible, sclp_max, cpu; + + sclp_max = max(sclp.mtid, sclp.mtid_cp) + 1; + sclp_max = min(smp_max_threads, sclp_max); + sclp_max = (sclp.max_cores * sclp_max) ?: nr_cpu_ids; + possible = setup_possible_cpus ?: nr_cpu_ids; + possible = min(possible, sclp_max); + for (cpu = 0; cpu < possible && cpu < nr_cpu_ids; cpu++) + set_cpu_possible(cpu, true); +} + +void __init smp_prepare_cpus(unsigned int max_cpus) +{ + /* request the 0x1201 emergency signal external interrupt */ + if (register_external_irq(EXT_IRQ_EMERGENCY_SIG, do_ext_call_interrupt)) + panic("Couldn't request external interrupt 0x1201"); + /* request the 0x1202 external call external interrupt */ + if (register_external_irq(EXT_IRQ_EXTERNAL_CALL, do_ext_call_interrupt)) + panic("Couldn't request external interrupt 0x1202"); +} + +void __init smp_prepare_boot_cpu(void) +{ + struct pcpu *pcpu = pcpu_devices; + + WARN_ON(!cpu_present(0) || !cpu_online(0)); + pcpu->state = CPU_STATE_CONFIGURED; + pcpu->lowcore = (struct lowcore *)(unsigned long) store_prefix(); + S390_lowcore.percpu_offset = __per_cpu_offset[0]; + smp_cpu_set_polarization(0, POLARIZATION_UNKNOWN); +} + +void __init smp_cpus_done(unsigned int max_cpus) +{ +} + +void __init smp_setup_processor_id(void) +{ + pcpu_devices[0].address = stap(); + S390_lowcore.cpu_nr = 0; + S390_lowcore.spinlock_lockval = arch_spin_lockval(0); + S390_lowcore.spinlock_index = 0; +} + +/* + * the frequency of the profiling timer can be changed + * by writing a multiplier value into /proc/profile. + * + * usually you want to run this on all CPUs ;) + */ +int setup_profiling_timer(unsigned int multiplier) +{ + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU +static ssize_t cpu_configure_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + ssize_t count; + + mutex_lock(&smp_cpu_state_mutex); + count = sprintf(buf, "%d\n", pcpu_devices[dev->id].state); + mutex_unlock(&smp_cpu_state_mutex); + return count; +} + +static ssize_t cpu_configure_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct pcpu *pcpu; + int cpu, val, rc, i; + char delim; + + if (sscanf(buf, "%d %c", &val, &delim) != 1) + return -EINVAL; + if (val != 0 && val != 1) + return -EINVAL; + get_online_cpus(); + mutex_lock(&smp_cpu_state_mutex); + rc = -EBUSY; + /* disallow configuration changes of online cpus and cpu 0 */ + cpu = dev->id; + cpu = smp_get_base_cpu(cpu); + if (cpu == 0) + goto out; + for (i = 0; i <= smp_cpu_mtid; i++) + if (cpu_online(cpu + i)) + goto out; + pcpu = pcpu_devices + cpu; + rc = 0; + switch (val) { + case 0: + if (pcpu->state != CPU_STATE_CONFIGURED) + break; + rc = sclp_core_deconfigure(pcpu->address >> smp_cpu_mt_shift); + if (rc) + break; + for (i = 0; i <= smp_cpu_mtid; i++) { + if (cpu + i >= nr_cpu_ids || !cpu_present(cpu + i)) + continue; + pcpu[i].state = CPU_STATE_STANDBY; + smp_cpu_set_polarization(cpu + i, + POLARIZATION_UNKNOWN); + } + topology_expect_change(); + break; + case 1: + if (pcpu->state != CPU_STATE_STANDBY) + break; + rc = sclp_core_configure(pcpu->address >> smp_cpu_mt_shift); + if (rc) + break; + for (i = 0; i <= smp_cpu_mtid; i++) { + if (cpu + i >= nr_cpu_ids || !cpu_present(cpu + i)) + continue; + pcpu[i].state = CPU_STATE_CONFIGURED; + smp_cpu_set_polarization(cpu + i, + POLARIZATION_UNKNOWN); + } + topology_expect_change(); + break; + default: + break; + } +out: + mutex_unlock(&smp_cpu_state_mutex); + put_online_cpus(); + return rc ? rc : count; +} +static DEVICE_ATTR(configure, 0644, cpu_configure_show, cpu_configure_store); +#endif /* CONFIG_HOTPLUG_CPU */ + +static ssize_t show_cpu_address(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", pcpu_devices[dev->id].address); +} +static DEVICE_ATTR(address, 0444, show_cpu_address, NULL); + +static struct attribute *cpu_common_attrs[] = { +#ifdef CONFIG_HOTPLUG_CPU + &dev_attr_configure.attr, +#endif + &dev_attr_address.attr, + NULL, +}; + +static struct attribute_group cpu_common_attr_group = { + .attrs = cpu_common_attrs, +}; + +static struct attribute *cpu_online_attrs[] = { + &dev_attr_idle_count.attr, + &dev_attr_idle_time_us.attr, + NULL, +}; + +static struct attribute_group cpu_online_attr_group = { + .attrs = cpu_online_attrs, +}; + +static int smp_cpu_online(unsigned int cpu) +{ + struct device *s = &per_cpu(cpu_device, cpu)->dev; + + return sysfs_create_group(&s->kobj, &cpu_online_attr_group); +} +static int smp_cpu_pre_down(unsigned int cpu) +{ + struct device *s = &per_cpu(cpu_device, cpu)->dev; + + sysfs_remove_group(&s->kobj, &cpu_online_attr_group); + return 0; +} + +static int smp_add_present_cpu(int cpu) +{ + struct device *s; + struct cpu *c; + int rc; + + c = kzalloc(sizeof(*c), GFP_KERNEL); + if (!c) + return -ENOMEM; + per_cpu(cpu_device, cpu) = c; + s = &c->dev; + c->hotpluggable = 1; + rc = register_cpu(c, cpu); + if (rc) + goto out; + rc = sysfs_create_group(&s->kobj, &cpu_common_attr_group); + if (rc) + goto out_cpu; + rc = topology_cpu_init(c); + if (rc) + goto out_topology; + return 0; + +out_topology: + sysfs_remove_group(&s->kobj, &cpu_common_attr_group); +out_cpu: +#ifdef CONFIG_HOTPLUG_CPU + unregister_cpu(c); +#endif +out: + return rc; +} + +#ifdef CONFIG_HOTPLUG_CPU + +int __ref smp_rescan_cpus(void) +{ + struct sclp_core_info *info; + int nr; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + smp_get_core_info(info, 0); + get_online_cpus(); + mutex_lock(&smp_cpu_state_mutex); + nr = __smp_rescan_cpus(info, false); + mutex_unlock(&smp_cpu_state_mutex); + put_online_cpus(); + kfree(info); + if (nr) + topology_schedule_update(); + return 0; +} + +static ssize_t __ref rescan_store(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + int rc; + + rc = lock_device_hotplug_sysfs(); + if (rc) + return rc; + rc = smp_rescan_cpus(); + unlock_device_hotplug(); + return rc ? rc : count; +} +static DEVICE_ATTR_WO(rescan); +#endif /* CONFIG_HOTPLUG_CPU */ + +static int __init s390_smp_init(void) +{ + int cpu, rc = 0; + +#ifdef CONFIG_HOTPLUG_CPU + rc = device_create_file(cpu_subsys.dev_root, &dev_attr_rescan); + if (rc) + return rc; +#endif + for_each_present_cpu(cpu) { + rc = smp_add_present_cpu(cpu); + if (rc) + goto out; + } + + rc = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "s390/smp:online", + smp_cpu_online, smp_cpu_pre_down); + rc = rc <= 0 ? rc : 0; +out: + return rc; +} +subsys_initcall(s390_smp_init); diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c new file mode 100644 index 000000000..460dcfba7 --- /dev/null +++ b/arch/s390/kernel/stacktrace.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Stack trace management functions + * + * Copyright IBM Corp. 2006 + * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com> + */ + +#include <linux/sched.h> +#include <linux/sched/debug.h> +#include <linux/stacktrace.h> +#include <linux/kallsyms.h> +#include <linux/export.h> + +static int __save_address(void *data, unsigned long address, int nosched) +{ + struct stack_trace *trace = data; + + if (nosched && in_sched_functions(address)) + return 0; + if (trace->skip > 0) { + trace->skip--; + return 0; + } + if (trace->nr_entries < trace->max_entries) { + trace->entries[trace->nr_entries++] = address; + return 0; + } + return 1; +} + +static int save_address(void *data, unsigned long address, int reliable) +{ + return __save_address(data, address, 0); +} + +static int save_address_nosched(void *data, unsigned long address, int reliable) +{ + return __save_address(data, address, 1); +} + +void save_stack_trace(struct stack_trace *trace) +{ + unsigned long sp; + + sp = current_stack_pointer(); + dump_trace(save_address, trace, NULL, sp); + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; +} +EXPORT_SYMBOL_GPL(save_stack_trace); + +void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) +{ + unsigned long sp; + + sp = tsk->thread.ksp; + if (tsk == current) + sp = current_stack_pointer(); + dump_trace(save_address_nosched, trace, tsk, sp); + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; +} +EXPORT_SYMBOL_GPL(save_stack_trace_tsk); + +void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) +{ + unsigned long sp; + + sp = kernel_stack_pointer(regs); + dump_trace(save_address, trace, NULL, sp); + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; +} +EXPORT_SYMBOL_GPL(save_stack_trace_regs); diff --git a/arch/s390/kernel/sthyi.c b/arch/s390/kernel/sthyi.c new file mode 100644 index 000000000..888cc2f16 --- /dev/null +++ b/arch/s390/kernel/sthyi.c @@ -0,0 +1,516 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * store hypervisor information instruction emulation functions. + * + * Copyright IBM Corp. 2016 + * Author(s): Janosch Frank <frankja@linux.vnet.ibm.com> + */ +#include <linux/errno.h> +#include <linux/pagemap.h> +#include <linux/vmalloc.h> +#include <linux/syscalls.h> +#include <linux/mutex.h> +#include <asm/asm-offsets.h> +#include <asm/sclp.h> +#include <asm/diag.h> +#include <asm/sysinfo.h> +#include <asm/ebcdic.h> +#include <asm/facility.h> +#include <asm/sthyi.h> +#include "entry.h" + +#define DED_WEIGHT 0xffff +/* + * CP and IFL as EBCDIC strings, SP/0x40 determines the end of string + * as they are justified with spaces. + */ +#define CP 0xc3d7404040404040UL +#define IFL 0xc9c6d34040404040UL + +enum hdr_flags { + HDR_NOT_LPAR = 0x10, + HDR_STACK_INCM = 0x20, + HDR_STSI_UNAV = 0x40, + HDR_PERF_UNAV = 0x80, +}; + +enum mac_validity { + MAC_NAME_VLD = 0x20, + MAC_ID_VLD = 0x40, + MAC_CNT_VLD = 0x80, +}; + +enum par_flag { + PAR_MT_EN = 0x80, +}; + +enum par_validity { + PAR_GRP_VLD = 0x08, + PAR_ID_VLD = 0x10, + PAR_ABS_VLD = 0x20, + PAR_WGHT_VLD = 0x40, + PAR_PCNT_VLD = 0x80, +}; + +struct hdr_sctn { + u8 infhflg1; + u8 infhflg2; /* reserved */ + u8 infhval1; /* reserved */ + u8 infhval2; /* reserved */ + u8 reserved[3]; + u8 infhygct; + u16 infhtotl; + u16 infhdln; + u16 infmoff; + u16 infmlen; + u16 infpoff; + u16 infplen; + u16 infhoff1; + u16 infhlen1; + u16 infgoff1; + u16 infglen1; + u16 infhoff2; + u16 infhlen2; + u16 infgoff2; + u16 infglen2; + u16 infhoff3; + u16 infhlen3; + u16 infgoff3; + u16 infglen3; + u8 reserved2[4]; +} __packed; + +struct mac_sctn { + u8 infmflg1; /* reserved */ + u8 infmflg2; /* reserved */ + u8 infmval1; + u8 infmval2; /* reserved */ + u16 infmscps; + u16 infmdcps; + u16 infmsifl; + u16 infmdifl; + char infmname[8]; + char infmtype[4]; + char infmmanu[16]; + char infmseq[16]; + char infmpman[4]; + u8 reserved[4]; +} __packed; + +struct par_sctn { + u8 infpflg1; + u8 infpflg2; /* reserved */ + u8 infpval1; + u8 infpval2; /* reserved */ + u16 infppnum; + u16 infpscps; + u16 infpdcps; + u16 infpsifl; + u16 infpdifl; + u16 reserved; + char infppnam[8]; + u32 infpwbcp; + u32 infpabcp; + u32 infpwbif; + u32 infpabif; + char infplgnm[8]; + u32 infplgcp; + u32 infplgif; +} __packed; + +struct sthyi_sctns { + struct hdr_sctn hdr; + struct mac_sctn mac; + struct par_sctn par; +} __packed; + +struct cpu_inf { + u64 lpar_cap; + u64 lpar_grp_cap; + u64 lpar_weight; + u64 all_weight; + int cpu_num_ded; + int cpu_num_shd; +}; + +struct lpar_cpu_inf { + struct cpu_inf cp; + struct cpu_inf ifl; +}; + +/* + * STHYI requires extensive locking in the higher hypervisors + * and is very computational/memory expensive. Therefore we + * cache the retrieved data whose valid period is 1s. + */ +#define CACHE_VALID_JIFFIES HZ + +struct sthyi_info { + void *info; + unsigned long end; +}; + +static DEFINE_MUTEX(sthyi_mutex); +static struct sthyi_info sthyi_cache; + +static inline u64 cpu_id(u8 ctidx, void *diag224_buf) +{ + return *((u64 *)(diag224_buf + (ctidx + 1) * DIAG204_CPU_NAME_LEN)); +} + +/* + * Scales the cpu capping from the lpar range to the one expected in + * sthyi data. + * + * diag204 reports a cap in hundredths of processor units. + * z/VM's range for one core is 0 - 0x10000. + */ +static u32 scale_cap(u32 in) +{ + return (0x10000 * in) / 100; +} + +static void fill_hdr(struct sthyi_sctns *sctns) +{ + sctns->hdr.infhdln = sizeof(sctns->hdr); + sctns->hdr.infmoff = sizeof(sctns->hdr); + sctns->hdr.infmlen = sizeof(sctns->mac); + sctns->hdr.infplen = sizeof(sctns->par); + sctns->hdr.infpoff = sctns->hdr.infhdln + sctns->hdr.infmlen; + sctns->hdr.infhtotl = sctns->hdr.infpoff + sctns->hdr.infplen; +} + +static void fill_stsi_mac(struct sthyi_sctns *sctns, + struct sysinfo_1_1_1 *sysinfo) +{ + sclp_ocf_cpc_name_copy(sctns->mac.infmname); + if (*(u64 *)sctns->mac.infmname != 0) + sctns->mac.infmval1 |= MAC_NAME_VLD; + + if (stsi(sysinfo, 1, 1, 1)) + return; + + memcpy(sctns->mac.infmtype, sysinfo->type, sizeof(sctns->mac.infmtype)); + memcpy(sctns->mac.infmmanu, sysinfo->manufacturer, sizeof(sctns->mac.infmmanu)); + memcpy(sctns->mac.infmpman, sysinfo->plant, sizeof(sctns->mac.infmpman)); + memcpy(sctns->mac.infmseq, sysinfo->sequence, sizeof(sctns->mac.infmseq)); + + sctns->mac.infmval1 |= MAC_ID_VLD; +} + +static void fill_stsi_par(struct sthyi_sctns *sctns, + struct sysinfo_2_2_2 *sysinfo) +{ + if (stsi(sysinfo, 2, 2, 2)) + return; + + sctns->par.infppnum = sysinfo->lpar_number; + memcpy(sctns->par.infppnam, sysinfo->name, sizeof(sctns->par.infppnam)); + + sctns->par.infpval1 |= PAR_ID_VLD; +} + +static void fill_stsi(struct sthyi_sctns *sctns) +{ + void *sysinfo; + + /* Errors are handled through the validity bits in the response. */ + sysinfo = (void *)__get_free_page(GFP_KERNEL); + if (!sysinfo) + return; + + fill_stsi_mac(sctns, sysinfo); + fill_stsi_par(sctns, sysinfo); + + free_pages((unsigned long)sysinfo, 0); +} + +static void fill_diag_mac(struct sthyi_sctns *sctns, + struct diag204_x_phys_block *block, + void *diag224_buf) +{ + int i; + + for (i = 0; i < block->hdr.cpus; i++) { + switch (cpu_id(block->cpus[i].ctidx, diag224_buf)) { + case CP: + if (block->cpus[i].weight == DED_WEIGHT) + sctns->mac.infmdcps++; + else + sctns->mac.infmscps++; + break; + case IFL: + if (block->cpus[i].weight == DED_WEIGHT) + sctns->mac.infmdifl++; + else + sctns->mac.infmsifl++; + break; + } + } + sctns->mac.infmval1 |= MAC_CNT_VLD; +} + +/* Returns a pointer to the the next partition block. */ +static struct diag204_x_part_block *lpar_cpu_inf(struct lpar_cpu_inf *part_inf, + bool this_lpar, + void *diag224_buf, + struct diag204_x_part_block *block) +{ + int i, capped = 0, weight_cp = 0, weight_ifl = 0; + struct cpu_inf *cpu_inf; + + for (i = 0; i < block->hdr.rcpus; i++) { + if (!(block->cpus[i].cflag & DIAG204_CPU_ONLINE)) + continue; + + switch (cpu_id(block->cpus[i].ctidx, diag224_buf)) { + case CP: + cpu_inf = &part_inf->cp; + if (block->cpus[i].cur_weight < DED_WEIGHT) + weight_cp |= block->cpus[i].cur_weight; + break; + case IFL: + cpu_inf = &part_inf->ifl; + if (block->cpus[i].cur_weight < DED_WEIGHT) + weight_ifl |= block->cpus[i].cur_weight; + break; + default: + continue; + } + + if (!this_lpar) + continue; + + capped |= block->cpus[i].cflag & DIAG204_CPU_CAPPED; + cpu_inf->lpar_cap |= block->cpus[i].cpu_type_cap; + cpu_inf->lpar_grp_cap |= block->cpus[i].group_cpu_type_cap; + + if (block->cpus[i].weight == DED_WEIGHT) + cpu_inf->cpu_num_ded += 1; + else + cpu_inf->cpu_num_shd += 1; + } + + if (this_lpar && capped) { + part_inf->cp.lpar_weight = weight_cp; + part_inf->ifl.lpar_weight = weight_ifl; + } + part_inf->cp.all_weight += weight_cp; + part_inf->ifl.all_weight += weight_ifl; + return (struct diag204_x_part_block *)&block->cpus[i]; +} + +static void fill_diag(struct sthyi_sctns *sctns) +{ + int i, r, pages; + bool this_lpar; + void *diag204_buf; + void *diag224_buf = NULL; + struct diag204_x_info_blk_hdr *ti_hdr; + struct diag204_x_part_block *part_block; + struct diag204_x_phys_block *phys_block; + struct lpar_cpu_inf lpar_inf = {}; + + /* Errors are handled through the validity bits in the response. */ + pages = diag204((unsigned long)DIAG204_SUBC_RSI | + (unsigned long)DIAG204_INFO_EXT, 0, NULL); + if (pages <= 0) + return; + + diag204_buf = vmalloc(array_size(pages, PAGE_SIZE)); + if (!diag204_buf) + return; + + r = diag204((unsigned long)DIAG204_SUBC_STIB7 | + (unsigned long)DIAG204_INFO_EXT, pages, diag204_buf); + if (r < 0) + goto out; + + diag224_buf = (void *)__get_free_page(GFP_KERNEL | GFP_DMA); + if (!diag224_buf || diag224(diag224_buf)) + goto out; + + ti_hdr = diag204_buf; + part_block = diag204_buf + sizeof(*ti_hdr); + + for (i = 0; i < ti_hdr->npar; i++) { + /* + * For the calling lpar we also need to get the cpu + * caps and weights. The time information block header + * specifies the offset to the partition block of the + * caller lpar, so we know when we process its data. + */ + this_lpar = (void *)part_block - diag204_buf == ti_hdr->this_part; + part_block = lpar_cpu_inf(&lpar_inf, this_lpar, diag224_buf, + part_block); + } + + phys_block = (struct diag204_x_phys_block *)part_block; + part_block = diag204_buf + ti_hdr->this_part; + if (part_block->hdr.mtid) + sctns->par.infpflg1 = PAR_MT_EN; + + sctns->par.infpval1 |= PAR_GRP_VLD; + sctns->par.infplgcp = scale_cap(lpar_inf.cp.lpar_grp_cap); + sctns->par.infplgif = scale_cap(lpar_inf.ifl.lpar_grp_cap); + memcpy(sctns->par.infplgnm, part_block->hdr.hardware_group_name, + sizeof(sctns->par.infplgnm)); + + sctns->par.infpscps = lpar_inf.cp.cpu_num_shd; + sctns->par.infpdcps = lpar_inf.cp.cpu_num_ded; + sctns->par.infpsifl = lpar_inf.ifl.cpu_num_shd; + sctns->par.infpdifl = lpar_inf.ifl.cpu_num_ded; + sctns->par.infpval1 |= PAR_PCNT_VLD; + + sctns->par.infpabcp = scale_cap(lpar_inf.cp.lpar_cap); + sctns->par.infpabif = scale_cap(lpar_inf.ifl.lpar_cap); + sctns->par.infpval1 |= PAR_ABS_VLD; + + /* + * Everything below needs global performance data to be + * meaningful. + */ + if (!(ti_hdr->flags & DIAG204_LPAR_PHYS_FLG)) { + sctns->hdr.infhflg1 |= HDR_PERF_UNAV; + goto out; + } + + fill_diag_mac(sctns, phys_block, diag224_buf); + + if (lpar_inf.cp.lpar_weight) { + sctns->par.infpwbcp = sctns->mac.infmscps * 0x10000 * + lpar_inf.cp.lpar_weight / lpar_inf.cp.all_weight; + } + + if (lpar_inf.ifl.lpar_weight) { + sctns->par.infpwbif = sctns->mac.infmsifl * 0x10000 * + lpar_inf.ifl.lpar_weight / lpar_inf.ifl.all_weight; + } + sctns->par.infpval1 |= PAR_WGHT_VLD; + +out: + free_page((unsigned long)diag224_buf); + vfree(diag204_buf); +} + +static int sthyi(u64 vaddr, u64 *rc) +{ + register u64 code asm("0") = 0; + register u64 addr asm("2") = vaddr; + register u64 rcode asm("3"); + int cc; + + asm volatile( + ".insn rre,0xB2560000,%[code],%[addr]\n" + "ipm %[cc]\n" + "srl %[cc],28\n" + : [cc] "=d" (cc), "=d" (rcode) + : [code] "d" (code), [addr] "a" (addr) + : "memory", "cc"); + *rc = rcode; + return cc; +} + +static int fill_dst(void *dst, u64 *rc) +{ + struct sthyi_sctns *sctns = (struct sthyi_sctns *)dst; + + /* + * If the facility is on, we don't want to emulate the instruction. + * We ask the hypervisor to provide the data. + */ + if (test_facility(74)) + return sthyi((u64)dst, rc); + + fill_hdr(sctns); + fill_stsi(sctns); + fill_diag(sctns); + *rc = 0; + return 0; +} + +static int sthyi_init_cache(void) +{ + if (sthyi_cache.info) + return 0; + sthyi_cache.info = (void *)get_zeroed_page(GFP_KERNEL); + if (!sthyi_cache.info) + return -ENOMEM; + sthyi_cache.end = jiffies - 1; /* expired */ + return 0; +} + +static int sthyi_update_cache(u64 *rc) +{ + int r; + + memset(sthyi_cache.info, 0, PAGE_SIZE); + r = fill_dst(sthyi_cache.info, rc); + if (r) + return r; + sthyi_cache.end = jiffies + CACHE_VALID_JIFFIES; + return r; +} + +/* + * sthyi_fill - Fill page with data returned by the STHYI instruction + * + * @dst: Pointer to zeroed page + * @rc: Pointer for storing the return code of the instruction + * + * Fills the destination with system information returned by the STHYI + * instruction. The data is generated by emulation or execution of STHYI, + * if available. The return value is the condition code that would be + * returned, the rc parameter is the return code which is passed in + * register R2 + 1. + */ +int sthyi_fill(void *dst, u64 *rc) +{ + int r; + + mutex_lock(&sthyi_mutex); + r = sthyi_init_cache(); + if (r) + goto out; + + if (time_is_before_jiffies(sthyi_cache.end)) { + /* cache expired */ + r = sthyi_update_cache(rc); + if (r) + goto out; + } + *rc = 0; + memcpy(dst, sthyi_cache.info, PAGE_SIZE); +out: + mutex_unlock(&sthyi_mutex); + return r; +} +EXPORT_SYMBOL_GPL(sthyi_fill); + +SYSCALL_DEFINE4(s390_sthyi, unsigned long, function_code, void __user *, buffer, + u64 __user *, return_code, unsigned long, flags) +{ + u64 sthyi_rc; + void *info; + int r; + + if (flags) + return -EINVAL; + if (function_code != STHYI_FC_CP_IFL_CAP) + return -EOPNOTSUPP; + info = (void *)get_zeroed_page(GFP_KERNEL); + if (!info) + return -ENOMEM; + r = sthyi_fill(info, &sthyi_rc); + if (r < 0) + goto out; + if (return_code && put_user(sthyi_rc, return_code)) { + r = -EFAULT; + goto out; + } + if (copy_to_user(buffer, info, PAGE_SIZE)) + r = -EFAULT; +out: + free_page((unsigned long)info); + return r; +} diff --git a/arch/s390/kernel/suspend.c b/arch/s390/kernel/suspend.c new file mode 100644 index 000000000..75b7b3079 --- /dev/null +++ b/arch/s390/kernel/suspend.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Suspend support specific for s390. + * + * Copyright IBM Corp. 2009 + * + * Author(s): Hans-Joachim Picht <hans@linux.vnet.ibm.com> + */ + +#include <linux/pfn.h> +#include <linux/suspend.h> +#include <linux/mm.h> +#include <linux/pci.h> +#include <asm/ctl_reg.h> +#include <asm/ipl.h> +#include <asm/cio.h> +#include <asm/sections.h> +#include "entry.h" + +/* + * The restore of the saved pages in an hibernation image will set + * the change and referenced bits in the storage key for each page. + * Overindication of the referenced bits after an hibernation cycle + * does not cause any harm but the overindication of the change bits + * would cause trouble. + * Use the ARCH_SAVE_PAGE_KEYS hooks to save the storage key of each + * page to the most significant byte of the associated page frame + * number in the hibernation image. + */ + +/* + * Key storage is allocated as a linked list of pages. + * The size of the keys array is (PAGE_SIZE - sizeof(long)) + */ +struct page_key_data { + struct page_key_data *next; + unsigned char data[]; +}; + +#define PAGE_KEY_DATA_SIZE (PAGE_SIZE - sizeof(struct page_key_data *)) + +static struct page_key_data *page_key_data; +static struct page_key_data *page_key_rp, *page_key_wp; +static unsigned long page_key_rx, page_key_wx; +unsigned long suspend_zero_pages; + +/* + * For each page in the hibernation image one additional byte is + * stored in the most significant byte of the page frame number. + * On suspend no additional memory is required but on resume the + * keys need to be memorized until the page data has been restored. + * Only then can the storage keys be set to their old state. + */ +unsigned long page_key_additional_pages(unsigned long pages) +{ + return DIV_ROUND_UP(pages, PAGE_KEY_DATA_SIZE); +} + +/* + * Free page_key_data list of arrays. + */ +void page_key_free(void) +{ + struct page_key_data *pkd; + + while (page_key_data) { + pkd = page_key_data; + page_key_data = pkd->next; + free_page((unsigned long) pkd); + } +} + +/* + * Allocate page_key_data list of arrays with enough room to store + * one byte for each page in the hibernation image. + */ +int page_key_alloc(unsigned long pages) +{ + struct page_key_data *pk; + unsigned long size; + + size = DIV_ROUND_UP(pages, PAGE_KEY_DATA_SIZE); + while (size--) { + pk = (struct page_key_data *) get_zeroed_page(GFP_KERNEL); + if (!pk) { + page_key_free(); + return -ENOMEM; + } + pk->next = page_key_data; + page_key_data = pk; + } + page_key_rp = page_key_wp = page_key_data; + page_key_rx = page_key_wx = 0; + return 0; +} + +/* + * Save the storage key into the upper 8 bits of the page frame number. + */ +void page_key_read(unsigned long *pfn) +{ + struct page *page; + unsigned long addr; + unsigned char key; + + page = pfn_to_page(*pfn); + addr = (unsigned long) page_address(page); + key = (unsigned char) page_get_storage_key(addr) & 0x7f; + if (arch_test_page_nodat(page)) + key |= 0x80; + *(unsigned char *) pfn = key; +} + +/* + * Extract the storage key from the upper 8 bits of the page frame number + * and store it in the page_key_data list of arrays. + */ +void page_key_memorize(unsigned long *pfn) +{ + page_key_wp->data[page_key_wx] = *(unsigned char *) pfn; + *(unsigned char *) pfn = 0; + if (++page_key_wx < PAGE_KEY_DATA_SIZE) + return; + page_key_wp = page_key_wp->next; + page_key_wx = 0; +} + +/* + * Get the next key from the page_key_data list of arrays and set the + * storage key of the page referred by @address. If @address refers to + * a "safe" page the swsusp_arch_resume code will transfer the storage + * key from the buffer page to the original page. + */ +void page_key_write(void *address) +{ + struct page *page; + unsigned char key; + + key = page_key_rp->data[page_key_rx]; + page_set_storage_key((unsigned long) address, key & 0x7f, 0); + page = virt_to_page(address); + if (key & 0x80) + arch_set_page_nodat(page, 0); + else + arch_set_page_dat(page, 0); + if (++page_key_rx >= PAGE_KEY_DATA_SIZE) + return; + page_key_rp = page_key_rp->next; + page_key_rx = 0; +} + +int pfn_is_nosave(unsigned long pfn) +{ + unsigned long nosave_begin_pfn = PFN_DOWN(__pa(&__nosave_begin)); + unsigned long nosave_end_pfn = PFN_DOWN(__pa(&__nosave_end)); + unsigned long end_rodata_pfn = PFN_DOWN(__pa(__end_rodata)) - 1; + unsigned long stext_pfn = PFN_DOWN(__pa(_stext)); + + /* Always save lowcore pages (LC protection might be enabled). */ + if (pfn <= LC_PAGES) + return 0; + if (pfn >= nosave_begin_pfn && pfn < nosave_end_pfn) + return 1; + /* Skip memory holes and read-only pages (DCSS, ...). */ + if (pfn >= stext_pfn && pfn <= end_rodata_pfn) + return 0; + if (tprot(PFN_PHYS(pfn))) + return 1; + return 0; +} + +/* + * PM notifier callback for suspend + */ +static int suspend_pm_cb(struct notifier_block *nb, unsigned long action, + void *ptr) +{ + switch (action) { + case PM_SUSPEND_PREPARE: + case PM_HIBERNATION_PREPARE: + suspend_zero_pages = __get_free_pages(GFP_KERNEL, LC_ORDER); + if (!suspend_zero_pages) + return NOTIFY_BAD; + break; + case PM_POST_SUSPEND: + case PM_POST_HIBERNATION: + free_pages(suspend_zero_pages, LC_ORDER); + break; + default: + return NOTIFY_DONE; + } + return NOTIFY_OK; +} + +static int __init suspend_pm_init(void) +{ + pm_notifier(suspend_pm_cb, 0); + return 0; +} +arch_initcall(suspend_pm_init); + +void save_processor_state(void) +{ + /* swsusp_arch_suspend() actually saves all cpu register contents. + * Machine checks must be disabled since swsusp_arch_suspend() stores + * register contents to their lowcore save areas. That's the same + * place where register contents on machine checks would be saved. + * To avoid register corruption disable machine checks. + * We must also disable machine checks in the new psw mask for + * program checks, since swsusp_arch_suspend() may generate program + * checks. Disabling machine checks for all other new psw masks is + * just paranoia. + */ + local_mcck_disable(); + /* Disable lowcore protection */ + __ctl_clear_bit(0,28); + S390_lowcore.external_new_psw.mask &= ~PSW_MASK_MCHECK; + S390_lowcore.svc_new_psw.mask &= ~PSW_MASK_MCHECK; + S390_lowcore.io_new_psw.mask &= ~PSW_MASK_MCHECK; + S390_lowcore.program_new_psw.mask &= ~PSW_MASK_MCHECK; +} + +void restore_processor_state(void) +{ + S390_lowcore.external_new_psw.mask |= PSW_MASK_MCHECK; + S390_lowcore.svc_new_psw.mask |= PSW_MASK_MCHECK; + S390_lowcore.io_new_psw.mask |= PSW_MASK_MCHECK; + S390_lowcore.program_new_psw.mask |= PSW_MASK_MCHECK; + /* Enable lowcore protection */ + __ctl_set_bit(0,28); + local_mcck_enable(); +} + +/* Called at the end of swsusp_arch_resume */ +void s390_early_resume(void) +{ + lgr_info_log(); + channel_subsystem_reinit(); + zpci_rescan(); +} diff --git a/arch/s390/kernel/swsusp.S b/arch/s390/kernel/swsusp.S new file mode 100644 index 000000000..c1a080b11 --- /dev/null +++ b/arch/s390/kernel/swsusp.S @@ -0,0 +1,286 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * S390 64-bit swsusp implementation + * + * Copyright IBM Corp. 2009 + * + * Author(s): Hans-Joachim Picht <hans@linux.vnet.ibm.com> + * Michael Holzheu <holzheu@linux.vnet.ibm.com> + */ + +#include <linux/linkage.h> +#include <asm/page.h> +#include <asm/ptrace.h> +#include <asm/thread_info.h> +#include <asm/asm-offsets.h> +#include <asm/nospec-insn.h> +#include <asm/sigp.h> + +/* + * Save register context in absolute 0 lowcore and call swsusp_save() to + * create in-memory kernel image. The context is saved in the designated + * "store status" memory locations (see POP). + * We return from this function twice. The first time during the suspend to + * disk process. The second time via the swsusp_arch_resume() function + * (see below) in the resume process. + * This function runs with disabled interrupts. + */ + GEN_BR_THUNK %r14 + + .section .text +ENTRY(swsusp_arch_suspend) + stmg %r6,%r15,__SF_GPRS(%r15) + lgr %r1,%r15 + aghi %r15,-STACK_FRAME_OVERHEAD + stg %r1,__SF_BACKCHAIN(%r15) + + /* Store FPU registers */ + brasl %r14,save_fpu_regs + + /* Deactivate DAT */ + stnsm __SF_EMPTY(%r15),0xfb + + /* Store prefix register on stack */ + stpx __SF_EMPTY(%r15) + + /* Save prefix register contents for lowcore copy */ + llgf %r10,__SF_EMPTY(%r15) + + /* Get pointer to save area */ + lghi %r1,0x1000 + + /* Save CPU address */ + stap __LC_EXT_CPU_ADDR(%r0) + + /* Store registers */ + mvc 0x318(4,%r1),__SF_EMPTY(%r15) /* move prefix to lowcore */ + stam %a0,%a15,0x340(%r1) /* store access registers */ + stctg %c0,%c15,0x380(%r1) /* store control registers */ + stmg %r0,%r15,0x280(%r1) /* store general registers */ + + stpt 0x328(%r1) /* store timer */ + stck __SF_EMPTY(%r15) /* store clock */ + stckc 0x330(%r1) /* store clock comparator */ + + /* Update cputime accounting before going to sleep */ + lg %r0,__LC_LAST_UPDATE_TIMER + slg %r0,0x328(%r1) + alg %r0,__LC_SYSTEM_TIMER + stg %r0,__LC_SYSTEM_TIMER + mvc __LC_LAST_UPDATE_TIMER(8),0x328(%r1) + lg %r0,__LC_LAST_UPDATE_CLOCK + slg %r0,__SF_EMPTY(%r15) + alg %r0,__LC_STEAL_TIMER + stg %r0,__LC_STEAL_TIMER + mvc __LC_LAST_UPDATE_CLOCK(8),__SF_EMPTY(%r15) + + /* Activate DAT */ + stosm __SF_EMPTY(%r15),0x04 + + /* Set prefix page to zero */ + xc __SF_EMPTY(4,%r15),__SF_EMPTY(%r15) + spx __SF_EMPTY(%r15) + + /* Save absolute zero pages */ + larl %r2,suspend_zero_pages + lg %r2,0(%r2) + lghi %r4,0 + lghi %r3,2*PAGE_SIZE + lghi %r5,2*PAGE_SIZE +1: mvcle %r2,%r4,0 + jo 1b + + /* Copy lowcore to absolute zero lowcore */ + lghi %r2,0 + lgr %r4,%r10 + lghi %r3,2*PAGE_SIZE + lghi %r5,2*PAGE_SIZE +1: mvcle %r2,%r4,0 + jo 1b + + /* Save image */ + brasl %r14,swsusp_save + + /* Restore prefix register and return */ + lghi %r1,0x1000 + spx 0x318(%r1) + lmg %r6,%r15,STACK_FRAME_OVERHEAD + __SF_GPRS(%r15) + lghi %r2,0 + BR_EX %r14 + +/* + * Restore saved memory image to correct place and restore register context. + * Then we return to the function that called swsusp_arch_suspend(). + * swsusp_arch_resume() runs with disabled interrupts. + */ +ENTRY(swsusp_arch_resume) + stmg %r6,%r15,__SF_GPRS(%r15) + lgr %r1,%r15 + aghi %r15,-STACK_FRAME_OVERHEAD + stg %r1,__SF_BACKCHAIN(%r15) + + /* Make all free pages stable */ + lghi %r2,1 + brasl %r14,arch_set_page_states + + /* Deactivate DAT */ + stnsm __SF_EMPTY(%r15),0xfb + + /* Set prefix page to zero */ + xc __SF_EMPTY(4,%r15),__SF_EMPTY(%r15) + spx __SF_EMPTY(%r15) + + /* Restore saved image */ + larl %r1,restore_pblist + lg %r1,0(%r1) + ltgr %r1,%r1 + jz 2f +0: + lg %r2,8(%r1) + lg %r4,0(%r1) + iske %r0,%r4 + lghi %r3,PAGE_SIZE + lghi %r5,PAGE_SIZE +1: + mvcle %r2,%r4,0 + jo 1b + lg %r2,8(%r1) + sske %r0,%r2 + lg %r1,16(%r1) + ltgr %r1,%r1 + jnz 0b +2: + ptlb /* flush tlb */ + + /* Reset System */ + larl %r1,restart_entry + larl %r2,.Lrestart_diag308_psw + og %r1,0(%r2) + stg %r1,0(%r0) + larl %r1,.Lnew_pgm_check_psw + epsw %r2,%r3 + stm %r2,%r3,0(%r1) + mvc __LC_PGM_NEW_PSW(16,%r0),0(%r1) + lghi %r0,0 + diag %r0,%r0,0x308 +restart_entry: + lhi %r1,1 + sigp %r1,%r0,SIGP_SET_ARCHITECTURE + sam64 +#ifdef CONFIG_SMP + larl %r1,smp_cpu_mt_shift + icm %r1,15,0(%r1) + jz smt_done + llgfr %r1,%r1 +smt_loop: + sigp %r1,%r0,SIGP_SET_MULTI_THREADING + brc 8,smt_done /* accepted */ + brc 2,smt_loop /* busy, try again */ +smt_done: +#endif + larl %r1,.Lnew_pgm_check_psw + lpswe 0(%r1) +pgm_check_entry: + + /* Switch to original suspend CPU */ + larl %r1,.Lresume_cpu /* Resume CPU address: r2 */ + stap 0(%r1) + llgh %r2,0(%r1) + llgh %r1,__LC_EXT_CPU_ADDR(%r0) /* Suspend CPU address: r1 */ + cgr %r1,%r2 + je restore_registers /* r1 = r2 -> nothing to do */ + larl %r4,.Lrestart_suspend_psw /* Set new restart PSW */ + mvc __LC_RST_NEW_PSW(16,%r0),0(%r4) +3: + sigp %r9,%r1,SIGP_INITIAL_CPU_RESET /* sigp initial cpu reset */ + brc 8,4f /* accepted */ + brc 2,3b /* busy, try again */ + + /* Suspend CPU not available -> panic */ + larl %r15,init_thread_union + aghi %r15,1<<(PAGE_SHIFT+THREAD_SIZE_ORDER) + aghi %r15,-STACK_FRAME_OVERHEAD + larl %r2,.Lpanic_string + brasl %r14,sclp_early_printk_force + larl %r3,.Ldisabled_wait_31 + lpsw 0(%r3) +4: + /* Switch to suspend CPU */ + sigp %r9,%r1,SIGP_RESTART /* sigp restart to suspend CPU */ + brc 2,4b /* busy, try again */ +5: + sigp %r9,%r2,SIGP_STOP /* sigp stop to current resume CPU */ + brc 2,5b /* busy, try again */ +6: j 6b + +restart_suspend: + larl %r1,.Lresume_cpu + llgh %r2,0(%r1) +7: + sigp %r9,%r2,SIGP_SENSE /* sigp sense, wait for resume CPU */ + brc 8,7b /* accepted, status 0, still running */ + brc 2,7b /* busy, try again */ + tmll %r9,0x40 /* Test if resume CPU is stopped */ + jz 7b + +restore_registers: + /* Restore registers */ + lghi %r13,0x1000 /* %r1 = pointer to save area */ + + /* Ignore time spent in suspended state. */ + llgf %r1,0x318(%r13) + stck __LC_LAST_UPDATE_CLOCK(%r1) + spt 0x328(%r13) /* reprogram timer */ + //sckc 0x330(%r13) /* set clock comparator */ + + lctlg %c0,%c15,0x380(%r13) /* load control registers */ + lam %a0,%a15,0x340(%r13) /* load access registers */ + + /* Load old stack */ + lg %r15,0x2f8(%r13) + + /* Save prefix register */ + mvc __SF_EMPTY(4,%r15),0x318(%r13) + + /* Restore absolute zero pages */ + lghi %r2,0 + larl %r4,suspend_zero_pages + lg %r4,0(%r4) + lghi %r3,2*PAGE_SIZE + lghi %r5,2*PAGE_SIZE +1: mvcle %r2,%r4,0 + jo 1b + + /* Restore prefix register */ + spx __SF_EMPTY(%r15) + + /* Activate DAT */ + stosm __SF_EMPTY(%r15),0x04 + + /* Make all free pages unstable */ + lghi %r2,0 + brasl %r14,arch_set_page_states + + /* Call arch specific early resume code */ + brasl %r14,s390_early_resume + + /* Return 0 */ + lmg %r6,%r15,STACK_FRAME_OVERHEAD + __SF_GPRS(%r15) + lghi %r2,0 + BR_EX %r14 + + .section .data..nosave,"aw",@progbits + .align 8 +.Ldisabled_wait_31: + .long 0x000a0000,0x00000000 +.Lpanic_string: + .asciz "Resume not possible because suspend CPU is no longer available\n" + .align 8 +.Lrestart_diag308_psw: + .long 0x00080000,0x80000000 +.Lrestart_suspend_psw: + .quad 0x0000000180000000,restart_suspend +.Lnew_pgm_check_psw: + .quad 0,pgm_check_entry +.Lresume_cpu: + .byte 0,0 diff --git a/arch/s390/kernel/sys_s390.c b/arch/s390/kernel/sys_s390.c new file mode 100644 index 000000000..31cefe0c2 --- /dev/null +++ b/arch/s390/kernel/sys_s390.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * S390 version + * Copyright IBM Corp. 1999, 2000 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), + * Thomas Spatzier (tspat@de.ibm.com) + * + * Derived from "arch/i386/kernel/sys_i386.c" + * + * This file contains various random system calls that + * have a non-standard calling sequence on the Linux/s390 + * platform. + */ + +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/fs.h> +#include <linux/smp.h> +#include <linux/sem.h> +#include <linux/msg.h> +#include <linux/shm.h> +#include <linux/stat.h> +#include <linux/syscalls.h> +#include <linux/mman.h> +#include <linux/file.h> +#include <linux/utsname.h> +#include <linux/personality.h> +#include <linux/unistd.h> +#include <linux/ipc.h> +#include <linux/uaccess.h> +#include "entry.h" + +/* + * Perform the mmap() system call. Linux for S/390 isn't able to handle more + * than 5 system call parameters, so this system call uses a memory block + * for parameter passing. + */ + +struct s390_mmap_arg_struct { + unsigned long addr; + unsigned long len; + unsigned long prot; + unsigned long flags; + unsigned long fd; + unsigned long offset; +}; + +SYSCALL_DEFINE1(mmap2, struct s390_mmap_arg_struct __user *, arg) +{ + struct s390_mmap_arg_struct a; + int error = -EFAULT; + + if (copy_from_user(&a, arg, sizeof(a))) + goto out; + error = ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset); +out: + return error; +} + +/* + * sys_ipc() is the de-multiplexer for the SysV IPC calls. + */ +SYSCALL_DEFINE5(s390_ipc, uint, call, int, first, unsigned long, second, + unsigned long, third, void __user *, ptr) +{ + if (call >> 16) + return -EINVAL; + /* The s390 sys_ipc variant has only five parameters instead of six + * like the generic variant. The only difference is the handling of + * the SEMTIMEDOP subcall where on s390 the third parameter is used + * as a pointer to a struct timespec where the generic variant uses + * the fifth parameter. + * Therefore we can call the generic variant by simply passing the + * third parameter also as fifth parameter. + */ + return sys_ipc(call, first, second, third, ptr, third); +} + +SYSCALL_DEFINE1(s390_personality, unsigned int, personality) +{ + unsigned int ret; + + if (personality(current->personality) == PER_LINUX32 && + personality(personality) == PER_LINUX) + personality |= PER_LINUX32; + ret = sys_personality(personality); + if (personality(ret) == PER_LINUX32) + ret &= ~PER_LINUX32; + + return ret; +} diff --git a/arch/s390/kernel/syscalls/Makefile b/arch/s390/kernel/syscalls/Makefile new file mode 100644 index 000000000..4d929edc8 --- /dev/null +++ b/arch/s390/kernel/syscalls/Makefile @@ -0,0 +1,52 @@ +# SPDX-License-Identifier: GPL-2.0 + +gen := arch/$(ARCH)/include/generated +kapi := $(gen)/asm +uapi := $(gen)/uapi/asm + +syscall := $(srctree)/$(src)/syscall.tbl +systbl := $(srctree)/$(src)/syscalltbl + +gen-y := $(kapi)/syscall_table.h +kapi-hdrs-y := $(kapi)/unistd_nr.h +uapi-hdrs-y := $(uapi)/unistd_32.h +uapi-hdrs-y += $(uapi)/unistd_64.h + +targets += $(addprefix ../../../,$(gen-y) $(kapi-hdrs-y) $(uapi-hdrs-y)) + +PHONY += kapi uapi + +kapi: $(gen-y) $(kapi-hdrs-y) +uapi: $(uapi-hdrs-y) + + +# Create output directory if not already present +_dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') \ + $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)') + +define filechk_syshdr + $(CONFIG_SHELL) '$(systbl)' -H -a $(syshdr_abi_$(basetarget)) -f "$2" < $< +endef + +define filechk_sysnr + $(CONFIG_SHELL) '$(systbl)' -N -a $(sysnr_abi_$(basetarget)) < $< +endef + +define filechk_syscalls + $(CONFIG_SHELL) '$(systbl)' -S < $< +endef + +syshdr_abi_unistd_32 := common,32 +$(uapi)/unistd_32.h: $(syscall) FORCE + $(call filechk,syshdr,$@) + +syshdr_abi_unistd_64 := common,64 +$(uapi)/unistd_64.h: $(syscall) FORCE + $(call filechk,syshdr,$@) + +$(kapi)/syscall_table.h: $(syscall) FORCE + $(call filechk,syscalls) + +sysnr_abi_unistd_nr := common,32,64 +$(kapi)/unistd_nr.h: $(syscall) FORCE + $(call filechk,sysnr) diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl new file mode 100644 index 000000000..022fc099b --- /dev/null +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -0,0 +1,393 @@ +# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note +# +# System call table for s390 +# +# Format: +# +# <nr> <abi> <syscall> <entry-64bit> <compat-entry> +# +# where <abi> can be common, 64, or 32 + +1 common exit sys_exit sys_exit +2 common fork sys_fork sys_fork +3 common read sys_read compat_sys_s390_read +4 common write sys_write compat_sys_s390_write +5 common open sys_open compat_sys_open +6 common close sys_close sys_close +7 common restart_syscall sys_restart_syscall sys_restart_syscall +8 common creat sys_creat compat_sys_creat +9 common link sys_link compat_sys_link +10 common unlink sys_unlink compat_sys_unlink +11 common execve sys_execve compat_sys_execve +12 common chdir sys_chdir compat_sys_chdir +13 32 time - compat_sys_time +14 common mknod sys_mknod compat_sys_mknod +15 common chmod sys_chmod compat_sys_chmod +16 32 lchown - compat_sys_s390_lchown16 +19 common lseek sys_lseek compat_sys_lseek +20 common getpid sys_getpid sys_getpid +21 common mount sys_mount compat_sys_mount +22 common umount sys_oldumount compat_sys_oldumount +23 32 setuid - compat_sys_s390_setuid16 +24 32 getuid - compat_sys_s390_getuid16 +25 32 stime - compat_sys_stime +26 common ptrace sys_ptrace compat_sys_ptrace +27 common alarm sys_alarm sys_alarm +29 common pause sys_pause sys_pause +30 common utime sys_utime compat_sys_utime +33 common access sys_access compat_sys_access +34 common nice sys_nice sys_nice +36 common sync sys_sync sys_sync +37 common kill sys_kill sys_kill +38 common rename sys_rename compat_sys_rename +39 common mkdir sys_mkdir compat_sys_mkdir +40 common rmdir sys_rmdir compat_sys_rmdir +41 common dup sys_dup sys_dup +42 common pipe sys_pipe compat_sys_pipe +43 common times sys_times compat_sys_times +45 common brk sys_brk compat_sys_brk +46 32 setgid - compat_sys_s390_setgid16 +47 32 getgid - compat_sys_s390_getgid16 +48 common signal sys_signal compat_sys_signal +49 32 geteuid - compat_sys_s390_geteuid16 +50 32 getegid - compat_sys_s390_getegid16 +51 common acct sys_acct compat_sys_acct +52 common umount2 sys_umount compat_sys_umount +54 common ioctl sys_ioctl compat_sys_ioctl +55 common fcntl sys_fcntl compat_sys_fcntl +57 common setpgid sys_setpgid sys_setpgid +60 common umask sys_umask sys_umask +61 common chroot sys_chroot compat_sys_chroot +62 common ustat sys_ustat compat_sys_ustat +63 common dup2 sys_dup2 sys_dup2 +64 common getppid sys_getppid sys_getppid +65 common getpgrp sys_getpgrp sys_getpgrp +66 common setsid sys_setsid sys_setsid +67 common sigaction sys_sigaction compat_sys_sigaction +70 32 setreuid - compat_sys_s390_setreuid16 +71 32 setregid - compat_sys_s390_setregid16 +72 common sigsuspend sys_sigsuspend compat_sys_sigsuspend +73 common sigpending sys_sigpending compat_sys_sigpending +74 common sethostname sys_sethostname compat_sys_sethostname +75 common setrlimit sys_setrlimit compat_sys_setrlimit +76 32 getrlimit - compat_sys_old_getrlimit +77 common getrusage sys_getrusage compat_sys_getrusage +78 common gettimeofday sys_gettimeofday compat_sys_gettimeofday +79 common settimeofday sys_settimeofday compat_sys_settimeofday +80 32 getgroups - compat_sys_s390_getgroups16 +81 32 setgroups - compat_sys_s390_setgroups16 +83 common symlink sys_symlink compat_sys_symlink +85 common readlink sys_readlink compat_sys_readlink +86 common uselib sys_uselib compat_sys_uselib +87 common swapon sys_swapon compat_sys_swapon +88 common reboot sys_reboot compat_sys_reboot +89 common readdir - compat_sys_old_readdir +90 common mmap sys_old_mmap compat_sys_s390_old_mmap +91 common munmap sys_munmap compat_sys_munmap +92 common truncate sys_truncate compat_sys_truncate +93 common ftruncate sys_ftruncate compat_sys_ftruncate +94 common fchmod sys_fchmod sys_fchmod +95 32 fchown - compat_sys_s390_fchown16 +96 common getpriority sys_getpriority sys_getpriority +97 common setpriority sys_setpriority sys_setpriority +99 common statfs sys_statfs compat_sys_statfs +100 common fstatfs sys_fstatfs compat_sys_fstatfs +101 32 ioperm - - +102 common socketcall sys_socketcall compat_sys_socketcall +103 common syslog sys_syslog compat_sys_syslog +104 common setitimer sys_setitimer compat_sys_setitimer +105 common getitimer sys_getitimer compat_sys_getitimer +106 common stat sys_newstat compat_sys_newstat +107 common lstat sys_newlstat compat_sys_newlstat +108 common fstat sys_newfstat compat_sys_newfstat +110 common lookup_dcookie sys_lookup_dcookie compat_sys_lookup_dcookie +111 common vhangup sys_vhangup sys_vhangup +112 common idle - - +114 common wait4 sys_wait4 compat_sys_wait4 +115 common swapoff sys_swapoff compat_sys_swapoff +116 common sysinfo sys_sysinfo compat_sys_sysinfo +117 common ipc sys_s390_ipc compat_sys_s390_ipc +118 common fsync sys_fsync sys_fsync +119 common sigreturn sys_sigreturn compat_sys_sigreturn +120 common clone sys_clone compat_sys_clone +121 common setdomainname sys_setdomainname compat_sys_setdomainname +122 common uname sys_newuname compat_sys_newuname +124 common adjtimex sys_adjtimex compat_sys_adjtimex +125 common mprotect sys_mprotect compat_sys_mprotect +126 common sigprocmask sys_sigprocmask compat_sys_sigprocmask +127 common create_module - - +128 common init_module sys_init_module compat_sys_init_module +129 common delete_module sys_delete_module compat_sys_delete_module +130 common get_kernel_syms - - +131 common quotactl sys_quotactl compat_sys_quotactl +132 common getpgid sys_getpgid sys_getpgid +133 common fchdir sys_fchdir sys_fchdir +134 common bdflush sys_bdflush compat_sys_bdflush +135 common sysfs sys_sysfs compat_sys_sysfs +136 common personality sys_s390_personality sys_s390_personality +137 common afs_syscall - - +138 32 setfsuid - compat_sys_s390_setfsuid16 +139 32 setfsgid - compat_sys_s390_setfsgid16 +140 32 _llseek - compat_sys_llseek +141 common getdents sys_getdents compat_sys_getdents +142 32 _newselect - compat_sys_select +142 64 select sys_select - +143 common flock sys_flock sys_flock +144 common msync sys_msync compat_sys_msync +145 common readv sys_readv compat_sys_readv +146 common writev sys_writev compat_sys_writev +147 common getsid sys_getsid sys_getsid +148 common fdatasync sys_fdatasync sys_fdatasync +149 common _sysctl sys_sysctl compat_sys_sysctl +150 common mlock sys_mlock compat_sys_mlock +151 common munlock sys_munlock compat_sys_munlock +152 common mlockall sys_mlockall sys_mlockall +153 common munlockall sys_munlockall sys_munlockall +154 common sched_setparam sys_sched_setparam compat_sys_sched_setparam +155 common sched_getparam sys_sched_getparam compat_sys_sched_getparam +156 common sched_setscheduler sys_sched_setscheduler compat_sys_sched_setscheduler +157 common sched_getscheduler sys_sched_getscheduler sys_sched_getscheduler +158 common sched_yield sys_sched_yield sys_sched_yield +159 common sched_get_priority_max sys_sched_get_priority_max sys_sched_get_priority_max +160 common sched_get_priority_min sys_sched_get_priority_min sys_sched_get_priority_min +161 common sched_rr_get_interval sys_sched_rr_get_interval compat_sys_sched_rr_get_interval +162 common nanosleep sys_nanosleep compat_sys_nanosleep +163 common mremap sys_mremap compat_sys_mremap +164 32 setresuid - compat_sys_s390_setresuid16 +165 32 getresuid - compat_sys_s390_getresuid16 +167 common query_module - - +168 common poll sys_poll compat_sys_poll +169 common nfsservctl - - +170 32 setresgid - compat_sys_s390_setresgid16 +171 32 getresgid - compat_sys_s390_getresgid16 +172 common prctl sys_prctl compat_sys_prctl +173 common rt_sigreturn sys_rt_sigreturn compat_sys_rt_sigreturn +174 common rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction +175 common rt_sigprocmask sys_rt_sigprocmask compat_sys_rt_sigprocmask +176 common rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending +177 common rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait +178 common rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo +179 common rt_sigsuspend sys_rt_sigsuspend compat_sys_rt_sigsuspend +180 common pread64 sys_pread64 compat_sys_s390_pread64 +181 common pwrite64 sys_pwrite64 compat_sys_s390_pwrite64 +182 32 chown - compat_sys_s390_chown16 +183 common getcwd sys_getcwd compat_sys_getcwd +184 common capget sys_capget compat_sys_capget +185 common capset sys_capset compat_sys_capset +186 common sigaltstack sys_sigaltstack compat_sys_sigaltstack +187 common sendfile sys_sendfile64 compat_sys_sendfile +188 common getpmsg - - +189 common putpmsg - - +190 common vfork sys_vfork sys_vfork +191 32 ugetrlimit - compat_sys_getrlimit +191 64 getrlimit sys_getrlimit - +192 32 mmap2 - compat_sys_s390_mmap2 +193 32 truncate64 - compat_sys_s390_truncate64 +194 32 ftruncate64 - compat_sys_s390_ftruncate64 +195 32 stat64 - compat_sys_s390_stat64 +196 32 lstat64 - compat_sys_s390_lstat64 +197 32 fstat64 - compat_sys_s390_fstat64 +198 32 lchown32 - compat_sys_lchown +198 64 lchown sys_lchown - +199 32 getuid32 - sys_getuid +199 64 getuid sys_getuid - +200 32 getgid32 - sys_getgid +200 64 getgid sys_getgid - +201 32 geteuid32 - sys_geteuid +201 64 geteuid sys_geteuid - +202 32 getegid32 - sys_getegid +202 64 getegid sys_getegid - +203 32 setreuid32 - sys_setreuid +203 64 setreuid sys_setreuid - +204 32 setregid32 - sys_setregid +204 64 setregid sys_setregid - +205 32 getgroups32 - compat_sys_getgroups +205 64 getgroups sys_getgroups - +206 32 setgroups32 - compat_sys_setgroups +206 64 setgroups sys_setgroups - +207 32 fchown32 - sys_fchown +207 64 fchown sys_fchown - +208 32 setresuid32 - sys_setresuid +208 64 setresuid sys_setresuid - +209 32 getresuid32 - compat_sys_getresuid +209 64 getresuid sys_getresuid - +210 32 setresgid32 - sys_setresgid +210 64 setresgid sys_setresgid - +211 32 getresgid32 - compat_sys_getresgid +211 64 getresgid sys_getresgid - +212 32 chown32 - compat_sys_chown +212 64 chown sys_chown - +213 32 setuid32 - sys_setuid +213 64 setuid sys_setuid - +214 32 setgid32 - sys_setgid +214 64 setgid sys_setgid - +215 32 setfsuid32 - sys_setfsuid +215 64 setfsuid sys_setfsuid - +216 32 setfsgid32 - sys_setfsgid +216 64 setfsgid sys_setfsgid - +217 common pivot_root sys_pivot_root compat_sys_pivot_root +218 common mincore sys_mincore compat_sys_mincore +219 common madvise sys_madvise compat_sys_madvise +220 common getdents64 sys_getdents64 compat_sys_getdents64 +221 32 fcntl64 - compat_sys_fcntl64 +222 common readahead sys_readahead compat_sys_s390_readahead +223 32 sendfile64 - compat_sys_sendfile64 +224 common setxattr sys_setxattr compat_sys_setxattr +225 common lsetxattr sys_lsetxattr compat_sys_lsetxattr +226 common fsetxattr sys_fsetxattr compat_sys_fsetxattr +227 common getxattr sys_getxattr compat_sys_getxattr +228 common lgetxattr sys_lgetxattr compat_sys_lgetxattr +229 common fgetxattr sys_fgetxattr compat_sys_fgetxattr +230 common listxattr sys_listxattr compat_sys_listxattr +231 common llistxattr sys_llistxattr compat_sys_llistxattr +232 common flistxattr sys_flistxattr compat_sys_flistxattr +233 common removexattr sys_removexattr compat_sys_removexattr +234 common lremovexattr sys_lremovexattr compat_sys_lremovexattr +235 common fremovexattr sys_fremovexattr compat_sys_fremovexattr +236 common gettid sys_gettid sys_gettid +237 common tkill sys_tkill sys_tkill +238 common futex sys_futex compat_sys_futex +239 common sched_setaffinity sys_sched_setaffinity compat_sys_sched_setaffinity +240 common sched_getaffinity sys_sched_getaffinity compat_sys_sched_getaffinity +241 common tgkill sys_tgkill sys_tgkill +243 common io_setup sys_io_setup compat_sys_io_setup +244 common io_destroy sys_io_destroy compat_sys_io_destroy +245 common io_getevents sys_io_getevents compat_sys_io_getevents +246 common io_submit sys_io_submit compat_sys_io_submit +247 common io_cancel sys_io_cancel compat_sys_io_cancel +248 common exit_group sys_exit_group sys_exit_group +249 common epoll_create sys_epoll_create sys_epoll_create +250 common epoll_ctl sys_epoll_ctl compat_sys_epoll_ctl +251 common epoll_wait sys_epoll_wait compat_sys_epoll_wait +252 common set_tid_address sys_set_tid_address compat_sys_set_tid_address +253 common fadvise64 sys_fadvise64_64 compat_sys_s390_fadvise64 +254 common timer_create sys_timer_create compat_sys_timer_create +255 common timer_settime sys_timer_settime compat_sys_timer_settime +256 common timer_gettime sys_timer_gettime compat_sys_timer_gettime +257 common timer_getoverrun sys_timer_getoverrun sys_timer_getoverrun +258 common timer_delete sys_timer_delete sys_timer_delete +259 common clock_settime sys_clock_settime compat_sys_clock_settime +260 common clock_gettime sys_clock_gettime compat_sys_clock_gettime +261 common clock_getres sys_clock_getres compat_sys_clock_getres +262 common clock_nanosleep sys_clock_nanosleep compat_sys_clock_nanosleep +264 32 fadvise64_64 - compat_sys_s390_fadvise64_64 +265 common statfs64 sys_statfs64 compat_sys_statfs64 +266 common fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 +267 common remap_file_pages sys_remap_file_pages compat_sys_remap_file_pages +268 common mbind sys_mbind compat_sys_mbind +269 common get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy +270 common set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy +271 common mq_open sys_mq_open compat_sys_mq_open +272 common mq_unlink sys_mq_unlink compat_sys_mq_unlink +273 common mq_timedsend sys_mq_timedsend compat_sys_mq_timedsend +274 common mq_timedreceive sys_mq_timedreceive compat_sys_mq_timedreceive +275 common mq_notify sys_mq_notify compat_sys_mq_notify +276 common mq_getsetattr sys_mq_getsetattr compat_sys_mq_getsetattr +277 common kexec_load sys_kexec_load compat_sys_kexec_load +278 common add_key sys_add_key compat_sys_add_key +279 common request_key sys_request_key compat_sys_request_key +280 common keyctl sys_keyctl compat_sys_keyctl +281 common waitid sys_waitid compat_sys_waitid +282 common ioprio_set sys_ioprio_set sys_ioprio_set +283 common ioprio_get sys_ioprio_get sys_ioprio_get +284 common inotify_init sys_inotify_init sys_inotify_init +285 common inotify_add_watch sys_inotify_add_watch compat_sys_inotify_add_watch +286 common inotify_rm_watch sys_inotify_rm_watch sys_inotify_rm_watch +287 common migrate_pages sys_migrate_pages compat_sys_migrate_pages +288 common openat sys_openat compat_sys_openat +289 common mkdirat sys_mkdirat compat_sys_mkdirat +290 common mknodat sys_mknodat compat_sys_mknodat +291 common fchownat sys_fchownat compat_sys_fchownat +292 common futimesat sys_futimesat compat_sys_futimesat +293 32 fstatat64 - compat_sys_s390_fstatat64 +293 64 newfstatat sys_newfstatat - +294 common unlinkat sys_unlinkat compat_sys_unlinkat +295 common renameat sys_renameat compat_sys_renameat +296 common linkat sys_linkat compat_sys_linkat +297 common symlinkat sys_symlinkat compat_sys_symlinkat +298 common readlinkat sys_readlinkat compat_sys_readlinkat +299 common fchmodat sys_fchmodat compat_sys_fchmodat +300 common faccessat sys_faccessat compat_sys_faccessat +301 common pselect6 sys_pselect6 compat_sys_pselect6 +302 common ppoll sys_ppoll compat_sys_ppoll +303 common unshare sys_unshare compat_sys_unshare +304 common set_robust_list sys_set_robust_list compat_sys_set_robust_list +305 common get_robust_list sys_get_robust_list compat_sys_get_robust_list +306 common splice sys_splice compat_sys_splice +307 common sync_file_range sys_sync_file_range compat_sys_s390_sync_file_range +308 common tee sys_tee compat_sys_tee +309 common vmsplice sys_vmsplice compat_sys_vmsplice +310 common move_pages sys_move_pages compat_sys_move_pages +311 common getcpu sys_getcpu compat_sys_getcpu +312 common epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait +313 common utimes sys_utimes compat_sys_utimes +314 common fallocate sys_fallocate compat_sys_s390_fallocate +315 common utimensat sys_utimensat compat_sys_utimensat +316 common signalfd sys_signalfd compat_sys_signalfd +317 common timerfd - - +318 common eventfd sys_eventfd sys_eventfd +319 common timerfd_create sys_timerfd_create sys_timerfd_create +320 common timerfd_settime sys_timerfd_settime compat_sys_timerfd_settime +321 common timerfd_gettime sys_timerfd_gettime compat_sys_timerfd_gettime +322 common signalfd4 sys_signalfd4 compat_sys_signalfd4 +323 common eventfd2 sys_eventfd2 sys_eventfd2 +324 common inotify_init1 sys_inotify_init1 sys_inotify_init1 +325 common pipe2 sys_pipe2 compat_sys_pipe2 +326 common dup3 sys_dup3 sys_dup3 +327 common epoll_create1 sys_epoll_create1 sys_epoll_create1 +328 common preadv sys_preadv compat_sys_preadv +329 common pwritev sys_pwritev compat_sys_pwritev +330 common rt_tgsigqueueinfo sys_rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo +331 common perf_event_open sys_perf_event_open compat_sys_perf_event_open +332 common fanotify_init sys_fanotify_init sys_fanotify_init +333 common fanotify_mark sys_fanotify_mark compat_sys_fanotify_mark +334 common prlimit64 sys_prlimit64 compat_sys_prlimit64 +335 common name_to_handle_at sys_name_to_handle_at compat_sys_name_to_handle_at +336 common open_by_handle_at sys_open_by_handle_at compat_sys_open_by_handle_at +337 common clock_adjtime sys_clock_adjtime compat_sys_clock_adjtime +338 common syncfs sys_syncfs sys_syncfs +339 common setns sys_setns sys_setns +340 common process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv +341 common process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev +342 common s390_runtime_instr sys_s390_runtime_instr sys_s390_runtime_instr +343 common kcmp sys_kcmp compat_sys_kcmp +344 common finit_module sys_finit_module compat_sys_finit_module +345 common sched_setattr sys_sched_setattr compat_sys_sched_setattr +346 common sched_getattr sys_sched_getattr compat_sys_sched_getattr +347 common renameat2 sys_renameat2 compat_sys_renameat2 +348 common seccomp sys_seccomp compat_sys_seccomp +349 common getrandom sys_getrandom compat_sys_getrandom +350 common memfd_create sys_memfd_create compat_sys_memfd_create +351 common bpf sys_bpf compat_sys_bpf +352 common s390_pci_mmio_write sys_s390_pci_mmio_write compat_sys_s390_pci_mmio_write +353 common s390_pci_mmio_read sys_s390_pci_mmio_read compat_sys_s390_pci_mmio_read +354 common execveat sys_execveat compat_sys_execveat +355 common userfaultfd sys_userfaultfd sys_userfaultfd +356 common membarrier sys_membarrier sys_membarrier +357 common recvmmsg sys_recvmmsg compat_sys_recvmmsg +358 common sendmmsg sys_sendmmsg compat_sys_sendmmsg +359 common socket sys_socket sys_socket +360 common socketpair sys_socketpair compat_sys_socketpair +361 common bind sys_bind compat_sys_bind +362 common connect sys_connect compat_sys_connect +363 common listen sys_listen sys_listen +364 common accept4 sys_accept4 compat_sys_accept4 +365 common getsockopt sys_getsockopt compat_sys_getsockopt +366 common setsockopt sys_setsockopt compat_sys_setsockopt +367 common getsockname sys_getsockname compat_sys_getsockname +368 common getpeername sys_getpeername compat_sys_getpeername +369 common sendto sys_sendto compat_sys_sendto +370 common sendmsg sys_sendmsg compat_sys_sendmsg +371 common recvfrom sys_recvfrom compat_sys_recvfrom +372 common recvmsg sys_recvmsg compat_sys_recvmsg +373 common shutdown sys_shutdown sys_shutdown +374 common mlock2 sys_mlock2 compat_sys_mlock2 +375 common copy_file_range sys_copy_file_range compat_sys_copy_file_range +376 common preadv2 sys_preadv2 compat_sys_preadv2 +377 common pwritev2 sys_pwritev2 compat_sys_pwritev2 +378 common s390_guarded_storage sys_s390_guarded_storage compat_sys_s390_guarded_storage +379 common statx sys_statx compat_sys_statx +380 common s390_sthyi sys_s390_sthyi compat_sys_s390_sthyi +381 common kexec_file_load sys_kexec_file_load compat_sys_kexec_file_load +382 common io_pgetevents sys_io_pgetevents compat_sys_io_pgetevents +383 common rseq sys_rseq compat_sys_rseq diff --git a/arch/s390/kernel/syscalls/syscalltbl b/arch/s390/kernel/syscalls/syscalltbl new file mode 100755 index 000000000..fbac1732f --- /dev/null +++ b/arch/s390/kernel/syscalls/syscalltbl @@ -0,0 +1,232 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# +# Generate system call table and header files +# +# Copyright IBM Corp. 2018 +# Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> + +# +# File path to the system call table definition. +# You can set the path with the -i option. If omitted, +# system call table definitions are read from standard input. +# +SYSCALL_TBL="" + + +create_syscall_table_entries() +{ + local nr abi name entry64 entry32 _ignore + local temp=$(mktemp ${TMPDIR:-/tmp}/syscalltbl-common.XXXXXXXXX) + + ( + # + # Initialize with 0 to create an NI_SYSCALL for 0 + # + local prev_nr=0 prev_32=sys_ni_syscall prev_64=sys_ni_syscall + while read nr abi name entry64 entry32 _ignore; do + test x$entry32 = x- && entry32=sys_ni_syscall + test x$entry64 = x- && entry64=sys_ni_syscall + + if test $prev_nr -eq $nr; then + # + # Same syscall but different ABI, just update + # the respective entry point + # + case $abi in + 32) + prev_32=$entry32 + ;; + 64) + prev_64=$entry64 + ;; + esac + continue; + else + printf "%d\t%s\t%s\n" $prev_nr $prev_64 $prev_32 + fi + + prev_nr=$nr + prev_64=$entry64 + prev_32=$entry32 + done + printf "%d\t%s\t%s\n" $prev_nr $prev_64 $prev_32 + ) >> $temp + + # + # Check for duplicate syscall numbers + # + if ! cat $temp |cut -f1 |uniq -d 2>&1; then + echo "Error: generated system call table contains duplicate entries: $temp" >&2 + exit 1 + fi + + # + # Generate syscall table + # + prev_nr=0 + while read nr entry64 entry32; do + while test $prev_nr -lt $((nr - 1)); do + printf "NI_SYSCALL\n" + prev_nr=$((prev_nr + 1)) + done + if test x$entry64 = xsys_ni_syscall && + test x$entry32 = xsys_ni_syscall; then + printf "NI_SYSCALL\n" + else + printf "SYSCALL(%s,%s)\n" $entry64 $entry32 + fi + prev_nr=$nr + done < $temp + rm $temp +} + +generate_syscall_table() +{ + cat <<-EoHEADER + /* SPDX-License-Identifier: GPL-2.0 */ + /* + * Definitions for sys_call_table, each line represents an + * entry in the table in the form + * SYSCALL(64 bit syscall, 31 bit emulated syscall) + * + * This file is meant to be included from entry.S. + */ + + #define NI_SYSCALL SYSCALL(sys_ni_syscall,sys_ni_syscall) + +EoHEADER + grep -Ev '^(#|[[:blank:]]*$)' $SYSCALL_TBL \ + |sort -k1 -n \ + |create_syscall_table_entries +} + +create_header_defines() +{ + local nr abi name _ignore + + while read nr abi name _ignore; do + printf "#define __NR_%s %d\n" $name $nr + done +} + +normalize_fileguard() +{ + local fileguard="$1" + + echo "$1" |tr '[[:lower:]]' '[[:upper:]]' \ + |sed -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g' +} + +generate_syscall_header() +{ + local abis=$(echo "($1)" | tr ',' '|') + local filename="$2" + local fileguard suffix + + if test "$filename"; then + fileguard=$(normalize_fileguard "__UAPI_ASM_S390_$2") + else + case "$abis" in + *64*) suffix=64 ;; + *32*) suffix=32 ;; + esac + fileguard=$(normalize_fileguard "__UAPI_ASM_S390_SYSCALLS_$suffix") + fi + + cat <<-EoHEADER + /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ + #ifndef ${fileguard} + #define ${fileguard} + +EoHEADER + + grep -E "^[[:digit:]]+[[:space:]]+${abis}" $SYSCALL_TBL \ + |sort -k1 -n \ + |create_header_defines + + cat <<-EoFOOTER + + #endif /* ${fileguard} */ +EoFOOTER +} + +__max_syscall_nr() +{ + local abis=$(echo "($1)" | tr ',' '|') + + grep -E "^[[:digit:]]+[[:space:]]+${abis}" $SYSCALL_TBL \ + |sed -ne 's/^\([[:digit:]]*\)[[:space:]].*/\1/p' \ + |sort -n \ + |tail -1 +} + + +generate_syscall_nr() +{ + local abis="$1" + local max_syscall_nr num_syscalls + + max_syscall_nr=$(__max_syscall_nr "$abis") + num_syscalls=$((max_syscall_nr + 1)) + + cat <<-EoHEADER + /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ + #ifndef __ASM_S390_SYSCALLS_NR + #define __ASM_S390_SYSCALLS_NR + + #define NR_syscalls ${num_syscalls} + + #endif /* __ASM_S390_SYSCALLS_NR */ +EoHEADER +} + + +# +# Parse command line arguments +# +do_syscall_header="" +do_syscall_table="" +do_syscall_nr="" +output_file="" +abi_list="common,64" +filename="" +while getopts ":HNSXi:a:f:" arg; do + case $arg in + a) + abi_list="$OPTARG" + ;; + i) + SYSCALL_TBL="$OPTARG" + ;; + f) + filename=${OPTARG##*/} + ;; + H) + do_syscall_header=1 + ;; + N) + do_syscall_nr=1 + ;; + S) + do_syscall_table=1 + ;; + X) + set -x + ;; + :) + echo "Missing argument for -$OPTARG" >&2 + exit 1 + ;; + \?) + echo "Invalid option specified" >&2 + exit 1 + ;; + esac +done + +test "$do_syscall_header" && generate_syscall_header "$abi_list" "$filename" +test "$do_syscall_table" && generate_syscall_table +test "$do_syscall_nr" && generate_syscall_nr "$abi_list" + +exit 0 diff --git a/arch/s390/kernel/sysinfo.c b/arch/s390/kernel/sysinfo.c new file mode 100644 index 000000000..12f80d1f0 --- /dev/null +++ b/arch/s390/kernel/sysinfo.c @@ -0,0 +1,568 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2001, 2009 + * Author(s): Ulrich Weigand <Ulrich.Weigand@de.ibm.com>, + * Martin Schwidefsky <schwidefsky@de.ibm.com>, + */ + +#include <linux/debugfs.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/export.h> +#include <linux/slab.h> +#include <asm/ebcdic.h> +#include <asm/debug.h> +#include <asm/sysinfo.h> +#include <asm/cpcmd.h> +#include <asm/topology.h> +#include <asm/fpu/api.h> + +int topology_max_mnest; + +static inline int __stsi(void *sysinfo, int fc, int sel1, int sel2, int *lvl) +{ + register int r0 asm("0") = (fc << 28) | sel1; + register int r1 asm("1") = sel2; + int rc = 0; + + asm volatile( + " stsi 0(%3)\n" + "0: jz 2f\n" + "1: lhi %1,%4\n" + "2:\n" + EX_TABLE(0b, 1b) + : "+d" (r0), "+d" (rc) + : "d" (r1), "a" (sysinfo), "K" (-EOPNOTSUPP) + : "cc", "memory"); + *lvl = ((unsigned int) r0) >> 28; + return rc; +} + +/* + * stsi - store system information + * + * Returns the current configuration level if function code 0 was specified. + * Otherwise returns 0 on success or a negative value on error. + */ +int stsi(void *sysinfo, int fc, int sel1, int sel2) +{ + int lvl, rc; + + rc = __stsi(sysinfo, fc, sel1, sel2, &lvl); + if (rc) + return rc; + return fc ? 0 : lvl; +} +EXPORT_SYMBOL(stsi); + +#ifdef CONFIG_PROC_FS + +static bool convert_ext_name(unsigned char encoding, char *name, size_t len) +{ + switch (encoding) { + case 1: /* EBCDIC */ + EBCASC(name, len); + break; + case 2: /* UTF-8 */ + break; + default: + return false; + } + return true; +} + +static void stsi_1_1_1(struct seq_file *m, struct sysinfo_1_1_1 *info) +{ + int i; + + if (stsi(info, 1, 1, 1)) + return; + EBCASC(info->manufacturer, sizeof(info->manufacturer)); + EBCASC(info->type, sizeof(info->type)); + EBCASC(info->model, sizeof(info->model)); + EBCASC(info->sequence, sizeof(info->sequence)); + EBCASC(info->plant, sizeof(info->plant)); + EBCASC(info->model_capacity, sizeof(info->model_capacity)); + EBCASC(info->model_perm_cap, sizeof(info->model_perm_cap)); + EBCASC(info->model_temp_cap, sizeof(info->model_temp_cap)); + seq_printf(m, "Manufacturer: %-16.16s\n", info->manufacturer); + seq_printf(m, "Type: %-4.4s\n", info->type); + if (info->lic) + seq_printf(m, "LIC Identifier: %016lx\n", info->lic); + /* + * Sigh: the model field has been renamed with System z9 + * to model_capacity and a new model field has been added + * after the plant field. To avoid confusing older programs + * the "Model:" prints "model_capacity model" or just + * "model_capacity" if the model string is empty . + */ + seq_printf(m, "Model: %-16.16s", info->model_capacity); + if (info->model[0] != '\0') + seq_printf(m, " %-16.16s", info->model); + seq_putc(m, '\n'); + seq_printf(m, "Sequence Code: %-16.16s\n", info->sequence); + seq_printf(m, "Plant: %-4.4s\n", info->plant); + seq_printf(m, "Model Capacity: %-16.16s %08u\n", + info->model_capacity, info->model_cap_rating); + if (info->model_perm_cap_rating) + seq_printf(m, "Model Perm. Capacity: %-16.16s %08u\n", + info->model_perm_cap, + info->model_perm_cap_rating); + if (info->model_temp_cap_rating) + seq_printf(m, "Model Temp. Capacity: %-16.16s %08u\n", + info->model_temp_cap, + info->model_temp_cap_rating); + if (info->ncr) + seq_printf(m, "Nominal Cap. Rating: %08u\n", info->ncr); + if (info->npr) + seq_printf(m, "Nominal Perm. Rating: %08u\n", info->npr); + if (info->ntr) + seq_printf(m, "Nominal Temp. Rating: %08u\n", info->ntr); + if (info->cai) { + seq_printf(m, "Capacity Adj. Ind.: %d\n", info->cai); + seq_printf(m, "Capacity Ch. Reason: %d\n", info->ccr); + seq_printf(m, "Capacity Transient: %d\n", info->t); + } + if (info->p) { + for (i = 1; i <= ARRAY_SIZE(info->typepct); i++) { + seq_printf(m, "Type %d Percentage: %d\n", + i, info->typepct[i - 1]); + } + } +} + +static void stsi_15_1_x(struct seq_file *m, struct sysinfo_15_1_x *info) +{ + int i; + + seq_putc(m, '\n'); + if (!MACHINE_HAS_TOPOLOGY) + return; + if (stsi(info, 15, 1, topology_max_mnest)) + return; + seq_printf(m, "CPU Topology HW: "); + for (i = 0; i < TOPOLOGY_NR_MAG; i++) + seq_printf(m, " %d", info->mag[i]); + seq_putc(m, '\n'); +#ifdef CONFIG_SCHED_TOPOLOGY + store_topology(info); + seq_printf(m, "CPU Topology SW: "); + for (i = 0; i < TOPOLOGY_NR_MAG; i++) + seq_printf(m, " %d", info->mag[i]); + seq_putc(m, '\n'); +#endif +} + +static void stsi_1_2_2(struct seq_file *m, struct sysinfo_1_2_2 *info) +{ + struct sysinfo_1_2_2_extension *ext; + int i; + + if (stsi(info, 1, 2, 2)) + return; + ext = (struct sysinfo_1_2_2_extension *) + ((unsigned long) info + info->acc_offset); + seq_printf(m, "CPUs Total: %d\n", info->cpus_total); + seq_printf(m, "CPUs Configured: %d\n", info->cpus_configured); + seq_printf(m, "CPUs Standby: %d\n", info->cpus_standby); + seq_printf(m, "CPUs Reserved: %d\n", info->cpus_reserved); + if (info->mt_installed) { + seq_printf(m, "CPUs G-MTID: %d\n", info->mt_gtid); + seq_printf(m, "CPUs S-MTID: %d\n", info->mt_stid); + } + /* + * Sigh 2. According to the specification the alternate + * capability field is a 32 bit floating point number + * if the higher order 8 bits are not zero. Printing + * a floating point number in the kernel is a no-no, + * always print the number as 32 bit unsigned integer. + * The user-space needs to know about the strange + * encoding of the alternate cpu capability. + */ + seq_printf(m, "Capability: %u", info->capability); + if (info->format == 1) + seq_printf(m, " %u", ext->alt_capability); + seq_putc(m, '\n'); + if (info->nominal_cap) + seq_printf(m, "Nominal Capability: %d\n", info->nominal_cap); + if (info->secondary_cap) + seq_printf(m, "Secondary Capability: %d\n", info->secondary_cap); + for (i = 2; i <= info->cpus_total; i++) { + seq_printf(m, "Adjustment %02d-way: %u", + i, info->adjustment[i-2]); + if (info->format == 1) + seq_printf(m, " %u", ext->alt_adjustment[i-2]); + seq_putc(m, '\n'); + } +} + +static void stsi_2_2_2(struct seq_file *m, struct sysinfo_2_2_2 *info) +{ + if (stsi(info, 2, 2, 2)) + return; + EBCASC(info->name, sizeof(info->name)); + seq_putc(m, '\n'); + seq_printf(m, "LPAR Number: %d\n", info->lpar_number); + seq_printf(m, "LPAR Characteristics: "); + if (info->characteristics & LPAR_CHAR_DEDICATED) + seq_printf(m, "Dedicated "); + if (info->characteristics & LPAR_CHAR_SHARED) + seq_printf(m, "Shared "); + if (info->characteristics & LPAR_CHAR_LIMITED) + seq_printf(m, "Limited "); + seq_putc(m, '\n'); + seq_printf(m, "LPAR Name: %-8.8s\n", info->name); + seq_printf(m, "LPAR Adjustment: %d\n", info->caf); + seq_printf(m, "LPAR CPUs Total: %d\n", info->cpus_total); + seq_printf(m, "LPAR CPUs Configured: %d\n", info->cpus_configured); + seq_printf(m, "LPAR CPUs Standby: %d\n", info->cpus_standby); + seq_printf(m, "LPAR CPUs Reserved: %d\n", info->cpus_reserved); + seq_printf(m, "LPAR CPUs Dedicated: %d\n", info->cpus_dedicated); + seq_printf(m, "LPAR CPUs Shared: %d\n", info->cpus_shared); + if (info->mt_installed) { + seq_printf(m, "LPAR CPUs G-MTID: %d\n", info->mt_gtid); + seq_printf(m, "LPAR CPUs S-MTID: %d\n", info->mt_stid); + seq_printf(m, "LPAR CPUs PS-MTID: %d\n", info->mt_psmtid); + } + if (convert_ext_name(info->vsne, info->ext_name, sizeof(info->ext_name))) { + seq_printf(m, "LPAR Extended Name: %-.256s\n", info->ext_name); + seq_printf(m, "LPAR UUID: %pUb\n", &info->uuid); + } +} + +static void print_ext_name(struct seq_file *m, int lvl, + struct sysinfo_3_2_2 *info) +{ + size_t len = sizeof(info->ext_names[lvl]); + + if (!convert_ext_name(info->vm[lvl].evmne, info->ext_names[lvl], len)) + return; + seq_printf(m, "VM%02d Extended Name: %-.256s\n", lvl, + info->ext_names[lvl]); +} + +static void print_uuid(struct seq_file *m, int i, struct sysinfo_3_2_2 *info) +{ + if (uuid_is_null(&info->vm[i].uuid)) + return; + seq_printf(m, "VM%02d UUID: %pUb\n", i, &info->vm[i].uuid); +} + +static void stsi_3_2_2(struct seq_file *m, struct sysinfo_3_2_2 *info) +{ + int i; + + if (stsi(info, 3, 2, 2)) + return; + for (i = 0; i < info->count; i++) { + EBCASC(info->vm[i].name, sizeof(info->vm[i].name)); + EBCASC(info->vm[i].cpi, sizeof(info->vm[i].cpi)); + seq_putc(m, '\n'); + seq_printf(m, "VM%02d Name: %-8.8s\n", i, info->vm[i].name); + seq_printf(m, "VM%02d Control Program: %-16.16s\n", i, info->vm[i].cpi); + seq_printf(m, "VM%02d Adjustment: %d\n", i, info->vm[i].caf); + seq_printf(m, "VM%02d CPUs Total: %d\n", i, info->vm[i].cpus_total); + seq_printf(m, "VM%02d CPUs Configured: %d\n", i, info->vm[i].cpus_configured); + seq_printf(m, "VM%02d CPUs Standby: %d\n", i, info->vm[i].cpus_standby); + seq_printf(m, "VM%02d CPUs Reserved: %d\n", i, info->vm[i].cpus_reserved); + print_ext_name(m, i, info); + print_uuid(m, i, info); + } +} + +static int sysinfo_show(struct seq_file *m, void *v) +{ + void *info = (void *)get_zeroed_page(GFP_KERNEL); + int level; + + if (!info) + return 0; + level = stsi(NULL, 0, 0, 0); + if (level >= 1) + stsi_1_1_1(m, info); + if (level >= 1) + stsi_15_1_x(m, info); + if (level >= 1) + stsi_1_2_2(m, info); + if (level >= 2) + stsi_2_2_2(m, info); + if (level >= 3) + stsi_3_2_2(m, info); + free_page((unsigned long)info); + return 0; +} + +static int __init sysinfo_create_proc(void) +{ + proc_create_single("sysinfo", 0444, NULL, sysinfo_show); + return 0; +} +device_initcall(sysinfo_create_proc); + +#endif /* CONFIG_PROC_FS */ + +/* + * Service levels interface. + */ + +static DECLARE_RWSEM(service_level_sem); +static LIST_HEAD(service_level_list); + +int register_service_level(struct service_level *slr) +{ + struct service_level *ptr; + + down_write(&service_level_sem); + list_for_each_entry(ptr, &service_level_list, list) + if (ptr == slr) { + up_write(&service_level_sem); + return -EEXIST; + } + list_add_tail(&slr->list, &service_level_list); + up_write(&service_level_sem); + return 0; +} +EXPORT_SYMBOL(register_service_level); + +int unregister_service_level(struct service_level *slr) +{ + struct service_level *ptr, *next; + int rc = -ENOENT; + + down_write(&service_level_sem); + list_for_each_entry_safe(ptr, next, &service_level_list, list) { + if (ptr != slr) + continue; + list_del(&ptr->list); + rc = 0; + break; + } + up_write(&service_level_sem); + return rc; +} +EXPORT_SYMBOL(unregister_service_level); + +static void *service_level_start(struct seq_file *m, loff_t *pos) +{ + down_read(&service_level_sem); + return seq_list_start(&service_level_list, *pos); +} + +static void *service_level_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next(p, &service_level_list, pos); +} + +static void service_level_stop(struct seq_file *m, void *p) +{ + up_read(&service_level_sem); +} + +static int service_level_show(struct seq_file *m, void *p) +{ + struct service_level *slr; + + slr = list_entry(p, struct service_level, list); + slr->seq_print(m, slr); + return 0; +} + +static const struct seq_operations service_level_seq_ops = { + .start = service_level_start, + .next = service_level_next, + .stop = service_level_stop, + .show = service_level_show +}; + +static void service_level_vm_print(struct seq_file *m, + struct service_level *slr) +{ + char *query_buffer, *str; + + query_buffer = kmalloc(1024, GFP_KERNEL | GFP_DMA); + if (!query_buffer) + return; + cpcmd("QUERY CPLEVEL", query_buffer, 1024, NULL); + str = strchr(query_buffer, '\n'); + if (str) + *str = 0; + seq_printf(m, "VM: %s\n", query_buffer); + kfree(query_buffer); +} + +static struct service_level service_level_vm = { + .seq_print = service_level_vm_print +}; + +static __init int create_proc_service_level(void) +{ + proc_create_seq("service_levels", 0, NULL, &service_level_seq_ops); + if (MACHINE_IS_VM) + register_service_level(&service_level_vm); + return 0; +} +subsys_initcall(create_proc_service_level); + +/* + * CPU capability might have changed. Therefore recalculate loops_per_jiffy. + */ +void s390_adjust_jiffies(void) +{ + struct sysinfo_1_2_2 *info; + unsigned long capability; + struct kernel_fpu fpu; + + info = (void *) get_zeroed_page(GFP_KERNEL); + if (!info) + return; + + if (stsi(info, 1, 2, 2) == 0) { + /* + * Major sigh. The cpu capability encoding is "special". + * If the first 9 bits of info->capability are 0 then it + * is a 32 bit unsigned integer in the range 0 .. 2^23. + * If the first 9 bits are != 0 then it is a 32 bit float. + * In addition a lower value indicates a proportionally + * higher cpu capacity. Bogomips are the other way round. + * To get to a halfway suitable number we divide 1e7 + * by the cpu capability number. Yes, that means a floating + * point division .. + */ + kernel_fpu_begin(&fpu, KERNEL_FPR); + asm volatile( + " sfpc %3\n" + " l %0,%1\n" + " tmlh %0,0xff80\n" + " jnz 0f\n" + " cefbr %%f2,%0\n" + " j 1f\n" + "0: le %%f2,%1\n" + "1: cefbr %%f0,%2\n" + " debr %%f0,%%f2\n" + " cgebr %0,5,%%f0\n" + : "=&d" (capability) + : "Q" (info->capability), "d" (10000000), "d" (0) + : "cc" + ); + kernel_fpu_end(&fpu, KERNEL_FPR); + } else + /* + * Really old machine without stsi block for basic + * cpu information. Report 42.0 bogomips. + */ + capability = 42; + loops_per_jiffy = capability * (500000/HZ); + free_page((unsigned long) info); +} + +/* + * calibrate the delay loop + */ +void calibrate_delay(void) +{ + s390_adjust_jiffies(); + /* Print the good old Bogomips line .. */ + printk(KERN_DEBUG "Calibrating delay loop (skipped)... " + "%lu.%02lu BogoMIPS preset\n", loops_per_jiffy/(500000/HZ), + (loops_per_jiffy/(5000/HZ)) % 100); +} + +#ifdef CONFIG_DEBUG_FS + +#define STSI_FILE(fc, s1, s2) \ +static int stsi_open_##fc##_##s1##_##s2(struct inode *inode, struct file *file)\ +{ \ + file->private_data = (void *) get_zeroed_page(GFP_KERNEL); \ + if (!file->private_data) \ + return -ENOMEM; \ + if (stsi(file->private_data, fc, s1, s2)) { \ + free_page((unsigned long)file->private_data); \ + file->private_data = NULL; \ + return -EACCES; \ + } \ + return nonseekable_open(inode, file); \ +} \ + \ +static const struct file_operations stsi_##fc##_##s1##_##s2##_fs_ops = { \ + .open = stsi_open_##fc##_##s1##_##s2, \ + .release = stsi_release, \ + .read = stsi_read, \ + .llseek = no_llseek, \ +}; + +static int stsi_release(struct inode *inode, struct file *file) +{ + free_page((unsigned long)file->private_data); + return 0; +} + +static ssize_t stsi_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) +{ + return simple_read_from_buffer(buf, size, ppos, file->private_data, PAGE_SIZE); +} + +STSI_FILE( 1, 1, 1); +STSI_FILE( 1, 2, 1); +STSI_FILE( 1, 2, 2); +STSI_FILE( 2, 2, 1); +STSI_FILE( 2, 2, 2); +STSI_FILE( 3, 2, 2); +STSI_FILE(15, 1, 2); +STSI_FILE(15, 1, 3); +STSI_FILE(15, 1, 4); +STSI_FILE(15, 1, 5); +STSI_FILE(15, 1, 6); + +struct stsi_file { + const struct file_operations *fops; + char *name; +}; + +static struct stsi_file stsi_file[] __initdata = { + {.fops = &stsi_1_1_1_fs_ops, .name = "1_1_1"}, + {.fops = &stsi_1_2_1_fs_ops, .name = "1_2_1"}, + {.fops = &stsi_1_2_2_fs_ops, .name = "1_2_2"}, + {.fops = &stsi_2_2_1_fs_ops, .name = "2_2_1"}, + {.fops = &stsi_2_2_2_fs_ops, .name = "2_2_2"}, + {.fops = &stsi_3_2_2_fs_ops, .name = "3_2_2"}, + {.fops = &stsi_15_1_2_fs_ops, .name = "15_1_2"}, + {.fops = &stsi_15_1_3_fs_ops, .name = "15_1_3"}, + {.fops = &stsi_15_1_4_fs_ops, .name = "15_1_4"}, + {.fops = &stsi_15_1_5_fs_ops, .name = "15_1_5"}, + {.fops = &stsi_15_1_6_fs_ops, .name = "15_1_6"}, +}; + +static u8 stsi_0_0_0; + +static __init int stsi_init_debugfs(void) +{ + struct dentry *stsi_root; + struct stsi_file *sf; + int lvl, i; + + stsi_root = debugfs_create_dir("stsi", arch_debugfs_dir); + if (IS_ERR_OR_NULL(stsi_root)) + return 0; + lvl = stsi(NULL, 0, 0, 0); + if (lvl > 0) + stsi_0_0_0 = lvl; + debugfs_create_u8("0_0_0", 0400, stsi_root, &stsi_0_0_0); + for (i = 0; i < ARRAY_SIZE(stsi_file); i++) { + sf = &stsi_file[i]; + debugfs_create_file(sf->name, 0400, stsi_root, NULL, sf->fops); + } + if (IS_ENABLED(CONFIG_SCHED_TOPOLOGY) && MACHINE_HAS_TOPOLOGY) { + char link_to[10]; + + sprintf(link_to, "15_1_%d", topology_mnest_limit()); + debugfs_create_symlink("topology", stsi_root, link_to); + } + return 0; +} +device_initcall(stsi_init_debugfs); + +#endif /* CONFIG_DEBUG_FS */ diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c new file mode 100644 index 000000000..11c32b228 --- /dev/null +++ b/arch/s390/kernel/time.c @@ -0,0 +1,923 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Time of day based timer functions. + * + * S390 version + * Copyright IBM Corp. 1999, 2008 + * Author(s): Hartmut Penner (hp@de.ibm.com), + * Martin Schwidefsky (schwidefsky@de.ibm.com), + * Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com) + * + * Derived from "arch/i386/kernel/time.c" + * Copyright (C) 1991, 1992, 1995 Linus Torvalds + */ + +#define KMSG_COMPONENT "time" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/kernel_stat.h> +#include <linux/errno.h> +#include <linux/export.h> +#include <linux/sched.h> +#include <linux/sched/clock.h> +#include <linux/kernel.h> +#include <linux/param.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/cpu.h> +#include <linux/stop_machine.h> +#include <linux/time.h> +#include <linux/device.h> +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/types.h> +#include <linux/profile.h> +#include <linux/timex.h> +#include <linux/notifier.h> +#include <linux/timekeeper_internal.h> +#include <linux/clockchips.h> +#include <linux/gfp.h> +#include <linux/kprobes.h> +#include <linux/uaccess.h> +#include <asm/facility.h> +#include <asm/delay.h> +#include <asm/div64.h> +#include <asm/vdso.h> +#include <asm/irq.h> +#include <asm/irq_regs.h> +#include <asm/vtimer.h> +#include <asm/stp.h> +#include <asm/cio.h> +#include "entry.h" + +unsigned char tod_clock_base[16] __aligned(8) = { + /* Force to data section. */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +}; +EXPORT_SYMBOL_GPL(tod_clock_base); + +u64 clock_comparator_max = -1ULL; +EXPORT_SYMBOL_GPL(clock_comparator_max); + +static DEFINE_PER_CPU(struct clock_event_device, comparators); + +ATOMIC_NOTIFIER_HEAD(s390_epoch_delta_notifier); +EXPORT_SYMBOL(s390_epoch_delta_notifier); + +unsigned char ptff_function_mask[16]; + +static unsigned long long lpar_offset; +static unsigned long long initial_leap_seconds; +static unsigned long long tod_steering_end; +static long long tod_steering_delta; + +/* + * Get time offsets with PTFF + */ +void __init time_early_init(void) +{ + struct ptff_qto qto; + struct ptff_qui qui; + + /* Initialize TOD steering parameters */ + tod_steering_end = *(unsigned long long *) &tod_clock_base[1]; + vdso_data->ts_end = tod_steering_end; + + if (!test_facility(28)) + return; + + ptff(&ptff_function_mask, sizeof(ptff_function_mask), PTFF_QAF); + + /* get LPAR offset */ + if (ptff_query(PTFF_QTO) && ptff(&qto, sizeof(qto), PTFF_QTO) == 0) + lpar_offset = qto.tod_epoch_difference; + + /* get initial leap seconds */ + if (ptff_query(PTFF_QUI) && ptff(&qui, sizeof(qui), PTFF_QUI) == 0) + initial_leap_seconds = (unsigned long long) + ((long) qui.old_leap * 4096000000L); +} + +/* + * Scheduler clock - returns current time in nanosec units. + */ +unsigned long long notrace sched_clock(void) +{ + return tod_to_ns(get_tod_clock_monotonic()); +} +NOKPROBE_SYMBOL(sched_clock); + +/* + * Monotonic_clock - returns # of nanoseconds passed since time_init() + */ +unsigned long long monotonic_clock(void) +{ + return sched_clock(); +} +EXPORT_SYMBOL(monotonic_clock); + +static void ext_to_timespec64(unsigned char *clk, struct timespec64 *xt) +{ + unsigned long long high, low, rem, sec, nsec; + + /* Split extendnd TOD clock to micro-seconds and sub-micro-seconds */ + high = (*(unsigned long long *) clk) >> 4; + low = (*(unsigned long long *)&clk[7]) << 4; + /* Calculate seconds and nano-seconds */ + sec = high; + rem = do_div(sec, 1000000); + nsec = (((low >> 32) + (rem << 32)) * 1000) >> 32; + + xt->tv_sec = sec; + xt->tv_nsec = nsec; +} + +void clock_comparator_work(void) +{ + struct clock_event_device *cd; + + S390_lowcore.clock_comparator = clock_comparator_max; + cd = this_cpu_ptr(&comparators); + cd->event_handler(cd); +} + +static int s390_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + S390_lowcore.clock_comparator = get_tod_clock() + delta; + set_clock_comparator(S390_lowcore.clock_comparator); + return 0; +} + +/* + * Set up lowcore and control register of the current cpu to + * enable TOD clock and clock comparator interrupts. + */ +void init_cpu_timer(void) +{ + struct clock_event_device *cd; + int cpu; + + S390_lowcore.clock_comparator = clock_comparator_max; + set_clock_comparator(S390_lowcore.clock_comparator); + + cpu = smp_processor_id(); + cd = &per_cpu(comparators, cpu); + cd->name = "comparator"; + cd->features = CLOCK_EVT_FEAT_ONESHOT; + cd->mult = 16777; + cd->shift = 12; + cd->min_delta_ns = 1; + cd->min_delta_ticks = 1; + cd->max_delta_ns = LONG_MAX; + cd->max_delta_ticks = ULONG_MAX; + cd->rating = 400; + cd->cpumask = cpumask_of(cpu); + cd->set_next_event = s390_next_event; + + clockevents_register_device(cd); + + /* Enable clock comparator timer interrupt. */ + __ctl_set_bit(0,11); + + /* Always allow the timing alert external interrupt. */ + __ctl_set_bit(0, 4); +} + +static void clock_comparator_interrupt(struct ext_code ext_code, + unsigned int param32, + unsigned long param64) +{ + inc_irq_stat(IRQEXT_CLK); + if (S390_lowcore.clock_comparator == clock_comparator_max) + set_clock_comparator(S390_lowcore.clock_comparator); +} + +static void stp_timing_alert(struct stp_irq_parm *); + +static void timing_alert_interrupt(struct ext_code ext_code, + unsigned int param32, unsigned long param64) +{ + inc_irq_stat(IRQEXT_TLA); + if (param32 & 0x00038000) + stp_timing_alert((struct stp_irq_parm *) ¶m32); +} + +static void stp_reset(void); + +void read_persistent_clock64(struct timespec64 *ts) +{ + unsigned char clk[STORE_CLOCK_EXT_SIZE]; + __u64 delta; + + delta = initial_leap_seconds + TOD_UNIX_EPOCH; + get_tod_clock_ext(clk); + *(__u64 *) &clk[1] -= delta; + if (*(__u64 *) &clk[1] > delta) + clk[0]--; + ext_to_timespec64(clk, ts); +} + +void __init read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, + struct timespec64 *boot_offset) +{ + unsigned char clk[STORE_CLOCK_EXT_SIZE]; + struct timespec64 boot_time; + __u64 delta; + + delta = initial_leap_seconds + TOD_UNIX_EPOCH; + memcpy(clk, tod_clock_base, STORE_CLOCK_EXT_SIZE); + *(__u64 *)&clk[1] -= delta; + if (*(__u64 *)&clk[1] > delta) + clk[0]--; + ext_to_timespec64(clk, &boot_time); + + read_persistent_clock64(wall_time); + *boot_offset = timespec64_sub(*wall_time, boot_time); +} + +static u64 read_tod_clock(struct clocksource *cs) +{ + unsigned long long now, adj; + + preempt_disable(); /* protect from changes to steering parameters */ + now = get_tod_clock(); + adj = tod_steering_end - now; + if (unlikely((s64) adj >= 0)) + /* + * manually steer by 1 cycle every 2^16 cycles. This + * corresponds to shifting the tod delta by 15. 1s is + * therefore steered in ~9h. The adjust will decrease + * over time, until it finally reaches 0. + */ + now += (tod_steering_delta < 0) ? (adj >> 15) : -(adj >> 15); + preempt_enable(); + return now; +} + +static struct clocksource clocksource_tod = { + .name = "tod", + .rating = 400, + .read = read_tod_clock, + .mask = -1ULL, + .mult = 1000, + .shift = 12, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +struct clocksource * __init clocksource_default_clock(void) +{ + return &clocksource_tod; +} + +void update_vsyscall(struct timekeeper *tk) +{ + u64 nsecps; + + if (tk->tkr_mono.clock != &clocksource_tod) + return; + + /* Make userspace gettimeofday spin until we're done. */ + ++vdso_data->tb_update_count; + smp_wmb(); + vdso_data->xtime_tod_stamp = tk->tkr_mono.cycle_last; + vdso_data->xtime_clock_sec = tk->xtime_sec; + vdso_data->xtime_clock_nsec = tk->tkr_mono.xtime_nsec; + vdso_data->wtom_clock_sec = + tk->xtime_sec + tk->wall_to_monotonic.tv_sec; + vdso_data->wtom_clock_nsec = tk->tkr_mono.xtime_nsec + + + ((u64) tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift); + nsecps = (u64) NSEC_PER_SEC << tk->tkr_mono.shift; + while (vdso_data->wtom_clock_nsec >= nsecps) { + vdso_data->wtom_clock_nsec -= nsecps; + vdso_data->wtom_clock_sec++; + } + + vdso_data->xtime_coarse_sec = tk->xtime_sec; + vdso_data->xtime_coarse_nsec = + (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); + vdso_data->wtom_coarse_sec = + vdso_data->xtime_coarse_sec + tk->wall_to_monotonic.tv_sec; + vdso_data->wtom_coarse_nsec = + vdso_data->xtime_coarse_nsec + tk->wall_to_monotonic.tv_nsec; + while (vdso_data->wtom_coarse_nsec >= NSEC_PER_SEC) { + vdso_data->wtom_coarse_nsec -= NSEC_PER_SEC; + vdso_data->wtom_coarse_sec++; + } + + vdso_data->tk_mult = tk->tkr_mono.mult; + vdso_data->tk_shift = tk->tkr_mono.shift; + vdso_data->hrtimer_res = hrtimer_resolution; + smp_wmb(); + ++vdso_data->tb_update_count; +} + +extern struct timezone sys_tz; + +void update_vsyscall_tz(void) +{ + vdso_data->tz_minuteswest = sys_tz.tz_minuteswest; + vdso_data->tz_dsttime = sys_tz.tz_dsttime; +} + +/* + * Initialize the TOD clock and the CPU timer of + * the boot cpu. + */ +void __init time_init(void) +{ + /* Reset time synchronization interfaces. */ + stp_reset(); + + /* request the clock comparator external interrupt */ + if (register_external_irq(EXT_IRQ_CLK_COMP, clock_comparator_interrupt)) + panic("Couldn't request external interrupt 0x1004"); + + /* request the timing alert external interrupt */ + if (register_external_irq(EXT_IRQ_TIMING_ALERT, timing_alert_interrupt)) + panic("Couldn't request external interrupt 0x1406"); + + if (__clocksource_register(&clocksource_tod) != 0) + panic("Could not register TOD clock source"); + + /* Enable TOD clock interrupts on the boot cpu. */ + init_cpu_timer(); + + /* Enable cpu timer interrupts on the boot cpu. */ + vtime_init(); +} + +static DEFINE_PER_CPU(atomic_t, clock_sync_word); +static DEFINE_MUTEX(clock_sync_mutex); +static unsigned long clock_sync_flags; + +#define CLOCK_SYNC_HAS_STP 0 +#define CLOCK_SYNC_STP 1 +#define CLOCK_SYNC_STPINFO_VALID 2 + +/* + * The get_clock function for the physical clock. It will get the current + * TOD clock, subtract the LPAR offset and write the result to *clock. + * The function returns 0 if the clock is in sync with the external time + * source. If the clock mode is local it will return -EOPNOTSUPP and + * -EAGAIN if the clock is not in sync with the external reference. + */ +int get_phys_clock(unsigned long *clock) +{ + atomic_t *sw_ptr; + unsigned int sw0, sw1; + + sw_ptr = &get_cpu_var(clock_sync_word); + sw0 = atomic_read(sw_ptr); + *clock = get_tod_clock() - lpar_offset; + sw1 = atomic_read(sw_ptr); + put_cpu_var(clock_sync_word); + if (sw0 == sw1 && (sw0 & 0x80000000U)) + /* Success: time is in sync. */ + return 0; + if (!test_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags)) + return -EOPNOTSUPP; + if (!test_bit(CLOCK_SYNC_STP, &clock_sync_flags)) + return -EACCES; + return -EAGAIN; +} +EXPORT_SYMBOL(get_phys_clock); + +/* + * Make get_phys_clock() return -EAGAIN. + */ +static void disable_sync_clock(void *dummy) +{ + atomic_t *sw_ptr = this_cpu_ptr(&clock_sync_word); + /* + * Clear the in-sync bit 2^31. All get_phys_clock calls will + * fail until the sync bit is turned back on. In addition + * increase the "sequence" counter to avoid the race of an + * stp event and the complete recovery against get_phys_clock. + */ + atomic_andnot(0x80000000, sw_ptr); + atomic_inc(sw_ptr); +} + +/* + * Make get_phys_clock() return 0 again. + * Needs to be called from a context disabled for preemption. + */ +static void enable_sync_clock(void) +{ + atomic_t *sw_ptr = this_cpu_ptr(&clock_sync_word); + atomic_or(0x80000000, sw_ptr); +} + +/* + * Function to check if the clock is in sync. + */ +static inline int check_sync_clock(void) +{ + atomic_t *sw_ptr; + int rc; + + sw_ptr = &get_cpu_var(clock_sync_word); + rc = (atomic_read(sw_ptr) & 0x80000000U) != 0; + put_cpu_var(clock_sync_word); + return rc; +} + +/* + * Apply clock delta to the global data structures. + * This is called once on the CPU that performed the clock sync. + */ +static void clock_sync_global(unsigned long long delta) +{ + unsigned long now, adj; + struct ptff_qto qto; + + /* Fixup the monotonic sched clock. */ + *(unsigned long long *) &tod_clock_base[1] += delta; + if (*(unsigned long long *) &tod_clock_base[1] < delta) + /* Epoch overflow */ + tod_clock_base[0]++; + /* Adjust TOD steering parameters. */ + vdso_data->tb_update_count++; + now = get_tod_clock(); + adj = tod_steering_end - now; + if (unlikely((s64) adj >= 0)) + /* Calculate how much of the old adjustment is left. */ + tod_steering_delta = (tod_steering_delta < 0) ? + -(adj >> 15) : (adj >> 15); + tod_steering_delta += delta; + if ((abs(tod_steering_delta) >> 48) != 0) + panic("TOD clock sync offset %lli is too large to drift\n", + tod_steering_delta); + tod_steering_end = now + (abs(tod_steering_delta) << 15); + vdso_data->ts_dir = (tod_steering_delta < 0) ? 0 : 1; + vdso_data->ts_end = tod_steering_end; + vdso_data->tb_update_count++; + /* Update LPAR offset. */ + if (ptff_query(PTFF_QTO) && ptff(&qto, sizeof(qto), PTFF_QTO) == 0) + lpar_offset = qto.tod_epoch_difference; + /* Call the TOD clock change notifier. */ + atomic_notifier_call_chain(&s390_epoch_delta_notifier, 0, &delta); +} + +/* + * Apply clock delta to the per-CPU data structures of this CPU. + * This is called for each online CPU after the call to clock_sync_global. + */ +static void clock_sync_local(unsigned long long delta) +{ + /* Add the delta to the clock comparator. */ + if (S390_lowcore.clock_comparator != clock_comparator_max) { + S390_lowcore.clock_comparator += delta; + set_clock_comparator(S390_lowcore.clock_comparator); + } + /* Adjust the last_update_clock time-stamp. */ + S390_lowcore.last_update_clock += delta; +} + +/* Single threaded workqueue used for stp sync events */ +static struct workqueue_struct *time_sync_wq; + +static void __init time_init_wq(void) +{ + if (time_sync_wq) + return; + time_sync_wq = create_singlethread_workqueue("timesync"); +} + +struct clock_sync_data { + atomic_t cpus; + int in_sync; + unsigned long long clock_delta; +}; + +/* + * Server Time Protocol (STP) code. + */ +static bool stp_online; +static struct stp_sstpi stp_info; +static void *stp_page; + +static void stp_work_fn(struct work_struct *work); +static DEFINE_MUTEX(stp_work_mutex); +static DECLARE_WORK(stp_work, stp_work_fn); +static struct timer_list stp_timer; + +static int __init early_parse_stp(char *p) +{ + return kstrtobool(p, &stp_online); +} +early_param("stp", early_parse_stp); + +/* + * Reset STP attachment. + */ +static void __init stp_reset(void) +{ + int rc; + + stp_page = (void *) get_zeroed_page(GFP_ATOMIC); + rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0x0000, NULL); + if (rc == 0) + set_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags); + else if (stp_online) { + pr_warn("The real or virtual hardware system does not provide an STP interface\n"); + free_page((unsigned long) stp_page); + stp_page = NULL; + stp_online = false; + } +} + +static void stp_timeout(struct timer_list *unused) +{ + queue_work(time_sync_wq, &stp_work); +} + +static int __init stp_init(void) +{ + if (!test_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags)) + return 0; + timer_setup(&stp_timer, stp_timeout, 0); + time_init_wq(); + if (!stp_online) + return 0; + queue_work(time_sync_wq, &stp_work); + return 0; +} + +arch_initcall(stp_init); + +/* + * STP timing alert. There are three causes: + * 1) timing status change + * 2) link availability change + * 3) time control parameter change + * In all three cases we are only interested in the clock source state. + * If a STP clock source is now available use it. + */ +static void stp_timing_alert(struct stp_irq_parm *intparm) +{ + if (intparm->tsc || intparm->lac || intparm->tcpc) + queue_work(time_sync_wq, &stp_work); +} + +/* + * STP sync check machine check. This is called when the timing state + * changes from the synchronized state to the unsynchronized state. + * After a STP sync check the clock is not in sync. The machine check + * is broadcasted to all cpus at the same time. + */ +int stp_sync_check(void) +{ + disable_sync_clock(NULL); + return 1; +} + +/* + * STP island condition machine check. This is called when an attached + * server attempts to communicate over an STP link and the servers + * have matching CTN ids and have a valid stratum-1 configuration + * but the configurations do not match. + */ +int stp_island_check(void) +{ + disable_sync_clock(NULL); + return 1; +} + +void stp_queue_work(void) +{ + queue_work(time_sync_wq, &stp_work); +} + +static int __store_stpinfo(void) +{ + int rc = chsc_sstpi(stp_page, &stp_info, sizeof(struct stp_sstpi)); + + if (rc) + clear_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags); + else + set_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags); + return rc; +} + +static int stpinfo_valid(void) +{ + return stp_online && test_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags); +} + +static int stp_sync_clock(void *data) +{ + struct clock_sync_data *sync = data; + unsigned long long clock_delta; + static int first; + int rc; + + enable_sync_clock(); + if (xchg(&first, 1) == 0) { + /* Wait until all other cpus entered the sync function. */ + while (atomic_read(&sync->cpus) != 0) + cpu_relax(); + rc = 0; + if (stp_info.todoff[0] || stp_info.todoff[1] || + stp_info.todoff[2] || stp_info.todoff[3] || + stp_info.tmd != 2) { + rc = chsc_sstpc(stp_page, STP_OP_SYNC, 0, + &clock_delta); + if (rc == 0) { + sync->clock_delta = clock_delta; + clock_sync_global(clock_delta); + rc = __store_stpinfo(); + if (rc == 0 && stp_info.tmd != 2) + rc = -EAGAIN; + } + } + sync->in_sync = rc ? -EAGAIN : 1; + xchg(&first, 0); + } else { + /* Slave */ + atomic_dec(&sync->cpus); + /* Wait for in_sync to be set. */ + while (READ_ONCE(sync->in_sync) == 0) + __udelay(1); + } + if (sync->in_sync != 1) + /* Didn't work. Clear per-cpu in sync bit again. */ + disable_sync_clock(NULL); + /* Apply clock delta to per-CPU fields of this CPU. */ + clock_sync_local(sync->clock_delta); + + return 0; +} + +/* + * STP work. Check for the STP state and take over the clock + * synchronization if the STP clock source is usable. + */ +static void stp_work_fn(struct work_struct *work) +{ + struct clock_sync_data stp_sync; + int rc; + + /* prevent multiple execution. */ + mutex_lock(&stp_work_mutex); + + if (!stp_online) { + chsc_sstpc(stp_page, STP_OP_CTRL, 0x0000, NULL); + del_timer_sync(&stp_timer); + goto out_unlock; + } + + rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0xb0e0, NULL); + if (rc) + goto out_unlock; + + rc = __store_stpinfo(); + if (rc || stp_info.c == 0) + goto out_unlock; + + /* Skip synchronization if the clock is already in sync. */ + if (check_sync_clock()) + goto out_unlock; + + memset(&stp_sync, 0, sizeof(stp_sync)); + cpus_read_lock(); + atomic_set(&stp_sync.cpus, num_online_cpus() - 1); + stop_machine_cpuslocked(stp_sync_clock, &stp_sync, cpu_online_mask); + cpus_read_unlock(); + + if (!check_sync_clock()) + /* + * There is a usable clock but the synchonization failed. + * Retry after a second. + */ + mod_timer(&stp_timer, jiffies + HZ); + +out_unlock: + mutex_unlock(&stp_work_mutex); +} + +/* + * STP subsys sysfs interface functions + */ +static struct bus_type stp_subsys = { + .name = "stp", + .dev_name = "stp", +}; + +static ssize_t stp_ctn_id_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + ssize_t ret = -ENODATA; + + mutex_lock(&stp_work_mutex); + if (stpinfo_valid()) + ret = sprintf(buf, "%016llx\n", + *(unsigned long long *) stp_info.ctnid); + mutex_unlock(&stp_work_mutex); + return ret; +} + +static DEVICE_ATTR(ctn_id, 0400, stp_ctn_id_show, NULL); + +static ssize_t stp_ctn_type_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + ssize_t ret = -ENODATA; + + mutex_lock(&stp_work_mutex); + if (stpinfo_valid()) + ret = sprintf(buf, "%i\n", stp_info.ctn); + mutex_unlock(&stp_work_mutex); + return ret; +} + +static DEVICE_ATTR(ctn_type, 0400, stp_ctn_type_show, NULL); + +static ssize_t stp_dst_offset_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + ssize_t ret = -ENODATA; + + mutex_lock(&stp_work_mutex); + if (stpinfo_valid() && (stp_info.vbits & 0x2000)) + ret = sprintf(buf, "%i\n", (int)(s16) stp_info.dsto); + mutex_unlock(&stp_work_mutex); + return ret; +} + +static DEVICE_ATTR(dst_offset, 0400, stp_dst_offset_show, NULL); + +static ssize_t stp_leap_seconds_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + ssize_t ret = -ENODATA; + + mutex_lock(&stp_work_mutex); + if (stpinfo_valid() && (stp_info.vbits & 0x8000)) + ret = sprintf(buf, "%i\n", (int)(s16) stp_info.leaps); + mutex_unlock(&stp_work_mutex); + return ret; +} + +static DEVICE_ATTR(leap_seconds, 0400, stp_leap_seconds_show, NULL); + +static ssize_t stp_stratum_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + ssize_t ret = -ENODATA; + + mutex_lock(&stp_work_mutex); + if (stpinfo_valid()) + ret = sprintf(buf, "%i\n", (int)(s16) stp_info.stratum); + mutex_unlock(&stp_work_mutex); + return ret; +} + +static DEVICE_ATTR(stratum, 0400, stp_stratum_show, NULL); + +static ssize_t stp_time_offset_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + ssize_t ret = -ENODATA; + + mutex_lock(&stp_work_mutex); + if (stpinfo_valid() && (stp_info.vbits & 0x0800)) + ret = sprintf(buf, "%i\n", (int) stp_info.tto); + mutex_unlock(&stp_work_mutex); + return ret; +} + +static DEVICE_ATTR(time_offset, 0400, stp_time_offset_show, NULL); + +static ssize_t stp_time_zone_offset_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + ssize_t ret = -ENODATA; + + mutex_lock(&stp_work_mutex); + if (stpinfo_valid() && (stp_info.vbits & 0x4000)) + ret = sprintf(buf, "%i\n", (int)(s16) stp_info.tzo); + mutex_unlock(&stp_work_mutex); + return ret; +} + +static DEVICE_ATTR(time_zone_offset, 0400, + stp_time_zone_offset_show, NULL); + +static ssize_t stp_timing_mode_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + ssize_t ret = -ENODATA; + + mutex_lock(&stp_work_mutex); + if (stpinfo_valid()) + ret = sprintf(buf, "%i\n", stp_info.tmd); + mutex_unlock(&stp_work_mutex); + return ret; +} + +static DEVICE_ATTR(timing_mode, 0400, stp_timing_mode_show, NULL); + +static ssize_t stp_timing_state_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + ssize_t ret = -ENODATA; + + mutex_lock(&stp_work_mutex); + if (stpinfo_valid()) + ret = sprintf(buf, "%i\n", stp_info.tst); + mutex_unlock(&stp_work_mutex); + return ret; +} + +static DEVICE_ATTR(timing_state, 0400, stp_timing_state_show, NULL); + +static ssize_t stp_online_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "%i\n", stp_online); +} + +static ssize_t stp_online_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned int value; + + value = simple_strtoul(buf, NULL, 0); + if (value != 0 && value != 1) + return -EINVAL; + if (!test_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags)) + return -EOPNOTSUPP; + mutex_lock(&clock_sync_mutex); + stp_online = value; + if (stp_online) + set_bit(CLOCK_SYNC_STP, &clock_sync_flags); + else + clear_bit(CLOCK_SYNC_STP, &clock_sync_flags); + queue_work(time_sync_wq, &stp_work); + mutex_unlock(&clock_sync_mutex); + return count; +} + +/* + * Can't use DEVICE_ATTR because the attribute should be named + * stp/online but dev_attr_online already exists in this file .. + */ +static struct device_attribute dev_attr_stp_online = { + .attr = { .name = "online", .mode = 0600 }, + .show = stp_online_show, + .store = stp_online_store, +}; + +static struct device_attribute *stp_attributes[] = { + &dev_attr_ctn_id, + &dev_attr_ctn_type, + &dev_attr_dst_offset, + &dev_attr_leap_seconds, + &dev_attr_stp_online, + &dev_attr_stratum, + &dev_attr_time_offset, + &dev_attr_time_zone_offset, + &dev_attr_timing_mode, + &dev_attr_timing_state, + NULL +}; + +static int __init stp_init_sysfs(void) +{ + struct device_attribute **attr; + int rc; + + rc = subsys_system_register(&stp_subsys, NULL); + if (rc) + goto out; + for (attr = stp_attributes; *attr; attr++) { + rc = device_create_file(stp_subsys.dev_root, *attr); + if (rc) + goto out_unreg; + } + return 0; +out_unreg: + for (; attr >= stp_attributes; attr--) + device_remove_file(stp_subsys.dev_root, *attr); + bus_unregister(&stp_subsys); +out: + return rc; +} + +device_initcall(stp_init_sysfs); diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c new file mode 100644 index 000000000..7b9688897 --- /dev/null +++ b/arch/s390/kernel/topology.c @@ -0,0 +1,641 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2007, 2011 + * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com> + */ + +#define KMSG_COMPONENT "cpu" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/workqueue.h> +#include <linux/bootmem.h> +#include <linux/uaccess.h> +#include <linux/sysctl.h> +#include <linux/cpuset.h> +#include <linux/device.h> +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/sched/topology.h> +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/cpu.h> +#include <linux/smp.h> +#include <linux/mm.h> +#include <linux/nodemask.h> +#include <linux/node.h> +#include <asm/sysinfo.h> +#include <asm/numa.h> + +#define PTF_HORIZONTAL (0UL) +#define PTF_VERTICAL (1UL) +#define PTF_CHECK (2UL) + +enum { + TOPOLOGY_MODE_HW, + TOPOLOGY_MODE_SINGLE, + TOPOLOGY_MODE_PACKAGE, + TOPOLOGY_MODE_UNINITIALIZED +}; + +struct mask_info { + struct mask_info *next; + unsigned char id; + cpumask_t mask; +}; + +static int topology_mode = TOPOLOGY_MODE_UNINITIALIZED; +static void set_topology_timer(void); +static void topology_work_fn(struct work_struct *work); +static struct sysinfo_15_1_x *tl_info; + +static DECLARE_WORK(topology_work, topology_work_fn); + +/* + * Socket/Book linked lists and cpu_topology updates are + * protected by "sched_domains_mutex". + */ +static struct mask_info socket_info; +static struct mask_info book_info; +static struct mask_info drawer_info; + +struct cpu_topology_s390 cpu_topology[NR_CPUS]; +EXPORT_SYMBOL_GPL(cpu_topology); + +cpumask_t cpus_with_topology; + +static cpumask_t cpu_group_map(struct mask_info *info, unsigned int cpu) +{ + cpumask_t mask; + + cpumask_copy(&mask, cpumask_of(cpu)); + switch (topology_mode) { + case TOPOLOGY_MODE_HW: + while (info) { + if (cpumask_test_cpu(cpu, &info->mask)) { + mask = info->mask; + break; + } + info = info->next; + } + if (cpumask_empty(&mask)) + cpumask_copy(&mask, cpumask_of(cpu)); + break; + case TOPOLOGY_MODE_PACKAGE: + cpumask_copy(&mask, cpu_present_mask); + break; + default: + /* fallthrough */ + case TOPOLOGY_MODE_SINGLE: + cpumask_copy(&mask, cpumask_of(cpu)); + break; + } + return mask; +} + +static cpumask_t cpu_thread_map(unsigned int cpu) +{ + cpumask_t mask; + int i; + + cpumask_copy(&mask, cpumask_of(cpu)); + if (topology_mode != TOPOLOGY_MODE_HW) + return mask; + cpu -= cpu % (smp_cpu_mtid + 1); + for (i = 0; i <= smp_cpu_mtid; i++) + if (cpu_present(cpu + i)) + cpumask_set_cpu(cpu + i, &mask); + return mask; +} + +#define TOPOLOGY_CORE_BITS 64 + +static void add_cpus_to_mask(struct topology_core *tl_core, + struct mask_info *drawer, + struct mask_info *book, + struct mask_info *socket) +{ + struct cpu_topology_s390 *topo; + unsigned int core; + + for_each_set_bit(core, &tl_core->mask, TOPOLOGY_CORE_BITS) { + unsigned int rcore; + int lcpu, i; + + rcore = TOPOLOGY_CORE_BITS - 1 - core + tl_core->origin; + lcpu = smp_find_processor_id(rcore << smp_cpu_mt_shift); + if (lcpu < 0) + continue; + for (i = 0; i <= smp_cpu_mtid; i++) { + topo = &cpu_topology[lcpu + i]; + topo->drawer_id = drawer->id; + topo->book_id = book->id; + topo->socket_id = socket->id; + topo->core_id = rcore; + topo->thread_id = lcpu + i; + topo->dedicated = tl_core->d; + cpumask_set_cpu(lcpu + i, &drawer->mask); + cpumask_set_cpu(lcpu + i, &book->mask); + cpumask_set_cpu(lcpu + i, &socket->mask); + cpumask_set_cpu(lcpu + i, &cpus_with_topology); + smp_cpu_set_polarization(lcpu + i, tl_core->pp); + } + } +} + +static void clear_masks(void) +{ + struct mask_info *info; + + info = &socket_info; + while (info) { + cpumask_clear(&info->mask); + info = info->next; + } + info = &book_info; + while (info) { + cpumask_clear(&info->mask); + info = info->next; + } + info = &drawer_info; + while (info) { + cpumask_clear(&info->mask); + info = info->next; + } +} + +static union topology_entry *next_tle(union topology_entry *tle) +{ + if (!tle->nl) + return (union topology_entry *)((struct topology_core *)tle + 1); + return (union topology_entry *)((struct topology_container *)tle + 1); +} + +static void tl_to_masks(struct sysinfo_15_1_x *info) +{ + struct mask_info *socket = &socket_info; + struct mask_info *book = &book_info; + struct mask_info *drawer = &drawer_info; + union topology_entry *tle, *end; + + clear_masks(); + tle = info->tle; + end = (union topology_entry *)((unsigned long)info + info->length); + while (tle < end) { + switch (tle->nl) { + case 3: + drawer = drawer->next; + drawer->id = tle->container.id; + break; + case 2: + book = book->next; + book->id = tle->container.id; + break; + case 1: + socket = socket->next; + socket->id = tle->container.id; + break; + case 0: + add_cpus_to_mask(&tle->cpu, drawer, book, socket); + break; + default: + clear_masks(); + return; + } + tle = next_tle(tle); + } +} + +static void topology_update_polarization_simple(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + smp_cpu_set_polarization(cpu, POLARIZATION_HRZ); +} + +static int ptf(unsigned long fc) +{ + int rc; + + asm volatile( + " .insn rre,0xb9a20000,%1,%1\n" + " ipm %0\n" + " srl %0,28\n" + : "=d" (rc) + : "d" (fc) : "cc"); + return rc; +} + +int topology_set_cpu_management(int fc) +{ + int cpu, rc; + + if (!MACHINE_HAS_TOPOLOGY) + return -EOPNOTSUPP; + if (fc) + rc = ptf(PTF_VERTICAL); + else + rc = ptf(PTF_HORIZONTAL); + if (rc) + return -EBUSY; + for_each_possible_cpu(cpu) + smp_cpu_set_polarization(cpu, POLARIZATION_UNKNOWN); + return rc; +} + +static void update_cpu_masks(void) +{ + struct cpu_topology_s390 *topo; + int cpu, id; + + for_each_possible_cpu(cpu) { + topo = &cpu_topology[cpu]; + topo->thread_mask = cpu_thread_map(cpu); + topo->core_mask = cpu_group_map(&socket_info, cpu); + topo->book_mask = cpu_group_map(&book_info, cpu); + topo->drawer_mask = cpu_group_map(&drawer_info, cpu); + if (topology_mode != TOPOLOGY_MODE_HW) { + id = topology_mode == TOPOLOGY_MODE_PACKAGE ? 0 : cpu; + topo->thread_id = cpu; + topo->core_id = cpu; + topo->socket_id = id; + topo->book_id = id; + topo->drawer_id = id; + if (cpu_present(cpu)) + cpumask_set_cpu(cpu, &cpus_with_topology); + } + } + numa_update_cpu_topology(); +} + +void store_topology(struct sysinfo_15_1_x *info) +{ + stsi(info, 15, 1, topology_mnest_limit()); +} + +static void __arch_update_dedicated_flag(void *arg) +{ + if (topology_cpu_dedicated(smp_processor_id())) + set_cpu_flag(CIF_DEDICATED_CPU); + else + clear_cpu_flag(CIF_DEDICATED_CPU); +} + +static int __arch_update_cpu_topology(void) +{ + struct sysinfo_15_1_x *info = tl_info; + int rc = 0; + + mutex_lock(&smp_cpu_state_mutex); + cpumask_clear(&cpus_with_topology); + if (MACHINE_HAS_TOPOLOGY) { + rc = 1; + store_topology(info); + tl_to_masks(info); + } + update_cpu_masks(); + if (!MACHINE_HAS_TOPOLOGY) + topology_update_polarization_simple(); + mutex_unlock(&smp_cpu_state_mutex); + return rc; +} + +int arch_update_cpu_topology(void) +{ + struct device *dev; + int cpu, rc; + + rc = __arch_update_cpu_topology(); + on_each_cpu(__arch_update_dedicated_flag, NULL, 0); + for_each_online_cpu(cpu) { + dev = get_cpu_device(cpu); + if (dev) + kobject_uevent(&dev->kobj, KOBJ_CHANGE); + } + return rc; +} + +static void topology_work_fn(struct work_struct *work) +{ + rebuild_sched_domains(); +} + +void topology_schedule_update(void) +{ + schedule_work(&topology_work); +} + +static void topology_flush_work(void) +{ + flush_work(&topology_work); +} + +static void topology_timer_fn(struct timer_list *unused) +{ + if (ptf(PTF_CHECK)) + topology_schedule_update(); + set_topology_timer(); +} + +static struct timer_list topology_timer; + +static atomic_t topology_poll = ATOMIC_INIT(0); + +static void set_topology_timer(void) +{ + if (atomic_add_unless(&topology_poll, -1, 0)) + mod_timer(&topology_timer, jiffies + HZ / 10); + else + mod_timer(&topology_timer, jiffies + HZ * 60); +} + +void topology_expect_change(void) +{ + if (!MACHINE_HAS_TOPOLOGY) + return; + /* This is racy, but it doesn't matter since it is just a heuristic. + * Worst case is that we poll in a higher frequency for a bit longer. + */ + if (atomic_read(&topology_poll) > 60) + return; + atomic_add(60, &topology_poll); + set_topology_timer(); +} + +static int cpu_management; + +static ssize_t dispatching_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + ssize_t count; + + mutex_lock(&smp_cpu_state_mutex); + count = sprintf(buf, "%d\n", cpu_management); + mutex_unlock(&smp_cpu_state_mutex); + return count; +} + +static ssize_t dispatching_store(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + int val, rc; + char delim; + + if (sscanf(buf, "%d %c", &val, &delim) != 1) + return -EINVAL; + if (val != 0 && val != 1) + return -EINVAL; + rc = 0; + get_online_cpus(); + mutex_lock(&smp_cpu_state_mutex); + if (cpu_management == val) + goto out; + rc = topology_set_cpu_management(val); + if (rc) + goto out; + cpu_management = val; + topology_expect_change(); +out: + mutex_unlock(&smp_cpu_state_mutex); + put_online_cpus(); + return rc ? rc : count; +} +static DEVICE_ATTR_RW(dispatching); + +static ssize_t cpu_polarization_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int cpu = dev->id; + ssize_t count; + + mutex_lock(&smp_cpu_state_mutex); + switch (smp_cpu_get_polarization(cpu)) { + case POLARIZATION_HRZ: + count = sprintf(buf, "horizontal\n"); + break; + case POLARIZATION_VL: + count = sprintf(buf, "vertical:low\n"); + break; + case POLARIZATION_VM: + count = sprintf(buf, "vertical:medium\n"); + break; + case POLARIZATION_VH: + count = sprintf(buf, "vertical:high\n"); + break; + default: + count = sprintf(buf, "unknown\n"); + break; + } + mutex_unlock(&smp_cpu_state_mutex); + return count; +} +static DEVICE_ATTR(polarization, 0444, cpu_polarization_show, NULL); + +static struct attribute *topology_cpu_attrs[] = { + &dev_attr_polarization.attr, + NULL, +}; + +static struct attribute_group topology_cpu_attr_group = { + .attrs = topology_cpu_attrs, +}; + +static ssize_t cpu_dedicated_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int cpu = dev->id; + ssize_t count; + + mutex_lock(&smp_cpu_state_mutex); + count = sprintf(buf, "%d\n", topology_cpu_dedicated(cpu)); + mutex_unlock(&smp_cpu_state_mutex); + return count; +} +static DEVICE_ATTR(dedicated, 0444, cpu_dedicated_show, NULL); + +static struct attribute *topology_extra_cpu_attrs[] = { + &dev_attr_dedicated.attr, + NULL, +}; + +static struct attribute_group topology_extra_cpu_attr_group = { + .attrs = topology_extra_cpu_attrs, +}; + +int topology_cpu_init(struct cpu *cpu) +{ + int rc; + + rc = sysfs_create_group(&cpu->dev.kobj, &topology_cpu_attr_group); + if (rc || !MACHINE_HAS_TOPOLOGY) + return rc; + rc = sysfs_create_group(&cpu->dev.kobj, &topology_extra_cpu_attr_group); + if (rc) + sysfs_remove_group(&cpu->dev.kobj, &topology_cpu_attr_group); + return rc; +} + +static const struct cpumask *cpu_thread_mask(int cpu) +{ + return &cpu_topology[cpu].thread_mask; +} + + +const struct cpumask *cpu_coregroup_mask(int cpu) +{ + return &cpu_topology[cpu].core_mask; +} + +static const struct cpumask *cpu_book_mask(int cpu) +{ + return &cpu_topology[cpu].book_mask; +} + +static const struct cpumask *cpu_drawer_mask(int cpu) +{ + return &cpu_topology[cpu].drawer_mask; +} + +static struct sched_domain_topology_level s390_topology[] = { + { cpu_thread_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, + { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, + { cpu_book_mask, SD_INIT_NAME(BOOK) }, + { cpu_drawer_mask, SD_INIT_NAME(DRAWER) }, + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + +static void __init alloc_masks(struct sysinfo_15_1_x *info, + struct mask_info *mask, int offset) +{ + int i, nr_masks; + + nr_masks = info->mag[TOPOLOGY_NR_MAG - offset]; + for (i = 0; i < info->mnest - offset; i++) + nr_masks *= info->mag[TOPOLOGY_NR_MAG - offset - 1 - i]; + nr_masks = max(nr_masks, 1); + for (i = 0; i < nr_masks; i++) { + mask->next = memblock_virt_alloc(sizeof(*mask->next), 8); + mask = mask->next; + } +} + +void __init topology_init_early(void) +{ + struct sysinfo_15_1_x *info; + + set_sched_topology(s390_topology); + if (topology_mode == TOPOLOGY_MODE_UNINITIALIZED) { + if (MACHINE_HAS_TOPOLOGY) + topology_mode = TOPOLOGY_MODE_HW; + else + topology_mode = TOPOLOGY_MODE_SINGLE; + } + if (!MACHINE_HAS_TOPOLOGY) + goto out; + tl_info = memblock_virt_alloc(PAGE_SIZE, PAGE_SIZE); + info = tl_info; + store_topology(info); + pr_info("The CPU configuration topology of the machine is: %d %d %d %d %d %d / %d\n", + info->mag[0], info->mag[1], info->mag[2], info->mag[3], + info->mag[4], info->mag[5], info->mnest); + alloc_masks(info, &socket_info, 1); + alloc_masks(info, &book_info, 2); + alloc_masks(info, &drawer_info, 3); +out: + __arch_update_cpu_topology(); + __arch_update_dedicated_flag(NULL); +} + +static inline int topology_get_mode(int enabled) +{ + if (!enabled) + return TOPOLOGY_MODE_SINGLE; + return MACHINE_HAS_TOPOLOGY ? TOPOLOGY_MODE_HW : TOPOLOGY_MODE_PACKAGE; +} + +static inline int topology_is_enabled(void) +{ + return topology_mode != TOPOLOGY_MODE_SINGLE; +} + +static int __init topology_setup(char *str) +{ + bool enabled; + int rc; + + rc = kstrtobool(str, &enabled); + if (rc) + return rc; + topology_mode = topology_get_mode(enabled); + return 0; +} +early_param("topology", topology_setup); + +static int topology_ctl_handler(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int enabled = topology_is_enabled(); + int new_mode; + int zero = 0; + int one = 1; + int rc; + struct ctl_table ctl_entry = { + .procname = ctl->procname, + .data = &enabled, + .maxlen = sizeof(int), + .extra1 = &zero, + .extra2 = &one, + }; + + rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos); + if (rc < 0 || !write) + return rc; + + mutex_lock(&smp_cpu_state_mutex); + new_mode = topology_get_mode(enabled); + if (topology_mode != new_mode) { + topology_mode = new_mode; + topology_schedule_update(); + } + mutex_unlock(&smp_cpu_state_mutex); + topology_flush_work(); + + return rc; +} + +static struct ctl_table topology_ctl_table[] = { + { + .procname = "topology", + .mode = 0644, + .proc_handler = topology_ctl_handler, + }, + { }, +}; + +static struct ctl_table topology_dir_table[] = { + { + .procname = "s390", + .maxlen = 0, + .mode = 0555, + .child = topology_ctl_table, + }, + { }, +}; + +static int __init topology_init(void) +{ + timer_setup(&topology_timer, topology_timer_fn, TIMER_DEFERRABLE); + if (MACHINE_HAS_TOPOLOGY) + set_topology_timer(); + else + topology_update_polarization_simple(); + register_sysctl_table(topology_dir_table); + return device_create_file(cpu_subsys.dev_root, &dev_attr_dispatching); +} +device_initcall(topology_init); diff --git a/arch/s390/kernel/trace.c b/arch/s390/kernel/trace.c new file mode 100644 index 000000000..11a669f3c --- /dev/null +++ b/arch/s390/kernel/trace.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Tracepoint definitions for s390 + * + * Copyright IBM Corp. 2015 + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + */ + +#include <linux/percpu.h> +#define CREATE_TRACE_POINTS +#include <asm/trace/diag.h> + +EXPORT_TRACEPOINT_SYMBOL(s390_diagnose); + +static DEFINE_PER_CPU(unsigned int, diagnose_trace_depth); + +void notrace trace_s390_diagnose_norecursion(int diag_nr) +{ + unsigned long flags; + unsigned int *depth; + + /* Avoid lockdep recursion. */ + if (IS_ENABLED(CONFIG_LOCKDEP)) + return; + local_irq_save(flags); + depth = this_cpu_ptr(&diagnose_trace_depth); + if (*depth == 0) { + (*depth)++; + trace_s390_diagnose(diag_nr); + (*depth)--; + } + local_irq_restore(flags); +} diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c new file mode 100644 index 000000000..8003b38c1 --- /dev/null +++ b/arch/s390/kernel/traps.c @@ -0,0 +1,267 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * S390 version + * Copyright IBM Corp. 1999, 2000 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), + * Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com), + * + * Derived from "arch/i386/kernel/traps.c" + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * 'Traps.c' handles hardware traps and faults after we have saved some + * state in 'asm.s'. + */ +#include <linux/kprobes.h> +#include <linux/kdebug.h> +#include <linux/extable.h> +#include <linux/ptrace.h> +#include <linux/sched.h> +#include <linux/sched/debug.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/cpu.h> +#include <asm/fpu/api.h> +#include "entry.h" + +static inline void __user *get_trap_ip(struct pt_regs *regs) +{ + unsigned long address; + + if (regs->int_code & 0x200) + address = *(unsigned long *)(current->thread.trap_tdb + 24); + else + address = regs->psw.addr; + return (void __user *) (address - (regs->int_code >> 16)); +} + +int is_valid_bugaddr(unsigned long addr) +{ + return 1; +} + +void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str) +{ + if (user_mode(regs)) { + force_sig_fault(si_signo, si_code, get_trap_ip(regs), current); + report_user_fault(regs, si_signo, 0); + } else { + const struct exception_table_entry *fixup; + fixup = search_exception_tables(regs->psw.addr); + if (fixup) + regs->psw.addr = extable_fixup(fixup); + else { + enum bug_trap_type btt; + + btt = report_bug(regs->psw.addr, regs); + if (btt == BUG_TRAP_TYPE_WARN) + return; + die(regs, str); + } + } +} + +static void do_trap(struct pt_regs *regs, int si_signo, int si_code, char *str) +{ + if (notify_die(DIE_TRAP, str, regs, 0, + regs->int_code, si_signo) == NOTIFY_STOP) + return; + do_report_trap(regs, si_signo, si_code, str); +} +NOKPROBE_SYMBOL(do_trap); + +void do_per_trap(struct pt_regs *regs) +{ + if (notify_die(DIE_SSTEP, "sstep", regs, 0, 0, SIGTRAP) == NOTIFY_STOP) + return; + if (!current->ptrace) + return; + force_sig_fault(SIGTRAP, TRAP_HWBKPT, + (void __force __user *) current->thread.per_event.address, current); +} +NOKPROBE_SYMBOL(do_per_trap); + +void default_trap_handler(struct pt_regs *regs) +{ + if (user_mode(regs)) { + report_user_fault(regs, SIGSEGV, 0); + do_exit(SIGSEGV); + } else + die(regs, "Unknown program exception"); +} + +#define DO_ERROR_INFO(name, signr, sicode, str) \ +void name(struct pt_regs *regs) \ +{ \ + do_trap(regs, signr, sicode, str); \ +} + +DO_ERROR_INFO(addressing_exception, SIGILL, ILL_ILLADR, + "addressing exception") +DO_ERROR_INFO(execute_exception, SIGILL, ILL_ILLOPN, + "execute exception") +DO_ERROR_INFO(divide_exception, SIGFPE, FPE_INTDIV, + "fixpoint divide exception") +DO_ERROR_INFO(overflow_exception, SIGFPE, FPE_INTOVF, + "fixpoint overflow exception") +DO_ERROR_INFO(hfp_overflow_exception, SIGFPE, FPE_FLTOVF, + "HFP overflow exception") +DO_ERROR_INFO(hfp_underflow_exception, SIGFPE, FPE_FLTUND, + "HFP underflow exception") +DO_ERROR_INFO(hfp_significance_exception, SIGFPE, FPE_FLTRES, + "HFP significance exception") +DO_ERROR_INFO(hfp_divide_exception, SIGFPE, FPE_FLTDIV, + "HFP divide exception") +DO_ERROR_INFO(hfp_sqrt_exception, SIGFPE, FPE_FLTINV, + "HFP square root exception") +DO_ERROR_INFO(operand_exception, SIGILL, ILL_ILLOPN, + "operand exception") +DO_ERROR_INFO(privileged_op, SIGILL, ILL_PRVOPC, + "privileged operation") +DO_ERROR_INFO(special_op_exception, SIGILL, ILL_ILLOPN, + "special operation exception") +DO_ERROR_INFO(transaction_exception, SIGILL, ILL_ILLOPN, + "transaction constraint exception") + +static inline void do_fp_trap(struct pt_regs *regs, __u32 fpc) +{ + int si_code = 0; + /* FPC[2] is Data Exception Code */ + if ((fpc & 0x00000300) == 0) { + /* bits 6 and 7 of DXC are 0 iff IEEE exception */ + if (fpc & 0x8000) /* invalid fp operation */ + si_code = FPE_FLTINV; + else if (fpc & 0x4000) /* div by 0 */ + si_code = FPE_FLTDIV; + else if (fpc & 0x2000) /* overflow */ + si_code = FPE_FLTOVF; + else if (fpc & 0x1000) /* underflow */ + si_code = FPE_FLTUND; + else if (fpc & 0x0800) /* inexact */ + si_code = FPE_FLTRES; + } + do_trap(regs, SIGFPE, si_code, "floating point exception"); +} + +void translation_exception(struct pt_regs *regs) +{ + /* May never happen. */ + panic("Translation exception"); +} + +void illegal_op(struct pt_regs *regs) +{ + __u8 opcode[6]; + __u16 __user *location; + int is_uprobe_insn = 0; + int signal = 0; + + location = get_trap_ip(regs); + + if (user_mode(regs)) { + if (get_user(*((__u16 *) opcode), (__u16 __user *) location)) + return; + if (*((__u16 *) opcode) == S390_BREAKPOINT_U16) { + if (current->ptrace) + force_sig_fault(SIGTRAP, TRAP_BRKPT, location, current); + else + signal = SIGILL; +#ifdef CONFIG_UPROBES + } else if (*((__u16 *) opcode) == UPROBE_SWBP_INSN) { + is_uprobe_insn = 1; +#endif + } else + signal = SIGILL; + } + /* + * We got either an illegal op in kernel mode, or user space trapped + * on a uprobes illegal instruction. See if kprobes or uprobes picks + * it up. If not, SIGILL. + */ + if (is_uprobe_insn || !user_mode(regs)) { + if (notify_die(DIE_BPT, "bpt", regs, 0, + 3, SIGTRAP) != NOTIFY_STOP) + signal = SIGILL; + } + if (signal) + do_trap(regs, signal, ILL_ILLOPC, "illegal operation"); +} +NOKPROBE_SYMBOL(illegal_op); + +DO_ERROR_INFO(specification_exception, SIGILL, ILL_ILLOPN, + "specification exception"); + +void vector_exception(struct pt_regs *regs) +{ + int si_code, vic; + + if (!MACHINE_HAS_VX) { + do_trap(regs, SIGILL, ILL_ILLOPN, "illegal operation"); + return; + } + + /* get vector interrupt code from fpc */ + save_fpu_regs(); + vic = (current->thread.fpu.fpc & 0xf00) >> 8; + switch (vic) { + case 1: /* invalid vector operation */ + si_code = FPE_FLTINV; + break; + case 2: /* division by zero */ + si_code = FPE_FLTDIV; + break; + case 3: /* overflow */ + si_code = FPE_FLTOVF; + break; + case 4: /* underflow */ + si_code = FPE_FLTUND; + break; + case 5: /* inexact */ + si_code = FPE_FLTRES; + break; + default: /* unknown cause */ + si_code = 0; + } + do_trap(regs, SIGFPE, si_code, "vector exception"); +} + +void data_exception(struct pt_regs *regs) +{ + int signal = 0; + + save_fpu_regs(); + if (current->thread.fpu.fpc & FPC_DXC_MASK) + signal = SIGFPE; + else + signal = SIGILL; + if (signal == SIGFPE) + do_fp_trap(regs, current->thread.fpu.fpc); + else if (signal) + do_trap(regs, signal, ILL_ILLOPN, "data exception"); +} + +void space_switch_exception(struct pt_regs *regs) +{ + /* Set user psw back to home space mode. */ + if (user_mode(regs)) + regs->psw.mask |= PSW_ASC_HOME; + /* Send SIGILL. */ + do_trap(regs, SIGILL, ILL_PRVOPC, "space switch event"); +} + +void kernel_stack_overflow(struct pt_regs *regs) +{ + bust_spinlocks(1); + printk("Kernel stack overflow.\n"); + show_regs(regs); + bust_spinlocks(0); + panic("Corrupt kernel stack, can't continue."); +} +NOKPROBE_SYMBOL(kernel_stack_overflow); + +void __init trap_init(void) +{ + local_mcck_enable(); +} diff --git a/arch/s390/kernel/uprobes.c b/arch/s390/kernel/uprobes.c new file mode 100644 index 000000000..5007fac01 --- /dev/null +++ b/arch/s390/kernel/uprobes.c @@ -0,0 +1,397 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * User-space Probes (UProbes) for s390 + * + * Copyright IBM Corp. 2014 + * Author(s): Jan Willeke, + */ + +#include <linux/uaccess.h> +#include <linux/uprobes.h> +#include <linux/compat.h> +#include <linux/kdebug.h> +#include <linux/sched/task_stack.h> + +#include <asm/switch_to.h> +#include <asm/facility.h> +#include <asm/kprobes.h> +#include <asm/dis.h> +#include "entry.h" + +#define UPROBE_TRAP_NR UINT_MAX + +int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, + unsigned long addr) +{ + return probe_is_prohibited_opcode(auprobe->insn); +} + +int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + if (psw_bits(regs->psw).eaba == PSW_BITS_AMODE_24BIT) + return -EINVAL; + if (!is_compat_task() && psw_bits(regs->psw).eaba == PSW_BITS_AMODE_31BIT) + return -EINVAL; + clear_pt_regs_flag(regs, PIF_PER_TRAP); + auprobe->saved_per = psw_bits(regs->psw).per; + auprobe->saved_int_code = regs->int_code; + regs->int_code = UPROBE_TRAP_NR; + regs->psw.addr = current->utask->xol_vaddr; + set_tsk_thread_flag(current, TIF_UPROBE_SINGLESTEP); + update_cr_regs(current); + return 0; +} + +bool arch_uprobe_xol_was_trapped(struct task_struct *tsk) +{ + struct pt_regs *regs = task_pt_regs(tsk); + + if (regs->int_code != UPROBE_TRAP_NR) + return true; + return false; +} + +static int check_per_event(unsigned short cause, unsigned long control, + struct pt_regs *regs) +{ + if (!(regs->psw.mask & PSW_MASK_PER)) + return 0; + /* user space single step */ + if (control == 0) + return 1; + /* over indication for storage alteration */ + if ((control & 0x20200000) && (cause & 0x2000)) + return 1; + if (cause & 0x8000) { + /* all branches */ + if ((control & 0x80800000) == 0x80000000) + return 1; + /* branch into selected range */ + if (((control & 0x80800000) == 0x80800000) && + regs->psw.addr >= current->thread.per_user.start && + regs->psw.addr <= current->thread.per_user.end) + return 1; + } + return 0; +} + +int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + int fixup = probe_get_fixup_type(auprobe->insn); + struct uprobe_task *utask = current->utask; + + clear_tsk_thread_flag(current, TIF_UPROBE_SINGLESTEP); + update_cr_regs(current); + psw_bits(regs->psw).per = auprobe->saved_per; + regs->int_code = auprobe->saved_int_code; + + if (fixup & FIXUP_PSW_NORMAL) + regs->psw.addr += utask->vaddr - utask->xol_vaddr; + if (fixup & FIXUP_RETURN_REGISTER) { + int reg = (auprobe->insn[0] & 0xf0) >> 4; + + regs->gprs[reg] += utask->vaddr - utask->xol_vaddr; + } + if (fixup & FIXUP_BRANCH_NOT_TAKEN) { + int ilen = insn_length(auprobe->insn[0] >> 8); + + if (regs->psw.addr - utask->xol_vaddr == ilen) + regs->psw.addr = utask->vaddr + ilen; + } + if (check_per_event(current->thread.per_event.cause, + current->thread.per_user.control, regs)) { + /* fix per address */ + current->thread.per_event.address = utask->vaddr; + /* trigger per event */ + set_pt_regs_flag(regs, PIF_PER_TRAP); + } + return 0; +} + +int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, + void *data) +{ + struct die_args *args = data; + struct pt_regs *regs = args->regs; + + if (!user_mode(regs)) + return NOTIFY_DONE; + if (regs->int_code & 0x200) /* Trap during transaction */ + return NOTIFY_DONE; + switch (val) { + case DIE_BPT: + if (uprobe_pre_sstep_notifier(regs)) + return NOTIFY_STOP; + break; + case DIE_SSTEP: + if (uprobe_post_sstep_notifier(regs)) + return NOTIFY_STOP; + default: + break; + } + return NOTIFY_DONE; +} + +void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + clear_thread_flag(TIF_UPROBE_SINGLESTEP); + regs->int_code = auprobe->saved_int_code; + regs->psw.addr = current->utask->vaddr; + current->thread.per_event.address = current->utask->vaddr; +} + +unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline, + struct pt_regs *regs) +{ + unsigned long orig; + + orig = regs->gprs[14]; + regs->gprs[14] = trampoline; + return orig; +} + +bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, + struct pt_regs *regs) +{ + if (ctx == RP_CHECK_CHAIN_CALL) + return user_stack_pointer(regs) <= ret->stack; + else + return user_stack_pointer(regs) < ret->stack; +} + +/* Instruction Emulation */ + +static void adjust_psw_addr(psw_t *psw, unsigned long len) +{ + psw->addr = __rewind_psw(*psw, -len); +} + +#define EMU_ILLEGAL_OP 1 +#define EMU_SPECIFICATION 2 +#define EMU_ADDRESSING 3 + +#define emu_load_ril(ptr, output) \ +({ \ + unsigned int mask = sizeof(*(ptr)) - 1; \ + __typeof__(*(ptr)) input; \ + int __rc = 0; \ + \ + if (!test_facility(34)) \ + __rc = EMU_ILLEGAL_OP; \ + else if ((u64 __force)ptr & mask) \ + __rc = EMU_SPECIFICATION; \ + else if (get_user(input, ptr)) \ + __rc = EMU_ADDRESSING; \ + else \ + *(output) = input; \ + __rc; \ +}) + +#define emu_store_ril(regs, ptr, input) \ +({ \ + unsigned int mask = sizeof(*(ptr)) - 1; \ + __typeof__(ptr) __ptr = (ptr); \ + int __rc = 0; \ + \ + if (!test_facility(34)) \ + __rc = EMU_ILLEGAL_OP; \ + else if ((u64 __force)__ptr & mask) \ + __rc = EMU_SPECIFICATION; \ + else if (put_user(*(input), __ptr)) \ + __rc = EMU_ADDRESSING; \ + if (__rc == 0) \ + sim_stor_event(regs, \ + (void __force *)__ptr, \ + mask + 1); \ + __rc; \ +}) + +#define emu_cmp_ril(regs, ptr, cmp) \ +({ \ + unsigned int mask = sizeof(*(ptr)) - 1; \ + __typeof__(*(ptr)) input; \ + int __rc = 0; \ + \ + if (!test_facility(34)) \ + __rc = EMU_ILLEGAL_OP; \ + else if ((u64 __force)ptr & mask) \ + __rc = EMU_SPECIFICATION; \ + else if (get_user(input, ptr)) \ + __rc = EMU_ADDRESSING; \ + else if (input > *(cmp)) \ + psw_bits((regs)->psw).cc = 1; \ + else if (input < *(cmp)) \ + psw_bits((regs)->psw).cc = 2; \ + else \ + psw_bits((regs)->psw).cc = 0; \ + __rc; \ +}) + +struct insn_ril { + u8 opc0; + u8 reg : 4; + u8 opc1 : 4; + s32 disp; +} __packed; + +union split_register { + u64 u64; + u32 u32[2]; + u16 u16[4]; + s64 s64; + s32 s32[2]; + s16 s16[4]; +}; + +/* + * If user per registers are setup to trace storage alterations and an + * emulated store took place on a fitting address a user trap is generated. + */ +static void sim_stor_event(struct pt_regs *regs, void *addr, int len) +{ + if (!(regs->psw.mask & PSW_MASK_PER)) + return; + if (!(current->thread.per_user.control & PER_EVENT_STORE)) + return; + if ((void *)current->thread.per_user.start > (addr + len)) + return; + if ((void *)current->thread.per_user.end < addr) + return; + current->thread.per_event.address = regs->psw.addr; + current->thread.per_event.cause = PER_EVENT_STORE >> 16; + set_pt_regs_flag(regs, PIF_PER_TRAP); +} + +/* + * pc relative instructions are emulated, since parameters may not be + * accessible from the xol area due to range limitations. + */ +static void handle_insn_ril(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + union split_register *rx; + struct insn_ril *insn; + unsigned int ilen; + void *uptr; + int rc = 0; + + insn = (struct insn_ril *) &auprobe->insn; + rx = (union split_register *) ®s->gprs[insn->reg]; + uptr = (void *)(regs->psw.addr + (insn->disp * 2)); + ilen = insn_length(insn->opc0); + + switch (insn->opc0) { + case 0xc0: + switch (insn->opc1) { + case 0x00: /* larl */ + rx->u64 = (unsigned long)uptr; + break; + } + break; + case 0xc4: + switch (insn->opc1) { + case 0x02: /* llhrl */ + rc = emu_load_ril((u16 __user *)uptr, &rx->u32[1]); + break; + case 0x04: /* lghrl */ + rc = emu_load_ril((s16 __user *)uptr, &rx->u64); + break; + case 0x05: /* lhrl */ + rc = emu_load_ril((s16 __user *)uptr, &rx->u32[1]); + break; + case 0x06: /* llghrl */ + rc = emu_load_ril((u16 __user *)uptr, &rx->u64); + break; + case 0x08: /* lgrl */ + rc = emu_load_ril((u64 __user *)uptr, &rx->u64); + break; + case 0x0c: /* lgfrl */ + rc = emu_load_ril((s32 __user *)uptr, &rx->u64); + break; + case 0x0d: /* lrl */ + rc = emu_load_ril((u32 __user *)uptr, &rx->u32[1]); + break; + case 0x0e: /* llgfrl */ + rc = emu_load_ril((u32 __user *)uptr, &rx->u64); + break; + case 0x07: /* sthrl */ + rc = emu_store_ril(regs, (u16 __user *)uptr, &rx->u16[3]); + break; + case 0x0b: /* stgrl */ + rc = emu_store_ril(regs, (u64 __user *)uptr, &rx->u64); + break; + case 0x0f: /* strl */ + rc = emu_store_ril(regs, (u32 __user *)uptr, &rx->u32[1]); + break; + } + break; + case 0xc6: + switch (insn->opc1) { + case 0x02: /* pfdrl */ + if (!test_facility(34)) + rc = EMU_ILLEGAL_OP; + break; + case 0x04: /* cghrl */ + rc = emu_cmp_ril(regs, (s16 __user *)uptr, &rx->s64); + break; + case 0x05: /* chrl */ + rc = emu_cmp_ril(regs, (s16 __user *)uptr, &rx->s32[1]); + break; + case 0x06: /* clghrl */ + rc = emu_cmp_ril(regs, (u16 __user *)uptr, &rx->u64); + break; + case 0x07: /* clhrl */ + rc = emu_cmp_ril(regs, (u16 __user *)uptr, &rx->u32[1]); + break; + case 0x08: /* cgrl */ + rc = emu_cmp_ril(regs, (s64 __user *)uptr, &rx->s64); + break; + case 0x0a: /* clgrl */ + rc = emu_cmp_ril(regs, (u64 __user *)uptr, &rx->u64); + break; + case 0x0c: /* cgfrl */ + rc = emu_cmp_ril(regs, (s32 __user *)uptr, &rx->s64); + break; + case 0x0d: /* crl */ + rc = emu_cmp_ril(regs, (s32 __user *)uptr, &rx->s32[1]); + break; + case 0x0e: /* clgfrl */ + rc = emu_cmp_ril(regs, (u32 __user *)uptr, &rx->u64); + break; + case 0x0f: /* clrl */ + rc = emu_cmp_ril(regs, (u32 __user *)uptr, &rx->u32[1]); + break; + } + break; + } + adjust_psw_addr(®s->psw, ilen); + switch (rc) { + case EMU_ILLEGAL_OP: + regs->int_code = ilen << 16 | 0x0001; + do_report_trap(regs, SIGILL, ILL_ILLOPC, NULL); + break; + case EMU_SPECIFICATION: + regs->int_code = ilen << 16 | 0x0006; + do_report_trap(regs, SIGILL, ILL_ILLOPC , NULL); + break; + case EMU_ADDRESSING: + regs->int_code = ilen << 16 | 0x0005; + do_report_trap(regs, SIGSEGV, SEGV_MAPERR, NULL); + break; + } +} + +bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + if ((psw_bits(regs->psw).eaba == PSW_BITS_AMODE_24BIT) || + ((psw_bits(regs->psw).eaba == PSW_BITS_AMODE_31BIT) && + !is_compat_task())) { + regs->psw.addr = __rewind_psw(regs->psw, UPROBE_SWBP_INSN_SIZE); + do_report_trap(regs, SIGILL, ILL_ILLADR, NULL); + return true; + } + if (probe_is_insn_relative_long(auprobe->insn)) { + handle_insn_ril(auprobe, regs); + return true; + } + return false; +} diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c new file mode 100644 index 000000000..7ab7d256d --- /dev/null +++ b/arch/s390/kernel/vdso.c @@ -0,0 +1,325 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * vdso setup for s390 + * + * Copyright IBM Corp. 2008 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) + */ + +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/stddef.h> +#include <linux/unistd.h> +#include <linux/slab.h> +#include <linux/user.h> +#include <linux/elf.h> +#include <linux/security.h> +#include <linux/bootmem.h> +#include <linux/compat.h> +#include <asm/asm-offsets.h> +#include <asm/pgtable.h> +#include <asm/processor.h> +#include <asm/mmu.h> +#include <asm/mmu_context.h> +#include <asm/sections.h> +#include <asm/vdso.h> +#include <asm/facility.h> + +#ifdef CONFIG_COMPAT +extern char vdso32_start, vdso32_end; +static void *vdso32_kbase = &vdso32_start; +static unsigned int vdso32_pages; +static struct page **vdso32_pagelist; +#endif + +extern char vdso64_start, vdso64_end; +static void *vdso64_kbase = &vdso64_start; +static unsigned int vdso64_pages; +static struct page **vdso64_pagelist; + +/* + * Should the kernel map a VDSO page into processes and pass its + * address down to glibc upon exec()? + */ +unsigned int __read_mostly vdso_enabled = 1; + +static vm_fault_t vdso_fault(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page **vdso_pagelist; + unsigned long vdso_pages; + + vdso_pagelist = vdso64_pagelist; + vdso_pages = vdso64_pages; +#ifdef CONFIG_COMPAT + if (vma->vm_mm->context.compat_mm) { + vdso_pagelist = vdso32_pagelist; + vdso_pages = vdso32_pages; + } +#endif + + if (vmf->pgoff >= vdso_pages) + return VM_FAULT_SIGBUS; + + vmf->page = vdso_pagelist[vmf->pgoff]; + get_page(vmf->page); + return 0; +} + +static int vdso_mremap(const struct vm_special_mapping *sm, + struct vm_area_struct *vma) +{ + unsigned long vdso_pages; + + vdso_pages = vdso64_pages; +#ifdef CONFIG_COMPAT + if (vma->vm_mm->context.compat_mm) + vdso_pages = vdso32_pages; +#endif + + if ((vdso_pages << PAGE_SHIFT) != vma->vm_end - vma->vm_start) + return -EINVAL; + + if (WARN_ON_ONCE(current->mm != vma->vm_mm)) + return -EFAULT; + + current->mm->context.vdso_base = vma->vm_start; + return 0; +} + +static const struct vm_special_mapping vdso_mapping = { + .name = "[vdso]", + .fault = vdso_fault, + .mremap = vdso_mremap, +}; + +static int __init vdso_setup(char *s) +{ + unsigned long val; + int rc; + + rc = 0; + if (strncmp(s, "on", 3) == 0) + vdso_enabled = 1; + else if (strncmp(s, "off", 4) == 0) + vdso_enabled = 0; + else { + rc = kstrtoul(s, 0, &val); + vdso_enabled = rc ? 0 : !!val; + } + return !rc; +} +__setup("vdso=", vdso_setup); + +/* + * The vdso data page + */ +static union { + struct vdso_data data; + u8 page[PAGE_SIZE]; +} vdso_data_store __page_aligned_data; +struct vdso_data *vdso_data = &vdso_data_store.data; + +/* + * Setup vdso data page. + */ +static void __init vdso_init_data(struct vdso_data *vd) +{ + vd->ectg_available = test_facility(31); +} + +/* + * Allocate/free per cpu vdso data. + */ +#define SEGMENT_ORDER 2 + +/* + * The initial vdso_data structure for the boot CPU. Eventually + * it is replaced with a properly allocated structure in vdso_init. + * This is necessary because a valid S390_lowcore.vdso_per_cpu_data + * pointer is required to be able to return from an interrupt or + * program check. See the exit paths in entry.S. + */ +struct vdso_data boot_vdso_data __initdata; + +void __init vdso_alloc_boot_cpu(struct lowcore *lowcore) +{ + lowcore->vdso_per_cpu_data = (unsigned long) &boot_vdso_data; +} + +int vdso_alloc_per_cpu(struct lowcore *lowcore) +{ + unsigned long segment_table, page_table, page_frame; + struct vdso_per_cpu_data *vd; + + segment_table = __get_free_pages(GFP_KERNEL, SEGMENT_ORDER); + page_table = get_zeroed_page(GFP_KERNEL); + page_frame = get_zeroed_page(GFP_KERNEL); + if (!segment_table || !page_table || !page_frame) + goto out; + arch_set_page_dat(virt_to_page(segment_table), SEGMENT_ORDER); + arch_set_page_dat(virt_to_page(page_table), 0); + + /* Initialize per-cpu vdso data page */ + vd = (struct vdso_per_cpu_data *) page_frame; + vd->cpu_nr = lowcore->cpu_nr; + vd->node_id = cpu_to_node(vd->cpu_nr); + + /* Set up page table for the vdso address space */ + memset64((u64 *)segment_table, _SEGMENT_ENTRY_EMPTY, _CRST_ENTRIES); + memset64((u64 *)page_table, _PAGE_INVALID, PTRS_PER_PTE); + + *(unsigned long *) segment_table = _SEGMENT_ENTRY + page_table; + *(unsigned long *) page_table = _PAGE_PROTECT + page_frame; + + lowcore->vdso_asce = segment_table + + _ASCE_TABLE_LENGTH + _ASCE_USER_BITS + _ASCE_TYPE_SEGMENT; + lowcore->vdso_per_cpu_data = page_frame; + + return 0; + +out: + free_page(page_frame); + free_page(page_table); + free_pages(segment_table, SEGMENT_ORDER); + return -ENOMEM; +} + +void vdso_free_per_cpu(struct lowcore *lowcore) +{ + unsigned long segment_table, page_table, page_frame; + + segment_table = lowcore->vdso_asce & PAGE_MASK; + page_table = *(unsigned long *) segment_table; + page_frame = *(unsigned long *) page_table; + + free_page(page_frame); + free_page(page_table); + free_pages(segment_table, SEGMENT_ORDER); +} + +/* + * This is called from binfmt_elf, we create the special vma for the + * vDSO and insert it into the mm struct tree + */ +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long vdso_pages; + unsigned long vdso_base; + int rc; + + if (!vdso_enabled) + return 0; + /* + * Only map the vdso for dynamically linked elf binaries. + */ + if (!uses_interp) + return 0; + + vdso_pages = vdso64_pages; +#ifdef CONFIG_COMPAT + mm->context.compat_mm = is_compat_task(); + if (mm->context.compat_mm) + vdso_pages = vdso32_pages; +#endif + /* + * vDSO has a problem and was disabled, just don't "enable" it for + * the process + */ + if (vdso_pages == 0) + return 0; + + /* + * pick a base address for the vDSO in process space. We try to put + * it at vdso_base which is the "natural" base for it, but we might + * fail and end up putting it elsewhere. + */ + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + vdso_base = get_unmapped_area(NULL, 0, vdso_pages << PAGE_SHIFT, 0, 0); + if (IS_ERR_VALUE(vdso_base)) { + rc = vdso_base; + goto out_up; + } + + /* + * our vma flags don't have VM_WRITE so by default, the process + * isn't allowed to write those pages. + * gdb can break that with ptrace interface, and thus trigger COW + * on those pages but it's then your responsibility to never do that + * on the "data" page of the vDSO or you'll stop getting kernel + * updates and your nice userland gettimeofday will be totally dead. + * It's fine to use that for setting breakpoints in the vDSO code + * pages though. + */ + vma = _install_special_mapping(mm, vdso_base, vdso_pages << PAGE_SHIFT, + VM_READ|VM_EXEC| + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, + &vdso_mapping); + if (IS_ERR(vma)) { + rc = PTR_ERR(vma); + goto out_up; + } + + current->mm->context.vdso_base = vdso_base; + rc = 0; + +out_up: + up_write(&mm->mmap_sem); + return rc; +} + +static int __init vdso_init(void) +{ + int i; + + vdso_init_data(vdso_data); +#ifdef CONFIG_COMPAT + /* Calculate the size of the 32 bit vDSO */ + vdso32_pages = ((&vdso32_end - &vdso32_start + + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1; + + /* Make sure pages are in the correct state */ + vdso32_pagelist = kcalloc(vdso32_pages + 1, sizeof(struct page *), + GFP_KERNEL); + BUG_ON(vdso32_pagelist == NULL); + for (i = 0; i < vdso32_pages - 1; i++) { + struct page *pg = virt_to_page(vdso32_kbase + i*PAGE_SIZE); + ClearPageReserved(pg); + get_page(pg); + vdso32_pagelist[i] = pg; + } + vdso32_pagelist[vdso32_pages - 1] = virt_to_page(vdso_data); + vdso32_pagelist[vdso32_pages] = NULL; +#endif + + /* Calculate the size of the 64 bit vDSO */ + vdso64_pages = ((&vdso64_end - &vdso64_start + + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1; + + /* Make sure pages are in the correct state */ + vdso64_pagelist = kcalloc(vdso64_pages + 1, sizeof(struct page *), + GFP_KERNEL); + BUG_ON(vdso64_pagelist == NULL); + for (i = 0; i < vdso64_pages - 1; i++) { + struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE); + ClearPageReserved(pg); + get_page(pg); + vdso64_pagelist[i] = pg; + } + vdso64_pagelist[vdso64_pages - 1] = virt_to_page(vdso_data); + vdso64_pagelist[vdso64_pages] = NULL; + if (vdso_alloc_per_cpu(&S390_lowcore)) + BUG(); + + get_page(virt_to_page(vdso_data)); + + return 0; +} +early_initcall(vdso_init); diff --git a/arch/s390/kernel/vdso32/.gitignore b/arch/s390/kernel/vdso32/.gitignore new file mode 100644 index 000000000..e45fba9d0 --- /dev/null +++ b/arch/s390/kernel/vdso32/.gitignore @@ -0,0 +1 @@ +vdso32.lds diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile new file mode 100644 index 000000000..e76309fbb --- /dev/null +++ b/arch/s390/kernel/vdso32/Makefile @@ -0,0 +1,66 @@ +# SPDX-License-Identifier: GPL-2.0 +# List of files in the vdso, has to be asm only for now + +KCOV_INSTRUMENT := n + +obj-vdso32 = gettimeofday.o clock_getres.o clock_gettime.o note.o getcpu.o + +# Build rules + +targets := $(obj-vdso32) vdso32.so vdso32.so.dbg +obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32)) + +KBUILD_AFLAGS += -DBUILD_VDSO +KBUILD_CFLAGS += -DBUILD_VDSO + +KBUILD_AFLAGS_31 := $(filter-out -m64,$(KBUILD_AFLAGS)) +KBUILD_AFLAGS_31 += -m31 -s + +KBUILD_CFLAGS_31 := $(filter-out -m64,$(KBUILD_CFLAGS)) +KBUILD_CFLAGS_31 += -m31 -fPIC -shared -fno-common -fno-builtin +KBUILD_CFLAGS_31 += -nostdlib -Wl,-soname=linux-vdso32.so.1 \ + $(call cc-ldoption, -Wl$(comma)--hash-style=both) + +$(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_31) +$(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_31) + +obj-y += vdso32_wrapper.o +extra-y += vdso32.lds +CPPFLAGS_vdso32.lds += -P -C -U$(ARCH) + +# Disable gcov profiling, ubsan and kasan for VDSO code +GCOV_PROFILE := n +UBSAN_SANITIZE := n +KASAN_SANITIZE := n + +# Force dependency (incbin is bad) +$(obj)/vdso32_wrapper.o : $(obj)/vdso32.so + +# link rule for the .so file, .lds has to be first +$(obj)/vdso32.so.dbg: $(src)/vdso32.lds $(obj-vdso32) FORCE + $(call if_changed,vdso32ld) + +# strip rule for the .so file +$(obj)/%.so: OBJCOPYFLAGS := -S +$(obj)/%.so: $(obj)/%.so.dbg FORCE + $(call if_changed,objcopy) + +# assembly rules for the .S files +$(obj-vdso32): %.o: %.S FORCE + $(call if_changed_dep,vdso32as) + +# actual build commands +quiet_cmd_vdso32ld = VDSO32L $@ + cmd_vdso32ld = $(CC) $(c_flags) -Wl,-T $(filter %.lds %.o,$^) -o $@ +quiet_cmd_vdso32as = VDSO32A $@ + cmd_vdso32as = $(CC) $(a_flags) -c -o $@ $< + +# install commands for the unstripped file +quiet_cmd_vdso_install = INSTALL $@ + cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@ + +vdso32.so: $(obj)/vdso32.so.dbg + @mkdir -p $(MODLIB)/vdso + $(call cmd,vdso_install) + +vdso_install: vdso32.so diff --git a/arch/s390/kernel/vdso32/clock_getres.S b/arch/s390/kernel/vdso32/clock_getres.S new file mode 100644 index 000000000..eaf9cf141 --- /dev/null +++ b/arch/s390/kernel/vdso32/clock_getres.S @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Userland implementation of clock_getres() for 32 bits processes in a + * s390 kernel for use in the vDSO + * + * Copyright IBM Corp. 2008 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) + */ +#include <asm/vdso.h> +#include <asm/asm-offsets.h> +#include <asm/unistd.h> +#include <asm/dwarf.h> + + .text + .align 4 + .globl __kernel_clock_getres + .type __kernel_clock_getres,@function +__kernel_clock_getres: + CFI_STARTPROC + basr %r1,0 + la %r1,4f-.(%r1) + chi %r2,__CLOCK_REALTIME + je 0f + chi %r2,__CLOCK_MONOTONIC + je 0f + la %r1,5f-4f(%r1) + chi %r2,__CLOCK_REALTIME_COARSE + je 0f + chi %r2,__CLOCK_MONOTONIC_COARSE + jne 3f +0: ltr %r3,%r3 + jz 2f /* res == NULL */ +1: l %r0,0(%r1) + xc 0(4,%r3),0(%r3) /* set tp->tv_sec to zero */ + st %r0,4(%r3) /* store tp->tv_usec */ +2: lhi %r2,0 + br %r14 +3: lhi %r1,__NR_clock_getres /* fallback to svc */ + svc 0 + br %r14 + CFI_ENDPROC +4: .long __CLOCK_REALTIME_RES +5: .long __CLOCK_COARSE_RES + .size __kernel_clock_getres,.-__kernel_clock_getres diff --git a/arch/s390/kernel/vdso32/clock_gettime.S b/arch/s390/kernel/vdso32/clock_gettime.S new file mode 100644 index 000000000..ada5c11a1 --- /dev/null +++ b/arch/s390/kernel/vdso32/clock_gettime.S @@ -0,0 +1,179 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Userland implementation of clock_gettime() for 32 bits processes in a + * s390 kernel for use in the vDSO + * + * Copyright IBM Corp. 2008 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) + */ +#include <asm/vdso.h> +#include <asm/asm-offsets.h> +#include <asm/unistd.h> +#include <asm/dwarf.h> +#include <asm/ptrace.h> + + .text + .align 4 + .globl __kernel_clock_gettime + .type __kernel_clock_gettime,@function +__kernel_clock_gettime: + CFI_STARTPROC + ahi %r15,-16 + CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD+16 + CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD + basr %r5,0 +0: al %r5,21f-0b(%r5) /* get &_vdso_data */ + chi %r2,__CLOCK_REALTIME_COARSE + je 10f + chi %r2,__CLOCK_REALTIME + je 11f + chi %r2,__CLOCK_MONOTONIC_COARSE + je 9f + chi %r2,__CLOCK_MONOTONIC + jne 19f + + /* CLOCK_MONOTONIC */ +1: l %r4,__VDSO_UPD_COUNT+4(%r5) /* load update counter */ + tml %r4,0x0001 /* pending update ? loop */ + jnz 1b + stcke 0(%r15) /* Store TOD clock */ + lm %r0,%r1,1(%r15) + s %r0,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */ + sl %r1,__VDSO_XTIME_STAMP+4(%r5) + brc 3,2f + ahi %r0,-1 +2: ms %r0,__VDSO_TK_MULT(%r5) /* * tk->mult */ + lr %r2,%r0 + l %r0,__VDSO_TK_MULT(%r5) + ltr %r1,%r1 + mr %r0,%r0 + jnm 3f + a %r0,__VDSO_TK_MULT(%r5) +3: alr %r0,%r2 + al %r0,__VDSO_WTOM_NSEC(%r5) + al %r1,__VDSO_WTOM_NSEC+4(%r5) + brc 12,5f + ahi %r0,1 +5: l %r2,__VDSO_TK_SHIFT(%r5) /* Timekeeper shift */ + srdl %r0,0(%r2) /* >> tk->shift */ + l %r2,__VDSO_WTOM_SEC+4(%r5) + cl %r4,__VDSO_UPD_COUNT+4(%r5) /* check update counter */ + jne 1b + basr %r5,0 +6: ltr %r0,%r0 + jnz 7f + cl %r1,20f-6b(%r5) + jl 8f +7: ahi %r2,1 + sl %r1,20f-6b(%r5) + brc 3,6b + ahi %r0,-1 + j 6b +8: st %r2,0(%r3) /* store tp->tv_sec */ + st %r1,4(%r3) /* store tp->tv_nsec */ + lhi %r2,0 + ahi %r15,16 + CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD + CFI_RESTORE 15 + br %r14 + + /* CLOCK_MONOTONIC_COARSE */ + CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD+16 + CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD +9: l %r4,__VDSO_UPD_COUNT+4(%r5) /* load update counter */ + tml %r4,0x0001 /* pending update ? loop */ + jnz 9b + l %r2,__VDSO_WTOM_CRS_SEC+4(%r5) + l %r1,__VDSO_WTOM_CRS_NSEC+4(%r5) + cl %r4,__VDSO_UPD_COUNT+4(%r5) /* check update counter */ + jne 9b + j 8b + + /* CLOCK_REALTIME_COARSE */ +10: l %r4,__VDSO_UPD_COUNT+4(%r5) /* load update counter */ + tml %r4,0x0001 /* pending update ? loop */ + jnz 10b + l %r2,__VDSO_XTIME_CRS_SEC+4(%r5) + l %r1,__VDSO_XTIME_CRS_NSEC+4(%r5) + cl %r4,__VDSO_UPD_COUNT+4(%r5) /* check update counter */ + jne 10b + j 17f + + /* CLOCK_REALTIME */ +11: l %r4,__VDSO_UPD_COUNT+4(%r5) /* load update counter */ + tml %r4,0x0001 /* pending update ? loop */ + jnz 11b + stcke 0(%r15) /* Store TOD clock */ + lm %r0,%r1,__VDSO_TS_END(%r5) /* TOD steering end time */ + s %r0,1(%r15) /* no - ts_steering_end */ + sl %r1,5(%r15) + brc 3,22f + ahi %r0,-1 +22: ltr %r0,%r0 /* past end of steering? */ + jm 24f + srdl %r0,15 /* 1 per 2^16 */ + tm __VDSO_TS_DIR+3(%r5),0x01 /* steering direction? */ + jz 23f + lcr %r0,%r0 /* negative TOD offset */ + lcr %r1,%r1 + je 23f + ahi %r0,-1 +23: a %r0,1(%r15) /* add TOD timestamp */ + al %r1,5(%r15) + brc 12,25f + ahi %r0,1 + j 25f +24: lm %r0,%r1,1(%r15) /* load TOD timestamp */ +25: s %r0,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */ + sl %r1,__VDSO_XTIME_STAMP+4(%r5) + brc 3,12f + ahi %r0,-1 +12: ms %r0,__VDSO_TK_MULT(%r5) /* * tk->mult */ + lr %r2,%r0 + l %r0,__VDSO_TK_MULT(%r5) + ltr %r1,%r1 + mr %r0,%r0 + jnm 13f + a %r0,__VDSO_TK_MULT(%r5) +13: alr %r0,%r2 + al %r0,__VDSO_XTIME_NSEC(%r5) /* + tk->xtime_nsec */ + al %r1,__VDSO_XTIME_NSEC+4(%r5) + brc 12,14f + ahi %r0,1 +14: l %r2,__VDSO_TK_SHIFT(%r5) /* Timekeeper shift */ + srdl %r0,0(%r2) /* >> tk->shift */ + l %r2,__VDSO_XTIME_SEC+4(%r5) + cl %r4,__VDSO_UPD_COUNT+4(%r5) /* check update counter */ + jne 11b + basr %r5,0 +15: ltr %r0,%r0 + jnz 16f + cl %r1,20f-15b(%r5) + jl 17f +16: ahi %r2,1 + sl %r1,20f-15b(%r5) + brc 3,15b + ahi %r0,-1 + j 15b +17: st %r2,0(%r3) /* store tp->tv_sec */ + st %r1,4(%r3) /* store tp->tv_nsec */ + lhi %r2,0 + ahi %r15,16 + CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD + CFI_RESTORE 15 + br %r14 + + /* Fallback to system call */ + CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD+16 + CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD +19: lhi %r1,__NR_clock_gettime + svc 0 + ahi %r15,16 + CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD + CFI_RESTORE 15 + br %r14 + CFI_ENDPROC + +20: .long 1000000000 +21: .long _vdso_data - 0b + .size __kernel_clock_gettime,.-__kernel_clock_gettime diff --git a/arch/s390/kernel/vdso32/getcpu.S b/arch/s390/kernel/vdso32/getcpu.S new file mode 100644 index 000000000..25515f3fb --- /dev/null +++ b/arch/s390/kernel/vdso32/getcpu.S @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Userland implementation of getcpu() for 32 bits processes in a + * s390 kernel for use in the vDSO + * + * Copyright IBM Corp. 2016 + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + */ +#include <asm/vdso.h> +#include <asm/asm-offsets.h> +#include <asm/dwarf.h> + + .text + .align 4 + .globl __kernel_getcpu + .type __kernel_getcpu,@function +__kernel_getcpu: + CFI_STARTPROC + la %r4,0 + sacf 256 + l %r5,__VDSO_CPU_NR(%r4) + l %r4,__VDSO_NODE_ID(%r4) + sacf 0 + ltr %r2,%r2 + jz 2f + st %r5,0(%r2) +2: ltr %r3,%r3 + jz 3f + st %r4,0(%r3) +3: lhi %r2,0 + br %r14 + CFI_ENDPROC + .size __kernel_getcpu,.-__kernel_getcpu diff --git a/arch/s390/kernel/vdso32/gettimeofday.S b/arch/s390/kernel/vdso32/gettimeofday.S new file mode 100644 index 000000000..b23063fbc --- /dev/null +++ b/arch/s390/kernel/vdso32/gettimeofday.S @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Userland implementation of gettimeofday() for 32 bits processes in a + * s390 kernel for use in the vDSO + * + * Copyright IBM Corp. 2008 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) + */ +#include <asm/vdso.h> +#include <asm/asm-offsets.h> +#include <asm/unistd.h> +#include <asm/dwarf.h> +#include <asm/ptrace.h> + + .text + .align 4 + .globl __kernel_gettimeofday + .type __kernel_gettimeofday,@function +__kernel_gettimeofday: + CFI_STARTPROC + ahi %r15,-16 + CFI_ADJUST_CFA_OFFSET 16 + CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD + basr %r5,0 +0: al %r5,13f-0b(%r5) /* get &_vdso_data */ +1: ltr %r3,%r3 /* check if tz is NULL */ + je 2f + mvc 0(8,%r3),__VDSO_TIMEZONE(%r5) +2: ltr %r2,%r2 /* check if tv is NULL */ + je 10f + l %r4,__VDSO_UPD_COUNT+4(%r5) /* load update counter */ + tml %r4,0x0001 /* pending update ? loop */ + jnz 1b + stcke 0(%r15) /* Store TOD clock */ + lm %r0,%r1,__VDSO_TS_END(%r5) /* TOD steering end time */ + s %r0,1(%r15) + sl %r1,5(%r15) + brc 3,14f + ahi %r0,-1 +14: ltr %r0,%r0 /* past end of steering? */ + jm 16f + srdl %r0,15 /* 1 per 2^16 */ + tm __VDSO_TS_DIR+3(%r5),0x01 /* steering direction? */ + jz 15f + lcr %r0,%r0 /* negative TOD offset */ + lcr %r1,%r1 + je 15f + ahi %r0,-1 +15: a %r0,1(%r15) /* add TOD timestamp */ + al %r1,5(%r15) + brc 12,17f + ahi %r0,1 + j 17f +16: lm %r0,%r1,1(%r15) /* load TOD timestamp */ +17: s %r0,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */ + sl %r1,__VDSO_XTIME_STAMP+4(%r5) + brc 3,3f + ahi %r0,-1 +3: ms %r0,__VDSO_TK_MULT(%r5) /* * tk->mult */ + st %r0,0(%r15) + l %r0,__VDSO_TK_MULT(%r5) + ltr %r1,%r1 + mr %r0,%r0 + jnm 4f + a %r0,__VDSO_TK_MULT(%r5) +4: al %r0,0(%r15) + al %r0,__VDSO_XTIME_NSEC(%r5) /* + xtime */ + al %r1,__VDSO_XTIME_NSEC+4(%r5) + brc 12,5f + ahi %r0,1 +5: mvc 0(4,%r15),__VDSO_XTIME_SEC+4(%r5) + cl %r4,__VDSO_UPD_COUNT+4(%r5) /* check update counter */ + jne 1b + l %r4,__VDSO_TK_SHIFT(%r5) /* Timekeeper shift */ + srdl %r0,0(%r4) /* >> tk->shift */ + l %r4,0(%r15) /* get tv_sec from stack */ + basr %r5,0 +6: ltr %r0,%r0 + jnz 7f + cl %r1,11f-6b(%r5) + jl 8f +7: ahi %r4,1 + sl %r1,11f-6b(%r5) + brc 3,6b + ahi %r0,-1 + j 6b +8: st %r4,0(%r2) /* store tv->tv_sec */ + ltr %r1,%r1 + m %r0,12f-6b(%r5) + jnm 9f + al %r0,12f-6b(%r5) +9: srl %r0,6 + st %r0,4(%r2) /* store tv->tv_usec */ +10: slr %r2,%r2 + ahi %r15,16 + CFI_ADJUST_CFA_OFFSET -16 + CFI_RESTORE 15 + br %r14 + CFI_ENDPROC +11: .long 1000000000 +12: .long 274877907 +13: .long _vdso_data - 0b + .size __kernel_gettimeofday,.-__kernel_gettimeofday diff --git a/arch/s390/kernel/vdso32/note.S b/arch/s390/kernel/vdso32/note.S new file mode 100644 index 000000000..db19d0680 --- /dev/null +++ b/arch/s390/kernel/vdso32/note.S @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. + * Here we can supply some information useful to userland. + */ + +#include <linux/uts.h> +#include <linux/version.h> +#include <linux/elfnote.h> + +ELFNOTE_START(Linux, 0, "a") + .long LINUX_VERSION_CODE +ELFNOTE_END diff --git a/arch/s390/kernel/vdso32/vdso32.lds.S b/arch/s390/kernel/vdso32/vdso32.lds.S new file mode 100644 index 000000000..721c4954c --- /dev/null +++ b/arch/s390/kernel/vdso32/vdso32.lds.S @@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This is the infamous ld script for the 32 bits vdso + * library + */ + +#include <asm/page.h> +#include <asm/vdso.h> + +OUTPUT_FORMAT("elf32-s390", "elf32-s390", "elf32-s390") +OUTPUT_ARCH(s390:31-bit) +ENTRY(_start) + +SECTIONS +{ + . = VDSO32_LBASE + SIZEOF_HEADERS; + + .hash : { *(.hash) } :text + .gnu.hash : { *(.gnu.hash) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + + .note : { *(.note.*) } :text :note + + . = ALIGN(16); + .text : { + *(.text .stub .text.* .gnu.linkonce.t.*) + } :text + PROVIDE(__etext = .); + PROVIDE(_etext = .); + PROVIDE(etext = .); + + /* + * Other stuff is appended to the text segment: + */ + .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } + .rodata1 : { *(.rodata1) } + + .dynamic : { *(.dynamic) } :text :dynamic + + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr + .eh_frame : { KEEP (*(.eh_frame)) } :text + .gcc_except_table : { *(.gcc_except_table .gcc_except_table.*) } + + .rela.dyn ALIGN(8) : { *(.rela.dyn) } + .got ALIGN(8) : { *(.got .toc) } + + _end = .; + PROVIDE(end = .); + + /* + * Stabs debugging sections are here too. + */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } + + /* + * DWARF debug sections. + * Symbols in the DWARF debugging sections are relative to the + * beginning of the section so we begin them at 0. + */ + /* DWARF 1 */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2 */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + /* DWARF 3 */ + .debug_pubtypes 0 : { *(.debug_pubtypes) } + .debug_ranges 0 : { *(.debug_ranges) } + .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) } + + . = ALIGN(PAGE_SIZE); + PROVIDE(_vdso_data = .); + + /DISCARD/ : { + *(.note.GNU-stack) + *(.branch_lt) + *(.data .data.* .gnu.linkonce.d.* .sdata*) + *(.bss .sbss .dynbss .dynsbss) + } +} + +/* + * Very old versions of ld do not recognize this name token; use the constant. + */ +#define PT_GNU_EH_FRAME 0x6474e550 + +/* + * We must supply the ELF program headers explicitly to get just one + * PT_LOAD segment, and set the flags explicitly to make segments read-only. + */ +PHDRS +{ + text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + note PT_NOTE FLAGS(4); /* PF_R */ + eh_frame_hdr PT_GNU_EH_FRAME; +} + +/* + * This controls what symbols we export from the DSO. + */ +VERSION +{ + VDSO_VERSION_STRING { + global: + /* + * Has to be there for the kernel to find + */ + __kernel_gettimeofday; + __kernel_clock_gettime; + __kernel_clock_getres; + __kernel_getcpu; + + local: *; + }; +} diff --git a/arch/s390/kernel/vdso32/vdso32_wrapper.S b/arch/s390/kernel/vdso32/vdso32_wrapper.S new file mode 100644 index 000000000..de2fb9304 --- /dev/null +++ b/arch/s390/kernel/vdso32/vdso32_wrapper.S @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/init.h> +#include <linux/linkage.h> +#include <asm/page.h> + + __PAGE_ALIGNED_DATA + + .globl vdso32_start, vdso32_end + .balign PAGE_SIZE +vdso32_start: + .incbin "arch/s390/kernel/vdso32/vdso32.so" + .balign PAGE_SIZE +vdso32_end: + + .previous diff --git a/arch/s390/kernel/vdso64/.gitignore b/arch/s390/kernel/vdso64/.gitignore new file mode 100644 index 000000000..3fd18cf9f --- /dev/null +++ b/arch/s390/kernel/vdso64/.gitignore @@ -0,0 +1 @@ +vdso64.lds diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile new file mode 100644 index 000000000..f849ac61c --- /dev/null +++ b/arch/s390/kernel/vdso64/Makefile @@ -0,0 +1,66 @@ +# SPDX-License-Identifier: GPL-2.0 +# List of files in the vdso, has to be asm only for now + +KCOV_INSTRUMENT := n + +obj-vdso64 = gettimeofday.o clock_getres.o clock_gettime.o note.o getcpu.o + +# Build rules + +targets := $(obj-vdso64) vdso64.so vdso64.so.dbg +obj-vdso64 := $(addprefix $(obj)/, $(obj-vdso64)) + +KBUILD_AFLAGS += -DBUILD_VDSO +KBUILD_CFLAGS += -DBUILD_VDSO + +KBUILD_AFLAGS_64 := $(filter-out -m64,$(KBUILD_AFLAGS)) +KBUILD_AFLAGS_64 += -m64 -s + +KBUILD_CFLAGS_64 := $(filter-out -m64,$(KBUILD_CFLAGS)) +KBUILD_CFLAGS_64 += -m64 -fPIC -shared -fno-common -fno-builtin +KBUILD_CFLAGS_64 += -nostdlib -Wl,-soname=linux-vdso64.so.1 \ + $(call cc-ldoption, -Wl$(comma)--hash-style=both) + +$(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_64) +$(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_64) + +obj-y += vdso64_wrapper.o +extra-y += vdso64.lds +CPPFLAGS_vdso64.lds += -P -C -U$(ARCH) + +# Disable gcov profiling, ubsan and kasan for VDSO code +GCOV_PROFILE := n +UBSAN_SANITIZE := n +KASAN_SANITIZE := n + +# Force dependency (incbin is bad) +$(obj)/vdso64_wrapper.o : $(obj)/vdso64.so + +# link rule for the .so file, .lds has to be first +$(obj)/vdso64.so.dbg: $(src)/vdso64.lds $(obj-vdso64) FORCE + $(call if_changed,vdso64ld) + +# strip rule for the .so file +$(obj)/%.so: OBJCOPYFLAGS := -S +$(obj)/%.so: $(obj)/%.so.dbg FORCE + $(call if_changed,objcopy) + +# assembly rules for the .S files +$(obj-vdso64): %.o: %.S FORCE + $(call if_changed_dep,vdso64as) + +# actual build commands +quiet_cmd_vdso64ld = VDSO64L $@ + cmd_vdso64ld = $(CC) $(c_flags) -Wl,-T $(filter %.lds %.o,$^) -o $@ +quiet_cmd_vdso64as = VDSO64A $@ + cmd_vdso64as = $(CC) $(a_flags) -c -o $@ $< + +# install commands for the unstripped file +quiet_cmd_vdso_install = INSTALL $@ + cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@ + +vdso64.so: $(obj)/vdso64.so.dbg + @mkdir -p $(MODLIB)/vdso + $(call cmd,vdso_install) + +vdso_install: vdso64.so diff --git a/arch/s390/kernel/vdso64/clock_getres.S b/arch/s390/kernel/vdso64/clock_getres.S new file mode 100644 index 000000000..0c79caa32 --- /dev/null +++ b/arch/s390/kernel/vdso64/clock_getres.S @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Userland implementation of clock_getres() for 64 bits processes in a + * s390 kernel for use in the vDSO + * + * Copyright IBM Corp. 2008 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) + */ +#include <asm/vdso.h> +#include <asm/asm-offsets.h> +#include <asm/unistd.h> +#include <asm/dwarf.h> + + .text + .align 4 + .globl __kernel_clock_getres + .type __kernel_clock_getres,@function +__kernel_clock_getres: + CFI_STARTPROC + larl %r1,3f + lg %r0,0(%r1) + cghi %r2,__CLOCK_REALTIME_COARSE + je 0f + cghi %r2,__CLOCK_MONOTONIC_COARSE + je 0f + larl %r1,_vdso_data + llgf %r0,__VDSO_CLOCK_REALTIME_RES(%r1) + cghi %r2,__CLOCK_REALTIME + je 0f + cghi %r2,__CLOCK_MONOTONIC + je 0f + cghi %r2,__CLOCK_THREAD_CPUTIME_ID + je 0f + cghi %r2,-2 /* Per-thread CPUCLOCK with PID=0, VIRT=1 */ + jne 2f + larl %r5,_vdso_data + icm %r0,15,__LC_ECTG_OK(%r5) + jz 2f +0: ltgr %r3,%r3 + jz 1f /* res == NULL */ + xc 0(8,%r3),0(%r3) /* set tp->tv_sec to zero */ + stg %r0,8(%r3) /* store tp->tv_usec */ +1: lghi %r2,0 + br %r14 +2: lghi %r1,__NR_clock_getres /* fallback to svc */ + svc 0 + br %r14 + CFI_ENDPROC +3: .quad __CLOCK_COARSE_RES + .size __kernel_clock_getres,.-__kernel_clock_getres diff --git a/arch/s390/kernel/vdso64/clock_gettime.S b/arch/s390/kernel/vdso64/clock_gettime.S new file mode 100644 index 000000000..9d2ee79b9 --- /dev/null +++ b/arch/s390/kernel/vdso64/clock_gettime.S @@ -0,0 +1,163 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Userland implementation of clock_gettime() for 64 bits processes in a + * s390 kernel for use in the vDSO + * + * Copyright IBM Corp. 2008 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) + */ +#include <asm/vdso.h> +#include <asm/asm-offsets.h> +#include <asm/unistd.h> +#include <asm/dwarf.h> +#include <asm/ptrace.h> + + .text + .align 4 + .globl __kernel_clock_gettime + .type __kernel_clock_gettime,@function +__kernel_clock_gettime: + CFI_STARTPROC + aghi %r15,-16 + CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD+16 + CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD + larl %r5,_vdso_data + cghi %r2,__CLOCK_REALTIME_COARSE + je 4f + cghi %r2,__CLOCK_REALTIME + je 5f + cghi %r2,-3 /* Per-thread CPUCLOCK with PID=0, VIRT=1 */ + je 9f + cghi %r2,__CLOCK_MONOTONIC_COARSE + je 3f + cghi %r2,__CLOCK_MONOTONIC + jne 12f + + /* CLOCK_MONOTONIC */ +0: lg %r4,__VDSO_UPD_COUNT(%r5) /* load update counter */ + tmll %r4,0x0001 /* pending update ? loop */ + jnz 0b + stcke 0(%r15) /* Store TOD clock */ + lgf %r2,__VDSO_TK_SHIFT(%r5) /* Timekeeper shift */ + lg %r0,__VDSO_WTOM_SEC(%r5) + lg %r1,1(%r15) + sg %r1,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */ + msgf %r1,__VDSO_TK_MULT(%r5) /* * tk->mult */ + alg %r1,__VDSO_WTOM_NSEC(%r5) + srlg %r1,%r1,0(%r2) /* >> tk->shift */ + clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */ + jne 0b + larl %r5,13f +1: clg %r1,0(%r5) + jl 2f + slg %r1,0(%r5) + aghi %r0,1 + j 1b +2: stg %r0,0(%r3) /* store tp->tv_sec */ + stg %r1,8(%r3) /* store tp->tv_nsec */ + lghi %r2,0 + aghi %r15,16 + CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD + CFI_RESTORE 15 + br %r14 + + /* CLOCK_MONOTONIC_COARSE */ + CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD+16 + CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD +3: lg %r4,__VDSO_UPD_COUNT(%r5) /* load update counter */ + tmll %r4,0x0001 /* pending update ? loop */ + jnz 3b + lg %r0,__VDSO_WTOM_CRS_SEC(%r5) + lg %r1,__VDSO_WTOM_CRS_NSEC(%r5) + clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */ + jne 3b + j 2b + + /* CLOCK_REALTIME_COARSE */ +4: lg %r4,__VDSO_UPD_COUNT(%r5) /* load update counter */ + tmll %r4,0x0001 /* pending update ? loop */ + jnz 4b + lg %r0,__VDSO_XTIME_CRS_SEC(%r5) + lg %r1,__VDSO_XTIME_CRS_NSEC(%r5) + clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */ + jne 4b + j 7f + + /* CLOCK_REALTIME */ +5: lg %r4,__VDSO_UPD_COUNT(%r5) /* load update counter */ + tmll %r4,0x0001 /* pending update ? loop */ + jnz 5b + stcke 0(%r15) /* Store TOD clock */ + lg %r1,1(%r15) + lg %r0,__VDSO_TS_END(%r5) /* TOD steering end time */ + slgr %r0,%r1 /* now - ts_steering_end */ + ltgr %r0,%r0 /* past end of steering ? */ + jm 17f + srlg %r0,%r0,15 /* 1 per 2^16 */ + tm __VDSO_TS_DIR+3(%r5),0x01 /* steering direction? */ + jz 18f + lcgr %r0,%r0 /* negative TOD offset */ +18: algr %r1,%r0 /* add steering offset */ +17: lgf %r2,__VDSO_TK_SHIFT(%r5) /* Timekeeper shift */ + sg %r1,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */ + msgf %r1,__VDSO_TK_MULT(%r5) /* * tk->mult */ + alg %r1,__VDSO_XTIME_NSEC(%r5) /* + tk->xtime_nsec */ + srlg %r1,%r1,0(%r2) /* >> tk->shift */ + lg %r0,__VDSO_XTIME_SEC(%r5) /* tk->xtime_sec */ + clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */ + jne 5b + larl %r5,13f +6: clg %r1,0(%r5) + jl 7f + slg %r1,0(%r5) + aghi %r0,1 + j 6b +7: stg %r0,0(%r3) /* store tp->tv_sec */ + stg %r1,8(%r3) /* store tp->tv_nsec */ + lghi %r2,0 + aghi %r15,16 + CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD + CFI_RESTORE 15 + br %r14 + + /* CPUCLOCK_VIRT for this thread */ + CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD+16 + CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD +9: lghi %r4,0 + icm %r0,15,__VDSO_ECTG_OK(%r5) + jz 12f + sacf 256 /* Magic ectg instruction */ + .insn ssf,0xc80100000000,__VDSO_ECTG_BASE(4),__VDSO_ECTG_USER(4),4 + sacf 0 + algr %r1,%r0 /* r1 = cputime as TOD value */ + mghi %r1,1000 /* convert to nanoseconds */ + srlg %r1,%r1,12 /* r1 = cputime in nanosec */ + lgr %r4,%r1 + larl %r5,13f + srlg %r1,%r1,9 /* divide by 1000000000 */ + mlg %r0,8(%r5) + srlg %r0,%r0,11 /* r0 = tv_sec */ + stg %r0,0(%r3) + msg %r0,0(%r5) /* calculate tv_nsec */ + slgr %r4,%r0 /* r4 = tv_nsec */ + stg %r4,8(%r3) + lghi %r2,0 + aghi %r15,16 + CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD + CFI_RESTORE 15 + br %r14 + + /* Fallback to system call */ + CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD+16 + CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD +12: lghi %r1,__NR_clock_gettime + svc 0 + aghi %r15,16 + CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD + CFI_RESTORE 15 + br %r14 + CFI_ENDPROC + +13: .quad 1000000000 +14: .quad 19342813113834067 + .size __kernel_clock_gettime,.-__kernel_clock_gettime diff --git a/arch/s390/kernel/vdso64/getcpu.S b/arch/s390/kernel/vdso64/getcpu.S new file mode 100644 index 000000000..2446e9dac --- /dev/null +++ b/arch/s390/kernel/vdso64/getcpu.S @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Userland implementation of getcpu() for 64 bits processes in a + * s390 kernel for use in the vDSO + * + * Copyright IBM Corp. 2016 + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + */ +#include <asm/vdso.h> +#include <asm/asm-offsets.h> +#include <asm/dwarf.h> + + .text + .align 4 + .globl __kernel_getcpu + .type __kernel_getcpu,@function +__kernel_getcpu: + CFI_STARTPROC + la %r4,0 + sacf 256 + l %r5,__VDSO_CPU_NR(%r4) + l %r4,__VDSO_NODE_ID(%r4) + sacf 0 + ltgr %r2,%r2 + jz 2f + st %r5,0(%r2) +2: ltgr %r3,%r3 + jz 3f + st %r4,0(%r3) +3: lghi %r2,0 + br %r14 + CFI_ENDPROC + .size __kernel_getcpu,.-__kernel_getcpu diff --git a/arch/s390/kernel/vdso64/gettimeofday.S b/arch/s390/kernel/vdso64/gettimeofday.S new file mode 100644 index 000000000..aebe10dc7 --- /dev/null +++ b/arch/s390/kernel/vdso64/gettimeofday.S @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Userland implementation of gettimeofday() for 64 bits processes in a + * s390 kernel for use in the vDSO + * + * Copyright IBM Corp. 2008 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) + */ +#include <asm/vdso.h> +#include <asm/asm-offsets.h> +#include <asm/unistd.h> +#include <asm/dwarf.h> +#include <asm/ptrace.h> + + .text + .align 4 + .globl __kernel_gettimeofday + .type __kernel_gettimeofday,@function +__kernel_gettimeofday: + CFI_STARTPROC + aghi %r15,-16 + CFI_ADJUST_CFA_OFFSET 16 + CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD + larl %r5,_vdso_data +0: ltgr %r3,%r3 /* check if tz is NULL */ + je 1f + mvc 0(8,%r3),__VDSO_TIMEZONE(%r5) +1: ltgr %r2,%r2 /* check if tv is NULL */ + je 4f + lg %r4,__VDSO_UPD_COUNT(%r5) /* load update counter */ + tmll %r4,0x0001 /* pending update ? loop */ + jnz 0b + stcke 0(%r15) /* Store TOD clock */ + lg %r1,1(%r15) + lg %r0,__VDSO_TS_END(%r5) /* TOD steering end time */ + slgr %r0,%r1 /* now - ts_steering_end */ + ltgr %r0,%r0 /* past end of steering ? */ + jm 6f + srlg %r0,%r0,15 /* 1 per 2^16 */ + tm __VDSO_TS_DIR+3(%r5),0x01 /* steering direction? */ + jz 7f + lcgr %r0,%r0 /* negative TOD offset */ +7: algr %r1,%r0 /* add steering offset */ +6: sg %r1,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */ + msgf %r1,__VDSO_TK_MULT(%r5) /* * tk->mult */ + alg %r1,__VDSO_XTIME_NSEC(%r5) /* + tk->xtime_nsec */ + lg %r0,__VDSO_XTIME_SEC(%r5) /* tk->xtime_sec */ + clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */ + jne 0b + lgf %r5,__VDSO_TK_SHIFT(%r5) /* Timekeeper shift */ + srlg %r1,%r1,0(%r5) /* >> tk->shift */ + larl %r5,5f +2: clg %r1,0(%r5) + jl 3f + slg %r1,0(%r5) + aghi %r0,1 + j 2b +3: stg %r0,0(%r2) /* store tv->tv_sec */ + slgr %r0,%r0 /* tv_nsec -> tv_usec */ + ml %r0,8(%r5) + srlg %r0,%r0,6 + stg %r0,8(%r2) /* store tv->tv_usec */ +4: lghi %r2,0 + aghi %r15,16 + CFI_ADJUST_CFA_OFFSET -16 + CFI_RESTORE 15 + br %r14 + CFI_ENDPROC +5: .quad 1000000000 + .long 274877907 + .size __kernel_gettimeofday,.-__kernel_gettimeofday diff --git a/arch/s390/kernel/vdso64/note.S b/arch/s390/kernel/vdso64/note.S new file mode 100644 index 000000000..db19d0680 --- /dev/null +++ b/arch/s390/kernel/vdso64/note.S @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. + * Here we can supply some information useful to userland. + */ + +#include <linux/uts.h> +#include <linux/version.h> +#include <linux/elfnote.h> + +ELFNOTE_START(Linux, 0, "a") + .long LINUX_VERSION_CODE +ELFNOTE_END diff --git a/arch/s390/kernel/vdso64/vdso64.lds.S b/arch/s390/kernel/vdso64/vdso64.lds.S new file mode 100644 index 000000000..7ddb116b5 --- /dev/null +++ b/arch/s390/kernel/vdso64/vdso64.lds.S @@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This is the infamous ld script for the 64 bits vdso + * library + */ + +#include <asm/page.h> +#include <asm/vdso.h> + +OUTPUT_FORMAT("elf64-s390", "elf64-s390", "elf64-s390") +OUTPUT_ARCH(s390:64-bit) +ENTRY(_start) + +SECTIONS +{ + . = VDSO64_LBASE + SIZEOF_HEADERS; + + .hash : { *(.hash) } :text + .gnu.hash : { *(.gnu.hash) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + + .note : { *(.note.*) } :text :note + + . = ALIGN(16); + .text : { + *(.text .stub .text.* .gnu.linkonce.t.*) + } :text + PROVIDE(__etext = .); + PROVIDE(_etext = .); + PROVIDE(etext = .); + + /* + * Other stuff is appended to the text segment: + */ + .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } + .rodata1 : { *(.rodata1) } + + .dynamic : { *(.dynamic) } :text :dynamic + + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr + .eh_frame : { KEEP (*(.eh_frame)) } :text + .gcc_except_table : { *(.gcc_except_table .gcc_except_table.*) } + + .rela.dyn ALIGN(8) : { *(.rela.dyn) } + .got ALIGN(8) : { *(.got .toc) } + + _end = .; + PROVIDE(end = .); + + /* + * Stabs debugging sections are here too. + */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } + + /* + * DWARF debug sections. + * Symbols in the DWARF debugging sections are relative to the + * beginning of the section so we begin them at 0. + */ + /* DWARF 1 */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2 */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + /* DWARF 3 */ + .debug_pubtypes 0 : { *(.debug_pubtypes) } + .debug_ranges 0 : { *(.debug_ranges) } + .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) } + + . = ALIGN(PAGE_SIZE); + PROVIDE(_vdso_data = .); + + /DISCARD/ : { + *(.note.GNU-stack) + *(.branch_lt) + *(.data .data.* .gnu.linkonce.d.* .sdata*) + *(.bss .sbss .dynbss .dynsbss) + } +} + +/* + * Very old versions of ld do not recognize this name token; use the constant. + */ +#define PT_GNU_EH_FRAME 0x6474e550 + +/* + * We must supply the ELF program headers explicitly to get just one + * PT_LOAD segment, and set the flags explicitly to make segments read-only. + */ +PHDRS +{ + text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + note PT_NOTE FLAGS(4); /* PF_R */ + eh_frame_hdr PT_GNU_EH_FRAME; +} + +/* + * This controls what symbols we export from the DSO. + */ +VERSION +{ + VDSO_VERSION_STRING { + global: + /* + * Has to be there for the kernel to find + */ + __kernel_gettimeofday; + __kernel_clock_gettime; + __kernel_clock_getres; + __kernel_getcpu; + + local: *; + }; +} diff --git a/arch/s390/kernel/vdso64/vdso64_wrapper.S b/arch/s390/kernel/vdso64/vdso64_wrapper.S new file mode 100644 index 000000000..672184998 --- /dev/null +++ b/arch/s390/kernel/vdso64/vdso64_wrapper.S @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/init.h> +#include <linux/linkage.h> +#include <asm/page.h> + + __PAGE_ALIGNED_DATA + + .globl vdso64_start, vdso64_end + .balign PAGE_SIZE +vdso64_start: + .incbin "arch/s390/kernel/vdso64/vdso64.so" + .balign PAGE_SIZE +vdso64_end: + + .previous diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S new file mode 100644 index 000000000..18ede6e80 --- /dev/null +++ b/arch/s390/kernel/vmlinux.lds.S @@ -0,0 +1,156 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* ld script to make s390 Linux kernel + * Written by Martin Schwidefsky (schwidefsky@de.ibm.com) + */ + +#include <asm/thread_info.h> +#include <asm/page.h> + +/* + * Put .bss..swapper_pg_dir as the first thing in .bss. This will + * make sure it has 16k alignment. + */ +#define BSS_FIRST_SECTIONS *(.bss..swapper_pg_dir) + +/* Handle ro_after_init data on our own. */ +#define RO_AFTER_INIT_DATA + +#include <asm-generic/vmlinux.lds.h> + +OUTPUT_FORMAT("elf64-s390", "elf64-s390", "elf64-s390") +OUTPUT_ARCH(s390:64-bit) +ENTRY(startup_continue) +jiffies = jiffies_64; + +PHDRS { + text PT_LOAD FLAGS(5); /* R_E */ + data PT_LOAD FLAGS(7); /* RWE */ + note PT_NOTE FLAGS(0); /* ___ */ +} + +SECTIONS +{ + . = 0x100000; + .text : { + _stext = .; /* Start of text section */ + _text = .; /* Text and read-only data */ + HEAD_TEXT + TEXT_TEXT + SCHED_TEXT + CPUIDLE_TEXT + LOCK_TEXT + KPROBES_TEXT + IRQENTRY_TEXT + SOFTIRQENTRY_TEXT + *(.text.*_indirect_*) + *(.fixup) + *(.gnu.warning) + . = ALIGN(PAGE_SIZE); + _etext = .; /* End of text section */ + } :text = 0x0700 + + NOTES :text :note + + .dummy : { *(.dummy) } :data + + RO_DATA_SECTION(PAGE_SIZE) + + . = ALIGN(PAGE_SIZE); + _sdata = .; /* Start of data section */ + + . = ALIGN(PAGE_SIZE); + __start_ro_after_init = .; + .data..ro_after_init : { + *(.data..ro_after_init) + } + EXCEPTION_TABLE(16) + . = ALIGN(PAGE_SIZE); + __end_ro_after_init = .; + + RW_DATA_SECTION(0x100, PAGE_SIZE, THREAD_SIZE) + + _edata = .; /* End of data section */ + + /* will be freed after init */ + . = ALIGN(PAGE_SIZE); /* Init code and data */ + __init_begin = .; + + . = ALIGN(PAGE_SIZE); + .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { + _sinittext = .; + INIT_TEXT + . = ALIGN(PAGE_SIZE); + _einittext = .; + } + + /* + * .exit.text is discarded at runtime, not link time, + * to deal with references from __bug_table + */ + .exit.text : { + EXIT_TEXT + } + + .exit.data : { + EXIT_DATA + } + + /* + * struct alt_inst entries. From the header (alternative.h): + * "Alternative instructions for different CPU types or capabilities" + * Think locking instructions on spinlocks. + * Note, that it is a part of __init region. + */ + . = ALIGN(8); + .altinstructions : { + __alt_instructions = .; + *(.altinstructions) + __alt_instructions_end = .; + } + + /* + * And here are the replacement instructions. The linker sticks + * them as binary blobs. The .altinstructions has enough data to + * get the address and the length of them to patch the kernel safely. + * Note, that it is a part of __init region. + */ + .altinstr_replacement : { + *(.altinstr_replacement) + } + + /* + * Table with the patch locations to undo expolines + */ + .nospec_call_table : { + __nospec_call_start = . ; + *(.s390_indirect*) + __nospec_call_end = . ; + } + .nospec_return_table : { + __nospec_return_start = . ; + *(.s390_return*) + __nospec_return_end = . ; + } + + /* early.c uses stsi, which requires page aligned data. */ + . = ALIGN(PAGE_SIZE); + INIT_DATA_SECTION(0x100) + + PERCPU_SECTION(0x100) + . = ALIGN(PAGE_SIZE); + __init_end = .; /* freed after init ends here */ + + BSS_SECTION(PAGE_SIZE, 4 * PAGE_SIZE, PAGE_SIZE) + + _end = . ; + + /* Debugging sections. */ + STABS_DEBUG + DWARF_DEBUG + + /* Sections to be discarded */ + DISCARDS + /DISCARD/ : { + *(.eh_frame) + } +} diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c new file mode 100644 index 000000000..f24395a01 --- /dev/null +++ b/arch/s390/kernel/vtime.c @@ -0,0 +1,437 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Virtual cpu timer based timer functions. + * + * Copyright IBM Corp. 2004, 2012 + * Author(s): Jan Glauber <jan.glauber@de.ibm.com> + */ + +#include <linux/kernel_stat.h> +#include <linux/sched/cputime.h> +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/timex.h> +#include <linux/types.h> +#include <linux/time.h> + +#include <asm/vtimer.h> +#include <asm/vtime.h> +#include <asm/cpu_mf.h> +#include <asm/smp.h> + +#include "entry.h" + +static void virt_timer_expire(void); + +static LIST_HEAD(virt_timer_list); +static DEFINE_SPINLOCK(virt_timer_lock); +static atomic64_t virt_timer_current; +static atomic64_t virt_timer_elapsed; + +DEFINE_PER_CPU(u64, mt_cycles[8]); +static DEFINE_PER_CPU(u64, mt_scaling_mult) = { 1 }; +static DEFINE_PER_CPU(u64, mt_scaling_div) = { 1 }; +static DEFINE_PER_CPU(u64, mt_scaling_jiffies); + +static inline u64 get_vtimer(void) +{ + u64 timer; + + asm volatile("stpt %0" : "=m" (timer)); + return timer; +} + +static inline void set_vtimer(u64 expires) +{ + u64 timer; + + asm volatile( + " stpt %0\n" /* Store current cpu timer value */ + " spt %1" /* Set new value imm. afterwards */ + : "=m" (timer) : "m" (expires)); + S390_lowcore.system_timer += S390_lowcore.last_update_timer - timer; + S390_lowcore.last_update_timer = expires; +} + +static inline int virt_timer_forward(u64 elapsed) +{ + BUG_ON(!irqs_disabled()); + + if (list_empty(&virt_timer_list)) + return 0; + elapsed = atomic64_add_return(elapsed, &virt_timer_elapsed); + return elapsed >= atomic64_read(&virt_timer_current); +} + +static void update_mt_scaling(void) +{ + u64 cycles_new[8], *cycles_old; + u64 delta, fac, mult, div; + int i; + + stcctm5(smp_cpu_mtid + 1, cycles_new); + cycles_old = this_cpu_ptr(mt_cycles); + fac = 1; + mult = div = 0; + for (i = 0; i <= smp_cpu_mtid; i++) { + delta = cycles_new[i] - cycles_old[i]; + div += delta; + mult *= i + 1; + mult += delta * fac; + fac *= i + 1; + } + div *= fac; + if (div > 0) { + /* Update scaling factor */ + __this_cpu_write(mt_scaling_mult, mult); + __this_cpu_write(mt_scaling_div, div); + memcpy(cycles_old, cycles_new, + sizeof(u64) * (smp_cpu_mtid + 1)); + } + __this_cpu_write(mt_scaling_jiffies, jiffies_64); +} + +static inline u64 update_tsk_timer(unsigned long *tsk_vtime, u64 new) +{ + u64 delta; + + delta = new - *tsk_vtime; + *tsk_vtime = new; + return delta; +} + + +static inline u64 scale_vtime(u64 vtime) +{ + u64 mult = __this_cpu_read(mt_scaling_mult); + u64 div = __this_cpu_read(mt_scaling_div); + + if (smp_cpu_mtid) + return vtime * mult / div; + return vtime; +} + +static void account_system_index_scaled(struct task_struct *p, u64 cputime, + enum cpu_usage_stat index) +{ + p->stimescaled += cputime_to_nsecs(scale_vtime(cputime)); + account_system_index_time(p, cputime_to_nsecs(cputime), index); +} + +/* + * Update process times based on virtual cpu times stored by entry.S + * to the lowcore fields user_timer, system_timer & steal_clock. + */ +static int do_account_vtime(struct task_struct *tsk) +{ + u64 timer, clock, user, guest, system, hardirq, softirq, steal; + + timer = S390_lowcore.last_update_timer; + clock = S390_lowcore.last_update_clock; + asm volatile( + " stpt %0\n" /* Store current cpu timer value */ +#ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES + " stckf %1" /* Store current tod clock value */ +#else + " stck %1" /* Store current tod clock value */ +#endif + : "=m" (S390_lowcore.last_update_timer), + "=m" (S390_lowcore.last_update_clock)); + clock = S390_lowcore.last_update_clock - clock; + timer -= S390_lowcore.last_update_timer; + + if (hardirq_count()) + S390_lowcore.hardirq_timer += timer; + else + S390_lowcore.system_timer += timer; + + /* Update MT utilization calculation */ + if (smp_cpu_mtid && + time_after64(jiffies_64, this_cpu_read(mt_scaling_jiffies))) + update_mt_scaling(); + + /* Calculate cputime delta */ + user = update_tsk_timer(&tsk->thread.user_timer, + READ_ONCE(S390_lowcore.user_timer)); + guest = update_tsk_timer(&tsk->thread.guest_timer, + READ_ONCE(S390_lowcore.guest_timer)); + system = update_tsk_timer(&tsk->thread.system_timer, + READ_ONCE(S390_lowcore.system_timer)); + hardirq = update_tsk_timer(&tsk->thread.hardirq_timer, + READ_ONCE(S390_lowcore.hardirq_timer)); + softirq = update_tsk_timer(&tsk->thread.softirq_timer, + READ_ONCE(S390_lowcore.softirq_timer)); + S390_lowcore.steal_timer += + clock - user - guest - system - hardirq - softirq; + + /* Push account value */ + if (user) { + account_user_time(tsk, cputime_to_nsecs(user)); + tsk->utimescaled += cputime_to_nsecs(scale_vtime(user)); + } + + if (guest) { + account_guest_time(tsk, cputime_to_nsecs(guest)); + tsk->utimescaled += cputime_to_nsecs(scale_vtime(guest)); + } + + if (system) + account_system_index_scaled(tsk, system, CPUTIME_SYSTEM); + if (hardirq) + account_system_index_scaled(tsk, hardirq, CPUTIME_IRQ); + if (softirq) + account_system_index_scaled(tsk, softirq, CPUTIME_SOFTIRQ); + + steal = S390_lowcore.steal_timer; + if ((s64) steal > 0) { + S390_lowcore.steal_timer = 0; + account_steal_time(cputime_to_nsecs(steal)); + } + + return virt_timer_forward(user + guest + system + hardirq + softirq); +} + +void vtime_task_switch(struct task_struct *prev) +{ + do_account_vtime(prev); + prev->thread.user_timer = S390_lowcore.user_timer; + prev->thread.guest_timer = S390_lowcore.guest_timer; + prev->thread.system_timer = S390_lowcore.system_timer; + prev->thread.hardirq_timer = S390_lowcore.hardirq_timer; + prev->thread.softirq_timer = S390_lowcore.softirq_timer; + S390_lowcore.user_timer = current->thread.user_timer; + S390_lowcore.guest_timer = current->thread.guest_timer; + S390_lowcore.system_timer = current->thread.system_timer; + S390_lowcore.hardirq_timer = current->thread.hardirq_timer; + S390_lowcore.softirq_timer = current->thread.softirq_timer; +} + +/* + * In s390, accounting pending user time also implies + * accounting system time in order to correctly compute + * the stolen time accounting. + */ +void vtime_flush(struct task_struct *tsk) +{ + if (do_account_vtime(tsk)) + virt_timer_expire(); +} + +/* + * Update process times based on virtual cpu times stored by entry.S + * to the lowcore fields user_timer, system_timer & steal_clock. + */ +void vtime_account_irq_enter(struct task_struct *tsk) +{ + u64 timer; + + timer = S390_lowcore.last_update_timer; + S390_lowcore.last_update_timer = get_vtimer(); + timer -= S390_lowcore.last_update_timer; + + if ((tsk->flags & PF_VCPU) && (irq_count() == 0)) + S390_lowcore.guest_timer += timer; + else if (hardirq_count()) + S390_lowcore.hardirq_timer += timer; + else if (in_serving_softirq()) + S390_lowcore.softirq_timer += timer; + else + S390_lowcore.system_timer += timer; + + virt_timer_forward(timer); +} +EXPORT_SYMBOL_GPL(vtime_account_irq_enter); + +void vtime_account_system(struct task_struct *tsk) +__attribute__((alias("vtime_account_irq_enter"))); +EXPORT_SYMBOL_GPL(vtime_account_system); + +/* + * Sorted add to a list. List is linear searched until first bigger + * element is found. + */ +static void list_add_sorted(struct vtimer_list *timer, struct list_head *head) +{ + struct vtimer_list *tmp; + + list_for_each_entry(tmp, head, entry) { + if (tmp->expires > timer->expires) { + list_add_tail(&timer->entry, &tmp->entry); + return; + } + } + list_add_tail(&timer->entry, head); +} + +/* + * Handler for expired virtual CPU timer. + */ +static void virt_timer_expire(void) +{ + struct vtimer_list *timer, *tmp; + unsigned long elapsed; + LIST_HEAD(cb_list); + + /* walk timer list, fire all expired timers */ + spin_lock(&virt_timer_lock); + elapsed = atomic64_read(&virt_timer_elapsed); + list_for_each_entry_safe(timer, tmp, &virt_timer_list, entry) { + if (timer->expires < elapsed) + /* move expired timer to the callback queue */ + list_move_tail(&timer->entry, &cb_list); + else + timer->expires -= elapsed; + } + if (!list_empty(&virt_timer_list)) { + timer = list_first_entry(&virt_timer_list, + struct vtimer_list, entry); + atomic64_set(&virt_timer_current, timer->expires); + } + atomic64_sub(elapsed, &virt_timer_elapsed); + spin_unlock(&virt_timer_lock); + + /* Do callbacks and recharge periodic timers */ + list_for_each_entry_safe(timer, tmp, &cb_list, entry) { + list_del_init(&timer->entry); + timer->function(timer->data); + if (timer->interval) { + /* Recharge interval timer */ + timer->expires = timer->interval + + atomic64_read(&virt_timer_elapsed); + spin_lock(&virt_timer_lock); + list_add_sorted(timer, &virt_timer_list); + spin_unlock(&virt_timer_lock); + } + } +} + +void init_virt_timer(struct vtimer_list *timer) +{ + timer->function = NULL; + INIT_LIST_HEAD(&timer->entry); +} +EXPORT_SYMBOL(init_virt_timer); + +static inline int vtimer_pending(struct vtimer_list *timer) +{ + return !list_empty(&timer->entry); +} + +static void internal_add_vtimer(struct vtimer_list *timer) +{ + if (list_empty(&virt_timer_list)) { + /* First timer, just program it. */ + atomic64_set(&virt_timer_current, timer->expires); + atomic64_set(&virt_timer_elapsed, 0); + list_add(&timer->entry, &virt_timer_list); + } else { + /* Update timer against current base. */ + timer->expires += atomic64_read(&virt_timer_elapsed); + if (likely((s64) timer->expires < + (s64) atomic64_read(&virt_timer_current))) + /* The new timer expires before the current timer. */ + atomic64_set(&virt_timer_current, timer->expires); + /* Insert new timer into the list. */ + list_add_sorted(timer, &virt_timer_list); + } +} + +static void __add_vtimer(struct vtimer_list *timer, int periodic) +{ + unsigned long flags; + + timer->interval = periodic ? timer->expires : 0; + spin_lock_irqsave(&virt_timer_lock, flags); + internal_add_vtimer(timer); + spin_unlock_irqrestore(&virt_timer_lock, flags); +} + +/* + * add_virt_timer - add a oneshot virtual CPU timer + */ +void add_virt_timer(struct vtimer_list *timer) +{ + __add_vtimer(timer, 0); +} +EXPORT_SYMBOL(add_virt_timer); + +/* + * add_virt_timer_int - add an interval virtual CPU timer + */ +void add_virt_timer_periodic(struct vtimer_list *timer) +{ + __add_vtimer(timer, 1); +} +EXPORT_SYMBOL(add_virt_timer_periodic); + +static int __mod_vtimer(struct vtimer_list *timer, u64 expires, int periodic) +{ + unsigned long flags; + int rc; + + BUG_ON(!timer->function); + + if (timer->expires == expires && vtimer_pending(timer)) + return 1; + spin_lock_irqsave(&virt_timer_lock, flags); + rc = vtimer_pending(timer); + if (rc) + list_del_init(&timer->entry); + timer->interval = periodic ? expires : 0; + timer->expires = expires; + internal_add_vtimer(timer); + spin_unlock_irqrestore(&virt_timer_lock, flags); + return rc; +} + +/* + * returns whether it has modified a pending timer (1) or not (0) + */ +int mod_virt_timer(struct vtimer_list *timer, u64 expires) +{ + return __mod_vtimer(timer, expires, 0); +} +EXPORT_SYMBOL(mod_virt_timer); + +/* + * returns whether it has modified a pending timer (1) or not (0) + */ +int mod_virt_timer_periodic(struct vtimer_list *timer, u64 expires) +{ + return __mod_vtimer(timer, expires, 1); +} +EXPORT_SYMBOL(mod_virt_timer_periodic); + +/* + * Delete a virtual timer. + * + * returns whether the deleted timer was pending (1) or not (0) + */ +int del_virt_timer(struct vtimer_list *timer) +{ + unsigned long flags; + + if (!vtimer_pending(timer)) + return 0; + spin_lock_irqsave(&virt_timer_lock, flags); + list_del_init(&timer->entry); + spin_unlock_irqrestore(&virt_timer_lock, flags); + return 1; +} +EXPORT_SYMBOL(del_virt_timer); + +/* + * Start the virtual CPU timer on the current CPU. + */ +void vtime_init(void) +{ + /* set initial cpu timer */ + set_vtimer(VTIMER_MAX_SLICE); + /* Setup initial MT scaling values */ + if (smp_cpu_mtid) { + __this_cpu_write(mt_scaling_jiffies, jiffies); + __this_cpu_write(mt_scaling_mult, 1); + __this_cpu_write(mt_scaling_div, 1); + stcctm5(smp_cpu_mtid + 1, this_cpu_ptr(mt_cycles)); + } +} |