From ace9429bb58fd418f0c81d4c2835699bddf6bde6 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Thu, 11 Apr 2024 10:27:49 +0200 Subject: Adding upstream version 6.6.15. Signed-off-by: Daniel Baumann --- arch/s390/Kbuild | 13 + arch/s390/Kconfig | 935 ++++ arch/s390/Kconfig.debug | 22 + arch/s390/Makefile | 177 + arch/s390/appldata/Makefile | 9 + arch/s390/appldata/appldata.h | 48 + arch/s390/appldata/appldata_base.c | 428 ++ arch/s390/appldata/appldata_mem.c | 161 + arch/s390/appldata/appldata_net_sum.c | 163 + arch/s390/appldata/appldata_os.c | 211 + arch/s390/boot/.gitignore | 7 + arch/s390/boot/Makefile | 133 + arch/s390/boot/als.c | 114 + arch/s390/boot/boot.h | 102 + arch/s390/boot/clz_ctz.c | 2 + arch/s390/boot/cmdline.c | 2 + arch/s390/boot/ctype.c | 2 + arch/s390/boot/decompressor.c | 86 + arch/s390/boot/decompressor.h | 12 + arch/s390/boot/ebcdic.c | 2 + arch/s390/boot/head.S | 320 ++ arch/s390/boot/head_kdump.S | 101 + arch/s390/boot/install.sh | 24 + arch/s390/boot/ipl_data.c | 84 + arch/s390/boot/ipl_parm.c | 306 ++ arch/s390/boot/ipl_report.c | 165 + arch/s390/boot/ipl_vmparm.c | 2 + arch/s390/boot/kaslr.c | 198 + arch/s390/boot/machine_kexec_reloc.c | 2 + arch/s390/boot/mem.S | 2 + arch/s390/boot/pgm_check_info.c | 179 + arch/s390/boot/physmem_info.c | 328 ++ arch/s390/boot/sclp_early_core.c | 11 + arch/s390/boot/startup.c | 378 ++ arch/s390/boot/string.c | 140 + arch/s390/boot/uv.c | 95 + arch/s390/boot/uv.h | 22 + arch/s390/boot/version.c | 8 + arch/s390/boot/vmem.c | 458 ++ arch/s390/boot/vmlinux.lds.S | 128 + arch/s390/configs/btf.config | 2 + arch/s390/configs/debug_defconfig | 887 ++++ arch/s390/configs/defconfig | 816 ++++ arch/s390/configs/kasan.config | 4 + arch/s390/configs/zfcpdump_defconfig | 82 + arch/s390/crypto/Kconfig | 135 + arch/s390/crypto/Makefile | 21 + arch/s390/crypto/aes_s390.c | 1057 +++++ arch/s390/crypto/arch_random.c | 19 + arch/s390/crypto/chacha-glue.c | 130 + arch/s390/crypto/chacha-s390.S | 908 ++++ arch/s390/crypto/chacha-s390.h | 14 + arch/s390/crypto/crc32-vx.c | 310 ++ arch/s390/crypto/crc32be-vx.S | 213 + arch/s390/crypto/crc32le-vx.S | 275 ++ arch/s390/crypto/des_s390.c | 502 +++ arch/s390/crypto/ghash_s390.c | 154 + arch/s390/crypto/paes_s390.c | 809 ++++ arch/s390/crypto/prng.c | 911 ++++ arch/s390/crypto/sha.h | 35 + arch/s390/crypto/sha1_s390.c | 103 + arch/s390/crypto/sha256_s390.c | 143 + arch/s390/crypto/sha3_256_s390.c | 146 + arch/s390/crypto/sha3_512_s390.c | 154 + arch/s390/crypto/sha512_s390.c | 149 + arch/s390/crypto/sha_common.c | 124 + arch/s390/hypfs/Makefile | 14 + arch/s390/hypfs/hypfs.h | 85 + arch/s390/hypfs/hypfs_dbfs.c | 122 + arch/s390/hypfs/hypfs_diag.c | 219 + arch/s390/hypfs/hypfs_diag.h | 35 + arch/s390/hypfs/hypfs_diag0c.c | 125 + arch/s390/hypfs/hypfs_diag_fs.c | 393 ++ arch/s390/hypfs/hypfs_sprp.c | 151 + arch/s390/hypfs/hypfs_vm.c | 141 + arch/s390/hypfs/hypfs_vm.h | 50 + arch/s390/hypfs/hypfs_vm_fs.c | 139 + arch/s390/hypfs/inode.c | 477 +++ arch/s390/include/asm/Kbuild | 9 + arch/s390/include/asm/abs_lowcore.h | 27 + arch/s390/include/asm/airq.h | 110 + arch/s390/include/asm/alternative-asm.h | 56 + arch/s390/include/asm/alternative.h | 120 + arch/s390/include/asm/ap.h | 555 +++ arch/s390/include/asm/appldata.h | 68 + arch/s390/include/asm/archrandom.h | 38 + arch/s390/include/asm/asm-const.h | 12 + arch/s390/include/asm/asm-extable.h | 92 + arch/s390/include/asm/asm-prototypes.h | 13 + arch/s390/include/asm/atomic.h | 150 + arch/s390/include/asm/atomic_ops.h | 201 + arch/s390/include/asm/barrier.h | 82 + arch/s390/include/asm/bitops.h | 378 ++ arch/s390/include/asm/boot_data.h | 18 + arch/s390/include/asm/bug.h | 71 + arch/s390/include/asm/cache.h | 19 + arch/s390/include/asm/ccwdev.h | 237 ++ arch/s390/include/asm/ccwgroup.h | 75 + arch/s390/include/asm/checksum.h | 133 + arch/s390/include/asm/chpid.h | 51 + arch/s390/include/asm/chsc.h | 69 + arch/s390/include/asm/cio.h | 378 ++ arch/s390/include/asm/clocksource.h | 7 + arch/s390/include/asm/clp.h | 59 + arch/s390/include/asm/cmb.h | 14 + arch/s390/include/asm/cmpxchg.h | 207 + arch/s390/include/asm/compat.h | 140 + arch/s390/include/asm/cpacf.h | 551 +++ arch/s390/include/asm/cpcmd.h | 32 + arch/s390/include/asm/cpu.h | 28 + arch/s390/include/asm/cpu_mf-insn.h | 22 + arch/s390/include/asm/cpu_mf.h | 277 ++ arch/s390/include/asm/cpufeature.h | 23 + arch/s390/include/asm/cputime.h | 21 + arch/s390/include/asm/crw.h | 55 + arch/s390/include/asm/css_chars.h | 47 + arch/s390/include/asm/ctl_reg.h | 146 + arch/s390/include/asm/current.h | 19 + arch/s390/include/asm/debug.h | 490 +++ arch/s390/include/asm/delay.h | 24 + arch/s390/include/asm/diag.h | 351 ++ arch/s390/include/asm/dis.h | 30 + arch/s390/include/asm/dma.h | 14 + arch/s390/include/asm/dwarf.h | 37 + arch/s390/include/asm/eadm.h | 120 + arch/s390/include/asm/ebcdic.h | 47 + arch/s390/include/asm/elf.h | 302 ++ arch/s390/include/asm/entry-common.h | 63 + arch/s390/include/asm/exec.h | 13 + arch/s390/include/asm/extable.h | 72 + arch/s390/include/asm/extmem.h | 32 + arch/s390/include/asm/facility.h | 114 + arch/s390/include/asm/fcx.h | 312 ++ arch/s390/include/asm/fpu/api.h | 119 + arch/s390/include/asm/fpu/internal.h | 62 + arch/s390/include/asm/fpu/types.h | 38 + arch/s390/include/asm/ftrace.h | 169 + arch/s390/include/asm/ftrace.lds.h | 21 + arch/s390/include/asm/futex.h | 81 + arch/s390/include/asm/gmap.h | 188 + arch/s390/include/asm/hardirq.h | 28 + arch/s390/include/asm/hugetlb.h | 147 + arch/s390/include/asm/hw_irq.h | 11 + arch/s390/include/asm/idals.h | 244 ++ arch/s390/include/asm/idle.h | 27 + arch/s390/include/asm/io.h | 80 + arch/s390/include/asm/ipl.h | 170 + arch/s390/include/asm/irq.h | 123 + arch/s390/include/asm/irq_work.h | 12 + arch/s390/include/asm/irqflags.h | 78 + arch/s390/include/asm/isc.h | 31 + arch/s390/include/asm/itcw.h | 31 + arch/s390/include/asm/jump_label.h | 55 + arch/s390/include/asm/kasan.h | 18 + arch/s390/include/asm/kdebug.h | 28 + arch/s390/include/asm/kexec.h | 110 + arch/s390/include/asm/kfence.h | 42 + arch/s390/include/asm/kprobes.h | 81 + arch/s390/include/asm/kvm_host.h | 1060 +++++ arch/s390/include/asm/kvm_para.h | 123 + arch/s390/include/asm/linkage.h | 10 + arch/s390/include/asm/lowcore.h | 232 + arch/s390/include/asm/maccess.h | 20 + arch/s390/include/asm/mem_encrypt.h | 12 + arch/s390/include/asm/mmu.h | 45 + arch/s390/include/asm/mmu_context.h | 128 + arch/s390/include/asm/mmzone.h | 17 + arch/s390/include/asm/module.h | 41 + arch/s390/include/asm/msi.h | 17 + arch/s390/include/asm/nmi.h | 108 + arch/s390/include/asm/nospec-branch.h | 22 + arch/s390/include/asm/nospec-insn.h | 132 + arch/s390/include/asm/numa.h | 25 + arch/s390/include/asm/os_info.h | 54 + arch/s390/include/asm/page-states.h | 21 + arch/s390/include/asm/page.h | 218 + arch/s390/include/asm/pai.h | 84 + arch/s390/include/asm/pci.h | 327 ++ arch/s390/include/asm/pci_clp.h | 217 + arch/s390/include/asm/pci_debug.h | 30 + arch/s390/include/asm/pci_dma.h | 198 + arch/s390/include/asm/pci_insn.h | 159 + arch/s390/include/asm/pci_io.h | 206 + arch/s390/include/asm/percpu.h | 185 + arch/s390/include/asm/perf_event.h | 81 + arch/s390/include/asm/pfault.h | 26 + arch/s390/include/asm/pgalloc.h | 157 + arch/s390/include/asm/pgtable.h | 1872 ++++++++ arch/s390/include/asm/physmem_info.h | 172 + arch/s390/include/asm/pkey.h | 28 + arch/s390/include/asm/pnet.h | 15 + arch/s390/include/asm/preempt.h | 139 + arch/s390/include/asm/processor.h | 380 ++ arch/s390/include/asm/ptdump.h | 14 + arch/s390/include/asm/ptrace.h | 263 ++ arch/s390/include/asm/purgatory.h | 17 + arch/s390/include/asm/qdio.h | 364 ++ arch/s390/include/asm/runtime_instr.h | 28 + arch/s390/include/asm/rwonce.h | 31 + arch/s390/include/asm/schid.h | 22 + arch/s390/include/asm/sclp.h | 164 + arch/s390/include/asm/scsw.h | 1050 +++++ arch/s390/include/asm/seccomp.h | 28 + arch/s390/include/asm/sections.h | 29 + arch/s390/include/asm/set_memory.h | 66 + arch/s390/include/asm/setup.h | 161 + arch/s390/include/asm/signal.h | 26 + arch/s390/include/asm/sigp.h | 72 + arch/s390/include/asm/smp.h | 70 + arch/s390/include/asm/softirq_stack.h | 14 + arch/s390/include/asm/sparsemem.h | 8 + arch/s390/include/asm/spinlock.h | 148 + arch/s390/include/asm/spinlock_types.h | 22 + arch/s390/include/asm/stacktrace.h | 241 ++ arch/s390/include/asm/stp.h | 98 + arch/s390/include/asm/string.h | 206 + arch/s390/include/asm/switch_to.h | 49 + arch/s390/include/asm/syscall.h | 154 + arch/s390/include/asm/syscall_wrapper.h | 140 + arch/s390/include/asm/sysinfo.h | 201 + arch/s390/include/asm/text-patching.h | 16 + arch/s390/include/asm/thread_info.h | 106 + arch/s390/include/asm/timex.h | 281 ++ arch/s390/include/asm/tlb.h | 137 + arch/s390/include/asm/tlbflush.h | 129 + arch/s390/include/asm/topology.h | 101 + arch/s390/include/asm/tpi.h | 37 + arch/s390/include/asm/trace/diag.h | 44 + arch/s390/include/asm/trace/zcrypt.h | 123 + arch/s390/include/asm/types.h | 19 + arch/s390/include/asm/uaccess.h | 601 +++ arch/s390/include/asm/unistd.h | 40 + arch/s390/include/asm/unwind.h | 98 + arch/s390/include/asm/uprobes.h | 33 + arch/s390/include/asm/user.h | 71 + arch/s390/include/asm/uv.h | 517 +++ arch/s390/include/asm/vdso.h | 34 + arch/s390/include/asm/vdso/clocksource.h | 8 + arch/s390/include/asm/vdso/data.h | 13 + arch/s390/include/asm/vdso/gettimeofday.h | 63 + arch/s390/include/asm/vdso/processor.h | 7 + arch/s390/include/asm/vdso/vsyscall.h | 26 + arch/s390/include/asm/vmalloc.h | 4 + arch/s390/include/asm/vmlinux.lds.h | 33 + arch/s390/include/asm/vtime.h | 21 + arch/s390/include/asm/vtimer.h | 30 + arch/s390/include/asm/vx-insn-asm.h | 681 +++ arch/s390/include/asm/vx-insn.h | 19 + arch/s390/include/asm/xor.h | 21 + arch/s390/include/uapi/asm/Kbuild | 4 + arch/s390/include/uapi/asm/auxvec.h | 9 + arch/s390/include/uapi/asm/bitsperlong.h | 14 + arch/s390/include/uapi/asm/bpf_perf_event.h | 9 + arch/s390/include/uapi/asm/byteorder.h | 7 + arch/s390/include/uapi/asm/chpid.h | 23 + arch/s390/include/uapi/asm/chsc.h | 144 + arch/s390/include/uapi/asm/clp.h | 29 + arch/s390/include/uapi/asm/cmb.h | 54 + arch/s390/include/uapi/asm/dasd.h | 354 ++ arch/s390/include/uapi/asm/fs3270.h | 25 + arch/s390/include/uapi/asm/guarded_storage.h | 78 + arch/s390/include/uapi/asm/hwctrset.h | 51 + arch/s390/include/uapi/asm/hypfs.h | 55 + arch/s390/include/uapi/asm/ioctls.h | 9 + arch/s390/include/uapi/asm/ipcbuf.h | 34 + arch/s390/include/uapi/asm/ipl.h | 208 + arch/s390/include/uapi/asm/kvm.h | 308 ++ arch/s390/include/uapi/asm/kvm_para.h | 8 + arch/s390/include/uapi/asm/kvm_perf.h | 22 + arch/s390/include/uapi/asm/monwriter.h | 32 + arch/s390/include/uapi/asm/perf_regs.h | 44 + arch/s390/include/uapi/asm/pkey.h | 452 ++ arch/s390/include/uapi/asm/posix_types.h | 58 + arch/s390/include/uapi/asm/ptrace.h | 455 ++ arch/s390/include/uapi/asm/qeth.h | 116 + arch/s390/include/uapi/asm/raw3270.h | 75 + arch/s390/include/uapi/asm/runtime_instr.h | 74 + arch/s390/include/uapi/asm/schid.h | 20 + arch/s390/include/uapi/asm/sclp_ctl.h | 25 + arch/s390/include/uapi/asm/setup.h | 1 + arch/s390/include/uapi/asm/sie.h | 252 ++ arch/s390/include/uapi/asm/sigcontext.h | 85 + arch/s390/include/uapi/asm/signal.h | 115 + arch/s390/include/uapi/asm/stat.h | 104 + arch/s390/include/uapi/asm/statfs.h | 51 + arch/s390/include/uapi/asm/sthyi.h | 7 + arch/s390/include/uapi/asm/tape390.h | 103 + arch/s390/include/uapi/asm/types.h | 30 + arch/s390/include/uapi/asm/ucontext.h | 41 + arch/s390/include/uapi/asm/unistd.h | 17 + arch/s390/include/uapi/asm/uvdevice.h | 102 + arch/s390/include/uapi/asm/virtio-ccw.h | 18 + arch/s390/include/uapi/asm/vmcp.h | 25 + arch/s390/include/uapi/asm/vtoc.h | 214 + arch/s390/include/uapi/asm/zcrypt.h | 373 ++ arch/s390/kernel/.gitignore | 2 + arch/s390/kernel/Makefile | 85 + arch/s390/kernel/abs_lowcore.c | 46 + arch/s390/kernel/alternative.c | 75 + arch/s390/kernel/asm-offsets.c | 190 + arch/s390/kernel/audit.c | 81 + arch/s390/kernel/audit.h | 16 + arch/s390/kernel/cache.c | 170 + arch/s390/kernel/cert_store.c | 812 ++++ arch/s390/kernel/compat_audit.c | 48 + arch/s390/kernel/compat_linux.c | 289 ++ arch/s390/kernel/compat_linux.h | 101 + arch/s390/kernel/compat_ptrace.h | 64 + arch/s390/kernel/compat_signal.c | 422 ++ arch/s390/kernel/cpcmd.c | 114 + arch/s390/kernel/cpufeature.c | 46 + arch/s390/kernel/crash_dump.c | 652 +++ arch/s390/kernel/debug.c | 1574 +++++++ arch/s390/kernel/diag.c | 269 ++ arch/s390/kernel/dis.c | 590 +++ arch/s390/kernel/dumpstack.c | 224 + arch/s390/kernel/early.c | 320 ++ arch/s390/kernel/early_printk.c | 36 + arch/s390/kernel/earlypgm.S | 23 + arch/s390/kernel/ebcdic.c | 401 ++ arch/s390/kernel/entry.S | 669 +++ arch/s390/kernel/entry.h | 75 + arch/s390/kernel/fpu.c | 264 ++ arch/s390/kernel/ftrace.c | 340 ++ arch/s390/kernel/ftrace.h | 24 + arch/s390/kernel/guarded_storage.c | 128 + arch/s390/kernel/head64.S | 40 + arch/s390/kernel/idle.c | 94 + arch/s390/kernel/ima_arch.c | 14 + arch/s390/kernel/ipl.c | 2522 +++++++++++ arch/s390/kernel/ipl_vmparm.c | 38 + arch/s390/kernel/irq.c | 402 ++ arch/s390/kernel/jump_label.c | 82 + arch/s390/kernel/kdebugfs.c | 14 + arch/s390/kernel/kexec_elf.c | 136 + arch/s390/kernel/kexec_image.c | 65 + arch/s390/kernel/kprobes.c | 525 +++ arch/s390/kernel/kprobes.h | 9 + arch/s390/kernel/kprobes_insn_page.S | 22 + arch/s390/kernel/lgr.c | 187 + arch/s390/kernel/machine_kexec.c | 286 ++ arch/s390/kernel/machine_kexec_file.c | 379 ++ arch/s390/kernel/machine_kexec_reloc.c | 56 + arch/s390/kernel/mcount.S | 195 + arch/s390/kernel/module.c | 576 +++ arch/s390/kernel/nmi.c | 513 +++ arch/s390/kernel/nospec-branch.c | 156 + arch/s390/kernel/nospec-sysfs.c | 23 + arch/s390/kernel/numa.c | 35 + arch/s390/kernel/os_info.c | 173 + arch/s390/kernel/perf_cpum_cf.c | 1950 +++++++++ arch/s390/kernel/perf_cpum_cf_events.c | 909 ++++ arch/s390/kernel/perf_cpum_sf.c | 2280 ++++++++++ arch/s390/kernel/perf_event.c | 223 + arch/s390/kernel/perf_pai_crypto.c | 698 +++ arch/s390/kernel/perf_pai_ext.c | 667 +++ arch/s390/kernel/perf_regs.c | 68 + arch/s390/kernel/process.c | 240 ++ arch/s390/kernel/processor.c | 366 ++ arch/s390/kernel/ptrace.c | 1608 +++++++ arch/s390/kernel/reipl.S | 81 + arch/s390/kernel/relocate_kernel.S | 76 + arch/s390/kernel/rethook.c | 34 + arch/s390/kernel/rethook.h | 7 + arch/s390/kernel/runtime_instr.c | 102 + arch/s390/kernel/setup.c | 1012 +++++ arch/s390/kernel/signal.c | 534 +++ arch/s390/kernel/smp.c | 1317 ++++++ arch/s390/kernel/stacktrace.c | 60 + arch/s390/kernel/sthyi.c | 517 +++ arch/s390/kernel/syscall.c | 170 + arch/s390/kernel/syscalls/Makefile | 45 + arch/s390/kernel/syscalls/syscall.tbl | 457 ++ arch/s390/kernel/syscalls/syscalltbl | 232 + arch/s390/kernel/sysinfo.c | 570 +++ arch/s390/kernel/text_amode31.S | 159 + arch/s390/kernel/time.c | 944 +++++ arch/s390/kernel/topology.c | 661 +++ arch/s390/kernel/trace.c | 33 + arch/s390/kernel/traps.c | 403 ++ arch/s390/kernel/unwind_bc.c | 178 + arch/s390/kernel/uprobes.c | 388 ++ arch/s390/kernel/uv.c | 717 ++++ arch/s390/kernel/vdso.c | 258 ++ arch/s390/kernel/vdso32/.gitignore | 2 + arch/s390/kernel/vdso32/Makefile | 80 + arch/s390/kernel/vdso32/gen_vdso_offsets.sh | 15 + arch/s390/kernel/vdso32/note.S | 13 + arch/s390/kernel/vdso32/vdso32.lds.S | 142 + arch/s390/kernel/vdso32/vdso32_wrapper.S | 15 + arch/s390/kernel/vdso32/vdso_user_wrapper.S | 22 + arch/s390/kernel/vdso64/.gitignore | 2 + arch/s390/kernel/vdso64/Makefile | 89 + arch/s390/kernel/vdso64/gen_vdso_offsets.sh | 15 + arch/s390/kernel/vdso64/getcpu.c | 21 + arch/s390/kernel/vdso64/note.S | 13 + arch/s390/kernel/vdso64/vdso.h | 14 + arch/s390/kernel/vdso64/vdso64.lds.S | 146 + arch/s390/kernel/vdso64/vdso64_generic.c | 19 + arch/s390/kernel/vdso64/vdso64_wrapper.S | 15 + arch/s390/kernel/vdso64/vdso_user_wrapper.S | 57 + arch/s390/kernel/vmlinux.lds.S | 243 ++ arch/s390/kernel/vtime.c | 454 ++ arch/s390/kvm/Kconfig | 59 + arch/s390/kvm/Makefile | 14 + arch/s390/kvm/diag.c | 307 ++ arch/s390/kvm/gaccess.c | 1624 +++++++ arch/s390/kvm/gaccess.h | 458 ++ arch/s390/kvm/guestdbg.c | 626 +++ arch/s390/kvm/intercept.c | 667 +++ arch/s390/kvm/interrupt.c | 3481 +++++++++++++++ arch/s390/kvm/kvm-s390.c | 5895 ++++++++++++++++++++++++++ arch/s390/kvm/kvm-s390.h | 524 +++ arch/s390/kvm/pci.c | 704 +++ arch/s390/kvm/pci.h | 87 + arch/s390/kvm/priv.c | 1573 +++++++ arch/s390/kvm/pv.c | 894 ++++ arch/s390/kvm/sigp.c | 495 +++ arch/s390/kvm/trace-s390.h | 340 ++ arch/s390/kvm/trace.h | 462 ++ arch/s390/kvm/vsie.c | 1488 +++++++ arch/s390/lib/Makefile | 25 + arch/s390/lib/delay.c | 45 + arch/s390/lib/error-inject.c | 14 + arch/s390/lib/expoline/Makefile | 3 + arch/s390/lib/expoline/expoline.S | 12 + arch/s390/lib/find.c | 76 + arch/s390/lib/mem.S | 197 + arch/s390/lib/probes.c | 161 + arch/s390/lib/spinlock.c | 324 ++ arch/s390/lib/string.c | 348 ++ arch/s390/lib/test_kprobes.c | 75 + arch/s390/lib/test_kprobes.h | 10 + arch/s390/lib/test_kprobes_asm.S | 45 + arch/s390/lib/test_modules.c | 32 + arch/s390/lib/test_modules.h | 53 + arch/s390/lib/test_modules_helpers.c | 13 + arch/s390/lib/test_unwind.c | 522 +++ arch/s390/lib/tishift.S | 63 + arch/s390/lib/uaccess.c | 185 + arch/s390/lib/xor.c | 140 + arch/s390/mm/Makefile | 13 + arch/s390/mm/cmm.c | 431 ++ arch/s390/mm/dump_pagetables.c | 315 ++ arch/s390/mm/extable.c | 86 + arch/s390/mm/extmem.c | 660 +++ arch/s390/mm/fault.c | 708 ++++ arch/s390/mm/gmap.c | 2894 +++++++++++++ arch/s390/mm/hugetlbpage.c | 342 ++ arch/s390/mm/init.c | 313 ++ arch/s390/mm/maccess.c | 193 + arch/s390/mm/mmap.c | 210 + arch/s390/mm/page-states.c | 226 + arch/s390/mm/pageattr.c | 452 ++ arch/s390/mm/pfault.c | 248 ++ arch/s390/mm/pgalloc.c | 757 ++++ arch/s390/mm/pgtable.c | 1207 ++++++ arch/s390/mm/vmem.c | 669 +++ arch/s390/net/Makefile | 6 + arch/s390/net/bpf_jit.h | 55 + arch/s390/net/bpf_jit_comp.c | 2537 +++++++++++ arch/s390/net/pnet.c | 91 + arch/s390/pci/Makefile | 9 + arch/s390/pci/pci.c | 1135 +++++ arch/s390/pci/pci_bus.c | 379 ++ arch/s390/pci/pci_bus.h | 42 + arch/s390/pci/pci_clp.c | 669 +++ arch/s390/pci/pci_debug.c | 210 + arch/s390/pci/pci_dma.c | 746 ++++ arch/s390/pci/pci_event.c | 389 ++ arch/s390/pci/pci_insn.c | 443 ++ arch/s390/pci/pci_iov.c | 99 + arch/s390/pci/pci_iov.h | 30 + arch/s390/pci/pci_irq.c | 530 +++ arch/s390/pci/pci_kvm_hook.c | 11 + arch/s390/pci/pci_mmio.c | 335 ++ arch/s390/pci/pci_sysfs.c | 239 ++ arch/s390/purgatory/.gitignore | 5 + arch/s390/purgatory/Makefile | 54 + arch/s390/purgatory/head.S | 265 ++ arch/s390/purgatory/kexec-purgatory.S | 12 + arch/s390/purgatory/purgatory.c | 33 + arch/s390/purgatory/purgatory.lds.S | 54 + arch/s390/purgatory/string.c | 3 + arch/s390/tools/.gitignore | 3 + arch/s390/tools/Makefile | 27 + arch/s390/tools/gcc-thunk-extern.sh | 24 + arch/s390/tools/gen_facilities.c | 168 + arch/s390/tools/gen_opcode_table.c | 337 ++ arch/s390/tools/opcodes.txt | 1250 ++++++ 490 files changed, 118141 insertions(+) create mode 100644 arch/s390/Kbuild create mode 100644 arch/s390/Kconfig create mode 100644 arch/s390/Kconfig.debug create mode 100644 arch/s390/Makefile create mode 100644 arch/s390/appldata/Makefile create mode 100644 arch/s390/appldata/appldata.h create mode 100644 arch/s390/appldata/appldata_base.c create mode 100644 arch/s390/appldata/appldata_mem.c create mode 100644 arch/s390/appldata/appldata_net_sum.c create mode 100644 arch/s390/appldata/appldata_os.c create mode 100644 arch/s390/boot/.gitignore create mode 100644 arch/s390/boot/Makefile create mode 100644 arch/s390/boot/als.c create mode 100644 arch/s390/boot/boot.h create mode 100644 arch/s390/boot/clz_ctz.c create mode 100644 arch/s390/boot/cmdline.c create mode 100644 arch/s390/boot/ctype.c create mode 100644 arch/s390/boot/decompressor.c create mode 100644 arch/s390/boot/decompressor.h create mode 100644 arch/s390/boot/ebcdic.c create mode 100644 arch/s390/boot/head.S create mode 100644 arch/s390/boot/head_kdump.S create mode 100755 arch/s390/boot/install.sh create mode 100644 arch/s390/boot/ipl_data.c create mode 100644 arch/s390/boot/ipl_parm.c create mode 100644 arch/s390/boot/ipl_report.c create mode 100644 arch/s390/boot/ipl_vmparm.c create mode 100644 arch/s390/boot/kaslr.c create mode 100644 arch/s390/boot/machine_kexec_reloc.c create mode 100644 arch/s390/boot/mem.S create mode 100644 arch/s390/boot/pgm_check_info.c create mode 100644 arch/s390/boot/physmem_info.c create mode 100644 arch/s390/boot/sclp_early_core.c create mode 100644 arch/s390/boot/startup.c create mode 100644 arch/s390/boot/string.c create mode 100644 arch/s390/boot/uv.c create mode 100644 arch/s390/boot/uv.h create mode 100644 arch/s390/boot/version.c create mode 100644 arch/s390/boot/vmem.c create mode 100644 arch/s390/boot/vmlinux.lds.S create mode 100644 arch/s390/configs/btf.config create mode 100644 arch/s390/configs/debug_defconfig create mode 100644 arch/s390/configs/defconfig create mode 100644 arch/s390/configs/kasan.config create mode 100644 arch/s390/configs/zfcpdump_defconfig create mode 100644 arch/s390/crypto/Kconfig create mode 100644 arch/s390/crypto/Makefile create mode 100644 arch/s390/crypto/aes_s390.c create mode 100644 arch/s390/crypto/arch_random.c create mode 100644 arch/s390/crypto/chacha-glue.c create mode 100644 arch/s390/crypto/chacha-s390.S create mode 100644 arch/s390/crypto/chacha-s390.h create mode 100644 arch/s390/crypto/crc32-vx.c create mode 100644 arch/s390/crypto/crc32be-vx.S create mode 100644 arch/s390/crypto/crc32le-vx.S create mode 100644 arch/s390/crypto/des_s390.c create mode 100644 arch/s390/crypto/ghash_s390.c create mode 100644 arch/s390/crypto/paes_s390.c create mode 100644 arch/s390/crypto/prng.c create mode 100644 arch/s390/crypto/sha.h create mode 100644 arch/s390/crypto/sha1_s390.c create mode 100644 arch/s390/crypto/sha256_s390.c create mode 100644 arch/s390/crypto/sha3_256_s390.c create mode 100644 arch/s390/crypto/sha3_512_s390.c create mode 100644 arch/s390/crypto/sha512_s390.c create mode 100644 arch/s390/crypto/sha_common.c create mode 100644 arch/s390/hypfs/Makefile create mode 100644 arch/s390/hypfs/hypfs.h create mode 100644 arch/s390/hypfs/hypfs_dbfs.c create mode 100644 arch/s390/hypfs/hypfs_diag.c create mode 100644 arch/s390/hypfs/hypfs_diag.h create mode 100644 arch/s390/hypfs/hypfs_diag0c.c create mode 100644 arch/s390/hypfs/hypfs_diag_fs.c create mode 100644 arch/s390/hypfs/hypfs_sprp.c create mode 100644 arch/s390/hypfs/hypfs_vm.c create mode 100644 arch/s390/hypfs/hypfs_vm.h create mode 100644 arch/s390/hypfs/hypfs_vm_fs.c create mode 100644 arch/s390/hypfs/inode.c create mode 100644 arch/s390/include/asm/Kbuild create mode 100644 arch/s390/include/asm/abs_lowcore.h create mode 100644 arch/s390/include/asm/airq.h create mode 100644 arch/s390/include/asm/alternative-asm.h create mode 100644 arch/s390/include/asm/alternative.h create mode 100644 arch/s390/include/asm/ap.h create mode 100644 arch/s390/include/asm/appldata.h create mode 100644 arch/s390/include/asm/archrandom.h create mode 100644 arch/s390/include/asm/asm-const.h create mode 100644 arch/s390/include/asm/asm-extable.h create mode 100644 arch/s390/include/asm/asm-prototypes.h create mode 100644 arch/s390/include/asm/atomic.h create mode 100644 arch/s390/include/asm/atomic_ops.h create mode 100644 arch/s390/include/asm/barrier.h create mode 100644 arch/s390/include/asm/bitops.h create mode 100644 arch/s390/include/asm/boot_data.h create mode 100644 arch/s390/include/asm/bug.h create mode 100644 arch/s390/include/asm/cache.h create mode 100644 arch/s390/include/asm/ccwdev.h create mode 100644 arch/s390/include/asm/ccwgroup.h create mode 100644 arch/s390/include/asm/checksum.h create mode 100644 arch/s390/include/asm/chpid.h create mode 100644 arch/s390/include/asm/chsc.h create mode 100644 arch/s390/include/asm/cio.h create mode 100644 arch/s390/include/asm/clocksource.h create mode 100644 arch/s390/include/asm/clp.h create mode 100644 arch/s390/include/asm/cmb.h create mode 100644 arch/s390/include/asm/cmpxchg.h create mode 100644 arch/s390/include/asm/compat.h create mode 100644 arch/s390/include/asm/cpacf.h create mode 100644 arch/s390/include/asm/cpcmd.h create mode 100644 arch/s390/include/asm/cpu.h create mode 100644 arch/s390/include/asm/cpu_mf-insn.h create mode 100644 arch/s390/include/asm/cpu_mf.h create mode 100644 arch/s390/include/asm/cpufeature.h create mode 100644 arch/s390/include/asm/cputime.h create mode 100644 arch/s390/include/asm/crw.h create mode 100644 arch/s390/include/asm/css_chars.h create mode 100644 arch/s390/include/asm/ctl_reg.h create mode 100644 arch/s390/include/asm/current.h create mode 100644 arch/s390/include/asm/debug.h create mode 100644 arch/s390/include/asm/delay.h create mode 100644 arch/s390/include/asm/diag.h create mode 100644 arch/s390/include/asm/dis.h create mode 100644 arch/s390/include/asm/dma.h create mode 100644 arch/s390/include/asm/dwarf.h create mode 100644 arch/s390/include/asm/eadm.h create mode 100644 arch/s390/include/asm/ebcdic.h create mode 100644 arch/s390/include/asm/elf.h create mode 100644 arch/s390/include/asm/entry-common.h create mode 100644 arch/s390/include/asm/exec.h create mode 100644 arch/s390/include/asm/extable.h create mode 100644 arch/s390/include/asm/extmem.h create mode 100644 arch/s390/include/asm/facility.h create mode 100644 arch/s390/include/asm/fcx.h create mode 100644 arch/s390/include/asm/fpu/api.h create mode 100644 arch/s390/include/asm/fpu/internal.h create mode 100644 arch/s390/include/asm/fpu/types.h create mode 100644 arch/s390/include/asm/ftrace.h create mode 100644 arch/s390/include/asm/ftrace.lds.h create mode 100644 arch/s390/include/asm/futex.h create mode 100644 arch/s390/include/asm/gmap.h create mode 100644 arch/s390/include/asm/hardirq.h create mode 100644 arch/s390/include/asm/hugetlb.h create mode 100644 arch/s390/include/asm/hw_irq.h create mode 100644 arch/s390/include/asm/idals.h create mode 100644 arch/s390/include/asm/idle.h create mode 100644 arch/s390/include/asm/io.h create mode 100644 arch/s390/include/asm/ipl.h create mode 100644 arch/s390/include/asm/irq.h create mode 100644 arch/s390/include/asm/irq_work.h create mode 100644 arch/s390/include/asm/irqflags.h create mode 100644 arch/s390/include/asm/isc.h create mode 100644 arch/s390/include/asm/itcw.h create mode 100644 arch/s390/include/asm/jump_label.h create mode 100644 arch/s390/include/asm/kasan.h create mode 100644 arch/s390/include/asm/kdebug.h create mode 100644 arch/s390/include/asm/kexec.h create mode 100644 arch/s390/include/asm/kfence.h create mode 100644 arch/s390/include/asm/kprobes.h create mode 100644 arch/s390/include/asm/kvm_host.h create mode 100644 arch/s390/include/asm/kvm_para.h create mode 100644 arch/s390/include/asm/linkage.h create mode 100644 arch/s390/include/asm/lowcore.h create mode 100644 arch/s390/include/asm/maccess.h create mode 100644 arch/s390/include/asm/mem_encrypt.h create mode 100644 arch/s390/include/asm/mmu.h create mode 100644 arch/s390/include/asm/mmu_context.h create mode 100644 arch/s390/include/asm/mmzone.h create mode 100644 arch/s390/include/asm/module.h create mode 100644 arch/s390/include/asm/msi.h create mode 100644 arch/s390/include/asm/nmi.h create mode 100644 arch/s390/include/asm/nospec-branch.h create mode 100644 arch/s390/include/asm/nospec-insn.h create mode 100644 arch/s390/include/asm/numa.h create mode 100644 arch/s390/include/asm/os_info.h create mode 100644 arch/s390/include/asm/page-states.h create mode 100644 arch/s390/include/asm/page.h create mode 100644 arch/s390/include/asm/pai.h create mode 100644 arch/s390/include/asm/pci.h create mode 100644 arch/s390/include/asm/pci_clp.h create mode 100644 arch/s390/include/asm/pci_debug.h create mode 100644 arch/s390/include/asm/pci_dma.h create mode 100644 arch/s390/include/asm/pci_insn.h create mode 100644 arch/s390/include/asm/pci_io.h create mode 100644 arch/s390/include/asm/percpu.h create mode 100644 arch/s390/include/asm/perf_event.h create mode 100644 arch/s390/include/asm/pfault.h create mode 100644 arch/s390/include/asm/pgalloc.h create mode 100644 arch/s390/include/asm/pgtable.h create mode 100644 arch/s390/include/asm/physmem_info.h create mode 100644 arch/s390/include/asm/pkey.h create mode 100644 arch/s390/include/asm/pnet.h create mode 100644 arch/s390/include/asm/preempt.h create mode 100644 arch/s390/include/asm/processor.h create mode 100644 arch/s390/include/asm/ptdump.h create mode 100644 arch/s390/include/asm/ptrace.h create mode 100644 arch/s390/include/asm/purgatory.h create mode 100644 arch/s390/include/asm/qdio.h create mode 100644 arch/s390/include/asm/runtime_instr.h create mode 100644 arch/s390/include/asm/rwonce.h create mode 100644 arch/s390/include/asm/schid.h create mode 100644 arch/s390/include/asm/sclp.h create mode 100644 arch/s390/include/asm/scsw.h create mode 100644 arch/s390/include/asm/seccomp.h create mode 100644 arch/s390/include/asm/sections.h create mode 100644 arch/s390/include/asm/set_memory.h create mode 100644 arch/s390/include/asm/setup.h create mode 100644 arch/s390/include/asm/signal.h create mode 100644 arch/s390/include/asm/sigp.h create mode 100644 arch/s390/include/asm/smp.h create mode 100644 arch/s390/include/asm/softirq_stack.h create mode 100644 arch/s390/include/asm/sparsemem.h create mode 100644 arch/s390/include/asm/spinlock.h create mode 100644 arch/s390/include/asm/spinlock_types.h create mode 100644 arch/s390/include/asm/stacktrace.h create mode 100644 arch/s390/include/asm/stp.h create mode 100644 arch/s390/include/asm/string.h create mode 100644 arch/s390/include/asm/switch_to.h create mode 100644 arch/s390/include/asm/syscall.h create mode 100644 arch/s390/include/asm/syscall_wrapper.h create mode 100644 arch/s390/include/asm/sysinfo.h create mode 100644 arch/s390/include/asm/text-patching.h create mode 100644 arch/s390/include/asm/thread_info.h create mode 100644 arch/s390/include/asm/timex.h create mode 100644 arch/s390/include/asm/tlb.h create mode 100644 arch/s390/include/asm/tlbflush.h create mode 100644 arch/s390/include/asm/topology.h create mode 100644 arch/s390/include/asm/tpi.h create mode 100644 arch/s390/include/asm/trace/diag.h create mode 100644 arch/s390/include/asm/trace/zcrypt.h create mode 100644 arch/s390/include/asm/types.h create mode 100644 arch/s390/include/asm/uaccess.h create mode 100644 arch/s390/include/asm/unistd.h create mode 100644 arch/s390/include/asm/unwind.h create mode 100644 arch/s390/include/asm/uprobes.h create mode 100644 arch/s390/include/asm/user.h create mode 100644 arch/s390/include/asm/uv.h create mode 100644 arch/s390/include/asm/vdso.h create mode 100644 arch/s390/include/asm/vdso/clocksource.h create mode 100644 arch/s390/include/asm/vdso/data.h create mode 100644 arch/s390/include/asm/vdso/gettimeofday.h create mode 100644 arch/s390/include/asm/vdso/processor.h create mode 100644 arch/s390/include/asm/vdso/vsyscall.h create mode 100644 arch/s390/include/asm/vmalloc.h create mode 100644 arch/s390/include/asm/vmlinux.lds.h create mode 100644 arch/s390/include/asm/vtime.h create mode 100644 arch/s390/include/asm/vtimer.h create mode 100644 arch/s390/include/asm/vx-insn-asm.h create mode 100644 arch/s390/include/asm/vx-insn.h create mode 100644 arch/s390/include/asm/xor.h create mode 100644 arch/s390/include/uapi/asm/Kbuild create mode 100644 arch/s390/include/uapi/asm/auxvec.h create mode 100644 arch/s390/include/uapi/asm/bitsperlong.h create mode 100644 arch/s390/include/uapi/asm/bpf_perf_event.h create mode 100644 arch/s390/include/uapi/asm/byteorder.h create mode 100644 arch/s390/include/uapi/asm/chpid.h create mode 100644 arch/s390/include/uapi/asm/chsc.h create mode 100644 arch/s390/include/uapi/asm/clp.h create mode 100644 arch/s390/include/uapi/asm/cmb.h create mode 100644 arch/s390/include/uapi/asm/dasd.h create mode 100644 arch/s390/include/uapi/asm/fs3270.h create mode 100644 arch/s390/include/uapi/asm/guarded_storage.h create mode 100644 arch/s390/include/uapi/asm/hwctrset.h create mode 100644 arch/s390/include/uapi/asm/hypfs.h create mode 100644 arch/s390/include/uapi/asm/ioctls.h create mode 100644 arch/s390/include/uapi/asm/ipcbuf.h create mode 100644 arch/s390/include/uapi/asm/ipl.h create mode 100644 arch/s390/include/uapi/asm/kvm.h create mode 100644 arch/s390/include/uapi/asm/kvm_para.h create mode 100644 arch/s390/include/uapi/asm/kvm_perf.h create mode 100644 arch/s390/include/uapi/asm/monwriter.h create mode 100644 arch/s390/include/uapi/asm/perf_regs.h create mode 100644 arch/s390/include/uapi/asm/pkey.h create mode 100644 arch/s390/include/uapi/asm/posix_types.h create mode 100644 arch/s390/include/uapi/asm/ptrace.h create mode 100644 arch/s390/include/uapi/asm/qeth.h create mode 100644 arch/s390/include/uapi/asm/raw3270.h create mode 100644 arch/s390/include/uapi/asm/runtime_instr.h create mode 100644 arch/s390/include/uapi/asm/schid.h create mode 100644 arch/s390/include/uapi/asm/sclp_ctl.h create mode 100644 arch/s390/include/uapi/asm/setup.h create mode 100644 arch/s390/include/uapi/asm/sie.h create mode 100644 arch/s390/include/uapi/asm/sigcontext.h create mode 100644 arch/s390/include/uapi/asm/signal.h create mode 100644 arch/s390/include/uapi/asm/stat.h create mode 100644 arch/s390/include/uapi/asm/statfs.h create mode 100644 arch/s390/include/uapi/asm/sthyi.h create mode 100644 arch/s390/include/uapi/asm/tape390.h create mode 100644 arch/s390/include/uapi/asm/types.h create mode 100644 arch/s390/include/uapi/asm/ucontext.h create mode 100644 arch/s390/include/uapi/asm/unistd.h create mode 100644 arch/s390/include/uapi/asm/uvdevice.h create mode 100644 arch/s390/include/uapi/asm/virtio-ccw.h create mode 100644 arch/s390/include/uapi/asm/vmcp.h create mode 100644 arch/s390/include/uapi/asm/vtoc.h create mode 100644 arch/s390/include/uapi/asm/zcrypt.h create mode 100644 arch/s390/kernel/.gitignore create mode 100644 arch/s390/kernel/Makefile create mode 100644 arch/s390/kernel/abs_lowcore.c create mode 100644 arch/s390/kernel/alternative.c create mode 100644 arch/s390/kernel/asm-offsets.c create mode 100644 arch/s390/kernel/audit.c create mode 100644 arch/s390/kernel/audit.h create mode 100644 arch/s390/kernel/cache.c create mode 100644 arch/s390/kernel/cert_store.c create mode 100644 arch/s390/kernel/compat_audit.c create mode 100644 arch/s390/kernel/compat_linux.c create mode 100644 arch/s390/kernel/compat_linux.h create mode 100644 arch/s390/kernel/compat_ptrace.h create mode 100644 arch/s390/kernel/compat_signal.c create mode 100644 arch/s390/kernel/cpcmd.c create mode 100644 arch/s390/kernel/cpufeature.c create mode 100644 arch/s390/kernel/crash_dump.c create mode 100644 arch/s390/kernel/debug.c create mode 100644 arch/s390/kernel/diag.c create mode 100644 arch/s390/kernel/dis.c create mode 100644 arch/s390/kernel/dumpstack.c create mode 100644 arch/s390/kernel/early.c create mode 100644 arch/s390/kernel/early_printk.c create mode 100644 arch/s390/kernel/earlypgm.S create mode 100644 arch/s390/kernel/ebcdic.c create mode 100644 arch/s390/kernel/entry.S create mode 100644 arch/s390/kernel/entry.h create mode 100644 arch/s390/kernel/fpu.c create mode 100644 arch/s390/kernel/ftrace.c create mode 100644 arch/s390/kernel/ftrace.h create mode 100644 arch/s390/kernel/guarded_storage.c create mode 100644 arch/s390/kernel/head64.S create mode 100644 arch/s390/kernel/idle.c create mode 100644 arch/s390/kernel/ima_arch.c create mode 100644 arch/s390/kernel/ipl.c create mode 100644 arch/s390/kernel/ipl_vmparm.c create mode 100644 arch/s390/kernel/irq.c create mode 100644 arch/s390/kernel/jump_label.c create mode 100644 arch/s390/kernel/kdebugfs.c create mode 100644 arch/s390/kernel/kexec_elf.c create mode 100644 arch/s390/kernel/kexec_image.c create mode 100644 arch/s390/kernel/kprobes.c create mode 100644 arch/s390/kernel/kprobes.h create mode 100644 arch/s390/kernel/kprobes_insn_page.S create mode 100644 arch/s390/kernel/lgr.c create mode 100644 arch/s390/kernel/machine_kexec.c create mode 100644 arch/s390/kernel/machine_kexec_file.c create mode 100644 arch/s390/kernel/machine_kexec_reloc.c create mode 100644 arch/s390/kernel/mcount.S create mode 100644 arch/s390/kernel/module.c create mode 100644 arch/s390/kernel/nmi.c create mode 100644 arch/s390/kernel/nospec-branch.c create mode 100644 arch/s390/kernel/nospec-sysfs.c create mode 100644 arch/s390/kernel/numa.c create mode 100644 arch/s390/kernel/os_info.c create mode 100644 arch/s390/kernel/perf_cpum_cf.c create mode 100644 arch/s390/kernel/perf_cpum_cf_events.c create mode 100644 arch/s390/kernel/perf_cpum_sf.c create mode 100644 arch/s390/kernel/perf_event.c create mode 100644 arch/s390/kernel/perf_pai_crypto.c create mode 100644 arch/s390/kernel/perf_pai_ext.c create mode 100644 arch/s390/kernel/perf_regs.c create mode 100644 arch/s390/kernel/process.c create mode 100644 arch/s390/kernel/processor.c create mode 100644 arch/s390/kernel/ptrace.c create mode 100644 arch/s390/kernel/reipl.S create mode 100644 arch/s390/kernel/relocate_kernel.S create mode 100644 arch/s390/kernel/rethook.c create mode 100644 arch/s390/kernel/rethook.h create mode 100644 arch/s390/kernel/runtime_instr.c create mode 100644 arch/s390/kernel/setup.c create mode 100644 arch/s390/kernel/signal.c create mode 100644 arch/s390/kernel/smp.c create mode 100644 arch/s390/kernel/stacktrace.c create mode 100644 arch/s390/kernel/sthyi.c create mode 100644 arch/s390/kernel/syscall.c create mode 100644 arch/s390/kernel/syscalls/Makefile create mode 100644 arch/s390/kernel/syscalls/syscall.tbl create mode 100755 arch/s390/kernel/syscalls/syscalltbl create mode 100644 arch/s390/kernel/sysinfo.c create mode 100644 arch/s390/kernel/text_amode31.S create mode 100644 arch/s390/kernel/time.c create mode 100644 arch/s390/kernel/topology.c create mode 100644 arch/s390/kernel/trace.c create mode 100644 arch/s390/kernel/traps.c create mode 100644 arch/s390/kernel/unwind_bc.c create mode 100644 arch/s390/kernel/uprobes.c create mode 100644 arch/s390/kernel/uv.c create mode 100644 arch/s390/kernel/vdso.c create mode 100644 arch/s390/kernel/vdso32/.gitignore create mode 100644 arch/s390/kernel/vdso32/Makefile create mode 100755 arch/s390/kernel/vdso32/gen_vdso_offsets.sh create mode 100644 arch/s390/kernel/vdso32/note.S create mode 100644 arch/s390/kernel/vdso32/vdso32.lds.S create mode 100644 arch/s390/kernel/vdso32/vdso32_wrapper.S create mode 100644 arch/s390/kernel/vdso32/vdso_user_wrapper.S create mode 100644 arch/s390/kernel/vdso64/.gitignore create mode 100644 arch/s390/kernel/vdso64/Makefile create mode 100755 arch/s390/kernel/vdso64/gen_vdso_offsets.sh create mode 100644 arch/s390/kernel/vdso64/getcpu.c create mode 100644 arch/s390/kernel/vdso64/note.S create mode 100644 arch/s390/kernel/vdso64/vdso.h create mode 100644 arch/s390/kernel/vdso64/vdso64.lds.S create mode 100644 arch/s390/kernel/vdso64/vdso64_generic.c create mode 100644 arch/s390/kernel/vdso64/vdso64_wrapper.S create mode 100644 arch/s390/kernel/vdso64/vdso_user_wrapper.S create mode 100644 arch/s390/kernel/vmlinux.lds.S create mode 100644 arch/s390/kernel/vtime.c create mode 100644 arch/s390/kvm/Kconfig create mode 100644 arch/s390/kvm/Makefile create mode 100644 arch/s390/kvm/diag.c create mode 100644 arch/s390/kvm/gaccess.c create mode 100644 arch/s390/kvm/gaccess.h create mode 100644 arch/s390/kvm/guestdbg.c create mode 100644 arch/s390/kvm/intercept.c create mode 100644 arch/s390/kvm/interrupt.c create mode 100644 arch/s390/kvm/kvm-s390.c create mode 100644 arch/s390/kvm/kvm-s390.h create mode 100644 arch/s390/kvm/pci.c create mode 100644 arch/s390/kvm/pci.h create mode 100644 arch/s390/kvm/priv.c create mode 100644 arch/s390/kvm/pv.c create mode 100644 arch/s390/kvm/sigp.c create mode 100644 arch/s390/kvm/trace-s390.h create mode 100644 arch/s390/kvm/trace.h create mode 100644 arch/s390/kvm/vsie.c create mode 100644 arch/s390/lib/Makefile create mode 100644 arch/s390/lib/delay.c create mode 100644 arch/s390/lib/error-inject.c create mode 100644 arch/s390/lib/expoline/Makefile create mode 100644 arch/s390/lib/expoline/expoline.S create mode 100644 arch/s390/lib/find.c create mode 100644 arch/s390/lib/mem.S create mode 100644 arch/s390/lib/probes.c create mode 100644 arch/s390/lib/spinlock.c create mode 100644 arch/s390/lib/string.c create mode 100644 arch/s390/lib/test_kprobes.c create mode 100644 arch/s390/lib/test_kprobes.h create mode 100644 arch/s390/lib/test_kprobes_asm.S create mode 100644 arch/s390/lib/test_modules.c create mode 100644 arch/s390/lib/test_modules.h create mode 100644 arch/s390/lib/test_modules_helpers.c create mode 100644 arch/s390/lib/test_unwind.c create mode 100644 arch/s390/lib/tishift.S create mode 100644 arch/s390/lib/uaccess.c create mode 100644 arch/s390/lib/xor.c create mode 100644 arch/s390/mm/Makefile create mode 100644 arch/s390/mm/cmm.c create mode 100644 arch/s390/mm/dump_pagetables.c create mode 100644 arch/s390/mm/extable.c create mode 100644 arch/s390/mm/extmem.c create mode 100644 arch/s390/mm/fault.c create mode 100644 arch/s390/mm/gmap.c create mode 100644 arch/s390/mm/hugetlbpage.c create mode 100644 arch/s390/mm/init.c create mode 100644 arch/s390/mm/maccess.c create mode 100644 arch/s390/mm/mmap.c create mode 100644 arch/s390/mm/page-states.c create mode 100644 arch/s390/mm/pageattr.c create mode 100644 arch/s390/mm/pfault.c create mode 100644 arch/s390/mm/pgalloc.c create mode 100644 arch/s390/mm/pgtable.c create mode 100644 arch/s390/mm/vmem.c create mode 100644 arch/s390/net/Makefile create mode 100644 arch/s390/net/bpf_jit.h create mode 100644 arch/s390/net/bpf_jit_comp.c create mode 100644 arch/s390/net/pnet.c create mode 100644 arch/s390/pci/Makefile create mode 100644 arch/s390/pci/pci.c create mode 100644 arch/s390/pci/pci_bus.c create mode 100644 arch/s390/pci/pci_bus.h create mode 100644 arch/s390/pci/pci_clp.c create mode 100644 arch/s390/pci/pci_debug.c create mode 100644 arch/s390/pci/pci_dma.c create mode 100644 arch/s390/pci/pci_event.c create mode 100644 arch/s390/pci/pci_insn.c create mode 100644 arch/s390/pci/pci_iov.c create mode 100644 arch/s390/pci/pci_iov.h create mode 100644 arch/s390/pci/pci_irq.c create mode 100644 arch/s390/pci/pci_kvm_hook.c create mode 100644 arch/s390/pci/pci_mmio.c create mode 100644 arch/s390/pci/pci_sysfs.c create mode 100644 arch/s390/purgatory/.gitignore create mode 100644 arch/s390/purgatory/Makefile create mode 100644 arch/s390/purgatory/head.S create mode 100644 arch/s390/purgatory/kexec-purgatory.S create mode 100644 arch/s390/purgatory/purgatory.c create mode 100644 arch/s390/purgatory/purgatory.lds.S create mode 100644 arch/s390/purgatory/string.c create mode 100644 arch/s390/tools/.gitignore create mode 100644 arch/s390/tools/Makefile create mode 100755 arch/s390/tools/gcc-thunk-extern.sh create mode 100644 arch/s390/tools/gen_facilities.c create mode 100644 arch/s390/tools/gen_opcode_table.c create mode 100644 arch/s390/tools/opcodes.txt (limited to 'arch/s390') diff --git a/arch/s390/Kbuild b/arch/s390/Kbuild new file mode 100644 index 0000000000..a5d3503b35 --- /dev/null +++ b/arch/s390/Kbuild @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-y += kernel/ +obj-y += mm/ +obj-$(CONFIG_KVM) += kvm/ +obj-y += crypto/ +obj-$(CONFIG_S390_HYPFS) += hypfs/ +obj-$(CONFIG_APPLDATA_BASE) += appldata/ +obj-y += net/ +obj-$(CONFIG_PCI) += pci/ +obj-$(CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY) += purgatory/ + +# for cleaning +subdir- += boot tools diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig new file mode 100644 index 0000000000..bd4782f23f --- /dev/null +++ b/arch/s390/Kconfig @@ -0,0 +1,935 @@ +# SPDX-License-Identifier: GPL-2.0 +config MMU + def_bool y + +config CPU_BIG_ENDIAN + def_bool y + +config LOCKDEP_SUPPORT + def_bool y + +config STACKTRACE_SUPPORT + def_bool y + +config ARCH_HAS_ILOG2_U32 + def_bool n + +config ARCH_HAS_ILOG2_U64 + def_bool n + +config GENERIC_HWEIGHT + def_bool y + +config GENERIC_BUG + def_bool y if BUG + +config GENERIC_BUG_RELATIVE_POINTERS + def_bool y + +config GENERIC_LOCKBREAK + def_bool y if PREEMPTION + +config PGSTE + def_bool y if KVM + +config AUDIT_ARCH + def_bool y + +config NO_IOPORT_MAP + def_bool y + +config PCI_QUIRKS + def_bool n + +config ARCH_SUPPORTS_UPROBES + def_bool y + +config KASAN_SHADOW_OFFSET + hex + depends on KASAN + default 0x1C000000000000 + +config S390 + def_bool y + # + # Note: keep this list sorted alphabetically + # + imply IMA_SECURE_AND_OR_TRUSTED_BOOT + select ALTERNATE_USER_ADDRESS_SPACE + select ARCH_32BIT_USTAT_F_TINODE + select ARCH_BINFMT_ELF_STATE + select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE + select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM + select ARCH_ENABLE_MEMORY_HOTREMOVE + select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2 + select ARCH_HAS_CURRENT_STACK_POINTER + select ARCH_HAS_DEBUG_VM_PGTABLE + select ARCH_HAS_DEBUG_WX + select ARCH_HAS_DEVMEM_IS_ALLOWED + select ARCH_HAS_ELF_RANDOMIZE + select ARCH_HAS_FORCE_DMA_UNENCRYPTED + select ARCH_HAS_FORTIFY_SOURCE + select ARCH_HAS_GCOV_PROFILE_ALL + select ARCH_HAS_GIGANTIC_PAGE + select ARCH_HAS_KCOV + select ARCH_HAS_MEMBARRIER_SYNC_CORE + select ARCH_HAS_MEM_ENCRYPT + select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS + select ARCH_HAS_PTE_SPECIAL + select ARCH_HAS_SCALED_CPUTIME + select ARCH_HAS_SET_DIRECT_MAP + select ARCH_HAS_SET_MEMORY + select ARCH_HAS_STRICT_KERNEL_RWX + select ARCH_HAS_STRICT_MODULE_RWX + select ARCH_HAS_SYSCALL_WRAPPER + select ARCH_HAS_UBSAN_SANITIZE_ALL + select ARCH_HAS_VDSO_DATA + select ARCH_HAVE_NMI_SAFE_CMPXCHG + select ARCH_INLINE_READ_LOCK + select ARCH_INLINE_READ_LOCK_BH + select ARCH_INLINE_READ_LOCK_IRQ + select ARCH_INLINE_READ_LOCK_IRQSAVE + select ARCH_INLINE_READ_TRYLOCK + select ARCH_INLINE_READ_UNLOCK + select ARCH_INLINE_READ_UNLOCK_BH + select ARCH_INLINE_READ_UNLOCK_IRQ + select ARCH_INLINE_READ_UNLOCK_IRQRESTORE + select ARCH_INLINE_SPIN_LOCK + select ARCH_INLINE_SPIN_LOCK_BH + select ARCH_INLINE_SPIN_LOCK_IRQ + select ARCH_INLINE_SPIN_LOCK_IRQSAVE + select ARCH_INLINE_SPIN_TRYLOCK + select ARCH_INLINE_SPIN_TRYLOCK_BH + select ARCH_INLINE_SPIN_UNLOCK + select ARCH_INLINE_SPIN_UNLOCK_BH + select ARCH_INLINE_SPIN_UNLOCK_IRQ + select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE + select ARCH_INLINE_WRITE_LOCK + select ARCH_INLINE_WRITE_LOCK_BH + select ARCH_INLINE_WRITE_LOCK_IRQ + select ARCH_INLINE_WRITE_LOCK_IRQSAVE + select ARCH_INLINE_WRITE_TRYLOCK + select ARCH_INLINE_WRITE_UNLOCK + select ARCH_INLINE_WRITE_UNLOCK_BH + select ARCH_INLINE_WRITE_UNLOCK_IRQ + select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE + select ARCH_STACKWALK + select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_DEBUG_PAGEALLOC + select ARCH_SUPPORTS_HUGETLBFS + select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 && CC_IS_CLANG + select ARCH_SUPPORTS_NUMA_BALANCING + select ARCH_SUPPORTS_PER_VMA_LOCK + select ARCH_USE_BUILTIN_BSWAP + select ARCH_USE_CMPXCHG_LOCKREF + select ARCH_USE_SYM_ANNOTATIONS + select ARCH_WANTS_DYNAMIC_TASK_STRUCT + select ARCH_WANTS_NO_INSTR + select ARCH_WANT_DEFAULT_BPF_JIT + select ARCH_WANT_IPC_PARSE_VERSION + select ARCH_WANT_KERNEL_PMD_MKWRITE + select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP + select BUILDTIME_TABLE_SORT + select CLONE_BACKWARDS2 + select DMA_OPS if PCI + select DYNAMIC_FTRACE if FUNCTION_TRACER + select FUNCTION_ALIGNMENT_8B if CC_IS_GCC + select FUNCTION_ALIGNMENT_16B if !CC_IS_GCC + select GENERIC_ALLOCATOR + select GENERIC_CPU_AUTOPROBE + select GENERIC_CPU_VULNERABILITIES + select GENERIC_ENTRY + select GENERIC_GETTIMEOFDAY + select GENERIC_PTDUMP + select GENERIC_SMP_IDLE_THREAD + select GENERIC_TIME_VSYSCALL + select GENERIC_VDSO_TIME_NS + select GENERIC_IOREMAP if PCI + select HAVE_ALIGNED_STRUCT_PAGE if SLUB + select HAVE_ARCH_AUDITSYSCALL + select HAVE_ARCH_JUMP_LABEL + select HAVE_ARCH_JUMP_LABEL_RELATIVE + select HAVE_ARCH_KASAN + select HAVE_ARCH_KASAN_VMALLOC + select HAVE_ARCH_KCSAN + select HAVE_ARCH_KFENCE + select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET + select HAVE_ARCH_SECCOMP_FILTER + select HAVE_ARCH_SOFT_DIRTY + select HAVE_ARCH_STACKLEAK + select HAVE_ARCH_TRACEHOOK + select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select HAVE_ARCH_VMAP_STACK + select HAVE_ASM_MODVERSIONS + select HAVE_CMPXCHG_DOUBLE + select HAVE_CMPXCHG_LOCAL + select HAVE_DEBUG_KMEMLEAK + select HAVE_DMA_CONTIGUOUS + select HAVE_DYNAMIC_FTRACE + select HAVE_DYNAMIC_FTRACE_WITH_ARGS + select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + select HAVE_DYNAMIC_FTRACE_WITH_REGS + select HAVE_EBPF_JIT if HAVE_MARCH_Z196_FEATURES + select HAVE_EFFICIENT_UNALIGNED_ACCESS + select HAVE_FAST_GUP + select HAVE_FENTRY + select HAVE_FTRACE_MCOUNT_RECORD + select HAVE_FUNCTION_ARG_ACCESS_API + select HAVE_FUNCTION_ERROR_INJECTION + select HAVE_FUNCTION_GRAPH_RETVAL + select HAVE_FUNCTION_GRAPH_TRACER + select HAVE_FUNCTION_TRACER + select HAVE_GCC_PLUGINS + select HAVE_GENERIC_VDSO + select HAVE_IOREMAP_PROT if PCI + select HAVE_KERNEL_BZIP2 + select HAVE_KERNEL_GZIP + select HAVE_KERNEL_LZ4 + select HAVE_KERNEL_LZMA + select HAVE_KERNEL_LZO + select HAVE_KERNEL_UNCOMPRESSED + select HAVE_KERNEL_XZ + select HAVE_KERNEL_ZSTD + select HAVE_KPROBES + select HAVE_KPROBES_ON_FTRACE + select HAVE_KRETPROBES + select HAVE_KVM + select HAVE_LIVEPATCH + select HAVE_MEMBLOCK_PHYS_MAP + select HAVE_MOD_ARCH_SPECIFIC + select HAVE_NMI + select HAVE_NOP_MCOUNT + select HAVE_PCI + select HAVE_PERF_EVENTS + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP + select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_RELIABLE_STACKTRACE + select HAVE_RETHOOK + select HAVE_RSEQ + select HAVE_SAMPLE_FTRACE_DIRECT + select HAVE_SAMPLE_FTRACE_DIRECT_MULTI + select HAVE_SETUP_PER_CPU_AREA + select HAVE_SOFTIRQ_ON_OWN_STACK + select HAVE_SYSCALL_TRACEPOINTS + select HAVE_VIRT_CPU_ACCOUNTING + select HAVE_VIRT_CPU_ACCOUNTING_IDLE + select IOMMU_HELPER if PCI + select IOMMU_SUPPORT if PCI + select KEXEC + select MMU_GATHER_MERGE_VMAS + select MMU_GATHER_NO_GATHER + select MMU_GATHER_RCU_TABLE_FREE + select MODULES_USE_ELF_RELA + select NEED_DMA_MAP_STATE if PCI + select NEED_PER_CPU_EMBED_FIRST_CHUNK + select NEED_SG_DMA_LENGTH if PCI + select OLD_SIGACTION + select OLD_SIGSUSPEND3 + select PCI_DOMAINS if PCI + select PCI_MSI if PCI + select PCI_MSI_ARCH_FALLBACKS if PCI_MSI + select SPARSE_IRQ + select SWIOTLB + select SYSCTL_EXCEPTION_TRACE + select THREAD_INFO_IN_TASK + select TRACE_IRQFLAGS_SUPPORT + select TTY + select VIRT_CPU_ACCOUNTING + select ZONE_DMA + # Note: keep the above list sorted alphabetically + +config SCHED_OMIT_FRAME_POINTER + def_bool y + +config PGTABLE_LEVELS + int + default 5 + +source "kernel/livepatch/Kconfig" + +config ARCH_SUPPORTS_KEXEC + def_bool y + +config ARCH_SUPPORTS_KEXEC_FILE + def_bool y + +config ARCH_SUPPORTS_KEXEC_SIG + def_bool MODULE_SIG_FORMAT + +config ARCH_SUPPORTS_KEXEC_PURGATORY + def_bool y + +config ARCH_SUPPORTS_CRASH_DUMP + def_bool y + help + Refer to for more details on this. + This option also enables s390 zfcpdump. + See also + +menu "Processor type and features" + +config HAVE_MARCH_Z10_FEATURES + def_bool n + +config HAVE_MARCH_Z196_FEATURES + def_bool n + select HAVE_MARCH_Z10_FEATURES + +config HAVE_MARCH_ZEC12_FEATURES + def_bool n + select HAVE_MARCH_Z196_FEATURES + +config HAVE_MARCH_Z13_FEATURES + def_bool n + select HAVE_MARCH_ZEC12_FEATURES + +config HAVE_MARCH_Z14_FEATURES + def_bool n + select HAVE_MARCH_Z13_FEATURES + +config HAVE_MARCH_Z15_FEATURES + def_bool n + select HAVE_MARCH_Z14_FEATURES + +config HAVE_MARCH_Z16_FEATURES + def_bool n + select HAVE_MARCH_Z15_FEATURES + +choice + prompt "Processor type" + default MARCH_Z196 + +config MARCH_Z10 + bool "IBM System z10" + select HAVE_MARCH_Z10_FEATURES + depends on $(cc-option,-march=z10) + help + Select this to enable optimizations for IBM System z10 (2097 and 2098 + series). This is the oldest machine generation currently supported. + +config MARCH_Z196 + bool "IBM zEnterprise 114 and 196" + select HAVE_MARCH_Z196_FEATURES + depends on $(cc-option,-march=z196) + help + Select this to enable optimizations for IBM zEnterprise 114 and 196 + (2818 and 2817 series). The kernel will be slightly faster but will + not work on older machines. + +config MARCH_ZEC12 + bool "IBM zBC12 and zEC12" + select HAVE_MARCH_ZEC12_FEATURES + depends on $(cc-option,-march=zEC12) + help + Select this to enable optimizations for IBM zBC12 and zEC12 (2828 and + 2827 series). The kernel will be slightly faster but will not work on + older machines. + +config MARCH_Z13 + bool "IBM z13s and z13" + select HAVE_MARCH_Z13_FEATURES + depends on $(cc-option,-march=z13) + help + Select this to enable optimizations for IBM z13s and z13 (2965 and + 2964 series). The kernel will be slightly faster but will not work on + older machines. + +config MARCH_Z14 + bool "IBM z14 ZR1 and z14" + select HAVE_MARCH_Z14_FEATURES + depends on $(cc-option,-march=z14) + help + Select this to enable optimizations for IBM z14 ZR1 and z14 (3907 + and 3906 series). The kernel will be slightly faster but will not + work on older machines. + +config MARCH_Z15 + bool "IBM z15" + select HAVE_MARCH_Z15_FEATURES + depends on $(cc-option,-march=z15) + help + Select this to enable optimizations for IBM z15 (8562 + and 8561 series). The kernel will be slightly faster but will not + work on older machines. + +config MARCH_Z16 + bool "IBM z16" + select HAVE_MARCH_Z16_FEATURES + depends on $(cc-option,-march=z16) + help + Select this to enable optimizations for IBM z16 (3931 and + 3932 series). + +endchoice + +config MARCH_Z10_TUNE + def_bool TUNE_Z10 || MARCH_Z10 && TUNE_DEFAULT + +config MARCH_Z196_TUNE + def_bool TUNE_Z196 || MARCH_Z196 && TUNE_DEFAULT + +config MARCH_ZEC12_TUNE + def_bool TUNE_ZEC12 || MARCH_ZEC12 && TUNE_DEFAULT + +config MARCH_Z13_TUNE + def_bool TUNE_Z13 || MARCH_Z13 && TUNE_DEFAULT + +config MARCH_Z14_TUNE + def_bool TUNE_Z14 || MARCH_Z14 && TUNE_DEFAULT + +config MARCH_Z15_TUNE + def_bool TUNE_Z15 || MARCH_Z15 && TUNE_DEFAULT + +config MARCH_Z16_TUNE + def_bool TUNE_Z16 || MARCH_Z16 && TUNE_DEFAULT + +choice + prompt "Tune code generation" + default TUNE_DEFAULT + help + Cause the compiler to tune (-mtune) the generated code for a machine. + This will make the code run faster on the selected machine but + somewhat slower on other machines. + This option only changes how the compiler emits instructions, not the + selection of instructions itself, so the resulting kernel will run on + all other machines. + +config TUNE_DEFAULT + bool "Default" + help + Tune the generated code for the target processor for which the kernel + will be compiled. + +config TUNE_Z10 + bool "IBM System z10" + +config TUNE_Z196 + bool "IBM zEnterprise 114 and 196" + depends on $(cc-option,-mtune=z196) + +config TUNE_ZEC12 + bool "IBM zBC12 and zEC12" + depends on $(cc-option,-mtune=zEC12) + +config TUNE_Z13 + bool "IBM z13s and z13" + depends on $(cc-option,-mtune=z13) + +config TUNE_Z14 + bool "IBM z14 ZR1 and z14" + depends on $(cc-option,-mtune=z14) + +config TUNE_Z15 + bool "IBM z15" + depends on $(cc-option,-mtune=z15) + +config TUNE_Z16 + bool "IBM z16" + depends on $(cc-option,-mtune=z16) + +endchoice + +config 64BIT + def_bool y + +config COMMAND_LINE_SIZE + int "Maximum size of kernel command line" + default 4096 + range 896 1048576 + help + This allows you to specify the maximum length of the kernel command + line. + +config COMPAT + def_bool y + prompt "Kernel support for 31 bit emulation" + select ARCH_WANT_OLD_COMPAT_IPC + select COMPAT_OLD_SIGACTION + select HAVE_UID16 + depends on MULTIUSER + depends on !CC_IS_CLANG + help + Select this option if you want to enable your system kernel to + handle system-calls from ELF binaries for 31 bit ESA. This option + (and some other stuff like libraries and such) is needed for + executing 31 bit applications. It is safe to say "Y". + +config SMP + def_bool y + +config NR_CPUS + int "Maximum number of CPUs (2-512)" + range 2 512 + default "64" + help + This allows you to specify the maximum number of CPUs which this + kernel will support. The maximum supported value is 512 and the + minimum value which makes sense is 2. + + This is purely to save memory - each supported CPU adds + approximately sixteen kilobytes to the kernel image. + +config HOTPLUG_CPU + def_bool y + +config NUMA + bool "NUMA support" + depends on SCHED_TOPOLOGY + default n + help + Enable NUMA support + + This option adds NUMA support to the kernel. + +config NODES_SHIFT + int + depends on NUMA + default "1" + +config SCHED_SMT + def_bool n + +config SCHED_MC + def_bool n + +config SCHED_TOPOLOGY + def_bool y + prompt "Topology scheduler support" + select SCHED_SMT + select SCHED_MC + help + Topology scheduler support improves the CPU scheduler's decision + making when dealing with machines that have multi-threading, + multiple cores or multiple books. + +source "kernel/Kconfig.hz" + +config CERT_STORE + bool "Get user certificates via DIAG320" + depends on KEYS + select CRYPTO_LIB_SHA256 + help + Enable this option if you want to access user-provided secure boot + certificates via DIAG 0x320. + + These certificates will be made available via the keyring named + 'cert_store'. + +config KERNEL_NOBP + def_bool n + prompt "Enable modified branch prediction for the kernel by default" + help + If this option is selected the kernel will switch to a modified + branch prediction mode if the firmware interface is available. + The modified branch prediction mode improves the behaviour in + regard to speculative execution. + + With the option enabled the kernel parameter "nobp=0" or "nospec" + can be used to run the kernel in the normal branch prediction mode. + + With the option disabled the modified branch prediction mode is + enabled with the "nobp=1" kernel parameter. + + If unsure, say N. + +config EXPOLINE + def_bool n + depends on $(cc-option,-mindirect-branch=thunk) + prompt "Avoid speculative indirect branches in the kernel" + help + Compile the kernel with the expoline compiler options to guard + against kernel-to-user data leaks by avoiding speculative indirect + branches. + Requires a compiler with -mindirect-branch=thunk support for full + protection. The kernel may run slower. + + If unsure, say N. + +config EXPOLINE_EXTERN + def_bool n + depends on EXPOLINE + depends on CC_IS_GCC && GCC_VERSION >= 110200 + depends on $(success,$(srctree)/arch/s390/tools/gcc-thunk-extern.sh $(CC)) + prompt "Generate expolines as extern functions." + help + This option is required for some tooling like kpatch. The kernel is + compiled with -mindirect-branch=thunk-extern and requires a newer + compiler. + + If unsure, say N. + +choice + prompt "Expoline default" + depends on EXPOLINE + default EXPOLINE_FULL + +config EXPOLINE_OFF + bool "spectre_v2=off" + +config EXPOLINE_AUTO + bool "spectre_v2=auto" + +config EXPOLINE_FULL + bool "spectre_v2=on" + +endchoice + +config RELOCATABLE + def_bool y + help + This builds a kernel image that retains relocation information + so it can be loaded at an arbitrary address. + The kernel is linked as a position-independent executable (PIE) + and contains dynamic relocations which are processed early in the + bootup process. + The relocations make the kernel image about 15% larger (compressed + 10%), but are discarded at runtime. + Note: this option exists only for documentation purposes, please do + not remove it. + +config RANDOMIZE_BASE + bool "Randomize the address of the kernel image (KASLR)" + default y + help + In support of Kernel Address Space Layout Randomization (KASLR), + this randomizes the address at which the kernel image is loaded, + as a security feature that deters exploit attempts relying on + knowledge of the location of kernel internals. + +endmenu + +menu "Memory setup" + +config ARCH_SPARSEMEM_ENABLE + def_bool y + select SPARSEMEM_VMEMMAP_ENABLE + select SPARSEMEM_VMEMMAP + +config ARCH_SPARSEMEM_DEFAULT + def_bool y + +config MAX_PHYSMEM_BITS + int "Maximum size of supported physical memory in bits (42-53)" + range 42 53 + default "46" + help + This option specifies the maximum supported size of physical memory + in bits. Supported is any size between 2^42 (4TB) and 2^53 (8PB). + Increasing the number of bits also increases the kernel image size. + By default 46 bits (64TB) are supported. + +config CHECK_STACK + def_bool y + depends on !VMAP_STACK + prompt "Detect kernel stack overflow" + help + This option enables the compiler option -mstack-guard and + -mstack-size if they are available. If the compiler supports them + it will emit additional code to each function prolog to trigger + an illegal operation if the kernel stack is about to overflow. + + Say N if you are unsure. + +config STACK_GUARD + int "Size of the guard area (128-1024)" + range 128 1024 + depends on CHECK_STACK + default "256" + help + This allows you to specify the size of the guard area at the lower + end of the kernel stack. If the kernel stack points into the guard + area on function entry an illegal operation is triggered. The size + needs to be a power of 2. Please keep in mind that the size of an + interrupt frame is 184 bytes for 31 bit and 328 bytes on 64 bit. + The minimum size for the stack guard should be 256 for 31 bit and + 512 for 64 bit. + +endmenu + +menu "I/O subsystem" + +config QDIO + def_tristate y + prompt "QDIO support" + help + This driver provides the Queued Direct I/O base support for + IBM System z. + + To compile this driver as a module, choose M here: the + module will be called qdio. + + If unsure, say Y. + +if PCI + +config PCI_NR_FUNCTIONS + int "Maximum number of PCI functions (1-4096)" + range 1 4096 + default "512" + help + This allows you to specify the maximum number of PCI functions which + this kernel will support. + +endif # PCI + +config HAS_IOMEM + def_bool PCI + +config CHSC_SCH + def_tristate m + prompt "Support for CHSC subchannels" + help + This driver allows usage of CHSC subchannels. A CHSC subchannel + is usually present on LPAR only. + The driver creates a device /dev/chsc, which may be used to + obtain I/O configuration information about the machine and + to issue asynchronous chsc commands (DANGEROUS). + You will usually only want to use this interface on a special + LPAR designated for system management. + + To compile this driver as a module, choose M here: the + module will be called chsc_sch. + + If unsure, say N. + +config SCM_BUS + def_bool y + prompt "SCM bus driver" + help + Bus driver for Storage Class Memory. + +config EADM_SCH + def_tristate m + prompt "Support for EADM subchannels" + depends on SCM_BUS + help + This driver allows usage of EADM subchannels. EADM subchannels act + as a communication vehicle for SCM increments. + + To compile this driver as a module, choose M here: the + module will be called eadm_sch. + +config VFIO_CCW + def_tristate n + prompt "Support for VFIO-CCW subchannels" + depends on VFIO + select VFIO_MDEV + help + This driver allows usage of I/O subchannels via VFIO-CCW. + + To compile this driver as a module, choose M here: the + module will be called vfio_ccw. + +config VFIO_AP + def_tristate n + prompt "VFIO support for AP devices" + depends on KVM + depends on VFIO + depends on ZCRYPT + select VFIO_MDEV + help + This driver grants access to Adjunct Processor (AP) devices + via the VFIO mediated device interface. + + To compile this driver as a module, choose M here: the module + will be called vfio_ap. + +endmenu + +config CCW + def_bool y + +config HAVE_PNETID + tristate + default (SMC || CCWGROUP) + +menu "Virtualization" + +config PROTECTED_VIRTUALIZATION_GUEST + def_bool n + prompt "Protected virtualization guest support" + help + Select this option, if you want to be able to run this + kernel as a protected virtualization KVM guest. + Protected virtualization capable machines have a mini hypervisor + located at machine level (an ultravisor). With help of the + Ultravisor, KVM will be able to run "protected" VMs, special + VMs whose memory and management data are unavailable to KVM. + +config PFAULT + def_bool y + prompt "Pseudo page fault support" + help + Select this option, if you want to use PFAULT pseudo page fault + handling under VM. If running native or in LPAR, this option + has no effect. If your VM does not support PFAULT, PAGEEX + pseudo page fault handling will be used. + Note that VM 4.2 supports PFAULT but has a bug in its + implementation that causes some problems. + Everybody who wants to run Linux under VM != VM4.2 should select + this option. + +config CMM + def_tristate n + prompt "Cooperative memory management" + help + Select this option, if you want to enable the kernel interface + to reduce the memory size of the system. This is accomplished + by allocating pages of memory and put them "on hold". This only + makes sense for a system running under VM where the unused pages + will be reused by VM for other guest systems. The interface + allows an external monitor to balance memory of many systems. + Everybody who wants to run Linux under VM should select this + option. + +config CMM_IUCV + def_bool y + prompt "IUCV special message interface to cooperative memory management" + depends on CMM && (SMSGIUCV=y || CMM=SMSGIUCV) + help + Select this option to enable the special message interface to + the cooperative memory management. + +config APPLDATA_BASE + def_bool n + prompt "Linux - VM Monitor Stream, base infrastructure" + depends on PROC_SYSCTL + help + This provides a kernel interface for creating and updating z/VM APPLDATA + monitor records. The monitor records are updated at certain time + intervals, once the timer is started. + Writing 1 or 0 to /proc/appldata/timer starts(1) or stops(0) the timer, + i.e. enables or disables monitoring on the Linux side. + A custom interval value (in seconds) can be written to + /proc/appldata/interval. + + Defaults are 60 seconds interval and timer off. + The /proc entries can also be read from, showing the current settings. + +config APPLDATA_MEM + def_tristate m + prompt "Monitor memory management statistics" + depends on APPLDATA_BASE && VM_EVENT_COUNTERS + help + This provides memory management related data to the Linux - VM Monitor + Stream, like paging/swapping rate, memory utilisation, etc. + Writing 1 or 0 to /proc/appldata/memory creates(1) or removes(0) a z/VM + APPLDATA monitor record, i.e. enables or disables monitoring this record + on the z/VM side. + + Default is disabled. + The /proc entry can also be read from, showing the current settings. + + This can also be compiled as a module, which will be called + appldata_mem.o. + +config APPLDATA_OS + def_tristate m + prompt "Monitor OS statistics" + depends on APPLDATA_BASE + help + This provides OS related data to the Linux - VM Monitor Stream, like + CPU utilisation, etc. + Writing 1 or 0 to /proc/appldata/os creates(1) or removes(0) a z/VM + APPLDATA monitor record, i.e. enables or disables monitoring this record + on the z/VM side. + + Default is disabled. + This can also be compiled as a module, which will be called + appldata_os.o. + +config APPLDATA_NET_SUM + def_tristate m + prompt "Monitor overall network statistics" + depends on APPLDATA_BASE && NET + help + This provides network related data to the Linux - VM Monitor Stream, + currently there is only a total sum of network I/O statistics, no + per-interface data. + Writing 1 or 0 to /proc/appldata/net_sum creates(1) or removes(0) a z/VM + APPLDATA monitor record, i.e. enables or disables monitoring this record + on the z/VM side. + + Default is disabled. + This can also be compiled as a module, which will be called + appldata_net_sum.o. + +config S390_HYPFS + def_bool y + prompt "s390 hypervisor information" + help + This provides several binary files at (debugfs)/s390_hypfs/ to + provide accounting information in an s390 hypervisor environment. + +config S390_HYPFS_FS + def_bool n + prompt "s390 hypervisor file system support" + select SYS_HYPERVISOR + depends on S390_HYPFS + help + This is a virtual file system intended to provide accounting + information in an s390 hypervisor environment. This file system + is deprecated and should not be used. + + Say N if you are unsure. + +source "arch/s390/kvm/Kconfig" + +config S390_GUEST + def_bool y + prompt "s390 support for virtio devices" + select TTY + select VIRTUALIZATION + select VIRTIO + help + Enabling this option adds support for virtio based paravirtual device + drivers on s390. + + Select this option if you want to run the kernel as a guest under + the KVM hypervisor. + +endmenu + +config S390_MODULES_SANITY_TEST_HELPERS + def_bool n + +menu "Selftests" + +config S390_UNWIND_SELFTEST + def_tristate n + depends on KUNIT + default KUNIT_ALL_TESTS + prompt "Test unwind functions" + help + This option enables s390 specific stack unwinder testing kernel + module. This option is not useful for distributions or general + kernels, but only for kernel developers working on architecture code. + + Say N if you are unsure. + +config S390_KPROBES_SANITY_TEST + def_tristate n + prompt "Enable s390 specific kprobes tests" + depends on KPROBES + depends on KUNIT + help + This option enables an s390 specific kprobes test module. This option + is not useful for distributions or general kernels, but only for kernel + developers working on architecture code. + + Say N if you are unsure. + +config S390_MODULES_SANITY_TEST + def_tristate n + depends on KUNIT + default KUNIT_ALL_TESTS + prompt "Enable s390 specific modules tests" + select S390_MODULES_SANITY_TEST_HELPERS + help + This option enables an s390 specific modules test. This option is + not useful for distributions or general kernels, but only for + kernel developers working on architecture code. + + Say N if you are unsure. +endmenu diff --git a/arch/s390/Kconfig.debug b/arch/s390/Kconfig.debug new file mode 100644 index 0000000000..c4300ea4ab --- /dev/null +++ b/arch/s390/Kconfig.debug @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: GPL-2.0 + +config EARLY_PRINTK + def_bool y + +config DEBUG_ENTRY + bool "Debug low-level entry code" + depends on DEBUG_KERNEL + help + This option enables sanity checks in s390 low-level entry code. + Some of these sanity checks may slow down kernel entries and + exits or otherwise impact performance. + + If unsure, say N. + +config CIO_INJECT + bool "CIO Inject interfaces" + depends on DEBUG_KERNEL && DEBUG_FS + help + This option provides a debugging facility to inject certain artificial events + and instruction responses to the CIO layer of Linux kernel. The newly created + debugfs user-interfaces will be at /sys/kernel/debug/s390/cio/* diff --git a/arch/s390/Makefile b/arch/s390/Makefile new file mode 100644 index 0000000000..a53a36ee07 --- /dev/null +++ b/arch/s390/Makefile @@ -0,0 +1,177 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# s390/Makefile +# +# This file is included by the global makefile so that you can add your own +# architecture-specific flags and dependencies. +# +# Copyright (C) 1994 by Linus Torvalds +# + +LD_BFD := elf64-s390 +KBUILD_LDFLAGS := -m elf64_s390 +KBUILD_AFLAGS_MODULE += -fPIC +KBUILD_CFLAGS_MODULE += -fPIC +KBUILD_AFLAGS += -m64 +KBUILD_CFLAGS += -m64 +KBUILD_CFLAGS += -fPIE +LDFLAGS_vmlinux := -pie +aflags_dwarf := -Wa,-gdwarf-2 +KBUILD_AFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -D__ASSEMBLY__ +ifndef CONFIG_AS_IS_LLVM +KBUILD_AFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),$(aflags_dwarf)) +endif +KBUILD_CFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -O2 -mpacked-stack +KBUILD_CFLAGS_DECOMPRESSOR += -DDISABLE_BRANCH_PROFILING -D__NO_FORTIFY +KBUILD_CFLAGS_DECOMPRESSOR += -fno-delete-null-pointer-checks -msoft-float -mbackchain +KBUILD_CFLAGS_DECOMPRESSOR += -fno-asynchronous-unwind-tables +KBUILD_CFLAGS_DECOMPRESSOR += -ffreestanding +KBUILD_CFLAGS_DECOMPRESSOR += -fno-stack-protector +KBUILD_CFLAGS_DECOMPRESSOR += -fPIE +KBUILD_CFLAGS_DECOMPRESSOR += $(call cc-disable-warning, address-of-packed-member) +KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),-g) +KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO_DWARF4), $(call cc-option, -gdwarf-4,)) +KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_CC_NO_ARRAY_BOUNDS),-Wno-array-bounds) + +UTS_MACHINE := s390x +STACK_SIZE := $(if $(CONFIG_KASAN),65536,16384) +CHECKFLAGS += -D__s390__ -D__s390x__ + +export LD_BFD + +mflags-$(CONFIG_MARCH_Z10) := -march=z10 +mflags-$(CONFIG_MARCH_Z196) := -march=z196 +mflags-$(CONFIG_MARCH_ZEC12) := -march=zEC12 +mflags-$(CONFIG_MARCH_Z13) := -march=z13 +mflags-$(CONFIG_MARCH_Z14) := -march=z14 +mflags-$(CONFIG_MARCH_Z15) := -march=z15 +mflags-$(CONFIG_MARCH_Z16) := -march=z16 + +export CC_FLAGS_MARCH := $(mflags-y) + +aflags-y += $(mflags-y) +cflags-y += $(mflags-y) + +cflags-$(CONFIG_MARCH_Z10_TUNE) += -mtune=z10 +cflags-$(CONFIG_MARCH_Z196_TUNE) += -mtune=z196 +cflags-$(CONFIG_MARCH_ZEC12_TUNE) += -mtune=zEC12 +cflags-$(CONFIG_MARCH_Z13_TUNE) += -mtune=z13 +cflags-$(CONFIG_MARCH_Z14_TUNE) += -mtune=z14 +cflags-$(CONFIG_MARCH_Z15_TUNE) += -mtune=z15 +cflags-$(CONFIG_MARCH_Z16_TUNE) += -mtune=z16 + +cflags-y += -Wa,-I$(srctree)/arch/$(ARCH)/include + +# +# Prevent tail-call optimizations, to get clearer backtraces: +# +cflags-$(CONFIG_FRAME_POINTER) += -fno-optimize-sibling-calls + +KBUILD_AFLAGS_DECOMPRESSOR += $(aflags-y) +KBUILD_CFLAGS_DECOMPRESSOR += $(cflags-y) + +ifneq ($(call cc-option,-mstack-size=8192 -mstack-guard=128),) + CC_FLAGS_CHECK_STACK := -mstack-size=$(STACK_SIZE) + ifeq ($(call cc-option,-mstack-size=8192),) + CC_FLAGS_CHECK_STACK += -mstack-guard=$(CONFIG_STACK_GUARD) + endif + export CC_FLAGS_CHECK_STACK + cflags-$(CONFIG_CHECK_STACK) += $(CC_FLAGS_CHECK_STACK) +endif + +ifdef CONFIG_EXPOLINE + ifdef CONFIG_EXPOLINE_EXTERN + KBUILD_LDFLAGS_MODULE += arch/s390/lib/expoline/expoline.o + CC_FLAGS_EXPOLINE := -mindirect-branch=thunk-extern + CC_FLAGS_EXPOLINE += -mfunction-return=thunk-extern + else + CC_FLAGS_EXPOLINE := -mindirect-branch=thunk + CC_FLAGS_EXPOLINE += -mfunction-return=thunk + endif + CC_FLAGS_EXPOLINE += -mindirect-branch-table + export CC_FLAGS_EXPOLINE + cflags-y += $(CC_FLAGS_EXPOLINE) -DCC_USING_EXPOLINE + aflags-y += -DCC_USING_EXPOLINE +endif + +ifdef CONFIG_FUNCTION_TRACER + ifeq ($(call cc-option,-mfentry -mnop-mcount),) + # make use of hotpatch feature if the compiler supports it + cc_hotpatch := -mhotpatch=0,3 + ifneq ($(call cc-option,$(cc_hotpatch)),) + CC_FLAGS_FTRACE := $(cc_hotpatch) + KBUILD_AFLAGS += -DCC_USING_HOTPATCH + KBUILD_CFLAGS += -DCC_USING_HOTPATCH + endif + endif +endif + +# Test CFI features of binutils +cfi := $(call as-instr,.cfi_startproc\n.cfi_val_offset 15$(comma)-160\n.cfi_endproc,-DCONFIG_AS_CFI_VAL_OFFSET=1) + +KBUILD_CFLAGS += -mpacked-stack -mbackchain -msoft-float $(cflags-y) +KBUILD_CFLAGS += -pipe -Wno-sign-compare +KBUILD_CFLAGS += -fno-asynchronous-unwind-tables $(cfi) +KBUILD_AFLAGS += $(aflags-y) $(cfi) +export KBUILD_AFLAGS_DECOMPRESSOR +export KBUILD_CFLAGS_DECOMPRESSOR + +OBJCOPYFLAGS := -O binary + +libs-y += arch/s390/lib/ + +boot := arch/s390/boot +syscalls := arch/s390/kernel/syscalls +tools := arch/s390/tools + +all: bzImage + +#KBUILD_IMAGE is necessary for packaging targets like rpm-pkg, deb-pkg... +KBUILD_IMAGE := $(boot)/bzImage + +install: + $(call cmd,install) + +bzImage: vmlinux + $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@ + +zfcpdump: + $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@ + +vdso_install: + $(Q)$(MAKE) $(build)=arch/$(ARCH)/kernel/vdso64 $@ + +archheaders: + $(Q)$(MAKE) $(build)=$(syscalls) uapi + +archprepare: + $(Q)$(MAKE) $(build)=$(syscalls) kapi + $(Q)$(MAKE) $(build)=$(tools) kapi +ifeq ($(KBUILD_EXTMOD),) +# We need to generate vdso-offsets.h before compiling certain files in kernel/. +# In order to do that, we should use the archprepare target, but we can't since +# asm-offsets.h is included in some files used to generate vdso-offsets.h, and +# asm-offsets.h is built in prepare0, for which archprepare is a dependency. +# Therefore we need to generate the header after prepare0 has been made, hence +# this hack. +prepare: vdso_prepare +vdso_prepare: prepare0 + $(Q)$(MAKE) $(build)=arch/s390/kernel/vdso64 include/generated/vdso64-offsets.h + $(if $(CONFIG_COMPAT),$(Q)$(MAKE) \ + $(build)=arch/s390/kernel/vdso32 include/generated/vdso32-offsets.h) + +ifdef CONFIG_EXPOLINE_EXTERN +modules_prepare: expoline_prepare +expoline_prepare: scripts + $(Q)$(MAKE) $(build)=arch/s390/lib/expoline arch/s390/lib/expoline/expoline.o +endif +endif + +# Don't use tabs in echo arguments +define archhelp + echo '* bzImage - Kernel image for IPL ($(boot)/bzImage)' + echo ' install - Install kernel using' + echo ' (your) ~/bin/$(INSTALLKERNEL) or' + echo ' (distribution) /sbin/$(INSTALLKERNEL) or' + echo ' install to $$(INSTALL_PATH)' +endef diff --git a/arch/s390/appldata/Makefile b/arch/s390/appldata/Makefile new file mode 100644 index 0000000000..b06def4a4f --- /dev/null +++ b/arch/s390/appldata/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the Linux - z/VM Monitor Stream. +# + +obj-$(CONFIG_APPLDATA_BASE) += appldata_base.o +obj-$(CONFIG_APPLDATA_MEM) += appldata_mem.o +obj-$(CONFIG_APPLDATA_OS) += appldata_os.o +obj-$(CONFIG_APPLDATA_NET_SUM) += appldata_net_sum.o diff --git a/arch/s390/appldata/appldata.h b/arch/s390/appldata/appldata.h new file mode 100644 index 0000000000..10346d2f3d --- /dev/null +++ b/arch/s390/appldata/appldata.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Definitions and interface for Linux - z/VM Monitor Stream. + * + * Copyright IBM Corp. 2003, 2008 + * + * Author: Gerald Schaefer + */ + +#define APPLDATA_MAX_REC_SIZE 4024 /* Maximum size of the */ + /* data buffer */ +#define APPLDATA_MAX_PROCS 100 + +#define APPLDATA_PROC_NAME_LENGTH 16 /* Max. length of /proc name */ + +#define APPLDATA_RECORD_MEM_ID 0x01 /* IDs to identify the */ +#define APPLDATA_RECORD_OS_ID 0x02 /* individual records, */ +#define APPLDATA_RECORD_NET_SUM_ID 0x03 /* must be < 256 ! */ +#define APPLDATA_RECORD_PROC_ID 0x04 + +#define CTL_APPLDATA_TIMER 2121 /* sysctl IDs, must be unique */ +#define CTL_APPLDATA_INTERVAL 2122 +#define CTL_APPLDATA_MEM 2123 +#define CTL_APPLDATA_OS 2124 +#define CTL_APPLDATA_NET_SUM 2125 +#define CTL_APPLDATA_PROC 2126 + +struct appldata_ops { + struct list_head list; + struct ctl_table_header *sysctl_header; + struct ctl_table *ctl_table; + int active; /* monitoring status */ + + /* fill in from here */ + char name[APPLDATA_PROC_NAME_LENGTH]; /* name of /proc fs node */ + unsigned char record_nr; /* Record Nr. for Product ID */ + void (*callback)(void *data); /* callback function */ + void *data; /* record data */ + unsigned int size; /* size of record */ + struct module *owner; /* THIS_MODULE */ + char mod_lvl[2]; /* modification level, EBCDIC */ +}; + +extern int appldata_register_ops(struct appldata_ops *ops); +extern void appldata_unregister_ops(struct appldata_ops *ops); +extern int appldata_diag(char record_nr, u16 function, unsigned long buffer, + u16 length, char *mod_lvl); + diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c new file mode 100644 index 0000000000..3b09946256 --- /dev/null +++ b/arch/s390/appldata/appldata_base.c @@ -0,0 +1,428 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Base infrastructure for Linux-z/VM Monitor Stream, Stage 1. + * Exports appldata_register_ops() and appldata_unregister_ops() for the + * data gathering modules. + * + * Copyright IBM Corp. 2003, 2009 + * + * Author: Gerald Schaefer + */ + +#define KMSG_COMPONENT "appldata" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "appldata.h" + + +#define APPLDATA_CPU_INTERVAL 10000 /* default (CPU) time for + sampling interval in + milliseconds */ + +#define TOD_MICRO 0x01000 /* nr. of TOD clock units + for 1 microsecond */ + +/* + * /proc entries (sysctl) + */ +static const char appldata_proc_name[APPLDATA_PROC_NAME_LENGTH] = "appldata"; +static int appldata_timer_handler(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos); +static int appldata_interval_handler(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos); + +static struct ctl_table_header *appldata_sysctl_header; +static struct ctl_table appldata_table[] = { + { + .procname = "timer", + .mode = S_IRUGO | S_IWUSR, + .proc_handler = appldata_timer_handler, + }, + { + .procname = "interval", + .mode = S_IRUGO | S_IWUSR, + .proc_handler = appldata_interval_handler, + }, + { }, +}; + +/* + * Timer + */ +static struct vtimer_list appldata_timer; + +static DEFINE_SPINLOCK(appldata_timer_lock); +static int appldata_interval = APPLDATA_CPU_INTERVAL; +static int appldata_timer_active; + +/* + * Work queue + */ +static struct workqueue_struct *appldata_wq; +static void appldata_work_fn(struct work_struct *work); +static DECLARE_WORK(appldata_work, appldata_work_fn); + + +/* + * Ops list + */ +static DEFINE_MUTEX(appldata_ops_mutex); +static LIST_HEAD(appldata_ops_list); + + +/*************************** timer, work, DIAG *******************************/ +/* + * appldata_timer_function() + * + * schedule work and reschedule timer + */ +static void appldata_timer_function(unsigned long data) +{ + queue_work(appldata_wq, (struct work_struct *) data); +} + +/* + * appldata_work_fn() + * + * call data gathering function for each (active) module + */ +static void appldata_work_fn(struct work_struct *work) +{ + struct list_head *lh; + struct appldata_ops *ops; + + mutex_lock(&appldata_ops_mutex); + list_for_each(lh, &appldata_ops_list) { + ops = list_entry(lh, struct appldata_ops, list); + if (ops->active == 1) { + ops->callback(ops->data); + } + } + mutex_unlock(&appldata_ops_mutex); +} + +static struct appldata_product_id appldata_id = { + .prod_nr = {0xD3, 0xC9, 0xD5, 0xE4, + 0xE7, 0xD2, 0xD9}, /* "LINUXKR" */ + .prod_fn = 0xD5D3, /* "NL" */ + .version_nr = 0xF2F6, /* "26" */ + .release_nr = 0xF0F1, /* "01" */ +}; + +/* + * appldata_diag() + * + * prepare parameter list, issue DIAG 0xDC + */ +int appldata_diag(char record_nr, u16 function, unsigned long buffer, + u16 length, char *mod_lvl) +{ + struct appldata_parameter_list *parm_list; + struct appldata_product_id *id; + int rc; + + parm_list = kmalloc(sizeof(*parm_list), GFP_KERNEL); + id = kmemdup(&appldata_id, sizeof(appldata_id), GFP_KERNEL); + rc = -ENOMEM; + if (parm_list && id) { + id->record_nr = record_nr; + id->mod_lvl = (mod_lvl[0]) << 8 | mod_lvl[1]; + rc = appldata_asm(parm_list, id, function, + (void *) buffer, length); + } + kfree(id); + kfree(parm_list); + return rc; +} +/************************ timer, work, DIAG ****************************/ + + +/****************************** /proc stuff **********************************/ + +#define APPLDATA_ADD_TIMER 0 +#define APPLDATA_DEL_TIMER 1 +#define APPLDATA_MOD_TIMER 2 + +/* + * __appldata_vtimer_setup() + * + * Add, delete or modify virtual timers on all online cpus. + * The caller needs to get the appldata_timer_lock spinlock. + */ +static void __appldata_vtimer_setup(int cmd) +{ + u64 timer_interval = (u64) appldata_interval * 1000 * TOD_MICRO; + + switch (cmd) { + case APPLDATA_ADD_TIMER: + if (appldata_timer_active) + break; + appldata_timer.expires = timer_interval; + add_virt_timer_periodic(&appldata_timer); + appldata_timer_active = 1; + break; + case APPLDATA_DEL_TIMER: + del_virt_timer(&appldata_timer); + if (!appldata_timer_active) + break; + appldata_timer_active = 0; + break; + case APPLDATA_MOD_TIMER: + if (!appldata_timer_active) + break; + mod_virt_timer_periodic(&appldata_timer, timer_interval); + } +} + +/* + * appldata_timer_handler() + * + * Start/Stop timer, show status of timer (0 = not active, 1 = active) + */ +static int +appldata_timer_handler(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int timer_active = appldata_timer_active; + int rc; + struct ctl_table ctl_entry = { + .procname = ctl->procname, + .data = &timer_active, + .maxlen = sizeof(int), + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + + rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos); + if (rc < 0 || !write) + return rc; + + spin_lock(&appldata_timer_lock); + if (timer_active) + __appldata_vtimer_setup(APPLDATA_ADD_TIMER); + else + __appldata_vtimer_setup(APPLDATA_DEL_TIMER); + spin_unlock(&appldata_timer_lock); + return 0; +} + +/* + * appldata_interval_handler() + * + * Set (CPU) timer interval for collection of data (in milliseconds), show + * current timer interval. + */ +static int +appldata_interval_handler(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int interval = appldata_interval; + int rc; + struct ctl_table ctl_entry = { + .procname = ctl->procname, + .data = &interval, + .maxlen = sizeof(int), + .extra1 = SYSCTL_ONE, + }; + + rc = proc_dointvec_minmax(&ctl_entry, write, buffer, lenp, ppos); + if (rc < 0 || !write) + return rc; + + spin_lock(&appldata_timer_lock); + appldata_interval = interval; + __appldata_vtimer_setup(APPLDATA_MOD_TIMER); + spin_unlock(&appldata_timer_lock); + return 0; +} + +/* + * appldata_generic_handler() + * + * Generic start/stop monitoring and DIAG, show status of + * monitoring (0 = not in process, 1 = in process) + */ +static int +appldata_generic_handler(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct appldata_ops *ops = NULL, *tmp_ops; + struct list_head *lh; + int rc, found; + int active; + struct ctl_table ctl_entry = { + .data = &active, + .maxlen = sizeof(int), + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + + found = 0; + mutex_lock(&appldata_ops_mutex); + list_for_each(lh, &appldata_ops_list) { + tmp_ops = list_entry(lh, struct appldata_ops, list); + if (&tmp_ops->ctl_table[0] == ctl) { + found = 1; + } + } + if (!found) { + mutex_unlock(&appldata_ops_mutex); + return -ENODEV; + } + ops = ctl->data; + if (!try_module_get(ops->owner)) { // protect this function + mutex_unlock(&appldata_ops_mutex); + return -ENODEV; + } + mutex_unlock(&appldata_ops_mutex); + + active = ops->active; + rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos); + if (rc < 0 || !write) { + module_put(ops->owner); + return rc; + } + + mutex_lock(&appldata_ops_mutex); + if (active && (ops->active == 0)) { + // protect work queue callback + if (!try_module_get(ops->owner)) { + mutex_unlock(&appldata_ops_mutex); + module_put(ops->owner); + return -ENODEV; + } + ops->callback(ops->data); // init record + rc = appldata_diag(ops->record_nr, + APPLDATA_START_INTERVAL_REC, + (unsigned long) ops->data, ops->size, + ops->mod_lvl); + if (rc != 0) { + pr_err("Starting the data collection for %s " + "failed with rc=%d\n", ops->name, rc); + module_put(ops->owner); + } else + ops->active = 1; + } else if (!active && (ops->active == 1)) { + ops->active = 0; + rc = appldata_diag(ops->record_nr, APPLDATA_STOP_REC, + (unsigned long) ops->data, ops->size, + ops->mod_lvl); + if (rc != 0) + pr_err("Stopping the data collection for %s " + "failed with rc=%d\n", ops->name, rc); + module_put(ops->owner); + } + mutex_unlock(&appldata_ops_mutex); + module_put(ops->owner); + return 0; +} + +/*************************** /proc stuff *******************************/ + + +/************************* module-ops management *****************************/ +/* + * appldata_register_ops() + * + * update ops list, register /proc/sys entries + */ +int appldata_register_ops(struct appldata_ops *ops) +{ + if (ops->size > APPLDATA_MAX_REC_SIZE) + return -EINVAL; + + /* The last entry must be an empty one */ + ops->ctl_table = kcalloc(2, sizeof(struct ctl_table), GFP_KERNEL); + if (!ops->ctl_table) + return -ENOMEM; + + mutex_lock(&appldata_ops_mutex); + list_add(&ops->list, &appldata_ops_list); + mutex_unlock(&appldata_ops_mutex); + + ops->ctl_table[0].procname = ops->name; + ops->ctl_table[0].mode = S_IRUGO | S_IWUSR; + ops->ctl_table[0].proc_handler = appldata_generic_handler; + ops->ctl_table[0].data = ops; + + ops->sysctl_header = register_sysctl_sz(appldata_proc_name, ops->ctl_table, 1); + if (!ops->sysctl_header) + goto out; + return 0; +out: + mutex_lock(&appldata_ops_mutex); + list_del(&ops->list); + mutex_unlock(&appldata_ops_mutex); + kfree(ops->ctl_table); + return -ENOMEM; +} + +/* + * appldata_unregister_ops() + * + * update ops list, unregister /proc entries, stop DIAG if necessary + */ +void appldata_unregister_ops(struct appldata_ops *ops) +{ + mutex_lock(&appldata_ops_mutex); + list_del(&ops->list); + mutex_unlock(&appldata_ops_mutex); + unregister_sysctl_table(ops->sysctl_header); + kfree(ops->ctl_table); +} +/********************** module-ops management **************************/ + + +/******************************* init / exit *********************************/ + +/* + * appldata_init() + * + * init timer, register /proc entries + */ +static int __init appldata_init(void) +{ + init_virt_timer(&appldata_timer); + appldata_timer.function = appldata_timer_function; + appldata_timer.data = (unsigned long) &appldata_work; + appldata_wq = alloc_ordered_workqueue("appldata", 0); + if (!appldata_wq) + return -ENOMEM; + appldata_sysctl_header = register_sysctl(appldata_proc_name, appldata_table); + return 0; +} + +__initcall(appldata_init); + +/**************************** init / exit ******************************/ + +EXPORT_SYMBOL_GPL(appldata_register_ops); +EXPORT_SYMBOL_GPL(appldata_unregister_ops); +EXPORT_SYMBOL_GPL(appldata_diag); + +#ifdef CONFIG_SWAP +EXPORT_SYMBOL_GPL(si_swapinfo); +#endif +EXPORT_SYMBOL_GPL(nr_threads); +EXPORT_SYMBOL_GPL(nr_running); +EXPORT_SYMBOL_GPL(nr_iowait); diff --git a/arch/s390/appldata/appldata_mem.c b/arch/s390/appldata/appldata_mem.c new file mode 100644 index 0000000000..fc608f9b79 --- /dev/null +++ b/arch/s390/appldata/appldata_mem.c @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Data gathering module for Linux-VM Monitor Stream, Stage 1. + * Collects data related to memory management. + * + * Copyright IBM Corp. 2003, 2006 + * + * Author: Gerald Schaefer + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "appldata.h" + + +#define P2K(x) ((x) << (PAGE_SHIFT - 10)) /* Converts #Pages to KB */ + +/* + * Memory data + * + * This is accessed as binary data by z/VM. If changes to it can't be avoided, + * the structure version (product ID, see appldata_base.c) needs to be changed + * as well and all documentation and z/VM applications using it must be + * updated. + */ +struct appldata_mem_data { + u64 timestamp; + u32 sync_count_1; /* after VM collected the record data, */ + u32 sync_count_2; /* sync_count_1 and sync_count_2 should be the + same. If not, the record has been updated on + the Linux side while VM was collecting the + (possibly corrupt) data */ + + u64 pgpgin; /* data read from disk */ + u64 pgpgout; /* data written to disk */ + u64 pswpin; /* pages swapped in */ + u64 pswpout; /* pages swapped out */ + + u64 sharedram; /* sharedram is currently set to 0 */ + + u64 totalram; /* total main memory size */ + u64 freeram; /* free main memory size */ + u64 totalhigh; /* total high memory size */ + u64 freehigh; /* free high memory size */ + + u64 bufferram; /* memory reserved for buffers, free cache */ + u64 cached; /* size of (used) cache, w/o buffers */ + u64 totalswap; /* total swap space size */ + u64 freeswap; /* free swap space */ + +// New in 2.6 --> + u64 pgalloc; /* page allocations */ + u64 pgfault; /* page faults (major+minor) */ + u64 pgmajfault; /* page faults (major only) */ +// <-- New in 2.6 + +} __packed; + + +/* + * appldata_get_mem_data() + * + * gather memory data + */ +static void appldata_get_mem_data(void *data) +{ + /* + * don't put large structures on the stack, we are + * serialized through the appldata_ops_mutex and can use static + */ + static struct sysinfo val; + unsigned long ev[NR_VM_EVENT_ITEMS]; + struct appldata_mem_data *mem_data; + + mem_data = data; + mem_data->sync_count_1++; + + all_vm_events(ev); + mem_data->pgpgin = ev[PGPGIN] >> 1; + mem_data->pgpgout = ev[PGPGOUT] >> 1; + mem_data->pswpin = ev[PSWPIN]; + mem_data->pswpout = ev[PSWPOUT]; + mem_data->pgalloc = ev[PGALLOC_NORMAL]; + mem_data->pgalloc += ev[PGALLOC_DMA]; + mem_data->pgfault = ev[PGFAULT]; + mem_data->pgmajfault = ev[PGMAJFAULT]; + + si_meminfo(&val); + mem_data->sharedram = val.sharedram; + mem_data->totalram = P2K(val.totalram); + mem_data->freeram = P2K(val.freeram); + mem_data->totalhigh = P2K(val.totalhigh); + mem_data->freehigh = P2K(val.freehigh); + mem_data->bufferram = P2K(val.bufferram); + mem_data->cached = P2K(global_node_page_state(NR_FILE_PAGES) + - val.bufferram); + + si_swapinfo(&val); + mem_data->totalswap = P2K(val.totalswap); + mem_data->freeswap = P2K(val.freeswap); + + mem_data->timestamp = get_tod_clock(); + mem_data->sync_count_2++; +} + + +static struct appldata_ops ops = { + .name = "mem", + .record_nr = APPLDATA_RECORD_MEM_ID, + .size = sizeof(struct appldata_mem_data), + .callback = &appldata_get_mem_data, + .owner = THIS_MODULE, + .mod_lvl = {0xF0, 0xF0}, /* EBCDIC "00" */ +}; + + +/* + * appldata_mem_init() + * + * init_data, register ops + */ +static int __init appldata_mem_init(void) +{ + int ret; + + ops.data = kzalloc(sizeof(struct appldata_mem_data), GFP_KERNEL); + if (!ops.data) + return -ENOMEM; + + ret = appldata_register_ops(&ops); + if (ret) + kfree(ops.data); + + return ret; +} + +/* + * appldata_mem_exit() + * + * unregister ops + */ +static void __exit appldata_mem_exit(void) +{ + appldata_unregister_ops(&ops); + kfree(ops.data); +} + + +module_init(appldata_mem_init); +module_exit(appldata_mem_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gerald Schaefer"); +MODULE_DESCRIPTION("Linux-VM Monitor Stream, MEMORY statistics"); diff --git a/arch/s390/appldata/appldata_net_sum.c b/arch/s390/appldata/appldata_net_sum.c new file mode 100644 index 0000000000..59c282ca00 --- /dev/null +++ b/arch/s390/appldata/appldata_net_sum.c @@ -0,0 +1,163 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Data gathering module for Linux-VM Monitor Stream, Stage 1. + * Collects accumulated network statistics (Packets received/transmitted, + * dropped, errors, ...). + * + * Copyright IBM Corp. 2003, 2006 + * + * Author: Gerald Schaefer + */ + +#include +#include +#include +#include +#include +#include + +#include "appldata.h" + + +/* + * Network data + * + * This is accessed as binary data by z/VM. If changes to it can't be avoided, + * the structure version (product ID, see appldata_base.c) needs to be changed + * as well and all documentation and z/VM applications using it must be updated. + */ +struct appldata_net_sum_data { + u64 timestamp; + u32 sync_count_1; /* after VM collected the record data, */ + u32 sync_count_2; /* sync_count_1 and sync_count_2 should be the + same. If not, the record has been updated on + the Linux side while VM was collecting the + (possibly corrupt) data */ + + u32 nr_interfaces; /* nr. of network interfaces being monitored */ + + u32 padding; /* next value is 64-bit aligned, so these */ + /* 4 byte would be padded out by compiler */ + + u64 rx_packets; /* total packets received */ + u64 tx_packets; /* total packets transmitted */ + u64 rx_bytes; /* total bytes received */ + u64 tx_bytes; /* total bytes transmitted */ + u64 rx_errors; /* bad packets received */ + u64 tx_errors; /* packet transmit problems */ + u64 rx_dropped; /* no space in linux buffers */ + u64 tx_dropped; /* no space available in linux */ + u64 collisions; /* collisions while transmitting */ +} __packed; + + +/* + * appldata_get_net_sum_data() + * + * gather accumulated network statistics + */ +static void appldata_get_net_sum_data(void *data) +{ + int i; + struct appldata_net_sum_data *net_data; + struct net_device *dev; + unsigned long rx_packets, tx_packets, rx_bytes, tx_bytes, rx_errors, + tx_errors, rx_dropped, tx_dropped, collisions; + + net_data = data; + net_data->sync_count_1++; + + i = 0; + rx_packets = 0; + tx_packets = 0; + rx_bytes = 0; + tx_bytes = 0; + rx_errors = 0; + tx_errors = 0; + rx_dropped = 0; + tx_dropped = 0; + collisions = 0; + + rcu_read_lock(); + for_each_netdev_rcu(&init_net, dev) { + const struct rtnl_link_stats64 *stats; + struct rtnl_link_stats64 temp; + + stats = dev_get_stats(dev, &temp); + rx_packets += stats->rx_packets; + tx_packets += stats->tx_packets; + rx_bytes += stats->rx_bytes; + tx_bytes += stats->tx_bytes; + rx_errors += stats->rx_errors; + tx_errors += stats->tx_errors; + rx_dropped += stats->rx_dropped; + tx_dropped += stats->tx_dropped; + collisions += stats->collisions; + i++; + } + rcu_read_unlock(); + + net_data->nr_interfaces = i; + net_data->rx_packets = rx_packets; + net_data->tx_packets = tx_packets; + net_data->rx_bytes = rx_bytes; + net_data->tx_bytes = tx_bytes; + net_data->rx_errors = rx_errors; + net_data->tx_errors = tx_errors; + net_data->rx_dropped = rx_dropped; + net_data->tx_dropped = tx_dropped; + net_data->collisions = collisions; + + net_data->timestamp = get_tod_clock(); + net_data->sync_count_2++; +} + + +static struct appldata_ops ops = { + .name = "net_sum", + .record_nr = APPLDATA_RECORD_NET_SUM_ID, + .size = sizeof(struct appldata_net_sum_data), + .callback = &appldata_get_net_sum_data, + .owner = THIS_MODULE, + .mod_lvl = {0xF0, 0xF0}, /* EBCDIC "00" */ +}; + + +/* + * appldata_net_init() + * + * init data, register ops + */ +static int __init appldata_net_init(void) +{ + int ret; + + ops.data = kzalloc(sizeof(struct appldata_net_sum_data), GFP_KERNEL); + if (!ops.data) + return -ENOMEM; + + ret = appldata_register_ops(&ops); + if (ret) + kfree(ops.data); + + return ret; +} + +/* + * appldata_net_exit() + * + * unregister ops + */ +static void __exit appldata_net_exit(void) +{ + appldata_unregister_ops(&ops); + kfree(ops.data); +} + + +module_init(appldata_net_init); +module_exit(appldata_net_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gerald Schaefer"); +MODULE_DESCRIPTION("Linux-VM Monitor Stream, accumulated network statistics"); diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c new file mode 100644 index 0000000000..a363d30ce7 --- /dev/null +++ b/arch/s390/appldata/appldata_os.c @@ -0,0 +1,211 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Data gathering module for Linux-VM Monitor Stream, Stage 1. + * Collects misc. OS related data (CPU utilization, running processes). + * + * Copyright IBM Corp. 2003, 2006 + * + * Author: Gerald Schaefer + */ + +#define KMSG_COMPONENT "appldata" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "appldata.h" + +/* + * OS data + * + * This is accessed as binary data by z/VM. If changes to it can't be avoided, + * the structure version (product ID, see appldata_base.c) needs to be changed + * as well and all documentation and z/VM applications using it must be + * updated. + */ +struct appldata_os_per_cpu { + u32 per_cpu_user; /* timer ticks spent in user mode */ + u32 per_cpu_nice; /* ... spent with modified priority */ + u32 per_cpu_system; /* ... spent in kernel mode */ + u32 per_cpu_idle; /* ... spent in idle mode */ + + /* New in 2.6 */ + u32 per_cpu_irq; /* ... spent in interrupts */ + u32 per_cpu_softirq; /* ... spent in softirqs */ + u32 per_cpu_iowait; /* ... spent while waiting for I/O */ + + /* New in modification level 01 */ + u32 per_cpu_steal; /* ... stolen by hypervisor */ + u32 cpu_id; /* number of this CPU */ +} __attribute__((packed)); + +struct appldata_os_data { + u64 timestamp; + u32 sync_count_1; /* after VM collected the record data, */ + u32 sync_count_2; /* sync_count_1 and sync_count_2 should be the + same. If not, the record has been updated on + the Linux side while VM was collecting the + (possibly corrupt) data */ + + u32 nr_cpus; /* number of (virtual) CPUs */ + u32 per_cpu_size; /* size of the per-cpu data struct */ + u32 cpu_offset; /* offset of the first per-cpu data struct */ + + u32 nr_running; /* number of runnable threads */ + u32 nr_threads; /* number of threads */ + u32 avenrun[3]; /* average nr. of running processes during */ + /* the last 1, 5 and 15 minutes */ + + /* New in 2.6 */ + u32 nr_iowait; /* number of blocked threads + (waiting for I/O) */ + + /* per cpu data */ + struct appldata_os_per_cpu os_cpu[]; +} __attribute__((packed)); + +static struct appldata_os_data *appldata_os_data; + +static struct appldata_ops ops = { + .name = "os", + .record_nr = APPLDATA_RECORD_OS_ID, + .owner = THIS_MODULE, + .mod_lvl = {0xF0, 0xF1}, /* EBCDIC "01" */ +}; + + +/* + * appldata_get_os_data() + * + * gather OS data + */ +static void appldata_get_os_data(void *data) +{ + int i, j, rc; + struct appldata_os_data *os_data; + unsigned int new_size; + + os_data = data; + os_data->sync_count_1++; + + os_data->nr_threads = nr_threads; + os_data->nr_running = nr_running(); + os_data->nr_iowait = nr_iowait(); + os_data->avenrun[0] = avenrun[0] + (FIXED_1/200); + os_data->avenrun[1] = avenrun[1] + (FIXED_1/200); + os_data->avenrun[2] = avenrun[2] + (FIXED_1/200); + + j = 0; + for_each_online_cpu(i) { + os_data->os_cpu[j].per_cpu_user = + nsecs_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_USER]); + os_data->os_cpu[j].per_cpu_nice = + nsecs_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_NICE]); + os_data->os_cpu[j].per_cpu_system = + nsecs_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]); + os_data->os_cpu[j].per_cpu_idle = + nsecs_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IDLE]); + os_data->os_cpu[j].per_cpu_irq = + nsecs_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IRQ]); + os_data->os_cpu[j].per_cpu_softirq = + nsecs_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]); + os_data->os_cpu[j].per_cpu_iowait = + nsecs_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IOWAIT]); + os_data->os_cpu[j].per_cpu_steal = + nsecs_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_STEAL]); + os_data->os_cpu[j].cpu_id = i; + j++; + } + + os_data->nr_cpus = j; + + new_size = struct_size(os_data, os_cpu, os_data->nr_cpus); + if (ops.size != new_size) { + if (ops.active) { + rc = appldata_diag(APPLDATA_RECORD_OS_ID, + APPLDATA_START_INTERVAL_REC, + (unsigned long) ops.data, new_size, + ops.mod_lvl); + if (rc != 0) + pr_err("Starting a new OS data collection " + "failed with rc=%d\n", rc); + + rc = appldata_diag(APPLDATA_RECORD_OS_ID, + APPLDATA_STOP_REC, + (unsigned long) ops.data, ops.size, + ops.mod_lvl); + if (rc != 0) + pr_err("Stopping a faulty OS data " + "collection failed with rc=%d\n", rc); + } + ops.size = new_size; + } + os_data->timestamp = get_tod_clock(); + os_data->sync_count_2++; +} + + +/* + * appldata_os_init() + * + * init data, register ops + */ +static int __init appldata_os_init(void) +{ + int rc, max_size; + + max_size = struct_size(appldata_os_data, os_cpu, num_possible_cpus()); + if (max_size > APPLDATA_MAX_REC_SIZE) { + pr_err("Maximum OS record size %i exceeds the maximum " + "record size %i\n", max_size, APPLDATA_MAX_REC_SIZE); + rc = -ENOMEM; + goto out; + } + + appldata_os_data = kzalloc(max_size, GFP_KERNEL | GFP_DMA); + if (appldata_os_data == NULL) { + rc = -ENOMEM; + goto out; + } + + appldata_os_data->per_cpu_size = sizeof(struct appldata_os_per_cpu); + appldata_os_data->cpu_offset = offsetof(struct appldata_os_data, + os_cpu); + + ops.data = appldata_os_data; + ops.callback = &appldata_get_os_data; + rc = appldata_register_ops(&ops); + if (rc != 0) + kfree(appldata_os_data); +out: + return rc; +} + +/* + * appldata_os_exit() + * + * unregister ops + */ +static void __exit appldata_os_exit(void) +{ + appldata_unregister_ops(&ops); + kfree(appldata_os_data); +} + + +module_init(appldata_os_init); +module_exit(appldata_os_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gerald Schaefer"); +MODULE_DESCRIPTION("Linux-VM Monitor Stream, OS statistics"); diff --git a/arch/s390/boot/.gitignore b/arch/s390/boot/.gitignore new file mode 100644 index 0000000000..f56591bc08 --- /dev/null +++ b/arch/s390/boot/.gitignore @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only +image +bzImage +section_cmp.* +vmlinux +vmlinux.lds +vmlinux.syms diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile new file mode 100644 index 0000000000..c7c81e5f92 --- /dev/null +++ b/arch/s390/boot/Makefile @@ -0,0 +1,133 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the linux s390-specific parts of the memory manager. +# + +KCOV_INSTRUMENT := n +GCOV_PROFILE := n +UBSAN_SANITIZE := n +KASAN_SANITIZE := n +KCSAN_SANITIZE := n + +KBUILD_AFLAGS := $(KBUILD_AFLAGS_DECOMPRESSOR) +KBUILD_CFLAGS := $(KBUILD_CFLAGS_DECOMPRESSOR) + +# +# Use minimum architecture for als.c to be able to print an error +# message if the kernel is started on a machine which is too old +# +ifndef CONFIG_CC_IS_CLANG +CC_FLAGS_MARCH_MINIMUM := -march=z900 +else +CC_FLAGS_MARCH_MINIMUM := -march=z10 +endif + +ifneq ($(CC_FLAGS_MARCH),$(CC_FLAGS_MARCH_MINIMUM)) +AFLAGS_REMOVE_head.o += $(CC_FLAGS_MARCH) +AFLAGS_head.o += $(CC_FLAGS_MARCH_MINIMUM) +AFLAGS_REMOVE_mem.o += $(CC_FLAGS_MARCH) +AFLAGS_mem.o += $(CC_FLAGS_MARCH_MINIMUM) +CFLAGS_REMOVE_als.o += $(CC_FLAGS_MARCH) +CFLAGS_als.o += $(CC_FLAGS_MARCH_MINIMUM) +CFLAGS_REMOVE_sclp_early_core.o += $(CC_FLAGS_MARCH) +CFLAGS_sclp_early_core.o += $(CC_FLAGS_MARCH_MINIMUM) +endif + +CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char + +obj-y := head.o als.o startup.o physmem_info.o ipl_parm.o ipl_report.o vmem.o +obj-y += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o +obj-y += version.o pgm_check_info.o ctype.o ipl_data.o machine_kexec_reloc.o +obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o +obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o +obj-y += $(if $(CONFIG_KERNEL_UNCOMPRESSED),,decompressor.o) info.o +obj-$(CONFIG_KERNEL_ZSTD) += clz_ctz.o +obj-all := $(obj-y) piggy.o syms.o + +targets := bzImage section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y) +targets += vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 +targets += vmlinux.bin.xz vmlinux.bin.lzma vmlinux.bin.lzo vmlinux.bin.lz4 +targets += vmlinux.bin.zst info.bin syms.bin vmlinux.syms $(obj-all) + +OBJECTS := $(addprefix $(obj)/,$(obj-y)) +OBJECTS_ALL := $(addprefix $(obj)/,$(obj-all)) + +clean-files += vmlinux.map + +quiet_cmd_section_cmp = SECTCMP $* +define cmd_section_cmp + s1=`$(OBJDUMP) -t -j "$*" "$<" | sort | \ + sed -n "/0000000000000000/! s/.*\s$*\s\+//p" | sha256sum`; \ + s2=`$(OBJDUMP) -t -j "$*" "$(word 2,$^)" | sort | \ + sed -n "/0000000000000000/! s/.*\s$*\s\+//p" | sha256sum`; \ + if [ "$$s1" != "$$s2" ]; then \ + echo "error: section $* differs between $< and $(word 2,$^)" >&2; \ + exit 1; \ + fi; \ + touch $@ +endef + +$(obj)/bzImage: $(obj)/vmlinux $(obj)/section_cmp.boot.data $(obj)/section_cmp.boot.preserved.data FORCE + $(call if_changed,objcopy) + +$(obj)/section_cmp%: vmlinux $(obj)/vmlinux FORCE + $(call if_changed,section_cmp) + +LDFLAGS_vmlinux := --oformat $(LD_BFD) -e startup $(if $(CONFIG_VMLINUX_MAP),-Map=$(obj)/vmlinux.map) --build-id=sha1 -T +$(obj)/vmlinux: $(obj)/vmlinux.lds $(OBJECTS_ALL) FORCE + $(call if_changed,ld) + +LDFLAGS_vmlinux.syms := --oformat $(LD_BFD) -e startup -T +$(obj)/vmlinux.syms: $(obj)/vmlinux.lds $(OBJECTS) FORCE + $(call if_changed,ld) + +quiet_cmd_dumpsyms = DUMPSYMS $< +define cmd_dumpsyms + $(NM) -n -S --format=bsd "$<" | sed -nE 's/^0*([0-9a-fA-F]+) 0*([0-9a-fA-F]+) [tT] ([^ ]*)$$/\1 \2 \3/p' | tr '\n' '\0' > "$@" +endef + +$(obj)/syms.bin: $(obj)/vmlinux.syms FORCE + $(call if_changed,dumpsyms) + +OBJCOPYFLAGS_syms.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .data=.decompressor.syms +$(obj)/syms.o: $(obj)/syms.bin FORCE + $(call if_changed,objcopy) + +OBJCOPYFLAGS_info.bin := -O binary --only-section=.vmlinux.info --set-section-flags .vmlinux.info=load +$(obj)/info.bin: vmlinux FORCE + $(call if_changed,objcopy) + +OBJCOPYFLAGS_info.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .data=.vmlinux.info +$(obj)/info.o: $(obj)/info.bin FORCE + $(call if_changed,objcopy) + +OBJCOPYFLAGS_vmlinux.bin := -O binary --remove-section=.comment --remove-section=.vmlinux.info -S +$(obj)/vmlinux.bin: vmlinux FORCE + $(call if_changed,objcopy) + +suffix-$(CONFIG_KERNEL_GZIP) := .gz +suffix-$(CONFIG_KERNEL_BZIP2) := .bz2 +suffix-$(CONFIG_KERNEL_LZ4) := .lz4 +suffix-$(CONFIG_KERNEL_LZMA) := .lzma +suffix-$(CONFIG_KERNEL_LZO) := .lzo +suffix-$(CONFIG_KERNEL_XZ) := .xz +suffix-$(CONFIG_KERNEL_ZSTD) := .zst + +$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE + $(call if_changed,gzip) +$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE + $(call if_changed,bzip2_with_size) +$(obj)/vmlinux.bin.lz4: $(obj)/vmlinux.bin FORCE + $(call if_changed,lz4_with_size) +$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE + $(call if_changed,lzma_with_size) +$(obj)/vmlinux.bin.lzo: $(obj)/vmlinux.bin FORCE + $(call if_changed,lzo_with_size) +$(obj)/vmlinux.bin.xz: $(obj)/vmlinux.bin FORCE + $(call if_changed,xzkern_with_size) +$(obj)/vmlinux.bin.zst: $(obj)/vmlinux.bin FORCE + $(call if_changed,zstd22_with_size) + +OBJCOPYFLAGS_piggy.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .data=.vmlinux.bin.compressed +$(obj)/piggy.o: $(obj)/vmlinux.bin$(suffix-y) FORCE + $(call if_changed,objcopy) diff --git a/arch/s390/boot/als.c b/arch/s390/boot/als.c new file mode 100644 index 0000000000..47c48fbfb5 --- /dev/null +++ b/arch/s390/boot/als.c @@ -0,0 +1,114 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2016 + */ +#include +#include +#include +#include +#include +#include "boot.h" + +/* + * The code within this file will be called very early. It may _not_ + * access anything within the bss section, since that is not cleared + * yet and may contain data (e.g. initrd) that must be saved by other + * code. + * For temporary objects the stack (16k) should be used. + */ + +static unsigned long als[] = { FACILITIES_ALS }; + +static void u16_to_hex(char *str, u16 val) +{ + int i, num; + + for (i = 1; i <= 4; i++) { + num = (val >> (16 - 4 * i)) & 0xf; + if (num >= 10) + num += 7; + *str++ = '0' + num; + } + *str = '\0'; +} + +static void print_machine_type(void) +{ + static char mach_str[80] = "Detected machine-type number: "; + char type_str[5]; + struct cpuid id; + + get_cpu_id(&id); + u16_to_hex(type_str, id.machine); + strcat(mach_str, type_str); + strcat(mach_str, "\n"); + sclp_early_printk(mach_str); +} + +static void u16_to_decimal(char *str, u16 val) +{ + int div = 1; + + while (div * 10 <= val) + div *= 10; + while (div) { + *str++ = '0' + val / div; + val %= div; + div /= 10; + } + *str = '\0'; +} + +void print_missing_facilities(void) +{ + static char als_str[80] = "Missing facilities: "; + unsigned long val; + char val_str[6]; + int i, j, first; + + first = 1; + for (i = 0; i < ARRAY_SIZE(als); i++) { + val = ~stfle_fac_list[i] & als[i]; + for (j = 0; j < BITS_PER_LONG; j++) { + if (!(val & (1UL << (BITS_PER_LONG - 1 - j)))) + continue; + if (!first) + strcat(als_str, ","); + /* + * Make sure we stay within one line. Consider that + * each facility bit adds up to five characters and + * z/VM adds a four character prefix. + */ + if (strlen(als_str) > 70) { + strcat(als_str, "\n"); + sclp_early_printk(als_str); + *als_str = '\0'; + } + u16_to_decimal(val_str, i * BITS_PER_LONG + j); + strcat(als_str, val_str); + first = 0; + } + } + strcat(als_str, "\n"); + sclp_early_printk(als_str); +} + +static void facility_mismatch(void) +{ + sclp_early_printk("The Linux kernel requires more recent processor hardware\n"); + print_machine_type(); + print_missing_facilities(); + sclp_early_printk("See Principles of Operations for facility bits\n"); + disabled_wait(); +} + +void verify_facilities(void) +{ + int i; + + __stfle(stfle_fac_list, ARRAY_SIZE(stfle_fac_list)); + for (i = 0; i < ARRAY_SIZE(als); i++) { + if ((stfle_fac_list[i] & als[i]) != als[i]) + facility_mismatch(); + } +} diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h new file mode 100644 index 0000000000..222c6886ac --- /dev/null +++ b/arch/s390/boot/boot.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef BOOT_BOOT_H +#define BOOT_BOOT_H + +#include + +#define IPL_START 0x200 + +#ifndef __ASSEMBLY__ + +#include + +struct machine_info { + unsigned char has_edat1 : 1; + unsigned char has_edat2 : 1; + unsigned char has_nx : 1; +}; + +struct vmlinux_info { + unsigned long default_lma; + unsigned long entry; + unsigned long image_size; /* does not include .bss */ + unsigned long bss_size; /* uncompressed image .bss size */ + unsigned long bootdata_off; + unsigned long bootdata_size; + unsigned long bootdata_preserved_off; + unsigned long bootdata_preserved_size; + unsigned long dynsym_start; + unsigned long rela_dyn_start; + unsigned long rela_dyn_end; + unsigned long amode31_size; + unsigned long init_mm_off; + unsigned long swapper_pg_dir_off; + unsigned long invalid_pg_dir_off; +#ifdef CONFIG_KASAN + unsigned long kasan_early_shadow_page_off; + unsigned long kasan_early_shadow_pte_off; + unsigned long kasan_early_shadow_pmd_off; + unsigned long kasan_early_shadow_pud_off; + unsigned long kasan_early_shadow_p4d_off; +#endif +}; + +void startup_kernel(void); +unsigned long detect_max_physmem_end(void); +void detect_physmem_online_ranges(unsigned long max_physmem_end); +void physmem_set_usable_limit(unsigned long limit); +void physmem_reserve(enum reserved_range_type type, unsigned long addr, unsigned long size); +void physmem_free(enum reserved_range_type type); +/* for continuous/multiple allocations per type */ +unsigned long physmem_alloc_top_down(enum reserved_range_type type, unsigned long size, + unsigned long align); +/* for single allocations, 1 per type */ +unsigned long physmem_alloc_range(enum reserved_range_type type, unsigned long size, + unsigned long align, unsigned long min, unsigned long max, + bool die_on_oom); +unsigned long get_physmem_alloc_pos(void); +bool ipl_report_certs_intersects(unsigned long addr, unsigned long size, + unsigned long *intersection_start); +bool is_ipl_block_dump(void); +void store_ipl_parmblock(void); +int read_ipl_report(void); +void save_ipl_cert_comp_list(void); +void setup_boot_command_line(void); +void parse_boot_command_line(void); +void verify_facilities(void); +void print_missing_facilities(void); +void sclp_early_setup_buffer(void); +void print_pgm_check_info(void); +unsigned long randomize_within_range(unsigned long size, unsigned long align, + unsigned long min, unsigned long max); +void setup_vmem(unsigned long asce_limit); +void __printf(1, 2) decompressor_printk(const char *fmt, ...); +void print_stacktrace(unsigned long sp); +void error(char *m); + +extern struct machine_info machine; + +/* Symbols defined by linker scripts */ +extern const char kernel_version[]; +extern unsigned long memory_limit; +extern unsigned long vmalloc_size; +extern int vmalloc_size_set; +extern char __boot_data_start[], __boot_data_end[]; +extern char __boot_data_preserved_start[], __boot_data_preserved_end[]; +extern char _decompressor_syms_start[], _decompressor_syms_end[]; +extern char _stack_start[], _stack_end[]; +extern char _end[], _decompressor_end[]; +extern unsigned char _compressed_start[]; +extern unsigned char _compressed_end[]; +extern struct vmlinux_info _vmlinux_info; +#define vmlinux _vmlinux_info + +#define __abs_lowcore_pa(x) (((unsigned long)(x) - __abs_lowcore) % sizeof(struct lowcore)) + +static inline bool intersects(unsigned long addr0, unsigned long size0, + unsigned long addr1, unsigned long size1) +{ + return addr0 + size0 > addr1 && addr1 + size1 > addr0; +} +#endif /* __ASSEMBLY__ */ +#endif /* BOOT_BOOT_H */ diff --git a/arch/s390/boot/clz_ctz.c b/arch/s390/boot/clz_ctz.c new file mode 100644 index 0000000000..c3ebf24859 --- /dev/null +++ b/arch/s390/boot/clz_ctz.c @@ -0,0 +1,2 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "../../../../lib/clz_ctz.c" diff --git a/arch/s390/boot/cmdline.c b/arch/s390/boot/cmdline.c new file mode 100644 index 0000000000..73d826cdbd --- /dev/null +++ b/arch/s390/boot/cmdline.c @@ -0,0 +1,2 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "../../../lib/cmdline.c" diff --git a/arch/s390/boot/ctype.c b/arch/s390/boot/ctype.c new file mode 100644 index 0000000000..2495810b47 --- /dev/null +++ b/arch/s390/boot/ctype.c @@ -0,0 +1,2 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "../../../lib/ctype.c" diff --git a/arch/s390/boot/decompressor.c b/arch/s390/boot/decompressor.c new file mode 100644 index 0000000000..d762733a07 --- /dev/null +++ b/arch/s390/boot/decompressor.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Definitions and wrapper functions for kernel decompressor + * + * Copyright IBM Corp. 2010 + * + * Author(s): Martin Schwidefsky + */ + +#include +#include +#include +#include "decompressor.h" +#include "boot.h" + +/* + * gzip declarations + */ +#define STATIC static + +#undef memset +#undef memcpy +#undef memmove +#define memmove memmove +#define memzero(s, n) memset((s), 0, (n)) + +#if defined(CONFIG_KERNEL_BZIP2) +#define BOOT_HEAP_SIZE 0x400000 +#elif defined(CONFIG_KERNEL_ZSTD) +#define BOOT_HEAP_SIZE 0x30000 +#else +#define BOOT_HEAP_SIZE 0x10000 +#endif + +static unsigned long free_mem_ptr = (unsigned long) _end; +static unsigned long free_mem_end_ptr = (unsigned long) _end + BOOT_HEAP_SIZE; + +#ifdef CONFIG_KERNEL_GZIP +#include "../../../../lib/decompress_inflate.c" +#endif + +#ifdef CONFIG_KERNEL_BZIP2 +#include "../../../../lib/decompress_bunzip2.c" +#endif + +#ifdef CONFIG_KERNEL_LZ4 +#include "../../../../lib/decompress_unlz4.c" +#endif + +#ifdef CONFIG_KERNEL_LZMA +#include "../../../../lib/decompress_unlzma.c" +#endif + +#ifdef CONFIG_KERNEL_LZO +#include "../../../../lib/decompress_unlzo.c" +#endif + +#ifdef CONFIG_KERNEL_XZ +#include "../../../../lib/decompress_unxz.c" +#endif + +#ifdef CONFIG_KERNEL_ZSTD +#include "../../../../lib/decompress_unzstd.c" +#endif + +#define decompress_offset ALIGN((unsigned long)_end + BOOT_HEAP_SIZE, PAGE_SIZE) + +unsigned long mem_safe_offset(void) +{ + /* + * due to 4MB HEAD_SIZE for bzip2 + * 'decompress_offset + vmlinux.image_size' could be larger than + * kernel at final position + its .bss, so take the larger of two + */ + return max(decompress_offset + vmlinux.image_size, + vmlinux.default_lma + vmlinux.image_size + vmlinux.bss_size); +} + +void *decompress_kernel(void) +{ + void *output = (void *)decompress_offset; + + __decompress(_compressed_start, _compressed_end - _compressed_start, + NULL, NULL, output, vmlinux.image_size, NULL, error); + return output; +} diff --git a/arch/s390/boot/decompressor.h b/arch/s390/boot/decompressor.h new file mode 100644 index 0000000000..92b81d2ea3 --- /dev/null +++ b/arch/s390/boot/decompressor.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef BOOT_COMPRESSED_DECOMPRESSOR_H +#define BOOT_COMPRESSED_DECOMPRESSOR_H + +#ifdef CONFIG_KERNEL_UNCOMPRESSED +static inline void *decompress_kernel(void) { return NULL; } +#else +void *decompress_kernel(void); +#endif +unsigned long mem_safe_offset(void); + +#endif /* BOOT_COMPRESSED_DECOMPRESSOR_H */ diff --git a/arch/s390/boot/ebcdic.c b/arch/s390/boot/ebcdic.c new file mode 100644 index 0000000000..7391e7d360 --- /dev/null +++ b/arch/s390/boot/ebcdic.c @@ -0,0 +1,2 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "../kernel/ebcdic.c" diff --git a/arch/s390/boot/head.S b/arch/s390/boot/head.S new file mode 100644 index 0000000000..637c29c3f6 --- /dev/null +++ b/arch/s390/boot/head.S @@ -0,0 +1,320 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 1999, 2010 + * + * Author(s): Hartmut Penner + * Martin Schwidefsky + * Rob van der Heij + * + * There are 5 different IPL methods + * 1) load the image directly into ram at address 0 and do an PSW restart + * 2) linload will load the image from address 0x10000 to memory 0x10000 + * and start the code thru LPSW 0x0008000080010000 (VM only, deprecated) + * 3) generate the tape ipl header, store the generated image on a tape + * and ipl from it + * In case of SL tape you need to IPL 5 times to get past VOL1 etc + * 4) generate the vm reader ipl header, move the generated image to the + * VM reader (use option NOH!) and do a ipl from reader (VM only) + * 5) direct call of start by the SALIPL loader + * We use the cpuid to distinguish between VM and native ipl + * params for kernel are pushed to 0x10400 (see setup.h) + * + */ + +#include +#include +#include +#include +#include +#include +#include "boot.h" + +#define EP_OFFSET 0x10008 +#define EP_STRING "S390EP" +#define IPL_BS 0x730 + +__HEAD +ipl_start: + mvi __LC_AR_MODE_ID,1 # set esame flag + slr %r0,%r0 # set cpuid to zero + lhi %r1,2 # mode 2 = esame (dump) + sigp %r1,%r0,0x12 # switch to esame mode + sam64 # switch to 64 bit addressing mode + lgh %r1,__LC_SUBCHANNEL_ID # test if subchannel number + brctg %r1,.Lnoload # is valid + llgf %r1,__LC_SUBCHANNEL_ID # load ipl subchannel number + lghi %r2,IPL_BS # load start address + bras %r14,.Lloader # load rest of ipl image + larl %r12,parmarea # pointer to parameter area + stg %r1,IPL_DEVICE-PARMAREA(%r12) # save ipl device number +# +# load parameter file from ipl device +# +.Lagain1: + larl %r2,_end # ramdisk loc. is temp + bras %r14,.Lloader # load parameter file + ltgr %r2,%r2 # got anything ? + jz .Lnopf + lg %r3,MAX_COMMAND_LINE_SIZE-PARMAREA(%r12) + aghi %r3,-1 + clgr %r2,%r3 + jl .Lnotrunc + lgr %r2,%r3 +.Lnotrunc: + larl %r4,_end + larl %r13,.L_hdr + clc 0(3,%r4),0(%r13) # if it is HDRx + jz .Lagain1 # skip dataset header + larl %r13,.L_eof + clc 0(3,%r4),0(%r13) # if it is EOFx + jz .Lagain1 # skip data set trailer + lgr %r5,%r2 + la %r6,COMMAND_LINE-PARMAREA(%r12) + lgr %r7,%r2 + aghi %r7,1 + mvcl %r6,%r4 +.Lnopf: +# +# load ramdisk from ipl device +# +.Lagain2: + larl %r2,_end # addr of ramdisk + stg %r2,INITRD_START-PARMAREA(%r12) + bras %r14,.Lloader # load ramdisk + stg %r2,INITRD_SIZE-PARMAREA(%r12) # store size of rd + ltgr %r2,%r2 + jnz .Lrdcont + stg %r2,INITRD_START-PARMAREA(%r12) # no ramdisk found +.Lrdcont: + larl %r2,_end + larl %r13,.L_hdr # skip HDRx and EOFx + clc 0(3,%r2),0(%r13) + jz .Lagain2 + larl %r13,.L_eof + clc 0(3,%r2),0(%r13) + jz .Lagain2 +# +# reset files in VM reader +# + larl %r13,.Lcpuid + stidp 0(%r13) # store cpuid + tm 0(%r13),0xff # running VM ? + jno .Lnoreset + larl %r2,.Lreset + lghi %r3,26 + diag %r2,%r3,8 + larl %r5,.Lirb + stsch 0(%r5) # check if irq is pending + tm 30(%r5),0x0f # by verifying if any of the + jnz .Lwaitforirq # activity or status control + tm 31(%r5),0xff # bits is set in the schib + jz .Lnoreset +.Lwaitforirq: + bras %r14,.Lirqwait # wait for IO interrupt + c %r1,__LC_SUBCHANNEL_ID # compare subchannel number + jne .Lwaitforirq + larl %r5,.Lirb + tsch 0(%r5) +.Lnoreset: + j .Lnoload +# +# everything loaded, go for it +# +.Lnoload: + jg startup +# +# subroutine to wait for end I/O +# +.Lirqwait: + larl %r13,.Lnewpswmask # set up IO interrupt psw + mvc __LC_IO_NEW_PSW(8),0(%r13) + stg %r14,__LC_IO_NEW_PSW+8 + larl %r13,.Lwaitpsw + lpswe 0(%r13) +.Lioint: +# +# subroutine for loading cards from the reader +# +.Lloader: + lgr %r4,%r14 + larl %r3,.Lorb # r2 = address of orb into r2 + larl %r5,.Lirb # r4 = address of irb + larl %r6,.Lccws + lghi %r7,20 +.Linit: + st %r2,4(%r6) # initialize CCW data addresses + la %r2,0x50(%r2) + la %r6,8(%r6) + brctg %r7,.Linit + larl %r13,.Lcr6 + lctlg %c6,%c6,0(%r13) + xgr %r2,%r2 +.Lldlp: + ssch 0(%r3) # load chunk of 1600 bytes + jnz .Llderr +.Lwait4irq: + bras %r14,.Lirqwait + c %r1,__LC_SUBCHANNEL_ID # compare subchannel number + jne .Lwait4irq + tsch 0(%r5) + xgr %r0,%r0 + ic %r0,8(%r5) # get device status + cghi %r0,8 # channel end ? + je .Lcont + cghi %r0,12 # channel end + device end ? + je .Lcont + llgf %r0,4(%r5) + sgf %r0,8(%r3) # r0/8 = number of ccws executed + mghi %r0,10 # *10 = number of bytes in ccws + llgh %r3,10(%r5) # get residual count + sgr %r0,%r3 # #ccws*80-residual=#bytes read + agr %r2,%r0 + br %r4 # r2 contains the total size +.Lcont: + aghi %r2,0x640 # add 0x640 to total size + larl %r6,.Lccws + lghi %r7,20 +.Lincr: + l %r0,4(%r6) # update CCW data addresses + aghi %r0,0x640 + st %r0,4(%r6) + aghi %r6,8 + brctg %r7,.Lincr + j .Lldlp +.Llderr: + larl %r13,.Lcrash + lpsw 0(%r13) + + .balign 8 +.Lwaitpsw: + .quad 0x0202000180000000,.Lioint +.Lnewpswmask: + .quad 0x0000000180000000 + .balign 8 +.Lorb: .long 0x00000000,0x0080ff00,.Lccws +.Lirb: .long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 + .balign 8 +.Lcr6: .quad 0x00000000ff000000 + .balign 8 +.Lcrash:.long 0x000a0000,0x00000000 + .balign 8 +.Lccws: .rept 19 + .long 0x02600050,0x00000000 + .endr + .long 0x02200050,0x00000000 +.Lreset:.byte 0xc3,0xc8,0xc1,0xd5,0xc7,0xc5,0x40,0xd9,0xc4,0xd9,0x40 + .byte 0xc1,0xd3,0xd3,0x40,0xd2,0xc5,0xc5,0xd7,0x40,0xd5,0xd6 + .byte 0xc8,0xd6,0xd3,0xc4 # "change rdr all keep nohold" +.L_eof: .long 0xc5d6c600 /* C'EOF' */ +.L_hdr: .long 0xc8c4d900 /* C'HDR' */ + .balign 8 +.Lcpuid:.fill 8,1,0 + +# +# normal startup-code, running in absolute addressing mode +# this is called either by the ipl loader or directly by PSW restart +# or linload or SALIPL +# + .org STARTUP_NORMAL_OFFSET - IPL_START +SYM_CODE_START(startup) + j startup_normal + .org EP_OFFSET - IPL_START +# +# This is a list of s390 kernel entry points. At address 0x1000f the number of +# valid entry points is stored. +# +# IMPORTANT: Do not change this table, it is s390 kernel ABI! +# + .ascii EP_STRING + .byte 0x00,0x01 +# +# kdump startup-code, running in 64 bit absolute addressing mode +# + .org STARTUP_KDUMP_OFFSET - IPL_START + j startup_kdump +SYM_CODE_END(startup) +SYM_CODE_START_LOCAL(startup_normal) + mvi __LC_AR_MODE_ID,1 # set esame flag + slr %r0,%r0 # set cpuid to zero + lhi %r1,2 # mode 2 = esame (dump) + sigp %r1,%r0,0x12 # switch to esame mode + bras %r13,0f + .fill 16,4,0x0 +0: lmh %r0,%r15,0(%r13) # clear high-order half of gprs + sam64 # switch to 64 bit addressing mode + larl %r13,.Lext_new_psw + mvc __LC_EXT_NEW_PSW(16),0(%r13) + larl %r13,.Lpgm_new_psw + mvc __LC_PGM_NEW_PSW(16),0(%r13) + larl %r13,.Lio_new_psw + mvc __LC_IO_NEW_PSW(16),0(%r13) + xc 0x200(256),0x200 # partially clear lowcore + xc 0x300(256),0x300 + xc 0xe00(256),0xe00 + xc 0xf00(256),0xf00 + larl %r13,.Lctl + lctlg %c0,%c15,0(%r13) # load control registers + stcke __LC_BOOT_CLOCK + mvc __LC_LAST_UPDATE_CLOCK(8),__LC_BOOT_CLOCK+1 + larl %r13,6f + spt 0(%r13) + mvc __LC_LAST_UPDATE_TIMER(8),0(%r13) + larl %r15,_stack_end-STACK_FRAME_OVERHEAD + brasl %r14,sclp_early_setup_buffer + brasl %r14,verify_facilities + brasl %r14,startup_kernel +SYM_CODE_END(startup_normal) + + .balign 8 +6: .long 0x7fffffff,0xffffffff +.Lext_new_psw: + .quad 0x0002000180000000,0x1b0 # disabled wait +.Lpgm_new_psw: + .quad 0x0000000180000000,startup_pgm_check_handler +.Lio_new_psw: + .quad 0x0002000180000000,0x1f0 # disabled wait +.Lctl: .quad 0x04040000 # cr0: AFP registers & secondary space + .quad 0 # cr1: primary space segment table + .quad 0 # cr2: dispatchable unit control table + .quad 0 # cr3: instruction authorization + .quad 0xffff # cr4: instruction authorization + .quad 0 # cr5: primary-aste origin + .quad 0 # cr6: I/O interrupts + .quad 0 # cr7: secondary space segment table + .quad 0x0000000000008000 # cr8: access registers translation + .quad 0 # cr9: tracing off + .quad 0 # cr10: tracing off + .quad 0 # cr11: tracing off + .quad 0 # cr12: tracing off + .quad 0 # cr13: home space segment table + .quad 0xc0000000 # cr14: machine check handling off + .quad 0 # cr15: linkage stack operations + +#include "head_kdump.S" + +# +# This program check is active immediately after kernel start +# and until early_pgm_check_handler is set in kernel/early.c +# It simply saves general/control registers and psw in +# the save area and does disabled wait with a faulty address. +# +SYM_CODE_START_LOCAL(startup_pgm_check_handler) + stmg %r8,%r15,__LC_SAVE_AREA_SYNC + la %r8,4095 + stctg %c0,%c15,__LC_CREGS_SAVE_AREA-4095(%r8) + stmg %r0,%r7,__LC_GPREGS_SAVE_AREA-4095(%r8) + mvc __LC_GPREGS_SAVE_AREA-4095+64(64,%r8),__LC_SAVE_AREA_SYNC + mvc __LC_PSW_SAVE_AREA-4095(16,%r8),__LC_PGM_OLD_PSW + mvc __LC_RETURN_PSW(16),__LC_PGM_OLD_PSW + ni __LC_RETURN_PSW,0xfc # remove IO and EX bits + ni __LC_RETURN_PSW+1,0xfb # remove MCHK bit + oi __LC_RETURN_PSW+1,0x2 # set wait state bit + larl %r9,.Lold_psw_disabled_wait + stg %r9,__LC_PGM_NEW_PSW+8 + larl %r15,_dump_info_stack_end-STACK_FRAME_OVERHEAD + brasl %r14,print_pgm_check_info +.Lold_psw_disabled_wait: + la %r8,4095 + lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r8) + lpswe __LC_RETURN_PSW # disabled wait +SYM_CODE_END(startup_pgm_check_handler) diff --git a/arch/s390/boot/head_kdump.S b/arch/s390/boot/head_kdump.S new file mode 100644 index 0000000000..f7107c7625 --- /dev/null +++ b/arch/s390/boot/head_kdump.S @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * S390 kdump lowlevel functions (new kernel) + * + * Copyright IBM Corp. 2011 + * Author(s): Michael Holzheu + */ + +#include + +#define DATAMOVER_ADDR 0x4000 +#define COPY_PAGE_ADDR 0x6000 + +#ifdef CONFIG_CRASH_DUMP + +# +# kdump entry (new kernel - not yet relocated) +# +# Note: This code has to be position independent +# + +SYM_CODE_START_LOCAL(startup_kdump) + lhi %r1,2 # mode 2 = esame (dump) + sigp %r1,%r0,SIGP_SET_ARCHITECTURE # Switch to esame mode + sam64 # Switch to 64 bit addressing + basr %r13,0 +.Lbase: + larl %r2,.Lbase_addr # Check, if we have been + lg %r2,0(%r2) # already relocated: + clgr %r2,%r13 # + jne .Lrelocate # No : Start data mover + lghi %r2,0 # Yes: Start kdump kernel + brasl %r14,startup_kdump_relocated + +.Lrelocate: + larl %r4,startup + lg %r2,0x418(%r4) # Get kdump base + lg %r3,0x420(%r4) # Get kdump size + + larl %r10,.Lcopy_start # Source of data mover + lghi %r8,DATAMOVER_ADDR # Target of data mover + mvc 0(256,%r8),0(%r10) # Copy data mover code + + agr %r8,%r2 # Copy data mover to + mvc 0(256,%r8),0(%r10) # reserved mem + + lghi %r14,DATAMOVER_ADDR # Jump to copied data mover + basr %r14,%r14 +.Lbase_addr: + .quad .Lbase + +# +# kdump data mover code (runs at address DATAMOVER_ADDR) +# +# r2: kdump base address +# r3: kdump size +# +.Lcopy_start: + basr %r13,0 # Base +0: + lgr %r11,%r2 # Save kdump base address + lgr %r12,%r2 + agr %r12,%r3 # Compute kdump end address + + lghi %r5,0 + lghi %r10,COPY_PAGE_ADDR # Load copy page address +1: + mvc 0(256,%r10),0(%r5) # Copy old kernel to tmp + mvc 0(256,%r5),0(%r11) # Copy new kernel to old + mvc 0(256,%r11),0(%r10) # Copy tmp to new + aghi %r11,256 + aghi %r5,256 + clgr %r11,%r12 + jl 1b + + lg %r14,.Lstartup_kdump-0b(%r13) + basr %r14,%r14 # Start relocated kernel +.Lstartup_kdump: + .long 0x00000000,0x00000000 + startup_kdump_relocated +.Lcopy_end: + +# +# Startup of kdump (relocated new kernel) +# + .balign 2 +startup_kdump_relocated: + basr %r13,0 +0: lpswe .Lrestart_psw-0b(%r13) # Start new kernel... +SYM_CODE_END(startup_kdump) + .balign 8 +.Lrestart_psw: + .quad 0x0000000080000000,0x0000000000000000 + startup +#else +SYM_CODE_START_LOCAL(startup_kdump) + larl %r13,startup_kdump_crash + lpswe 0(%r13) +SYM_CODE_END(startup_kdump) + .balign 8 +startup_kdump_crash: + .quad 0x0002000080000000,0x0000000000000000 + startup_kdump_crash +#endif /* CONFIG_CRASH_DUMP */ diff --git a/arch/s390/boot/install.sh b/arch/s390/boot/install.sh new file mode 100755 index 0000000000..a13dd2f2aa --- /dev/null +++ b/arch/s390/boot/install.sh @@ -0,0 +1,24 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# +# arch/s390x/boot/install.sh +# +# Copyright (C) 1995 by Linus Torvalds +# +# Adapted from code in arch/i386/boot/Makefile by H. Peter Anvin +# +# "make install" script for s390 architecture +# +# Arguments: +# $1 - kernel version +# $2 - kernel image file +# $3 - kernel map file +# $4 - default install path (blank if root directory) + +echo "Warning: '${INSTALLKERNEL}' command not available - additional " \ + "bootloader config required" >&2 +if [ -f "$4/vmlinuz-$1" ]; then mv -- "$4/vmlinuz-$1" "$4/vmlinuz-$1.old"; fi +if [ -f "$4/System.map-$1" ]; then mv -- "$4/System.map-$1" "$4/System.map-$1.old"; fi + +cat -- "$2" > "$4/vmlinuz-$1" +cp -- "$3" "$4/System.map-$1" diff --git a/arch/s390/boot/ipl_data.c b/arch/s390/boot/ipl_data.c new file mode 100644 index 0000000000..0846e2b249 --- /dev/null +++ b/arch/s390/boot/ipl_data.c @@ -0,0 +1,84 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include "boot.h" + +#define CCW0(cmd, addr, cnt, flg) \ + { .cmd_code = cmd, .cda = addr, .count = cnt, .flags = flg, } + +#define PSW_MASK_DISABLED (PSW_MASK_WAIT | PSW_MASK_EA | PSW_MASK_BA) + +struct ipl_lowcore { + psw_t32 ipl_psw; /* 0x0000 */ + struct ccw0 ccwpgm[2]; /* 0x0008 */ + u8 fill[56]; /* 0x0018 */ + struct ccw0 ccwpgmcc[20]; /* 0x0050 */ + u8 pad_0xf0[0x01a0-0x00f0]; /* 0x00f0 */ + psw_t restart_psw; /* 0x01a0 */ + psw_t external_new_psw; /* 0x01b0 */ + psw_t svc_new_psw; /* 0x01c0 */ + psw_t program_new_psw; /* 0x01d0 */ + psw_t mcck_new_psw; /* 0x01e0 */ + psw_t io_new_psw; /* 0x01f0 */ +}; + +/* + * Initial lowcore for IPL: the first 24 bytes are loaded by IPL to + * addresses 0-23 (a PSW and two CCWs). Bytes 24-79 are discarded. + * The next 160 bytes are loaded to addresses 0x18-0xb7. They form + * the continuation of the CCW program started by IPL and load the + * range 0x0f0-0x730 from the image to the range 0x0f0-0x730 in + * memory. At the end of the channel program the PSW at location 0 is + * loaded. + * Initial processing starts at 0x200 = iplstart. + * + * The restart psw points to iplstart which allows to load a kernel + * image into memory and starting it by a psw restart on any cpu. All + * other default psw new locations contain a disabled wait psw where + * the address indicates which psw was loaded. + * + * Note that the 'file' utility can detect s390 kernel images. For + * that to succeed the two initial CCWs, and the 0x40 fill bytes must + * be present. + */ +static struct ipl_lowcore ipl_lowcore __used __section(".ipldata") = { + .ipl_psw = { .mask = PSW32_MASK_BASE, .addr = PSW32_ADDR_AMODE | IPL_START }, + .ccwpgm = { + [ 0] = CCW0(CCW_CMD_READ_IPL, 0x018, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [ 1] = CCW0(CCW_CMD_READ_IPL, 0x068, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + }, + .fill = { + [ 0 ... 55] = 0x40, + }, + .ccwpgmcc = { + [ 0] = CCW0(CCW_CMD_READ_IPL, 0x0f0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [ 1] = CCW0(CCW_CMD_READ_IPL, 0x140, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [ 2] = CCW0(CCW_CMD_READ_IPL, 0x190, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [ 3] = CCW0(CCW_CMD_READ_IPL, 0x1e0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [ 4] = CCW0(CCW_CMD_READ_IPL, 0x230, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [ 5] = CCW0(CCW_CMD_READ_IPL, 0x280, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [ 6] = CCW0(CCW_CMD_READ_IPL, 0x2d0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [ 7] = CCW0(CCW_CMD_READ_IPL, 0x320, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [ 8] = CCW0(CCW_CMD_READ_IPL, 0x370, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [ 9] = CCW0(CCW_CMD_READ_IPL, 0x3c0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [10] = CCW0(CCW_CMD_READ_IPL, 0x410, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [11] = CCW0(CCW_CMD_READ_IPL, 0x460, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [12] = CCW0(CCW_CMD_READ_IPL, 0x4b0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [13] = CCW0(CCW_CMD_READ_IPL, 0x500, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [14] = CCW0(CCW_CMD_READ_IPL, 0x550, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [15] = CCW0(CCW_CMD_READ_IPL, 0x5a0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [16] = CCW0(CCW_CMD_READ_IPL, 0x5f0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [17] = CCW0(CCW_CMD_READ_IPL, 0x640, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [18] = CCW0(CCW_CMD_READ_IPL, 0x690, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC), + [19] = CCW0(CCW_CMD_READ_IPL, 0x6e0, 0x50, CCW_FLAG_SLI), + }, + .restart_psw = { .mask = 0, .addr = IPL_START, }, + .external_new_psw = { .mask = PSW_MASK_DISABLED, .addr = __LC_EXT_NEW_PSW, }, + .svc_new_psw = { .mask = PSW_MASK_DISABLED, .addr = __LC_SVC_NEW_PSW, }, + .program_new_psw = { .mask = PSW_MASK_DISABLED, .addr = __LC_PGM_NEW_PSW, }, + .mcck_new_psw = { .mask = PSW_MASK_DISABLED, .addr = __LC_MCK_NEW_PSW, }, + .io_new_psw = { .mask = PSW_MASK_DISABLED, .addr = __LC_IO_NEW_PSW, }, +}; diff --git a/arch/s390/boot/ipl_parm.c b/arch/s390/boot/ipl_parm.c new file mode 100644 index 0000000000..7b75217626 --- /dev/null +++ b/arch/s390/boot/ipl_parm.c @@ -0,0 +1,306 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "boot.h" + +struct parmarea parmarea __section(".parmarea") = { + .kernel_version = (unsigned long)kernel_version, + .max_command_line_size = COMMAND_LINE_SIZE, + .command_line = "root=/dev/ram0 ro", +}; + +char __bootdata(early_command_line)[COMMAND_LINE_SIZE]; + +unsigned int __bootdata_preserved(zlib_dfltcc_support) = ZLIB_DFLTCC_FULL; +struct ipl_parameter_block __bootdata_preserved(ipl_block); +int __bootdata_preserved(ipl_block_valid); +int __bootdata_preserved(__kaslr_enabled); + +unsigned long vmalloc_size = VMALLOC_DEFAULT_SIZE; +unsigned long memory_limit; +int vmalloc_size_set; + +static inline int __diag308(unsigned long subcode, void *addr) +{ + unsigned long reg1, reg2; + union register_pair r1; + psw_t old; + + r1.even = (unsigned long) addr; + r1.odd = 0; + asm volatile( + " mvc 0(16,%[psw_old]),0(%[psw_pgm])\n" + " epsw %[reg1],%[reg2]\n" + " st %[reg1],0(%[psw_pgm])\n" + " st %[reg2],4(%[psw_pgm])\n" + " larl %[reg1],1f\n" + " stg %[reg1],8(%[psw_pgm])\n" + " diag %[r1],%[subcode],0x308\n" + "1: mvc 0(16,%[psw_pgm]),0(%[psw_old])\n" + : [r1] "+&d" (r1.pair), + [reg1] "=&d" (reg1), + [reg2] "=&a" (reg2), + "+Q" (S390_lowcore.program_new_psw), + "=Q" (old) + : [subcode] "d" (subcode), + [psw_old] "a" (&old), + [psw_pgm] "a" (&S390_lowcore.program_new_psw) + : "cc", "memory"); + return r1.odd; +} + +void store_ipl_parmblock(void) +{ + int rc; + + rc = __diag308(DIAG308_STORE, &ipl_block); + if (rc == DIAG308_RC_OK && + ipl_block.hdr.version <= IPL_MAX_SUPPORTED_VERSION) + ipl_block_valid = 1; +} + +bool is_ipl_block_dump(void) +{ + if (ipl_block.pb0_hdr.pbt == IPL_PBT_FCP && + ipl_block.fcp.opt == IPL_PB0_FCP_OPT_DUMP) + return true; + if (ipl_block.pb0_hdr.pbt == IPL_PBT_NVME && + ipl_block.nvme.opt == IPL_PB0_NVME_OPT_DUMP) + return true; + if (ipl_block.pb0_hdr.pbt == IPL_PBT_ECKD && + ipl_block.eckd.opt == IPL_PB0_ECKD_OPT_DUMP) + return true; + return false; +} + +static size_t scpdata_length(const u8 *buf, size_t count) +{ + while (count) { + if (buf[count - 1] != '\0' && buf[count - 1] != ' ') + break; + count--; + } + return count; +} + +static size_t ipl_block_get_ascii_scpdata(char *dest, size_t size, + const struct ipl_parameter_block *ipb) +{ + const __u8 *scp_data; + __u32 scp_data_len; + int has_lowercase; + size_t count = 0; + size_t i; + + switch (ipb->pb0_hdr.pbt) { + case IPL_PBT_FCP: + scp_data_len = ipb->fcp.scp_data_len; + scp_data = ipb->fcp.scp_data; + break; + case IPL_PBT_NVME: + scp_data_len = ipb->nvme.scp_data_len; + scp_data = ipb->nvme.scp_data; + break; + case IPL_PBT_ECKD: + scp_data_len = ipb->eckd.scp_data_len; + scp_data = ipb->eckd.scp_data; + break; + + default: + goto out; + } + + count = min(size - 1, scpdata_length(scp_data, scp_data_len)); + if (!count) + goto out; + + has_lowercase = 0; + for (i = 0; i < count; i++) { + if (!isascii(scp_data[i])) { + count = 0; + goto out; + } + if (!has_lowercase && islower(scp_data[i])) + has_lowercase = 1; + } + + if (has_lowercase) + memcpy(dest, scp_data, count); + else + for (i = 0; i < count; i++) + dest[i] = tolower(scp_data[i]); +out: + dest[count] = '\0'; + return count; +} + +static void append_ipl_block_parm(void) +{ + char *parm, *delim; + size_t len, rc = 0; + + len = strlen(early_command_line); + + delim = early_command_line + len; /* '\0' character position */ + parm = early_command_line + len + 1; /* append right after '\0' */ + + switch (ipl_block.pb0_hdr.pbt) { + case IPL_PBT_CCW: + rc = ipl_block_get_ascii_vmparm( + parm, COMMAND_LINE_SIZE - len - 1, &ipl_block); + break; + case IPL_PBT_FCP: + case IPL_PBT_NVME: + case IPL_PBT_ECKD: + rc = ipl_block_get_ascii_scpdata( + parm, COMMAND_LINE_SIZE - len - 1, &ipl_block); + break; + } + if (rc) { + if (*parm == '=') + memmove(early_command_line, parm + 1, rc); + else + *delim = ' '; /* replace '\0' with space */ + } +} + +static inline int has_ebcdic_char(const char *str) +{ + int i; + + for (i = 0; str[i]; i++) + if (str[i] & 0x80) + return 1; + return 0; +} + +void setup_boot_command_line(void) +{ + parmarea.command_line[COMMAND_LINE_SIZE - 1] = 0; + /* convert arch command line to ascii if necessary */ + if (has_ebcdic_char(parmarea.command_line)) + EBCASC(parmarea.command_line, COMMAND_LINE_SIZE); + /* copy arch command line */ + strcpy(early_command_line, strim(parmarea.command_line)); + + /* append IPL PARM data to the boot command line */ + if (!is_prot_virt_guest() && ipl_block_valid) + append_ipl_block_parm(); +} + +static void modify_facility(unsigned long nr, bool clear) +{ + if (clear) + __clear_facility(nr, stfle_fac_list); + else + __set_facility(nr, stfle_fac_list); +} + +static void check_cleared_facilities(void) +{ + unsigned long als[] = { FACILITIES_ALS }; + int i; + + for (i = 0; i < ARRAY_SIZE(als); i++) { + if ((stfle_fac_list[i] & als[i]) != als[i]) { + sclp_early_printk("Warning: The Linux kernel requires facilities cleared via command line option\n"); + print_missing_facilities(); + break; + } + } +} + +static void modify_fac_list(char *str) +{ + unsigned long val, endval; + char *endp; + bool clear; + + while (*str) { + clear = false; + if (*str == '!') { + clear = true; + str++; + } + val = simple_strtoull(str, &endp, 0); + if (str == endp) + break; + str = endp; + if (*str == '-') { + str++; + endval = simple_strtoull(str, &endp, 0); + if (str == endp) + break; + str = endp; + while (val <= endval) { + modify_facility(val, clear); + val++; + } + } else { + modify_facility(val, clear); + } + if (*str != ',') + break; + str++; + } + check_cleared_facilities(); +} + +static char command_line_buf[COMMAND_LINE_SIZE]; +void parse_boot_command_line(void) +{ + char *param, *val; + bool enabled; + char *args; + int rc; + + __kaslr_enabled = IS_ENABLED(CONFIG_RANDOMIZE_BASE); + args = strcpy(command_line_buf, early_command_line); + while (*args) { + args = next_arg(args, ¶m, &val); + + if (!strcmp(param, "mem") && val) + memory_limit = round_down(memparse(val, NULL), PAGE_SIZE); + + if (!strcmp(param, "vmalloc") && val) { + vmalloc_size = round_up(memparse(val, NULL), PAGE_SIZE); + vmalloc_size_set = 1; + } + + if (!strcmp(param, "dfltcc") && val) { + if (!strcmp(val, "off")) + zlib_dfltcc_support = ZLIB_DFLTCC_DISABLED; + else if (!strcmp(val, "on")) + zlib_dfltcc_support = ZLIB_DFLTCC_FULL; + else if (!strcmp(val, "def_only")) + zlib_dfltcc_support = ZLIB_DFLTCC_DEFLATE_ONLY; + else if (!strcmp(val, "inf_only")) + zlib_dfltcc_support = ZLIB_DFLTCC_INFLATE_ONLY; + else if (!strcmp(val, "always")) + zlib_dfltcc_support = ZLIB_DFLTCC_FULL_DEBUG; + } + + if (!strcmp(param, "facilities") && val) + modify_fac_list(val); + + if (!strcmp(param, "nokaslr")) + __kaslr_enabled = 0; + +#if IS_ENABLED(CONFIG_KVM) + if (!strcmp(param, "prot_virt")) { + rc = kstrtobool(val, &enabled); + if (!rc && enabled) + prot_virt_host = 1; + } +#endif + } +} diff --git a/arch/s390/boot/ipl_report.c b/arch/s390/boot/ipl_report.c new file mode 100644 index 0000000000..1803035e68 --- /dev/null +++ b/arch/s390/boot/ipl_report.c @@ -0,0 +1,165 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include "boot.h" + +int __bootdata_preserved(ipl_secure_flag); + +unsigned long __bootdata_preserved(ipl_cert_list_addr); +unsigned long __bootdata_preserved(ipl_cert_list_size); + +unsigned long __bootdata(early_ipl_comp_list_addr); +unsigned long __bootdata(early_ipl_comp_list_size); + +static struct ipl_rb_certificates *certs; +static struct ipl_rb_components *comps; +static bool ipl_report_needs_saving; + +#define for_each_rb_entry(entry, rb) \ + for (entry = rb->entries; \ + (void *) entry + sizeof(*entry) <= (void *) rb + rb->len; \ + entry++) + +static unsigned long get_cert_comp_list_size(void) +{ + struct ipl_rb_certificate_entry *cert; + struct ipl_rb_component_entry *comp; + size_t size; + + /* + * Find the length for the IPL report boot data + */ + early_ipl_comp_list_size = 0; + for_each_rb_entry(comp, comps) + early_ipl_comp_list_size += sizeof(*comp); + ipl_cert_list_size = 0; + for_each_rb_entry(cert, certs) + ipl_cert_list_size += sizeof(unsigned int) + cert->len; + return ipl_cert_list_size + early_ipl_comp_list_size; +} + +bool ipl_report_certs_intersects(unsigned long addr, unsigned long size, + unsigned long *intersection_start) +{ + struct ipl_rb_certificate_entry *cert; + + if (!ipl_report_needs_saving) + return false; + + for_each_rb_entry(cert, certs) { + if (intersects(addr, size, cert->addr, cert->len)) { + *intersection_start = cert->addr; + return true; + } + } + return false; +} + +static void copy_components_bootdata(void) +{ + struct ipl_rb_component_entry *comp, *ptr; + + ptr = (struct ipl_rb_component_entry *) early_ipl_comp_list_addr; + for_each_rb_entry(comp, comps) + memcpy(ptr++, comp, sizeof(*ptr)); +} + +static void copy_certificates_bootdata(void) +{ + struct ipl_rb_certificate_entry *cert; + void *ptr; + + ptr = (void *) ipl_cert_list_addr; + for_each_rb_entry(cert, certs) { + *(unsigned int *) ptr = cert->len; + ptr += sizeof(unsigned int); + memcpy(ptr, (void *) cert->addr, cert->len); + ptr += cert->len; + } +} + +int read_ipl_report(void) +{ + struct ipl_pl_hdr *pl_hdr; + struct ipl_rl_hdr *rl_hdr; + struct ipl_rb_hdr *rb_hdr; + unsigned long tmp; + void *rl_end; + + /* + * Check if there is a IPL report by looking at the copy + * of the IPL parameter information block. + */ + if (!ipl_block_valid || + !(ipl_block.hdr.flags & IPL_PL_FLAG_IPLSR)) + return -1; + ipl_secure_flag = !!(ipl_block.hdr.flags & IPL_PL_FLAG_SIPL); + /* + * There is an IPL report, to find it load the pointer to the + * IPL parameter information block from lowcore and skip past + * the IPL parameter list, then align the address to a double + * word boundary. + */ + tmp = (unsigned long) S390_lowcore.ipl_parmblock_ptr; + pl_hdr = (struct ipl_pl_hdr *) tmp; + tmp = (tmp + pl_hdr->len + 7) & -8UL; + rl_hdr = (struct ipl_rl_hdr *) tmp; + /* Walk through the IPL report blocks in the IPL Report list */ + certs = NULL; + comps = NULL; + rl_end = (void *) rl_hdr + rl_hdr->len; + rb_hdr = (void *) rl_hdr + sizeof(*rl_hdr); + while ((void *) rb_hdr + sizeof(*rb_hdr) < rl_end && + (void *) rb_hdr + rb_hdr->len <= rl_end) { + + switch (rb_hdr->rbt) { + case IPL_RBT_CERTIFICATES: + certs = (struct ipl_rb_certificates *) rb_hdr; + break; + case IPL_RBT_COMPONENTS: + comps = (struct ipl_rb_components *) rb_hdr; + break; + default: + break; + } + + rb_hdr = (void *) rb_hdr + rb_hdr->len; + } + + /* + * With either the component list or the certificate list + * missing the kernel will stay ignorant of secure IPL. + */ + if (!comps || !certs) { + certs = NULL; + return -1; + } + + ipl_report_needs_saving = true; + physmem_reserve(RR_IPLREPORT, (unsigned long)pl_hdr, + (unsigned long)rl_end - (unsigned long)pl_hdr); + return 0; +} + +void save_ipl_cert_comp_list(void) +{ + unsigned long size; + + if (!ipl_report_needs_saving) + return; + + size = get_cert_comp_list_size(); + early_ipl_comp_list_addr = physmem_alloc_top_down(RR_CERT_COMP_LIST, size, sizeof(int)); + ipl_cert_list_addr = early_ipl_comp_list_addr + early_ipl_comp_list_size; + + copy_components_bootdata(); + copy_certificates_bootdata(); + physmem_free(RR_IPLREPORT); + ipl_report_needs_saving = false; +} diff --git a/arch/s390/boot/ipl_vmparm.c b/arch/s390/boot/ipl_vmparm.c new file mode 100644 index 0000000000..8dacd5fadf --- /dev/null +++ b/arch/s390/boot/ipl_vmparm.c @@ -0,0 +1,2 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "../kernel/ipl_vmparm.c" diff --git a/arch/s390/boot/kaslr.c b/arch/s390/boot/kaslr.c new file mode 100644 index 0000000000..90602101e2 --- /dev/null +++ b/arch/s390/boot/kaslr.c @@ -0,0 +1,198 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2019 + */ +#include +#include +#include +#include +#include +#include +#include "decompressor.h" +#include "boot.h" + +#define PRNG_MODE_TDES 1 +#define PRNG_MODE_SHA512 2 +#define PRNG_MODE_TRNG 3 + +struct prno_parm { + u32 res; + u32 reseed_counter; + u64 stream_bytes; + u8 V[112]; + u8 C[112]; +}; + +struct prng_parm { + u8 parm_block[32]; + u32 reseed_counter; + u64 byte_counter; +}; + +static int check_prng(void) +{ + if (!cpacf_query_func(CPACF_KMC, CPACF_KMC_PRNG)) { + sclp_early_printk("KASLR disabled: CPU has no PRNG\n"); + return 0; + } + if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG)) + return PRNG_MODE_TRNG; + if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_SHA512_DRNG_GEN)) + return PRNG_MODE_SHA512; + else + return PRNG_MODE_TDES; +} + +static int get_random(unsigned long limit, unsigned long *value) +{ + struct prng_parm prng = { + /* initial parameter block for tdes mode, copied from libica */ + .parm_block = { + 0x0F, 0x2B, 0x8E, 0x63, 0x8C, 0x8E, 0xD2, 0x52, + 0x64, 0xB7, 0xA0, 0x7B, 0x75, 0x28, 0xB8, 0xF4, + 0x75, 0x5F, 0xD2, 0xA6, 0x8D, 0x97, 0x11, 0xFF, + 0x49, 0xD8, 0x23, 0xF3, 0x7E, 0x21, 0xEC, 0xA0 + }, + }; + unsigned long seed, random; + struct prno_parm prno; + __u64 entropy[4]; + int mode, i; + + mode = check_prng(); + seed = get_tod_clock_fast(); + switch (mode) { + case PRNG_MODE_TRNG: + cpacf_trng(NULL, 0, (u8 *) &random, sizeof(random)); + break; + case PRNG_MODE_SHA512: + cpacf_prno(CPACF_PRNO_SHA512_DRNG_SEED, &prno, NULL, 0, + (u8 *) &seed, sizeof(seed)); + cpacf_prno(CPACF_PRNO_SHA512_DRNG_GEN, &prno, (u8 *) &random, + sizeof(random), NULL, 0); + break; + case PRNG_MODE_TDES: + /* add entropy */ + *(unsigned long *) prng.parm_block ^= seed; + for (i = 0; i < 16; i++) { + cpacf_kmc(CPACF_KMC_PRNG, prng.parm_block, + (u8 *) entropy, (u8 *) entropy, + sizeof(entropy)); + memcpy(prng.parm_block, entropy, sizeof(entropy)); + } + random = seed; + cpacf_kmc(CPACF_KMC_PRNG, prng.parm_block, (u8 *) &random, + (u8 *) &random, sizeof(random)); + break; + default: + return -1; + } + *value = random % limit; + return 0; +} + +static void sort_reserved_ranges(struct reserved_range *res, unsigned long size) +{ + struct reserved_range tmp; + int i, j; + + for (i = 1; i < size; i++) { + tmp = res[i]; + for (j = i - 1; j >= 0 && res[j].start > tmp.start; j--) + res[j + 1] = res[j]; + res[j + 1] = tmp; + } +} + +static unsigned long iterate_valid_positions(unsigned long size, unsigned long align, + unsigned long _min, unsigned long _max, + struct reserved_range *res, size_t res_count, + bool pos_count, unsigned long find_pos) +{ + unsigned long start, end, tmp_end, range_pos, pos = 0; + struct reserved_range *res_end = res + res_count; + struct reserved_range *skip_res; + int i; + + align = max(align, 8UL); + _min = round_up(_min, align); + for_each_physmem_usable_range(i, &start, &end) { + if (_min >= end) + continue; + start = round_up(start, align); + if (start >= _max) + break; + start = max(_min, start); + end = min(_max, end); + + while (start + size <= end) { + /* skip reserved ranges below the start */ + while (res && res->end <= start) { + res++; + if (res >= res_end) + res = NULL; + } + skip_res = NULL; + tmp_end = end; + /* has intersecting reserved range */ + if (res && res->start < end) { + skip_res = res; + tmp_end = res->start; + } + if (start + size <= tmp_end) { + range_pos = (tmp_end - start - size) / align + 1; + if (pos_count) { + pos += range_pos; + } else { + if (range_pos >= find_pos) + return start + (find_pos - 1) * align; + find_pos -= range_pos; + } + } + if (!skip_res) + break; + start = round_up(skip_res->end, align); + } + } + + return pos_count ? pos : 0; +} + +/* + * Two types of decompressor memory allocations/reserves are considered + * differently. + * + * "Static" or "single" allocations are done via physmem_alloc_range() and + * physmem_reserve(), and they are listed in physmem_info.reserved[]. Each + * type of "static" allocation can only have one allocation per type and + * cannot have chains. + * + * On the other hand, "dynamic" or "repetitive" allocations are done via + * physmem_alloc_top_down(). These allocations are tightly packed together + * top down from the end of online memory. physmem_alloc_pos represents + * current position where those allocations start. + * + * Functions randomize_within_range() and iterate_valid_positions() + * only consider "dynamic" allocations by never looking above + * physmem_alloc_pos. "Static" allocations, however, are explicitly + * considered by checking the "res" (reserves) array. The first + * reserved_range of a "dynamic" allocation may also be checked along the + * way, but it will always be above the maximum value anyway. + */ +unsigned long randomize_within_range(unsigned long size, unsigned long align, + unsigned long min, unsigned long max) +{ + struct reserved_range res[RR_MAX]; + unsigned long max_pos, pos; + + memcpy(res, physmem_info.reserved, sizeof(res)); + sort_reserved_ranges(res, ARRAY_SIZE(res)); + max = min(max, get_physmem_alloc_pos()); + + max_pos = iterate_valid_positions(size, align, min, max, res, ARRAY_SIZE(res), true, 0); + if (!max_pos) + return 0; + if (get_random(max_pos, &pos)) + return 0; + return iterate_valid_positions(size, align, min, max, res, ARRAY_SIZE(res), false, pos + 1); +} diff --git a/arch/s390/boot/machine_kexec_reloc.c b/arch/s390/boot/machine_kexec_reloc.c new file mode 100644 index 0000000000..b7a5d0f720 --- /dev/null +++ b/arch/s390/boot/machine_kexec_reloc.c @@ -0,0 +1,2 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "../kernel/machine_kexec_reloc.c" diff --git a/arch/s390/boot/mem.S b/arch/s390/boot/mem.S new file mode 100644 index 0000000000..b33463633f --- /dev/null +++ b/arch/s390/boot/mem.S @@ -0,0 +1,2 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include "../lib/mem.S" diff --git a/arch/s390/boot/pgm_check_info.c b/arch/s390/boot/pgm_check_info.c new file mode 100644 index 0000000000..97244cd7a2 --- /dev/null +++ b/arch/s390/boot/pgm_check_info.c @@ -0,0 +1,179 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "boot.h" + +const char hex_asc[] = "0123456789abcdef"; + +static char *as_hex(char *dst, unsigned long val, int pad) +{ + char *p, *end = p = dst + max(pad, (int)__fls(val | 1) / 4 + 1); + + for (*p-- = 0; p >= dst; val >>= 4) + *p-- = hex_asc[val & 0x0f]; + return end; +} + +static char *symstart(char *p) +{ + while (*p) + p--; + return p + 1; +} + +static noinline char *findsym(unsigned long ip, unsigned short *off, unsigned short *len) +{ + /* symbol entries are in a form "10000 c4 startup\0" */ + char *a = _decompressor_syms_start; + char *b = _decompressor_syms_end; + unsigned long start; + unsigned long size; + char *pivot; + char *endp; + + while (a < b) { + pivot = symstart(a + (b - a) / 2); + start = simple_strtoull(pivot, &endp, 16); + size = simple_strtoull(endp + 1, &endp, 16); + if (ip < start) { + b = pivot; + continue; + } + if (ip > start + size) { + a = pivot + strlen(pivot) + 1; + continue; + } + *off = ip - start; + *len = size; + return endp + 1; + } + return NULL; +} + +static noinline char *strsym(void *ip) +{ + static char buf[64]; + unsigned short off; + unsigned short len; + char *p; + + p = findsym((unsigned long)ip, &off, &len); + if (p) { + strncpy(buf, p, sizeof(buf)); + /* reserve 15 bytes for offset/len in symbol+0x1234/0x1234 */ + p = buf + strnlen(buf, sizeof(buf) - 15); + strcpy(p, "+0x"); + p = as_hex(p + 3, off, 0); + strcpy(p, "/0x"); + as_hex(p + 3, len, 0); + } else { + as_hex(buf, (unsigned long)ip, 16); + } + return buf; +} + +void decompressor_printk(const char *fmt, ...) +{ + char buf[1024] = { 0 }; + char *end = buf + sizeof(buf) - 1; /* make sure buf is 0 terminated */ + unsigned long pad; + char *p = buf; + va_list args; + + va_start(args, fmt); + for (; p < end && *fmt; fmt++) { + if (*fmt != '%') { + *p++ = *fmt; + continue; + } + pad = isdigit(*++fmt) ? simple_strtol(fmt, (char **)&fmt, 10) : 0; + switch (*fmt) { + case 's': + p = buf + strlcat(buf, va_arg(args, char *), sizeof(buf)); + break; + case 'p': + if (*++fmt != 'S') + goto out; + p = buf + strlcat(buf, strsym(va_arg(args, void *)), sizeof(buf)); + break; + case 'l': + if (*++fmt != 'x' || end - p <= max(sizeof(long) * 2, pad)) + goto out; + p = as_hex(p, va_arg(args, unsigned long), pad); + break; + case 'x': + if (end - p <= max(sizeof(int) * 2, pad)) + goto out; + p = as_hex(p, va_arg(args, unsigned int), pad); + break; + default: + goto out; + } + } +out: + va_end(args); + sclp_early_printk(buf); +} + +void print_stacktrace(unsigned long sp) +{ + struct stack_info boot_stack = { STACK_TYPE_TASK, (unsigned long)_stack_start, + (unsigned long)_stack_end }; + bool first = true; + + decompressor_printk("Call Trace:\n"); + while (!(sp & 0x7) && on_stack(&boot_stack, sp, sizeof(struct stack_frame))) { + struct stack_frame *sf = (struct stack_frame *)sp; + + decompressor_printk(first ? "(sp:%016lx [<%016lx>] %pS)\n" : + " sp:%016lx [<%016lx>] %pS\n", + sp, sf->gprs[8], (void *)sf->gprs[8]); + if (sf->back_chain <= sp) + break; + sp = sf->back_chain; + first = false; + } +} + +void print_pgm_check_info(void) +{ + unsigned long *gpregs = (unsigned long *)S390_lowcore.gpregs_save_area; + struct psw_bits *psw = &psw_bits(S390_lowcore.psw_save_area); + + decompressor_printk("Linux version %s\n", kernel_version); + if (!is_prot_virt_guest() && early_command_line[0]) + decompressor_printk("Kernel command line: %s\n", early_command_line); + decompressor_printk("Kernel fault: interruption code %04x ilc:%x\n", + S390_lowcore.pgm_code, S390_lowcore.pgm_ilc >> 1); + if (kaslr_enabled()) + decompressor_printk("Kernel random base: %lx\n", __kaslr_offset); + decompressor_printk("PSW : %016lx %016lx (%pS)\n", + S390_lowcore.psw_save_area.mask, + S390_lowcore.psw_save_area.addr, + (void *)S390_lowcore.psw_save_area.addr); + decompressor_printk( + " R:%x T:%x IO:%x EX:%x Key:%x M:%x W:%x P:%x AS:%x CC:%x PM:%x RI:%x EA:%x\n", + psw->per, psw->dat, psw->io, psw->ext, psw->key, psw->mcheck, + psw->wait, psw->pstate, psw->as, psw->cc, psw->pm, psw->ri, + psw->eaba); + decompressor_printk("GPRS: %016lx %016lx %016lx %016lx\n", + gpregs[0], gpregs[1], gpregs[2], gpregs[3]); + decompressor_printk(" %016lx %016lx %016lx %016lx\n", + gpregs[4], gpregs[5], gpregs[6], gpregs[7]); + decompressor_printk(" %016lx %016lx %016lx %016lx\n", + gpregs[8], gpregs[9], gpregs[10], gpregs[11]); + decompressor_printk(" %016lx %016lx %016lx %016lx\n", + gpregs[12], gpregs[13], gpregs[14], gpregs[15]); + print_stacktrace(S390_lowcore.gpregs_save_area[15]); + decompressor_printk("Last Breaking-Event-Address:\n"); + decompressor_printk(" [<%016lx>] %pS\n", (unsigned long)S390_lowcore.pgm_last_break, + (void *)S390_lowcore.pgm_last_break); +} diff --git a/arch/s390/boot/physmem_info.c b/arch/s390/boot/physmem_info.c new file mode 100644 index 0000000000..0cf79826ee --- /dev/null +++ b/arch/s390/boot/physmem_info.c @@ -0,0 +1,328 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "decompressor.h" +#include "boot.h" + +struct physmem_info __bootdata(physmem_info); +static unsigned int physmem_alloc_ranges; +static unsigned long physmem_alloc_pos; + +/* up to 256 storage elements, 1020 subincrements each */ +#define ENTRIES_EXTENDED_MAX \ + (256 * (1020 / 2) * sizeof(struct physmem_range)) + +static struct physmem_range *__get_physmem_range_ptr(u32 n) +{ + if (n < MEM_INLINED_ENTRIES) + return &physmem_info.online[n]; + if (unlikely(!physmem_info.online_extended)) { + physmem_info.online_extended = (struct physmem_range *)physmem_alloc_range( + RR_MEM_DETECT_EXTENDED, ENTRIES_EXTENDED_MAX, sizeof(long), 0, + physmem_alloc_pos, true); + } + return &physmem_info.online_extended[n - MEM_INLINED_ENTRIES]; +} + +/* + * sequential calls to add_physmem_online_range with adjacent memory ranges + * are merged together into single memory range. + */ +void add_physmem_online_range(u64 start, u64 end) +{ + struct physmem_range *range; + + if (physmem_info.range_count) { + range = __get_physmem_range_ptr(physmem_info.range_count - 1); + if (range->end == start) { + range->end = end; + return; + } + } + + range = __get_physmem_range_ptr(physmem_info.range_count); + range->start = start; + range->end = end; + physmem_info.range_count++; +} + +static int __diag260(unsigned long rx1, unsigned long rx2) +{ + unsigned long reg1, reg2, ry; + union register_pair rx; + psw_t old; + int rc; + + rx.even = rx1; + rx.odd = rx2; + ry = 0x10; /* storage configuration */ + rc = -1; /* fail */ + asm volatile( + " mvc 0(16,%[psw_old]),0(%[psw_pgm])\n" + " epsw %[reg1],%[reg2]\n" + " st %[reg1],0(%[psw_pgm])\n" + " st %[reg2],4(%[psw_pgm])\n" + " larl %[reg1],1f\n" + " stg %[reg1],8(%[psw_pgm])\n" + " diag %[rx],%[ry],0x260\n" + " ipm %[rc]\n" + " srl %[rc],28\n" + "1: mvc 0(16,%[psw_pgm]),0(%[psw_old])\n" + : [reg1] "=&d" (reg1), + [reg2] "=&a" (reg2), + [rc] "+&d" (rc), + [ry] "+&d" (ry), + "+Q" (S390_lowcore.program_new_psw), + "=Q" (old) + : [rx] "d" (rx.pair), + [psw_old] "a" (&old), + [psw_pgm] "a" (&S390_lowcore.program_new_psw) + : "cc", "memory"); + return rc == 0 ? ry : -1; +} + +static int diag260(void) +{ + int rc, i; + + struct { + unsigned long start; + unsigned long end; + } storage_extents[8] __aligned(16); /* VM supports up to 8 extends */ + + memset(storage_extents, 0, sizeof(storage_extents)); + rc = __diag260((unsigned long)storage_extents, sizeof(storage_extents)); + if (rc == -1) + return -1; + + for (i = 0; i < min_t(int, rc, ARRAY_SIZE(storage_extents)); i++) + add_physmem_online_range(storage_extents[i].start, storage_extents[i].end + 1); + return 0; +} + +static int tprot(unsigned long addr) +{ + unsigned long reg1, reg2; + int rc = -EFAULT; + psw_t old; + + asm volatile( + " mvc 0(16,%[psw_old]),0(%[psw_pgm])\n" + " epsw %[reg1],%[reg2]\n" + " st %[reg1],0(%[psw_pgm])\n" + " st %[reg2],4(%[psw_pgm])\n" + " larl %[reg1],1f\n" + " stg %[reg1],8(%[psw_pgm])\n" + " tprot 0(%[addr]),0\n" + " ipm %[rc]\n" + " srl %[rc],28\n" + "1: mvc 0(16,%[psw_pgm]),0(%[psw_old])\n" + : [reg1] "=&d" (reg1), + [reg2] "=&a" (reg2), + [rc] "+&d" (rc), + "=Q" (S390_lowcore.program_new_psw.addr), + "=Q" (old) + : [psw_old] "a" (&old), + [psw_pgm] "a" (&S390_lowcore.program_new_psw), + [addr] "a" (addr) + : "cc", "memory"); + return rc; +} + +static unsigned long search_mem_end(void) +{ + unsigned long range = 1 << (MAX_PHYSMEM_BITS - 20); /* in 1MB blocks */ + unsigned long offset = 0; + unsigned long pivot; + + while (range > 1) { + range >>= 1; + pivot = offset + range; + if (!tprot(pivot << 20)) + offset = pivot; + } + return (offset + 1) << 20; +} + +unsigned long detect_max_physmem_end(void) +{ + unsigned long max_physmem_end = 0; + + if (!sclp_early_get_memsize(&max_physmem_end)) { + physmem_info.info_source = MEM_DETECT_SCLP_READ_INFO; + } else { + max_physmem_end = search_mem_end(); + physmem_info.info_source = MEM_DETECT_BIN_SEARCH; + } + return max_physmem_end; +} + +void detect_physmem_online_ranges(unsigned long max_physmem_end) +{ + if (!sclp_early_read_storage_info()) { + physmem_info.info_source = MEM_DETECT_SCLP_STOR_INFO; + } else if (!diag260()) { + physmem_info.info_source = MEM_DETECT_DIAG260; + } else if (max_physmem_end) { + add_physmem_online_range(0, max_physmem_end); + } +} + +void physmem_set_usable_limit(unsigned long limit) +{ + physmem_info.usable = limit; + physmem_alloc_pos = limit; +} + +static void die_oom(unsigned long size, unsigned long align, unsigned long min, unsigned long max) +{ + unsigned long start, end, total_mem = 0, total_reserved_mem = 0; + struct reserved_range *range; + enum reserved_range_type t; + int i; + + decompressor_printk("Linux version %s\n", kernel_version); + if (!is_prot_virt_guest() && early_command_line[0]) + decompressor_printk("Kernel command line: %s\n", early_command_line); + decompressor_printk("Out of memory allocating %lx bytes %lx aligned in range %lx:%lx\n", + size, align, min, max); + decompressor_printk("Reserved memory ranges:\n"); + for_each_physmem_reserved_range(t, range, &start, &end) { + decompressor_printk("%016lx %016lx %s\n", start, end, get_rr_type_name(t)); + total_reserved_mem += end - start; + } + decompressor_printk("Usable online memory ranges (info source: %s [%x]):\n", + get_physmem_info_source(), physmem_info.info_source); + for_each_physmem_usable_range(i, &start, &end) { + decompressor_printk("%016lx %016lx\n", start, end); + total_mem += end - start; + } + decompressor_printk("Usable online memory total: %lx Reserved: %lx Free: %lx\n", + total_mem, total_reserved_mem, + total_mem > total_reserved_mem ? total_mem - total_reserved_mem : 0); + print_stacktrace(current_frame_address()); + sclp_early_printk("\n\n -- System halted\n"); + disabled_wait(); +} + +void physmem_reserve(enum reserved_range_type type, unsigned long addr, unsigned long size) +{ + physmem_info.reserved[type].start = addr; + physmem_info.reserved[type].end = addr + size; +} + +void physmem_free(enum reserved_range_type type) +{ + physmem_info.reserved[type].start = 0; + physmem_info.reserved[type].end = 0; +} + +static bool __physmem_alloc_intersects(unsigned long addr, unsigned long size, + unsigned long *intersection_start) +{ + unsigned long res_addr, res_size; + int t; + + for (t = 0; t < RR_MAX; t++) { + if (!get_physmem_reserved(t, &res_addr, &res_size)) + continue; + if (intersects(addr, size, res_addr, res_size)) { + *intersection_start = res_addr; + return true; + } + } + return ipl_report_certs_intersects(addr, size, intersection_start); +} + +static unsigned long __physmem_alloc_range(unsigned long size, unsigned long align, + unsigned long min, unsigned long max, + unsigned int from_ranges, unsigned int *ranges_left, + bool die_on_oom) +{ + unsigned int nranges = from_ranges ?: physmem_info.range_count; + unsigned long range_start, range_end; + unsigned long intersection_start; + unsigned long addr, pos = max; + + align = max(align, 8UL); + while (nranges) { + __get_physmem_range(nranges - 1, &range_start, &range_end, false); + pos = min(range_end, pos); + + if (round_up(min, align) + size > pos) + break; + addr = round_down(pos - size, align); + if (range_start > addr) { + nranges--; + continue; + } + if (__physmem_alloc_intersects(addr, size, &intersection_start)) { + pos = intersection_start; + continue; + } + + if (ranges_left) + *ranges_left = nranges; + return addr; + } + if (die_on_oom) + die_oom(size, align, min, max); + return 0; +} + +unsigned long physmem_alloc_range(enum reserved_range_type type, unsigned long size, + unsigned long align, unsigned long min, unsigned long max, + bool die_on_oom) +{ + unsigned long addr; + + max = min(max, physmem_alloc_pos); + addr = __physmem_alloc_range(size, align, min, max, 0, NULL, die_on_oom); + if (addr) + physmem_reserve(type, addr, size); + return addr; +} + +unsigned long physmem_alloc_top_down(enum reserved_range_type type, unsigned long size, + unsigned long align) +{ + struct reserved_range *range = &physmem_info.reserved[type]; + struct reserved_range *new_range; + unsigned int ranges_left; + unsigned long addr; + + addr = __physmem_alloc_range(size, align, 0, physmem_alloc_pos, physmem_alloc_ranges, + &ranges_left, true); + /* if not a consecutive allocation of the same type or first allocation */ + if (range->start != addr + size) { + if (range->end) { + physmem_alloc_pos = __physmem_alloc_range( + sizeof(struct reserved_range), 0, 0, physmem_alloc_pos, + physmem_alloc_ranges, &ranges_left, true); + new_range = (struct reserved_range *)physmem_alloc_pos; + *new_range = *range; + range->chain = new_range; + addr = __physmem_alloc_range(size, align, 0, physmem_alloc_pos, + ranges_left, &ranges_left, true); + } + range->end = addr + size; + } + range->start = addr; + physmem_alloc_pos = addr; + physmem_alloc_ranges = ranges_left; + return addr; +} + +unsigned long get_physmem_alloc_pos(void) +{ + return physmem_alloc_pos; +} diff --git a/arch/s390/boot/sclp_early_core.c b/arch/s390/boot/sclp_early_core.c new file mode 100644 index 0000000000..6f30646afb --- /dev/null +++ b/arch/s390/boot/sclp_early_core.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "boot.h" +#include "../../../drivers/s390/char/sclp_early_core.c" + +/* SCLP early buffer must stay page-aligned and below 2GB */ +static char __sclp_early_sccb[EXT_SCCB_READ_SCP] __aligned(PAGE_SIZE); + +void sclp_early_setup_buffer(void) +{ + sclp_early_set_buffer(&__sclp_early_sccb); +} diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c new file mode 100644 index 0000000000..d3e48bd9c3 --- /dev/null +++ b/arch/s390/boot/startup.c @@ -0,0 +1,378 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "decompressor.h" +#include "boot.h" +#include "uv.h" + +unsigned long __bootdata_preserved(__kaslr_offset); +unsigned long __bootdata_preserved(__abs_lowcore); +unsigned long __bootdata_preserved(__memcpy_real_area); +pte_t *__bootdata_preserved(memcpy_real_ptep); +unsigned long __bootdata_preserved(VMALLOC_START); +unsigned long __bootdata_preserved(VMALLOC_END); +struct page *__bootdata_preserved(vmemmap); +unsigned long __bootdata_preserved(vmemmap_size); +unsigned long __bootdata_preserved(MODULES_VADDR); +unsigned long __bootdata_preserved(MODULES_END); +unsigned long __bootdata_preserved(max_mappable); +unsigned long __bootdata(ident_map_size); + +u64 __bootdata_preserved(stfle_fac_list[16]); +u64 __bootdata_preserved(alt_stfle_fac_list[16]); +struct oldmem_data __bootdata_preserved(oldmem_data); + +struct machine_info machine; + +void error(char *x) +{ + sclp_early_printk("\n\n"); + sclp_early_printk(x); + sclp_early_printk("\n\n -- System halted"); + + disabled_wait(); +} + +static void detect_facilities(void) +{ + if (test_facility(8)) { + machine.has_edat1 = 1; + __ctl_set_bit(0, 23); + } + if (test_facility(78)) + machine.has_edat2 = 1; + if (test_facility(130)) + machine.has_nx = 1; +} + +static void setup_lpp(void) +{ + S390_lowcore.current_pid = 0; + S390_lowcore.lpp = LPP_MAGIC; + if (test_facility(40)) + lpp(&S390_lowcore.lpp); +} + +#ifdef CONFIG_KERNEL_UNCOMPRESSED +unsigned long mem_safe_offset(void) +{ + return vmlinux.default_lma + vmlinux.image_size + vmlinux.bss_size; +} +#endif + +static void rescue_initrd(unsigned long min, unsigned long max) +{ + unsigned long old_addr, addr, size; + + if (!IS_ENABLED(CONFIG_BLK_DEV_INITRD)) + return; + if (!get_physmem_reserved(RR_INITRD, &addr, &size)) + return; + if (addr >= min && addr + size <= max) + return; + old_addr = addr; + physmem_free(RR_INITRD); + addr = physmem_alloc_top_down(RR_INITRD, size, 0); + memmove((void *)addr, (void *)old_addr, size); +} + +static void copy_bootdata(void) +{ + if (__boot_data_end - __boot_data_start != vmlinux.bootdata_size) + error(".boot.data section size mismatch"); + memcpy((void *)vmlinux.bootdata_off, __boot_data_start, vmlinux.bootdata_size); + if (__boot_data_preserved_end - __boot_data_preserved_start != vmlinux.bootdata_preserved_size) + error(".boot.preserved.data section size mismatch"); + memcpy((void *)vmlinux.bootdata_preserved_off, __boot_data_preserved_start, vmlinux.bootdata_preserved_size); +} + +static void handle_relocs(unsigned long offset) +{ + Elf64_Rela *rela_start, *rela_end, *rela; + int r_type, r_sym, rc; + Elf64_Addr loc, val; + Elf64_Sym *dynsym; + + rela_start = (Elf64_Rela *) vmlinux.rela_dyn_start; + rela_end = (Elf64_Rela *) vmlinux.rela_dyn_end; + dynsym = (Elf64_Sym *) vmlinux.dynsym_start; + for (rela = rela_start; rela < rela_end; rela++) { + loc = rela->r_offset + offset; + val = rela->r_addend; + r_sym = ELF64_R_SYM(rela->r_info); + if (r_sym) { + if (dynsym[r_sym].st_shndx != SHN_UNDEF) + val += dynsym[r_sym].st_value + offset; + } else { + /* + * 0 == undefined symbol table index (STN_UNDEF), + * used for R_390_RELATIVE, only add KASLR offset + */ + val += offset; + } + r_type = ELF64_R_TYPE(rela->r_info); + rc = arch_kexec_do_relocs(r_type, (void *) loc, val, 0); + if (rc) + error("Unknown relocation type"); + } +} + +/* + * Merge information from several sources into a single ident_map_size value. + * "ident_map_size" represents the upper limit of physical memory we may ever + * reach. It might not be all online memory, but also include standby (offline) + * memory. "ident_map_size" could be lower then actual standby or even online + * memory present, due to limiting factors. We should never go above this limit. + * It is the size of our identity mapping. + * + * Consider the following factors: + * 1. max_physmem_end - end of physical memory online or standby. + * Always >= end of the last online memory range (get_physmem_online_end()). + * 2. CONFIG_MAX_PHYSMEM_BITS - the maximum size of physical memory the + * kernel is able to support. + * 3. "mem=" kernel command line option which limits physical memory usage. + * 4. OLDMEM_BASE which is a kdump memory limit when the kernel is executed as + * crash kernel. + * 5. "hsa" size which is a memory limit when the kernel is executed during + * zfcp/nvme dump. + */ +static void setup_ident_map_size(unsigned long max_physmem_end) +{ + unsigned long hsa_size; + + ident_map_size = max_physmem_end; + if (memory_limit) + ident_map_size = min(ident_map_size, memory_limit); + ident_map_size = min(ident_map_size, 1UL << MAX_PHYSMEM_BITS); + +#ifdef CONFIG_CRASH_DUMP + if (oldmem_data.start) { + __kaslr_enabled = 0; + ident_map_size = min(ident_map_size, oldmem_data.size); + } else if (ipl_block_valid && is_ipl_block_dump()) { + __kaslr_enabled = 0; + if (!sclp_early_get_hsa_size(&hsa_size) && hsa_size) + ident_map_size = min(ident_map_size, hsa_size); + } +#endif +} + +static unsigned long setup_kernel_memory_layout(void) +{ + unsigned long vmemmap_start; + unsigned long asce_limit; + unsigned long rte_size; + unsigned long pages; + unsigned long vsize; + unsigned long vmax; + + pages = ident_map_size / PAGE_SIZE; + /* vmemmap contains a multiple of PAGES_PER_SECTION struct pages */ + vmemmap_size = SECTION_ALIGN_UP(pages) * sizeof(struct page); + + /* choose kernel address space layout: 4 or 3 levels. */ + vsize = round_up(ident_map_size, _REGION3_SIZE) + vmemmap_size + + MODULES_LEN + MEMCPY_REAL_SIZE + ABS_LOWCORE_MAP_SIZE; + vsize = size_add(vsize, vmalloc_size); + if (IS_ENABLED(CONFIG_KASAN) || (vsize > _REGION2_SIZE)) { + asce_limit = _REGION1_SIZE; + rte_size = _REGION2_SIZE; + } else { + asce_limit = _REGION2_SIZE; + rte_size = _REGION3_SIZE; + } + + /* + * Forcing modules and vmalloc area under the ultravisor + * secure storage limit, so that any vmalloc allocation + * we do could be used to back secure guest storage. + */ + vmax = adjust_to_uv_max(asce_limit); +#ifdef CONFIG_KASAN + /* force vmalloc and modules below kasan shadow */ + vmax = min(vmax, KASAN_SHADOW_START); +#endif + __memcpy_real_area = round_down(vmax - MEMCPY_REAL_SIZE, PAGE_SIZE); + __abs_lowcore = round_down(__memcpy_real_area - ABS_LOWCORE_MAP_SIZE, + sizeof(struct lowcore)); + MODULES_END = round_down(__abs_lowcore, _SEGMENT_SIZE); + MODULES_VADDR = MODULES_END - MODULES_LEN; + VMALLOC_END = MODULES_VADDR; + + /* allow vmalloc area to occupy up to about 1/2 of the rest virtual space left */ + vmalloc_size = min(vmalloc_size, round_down(VMALLOC_END / 2, _REGION3_SIZE)); + VMALLOC_START = VMALLOC_END - vmalloc_size; + + /* split remaining virtual space between 1:1 mapping & vmemmap array */ + pages = VMALLOC_START / (PAGE_SIZE + sizeof(struct page)); + pages = SECTION_ALIGN_UP(pages); + /* keep vmemmap_start aligned to a top level region table entry */ + vmemmap_start = round_down(VMALLOC_START - pages * sizeof(struct page), rte_size); + vmemmap_start = min(vmemmap_start, 1UL << MAX_PHYSMEM_BITS); + /* maximum mappable address as seen by arch_get_mappable_range() */ + max_mappable = vmemmap_start; + /* make sure identity map doesn't overlay with vmemmap */ + ident_map_size = min(ident_map_size, vmemmap_start); + vmemmap_size = SECTION_ALIGN_UP(ident_map_size / PAGE_SIZE) * sizeof(struct page); + /* make sure vmemmap doesn't overlay with vmalloc area */ + VMALLOC_START = max(vmemmap_start + vmemmap_size, VMALLOC_START); + vmemmap = (struct page *)vmemmap_start; + + return asce_limit; +} + +/* + * This function clears the BSS section of the decompressed Linux kernel and NOT the decompressor's. + */ +static void clear_bss_section(unsigned long vmlinux_lma) +{ + memset((void *)vmlinux_lma + vmlinux.image_size, 0, vmlinux.bss_size); +} + +/* + * Set vmalloc area size to an 8th of (potential) physical memory + * size, unless size has been set by kernel command line parameter. + */ +static void setup_vmalloc_size(void) +{ + unsigned long size; + + if (vmalloc_size_set) + return; + size = round_up(ident_map_size / 8, _SEGMENT_SIZE); + vmalloc_size = max(size, vmalloc_size); +} + +static void offset_vmlinux_info(unsigned long offset) +{ + *(unsigned long *)(&vmlinux.entry) += offset; + vmlinux.bootdata_off += offset; + vmlinux.bootdata_preserved_off += offset; + vmlinux.rela_dyn_start += offset; + vmlinux.rela_dyn_end += offset; + vmlinux.dynsym_start += offset; + vmlinux.init_mm_off += offset; + vmlinux.swapper_pg_dir_off += offset; + vmlinux.invalid_pg_dir_off += offset; +#ifdef CONFIG_KASAN + vmlinux.kasan_early_shadow_page_off += offset; + vmlinux.kasan_early_shadow_pte_off += offset; + vmlinux.kasan_early_shadow_pmd_off += offset; + vmlinux.kasan_early_shadow_pud_off += offset; + vmlinux.kasan_early_shadow_p4d_off += offset; +#endif +} + +void startup_kernel(void) +{ + unsigned long max_physmem_end; + unsigned long vmlinux_lma = 0; + unsigned long amode31_lma = 0; + unsigned long asce_limit; + unsigned long safe_addr; + void *img; + psw_t psw; + + setup_lpp(); + safe_addr = mem_safe_offset(); + + /* + * Reserve decompressor memory together with decompression heap, buffer and + * memory which might be occupied by uncompressed kernel at default 1Mb + * position (if KASLR is off or failed). + */ + physmem_reserve(RR_DECOMPRESSOR, 0, safe_addr); + if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && parmarea.initrd_size) + physmem_reserve(RR_INITRD, parmarea.initrd_start, parmarea.initrd_size); + oldmem_data.start = parmarea.oldmem_base; + oldmem_data.size = parmarea.oldmem_size; + + store_ipl_parmblock(); + read_ipl_report(); + uv_query_info(); + sclp_early_read_info(); + setup_boot_command_line(); + parse_boot_command_line(); + detect_facilities(); + sanitize_prot_virt_host(); + max_physmem_end = detect_max_physmem_end(); + setup_ident_map_size(max_physmem_end); + setup_vmalloc_size(); + asce_limit = setup_kernel_memory_layout(); + /* got final ident_map_size, physmem allocations could be performed now */ + physmem_set_usable_limit(ident_map_size); + detect_physmem_online_ranges(max_physmem_end); + save_ipl_cert_comp_list(); + rescue_initrd(safe_addr, ident_map_size); + + if (kaslr_enabled()) { + vmlinux_lma = randomize_within_range(vmlinux.image_size + vmlinux.bss_size, + THREAD_SIZE, vmlinux.default_lma, + ident_map_size); + if (vmlinux_lma) { + __kaslr_offset = vmlinux_lma - vmlinux.default_lma; + offset_vmlinux_info(__kaslr_offset); + } + } + vmlinux_lma = vmlinux_lma ?: vmlinux.default_lma; + physmem_reserve(RR_VMLINUX, vmlinux_lma, vmlinux.image_size + vmlinux.bss_size); + + if (!IS_ENABLED(CONFIG_KERNEL_UNCOMPRESSED)) { + img = decompress_kernel(); + memmove((void *)vmlinux_lma, img, vmlinux.image_size); + } else if (__kaslr_offset) { + img = (void *)vmlinux.default_lma; + memmove((void *)vmlinux_lma, img, vmlinux.image_size); + memset(img, 0, vmlinux.image_size); + } + + /* vmlinux decompression is done, shrink reserved low memory */ + physmem_reserve(RR_DECOMPRESSOR, 0, (unsigned long)_decompressor_end); + if (kaslr_enabled()) + amode31_lma = randomize_within_range(vmlinux.amode31_size, PAGE_SIZE, 0, SZ_2G); + amode31_lma = amode31_lma ?: vmlinux.default_lma - vmlinux.amode31_size; + physmem_reserve(RR_AMODE31, amode31_lma, vmlinux.amode31_size); + + /* + * The order of the following operations is important: + * + * - handle_relocs() must follow clear_bss_section() to establish static + * memory references to data in .bss to be used by setup_vmem() + * (i.e init_mm.pgd) + * + * - setup_vmem() must follow handle_relocs() to be able using + * static memory references to data in .bss (i.e init_mm.pgd) + * + * - copy_bootdata() must follow setup_vmem() to propagate changes to + * bootdata made by setup_vmem() + */ + clear_bss_section(vmlinux_lma); + handle_relocs(__kaslr_offset); + setup_vmem(asce_limit); + copy_bootdata(); + + /* + * Save KASLR offset for early dumps, before vmcore_info is set. + * Mark as uneven to distinguish from real vmcore_info pointer. + */ + S390_lowcore.vmcore_info = __kaslr_offset ? __kaslr_offset | 0x1UL : 0; + + /* + * Jump to the decompressed kernel entry point and switch DAT mode on. + */ + psw.addr = vmlinux.entry; + psw.mask = PSW_KERNEL_BITS; + __load_psw(psw); +} diff --git a/arch/s390/boot/string.c b/arch/s390/boot/string.c new file mode 100644 index 0000000000..faccb33b46 --- /dev/null +++ b/arch/s390/boot/string.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#undef CONFIG_KASAN +#undef CONFIG_KASAN_GENERIC +#include "../lib/string.c" + +int strncmp(const char *cs, const char *ct, size_t count) +{ + unsigned char c1, c2; + + while (count) { + c1 = *cs++; + c2 = *ct++; + if (c1 != c2) + return c1 < c2 ? -1 : 1; + if (!c1) + break; + count--; + } + return 0; +} + +char *skip_spaces(const char *str) +{ + while (isspace(*str)) + ++str; + return (char *)str; +} + +char *strim(char *s) +{ + size_t size; + char *end; + + size = strlen(s); + if (!size) + return s; + + end = s + size - 1; + while (end >= s && isspace(*end)) + end--; + *(end + 1) = '\0'; + + return skip_spaces(s); +} + +/* Works only for digits and letters, but small and fast */ +#define TOLOWER(x) ((x) | 0x20) + +static unsigned int simple_guess_base(const char *cp) +{ + if (cp[0] == '0') { + if (TOLOWER(cp[1]) == 'x' && isxdigit(cp[2])) + return 16; + else + return 8; + } else { + return 10; + } +} + +/** + * simple_strtoull - convert a string to an unsigned long long + * @cp: The start of the string + * @endp: A pointer to the end of the parsed string will be placed here + * @base: The number base to use + */ + +unsigned long long simple_strtoull(const char *cp, char **endp, + unsigned int base) +{ + unsigned long long result = 0; + + if (!base) + base = simple_guess_base(cp); + + if (base == 16 && cp[0] == '0' && TOLOWER(cp[1]) == 'x') + cp += 2; + + while (isxdigit(*cp)) { + unsigned int value; + + value = isdigit(*cp) ? *cp - '0' : TOLOWER(*cp) - 'a' + 10; + if (value >= base) + break; + result = result * base + value; + cp++; + } + if (endp) + *endp = (char *)cp; + + return result; +} + +long simple_strtol(const char *cp, char **endp, unsigned int base) +{ + if (*cp == '-') + return -simple_strtoull(cp + 1, endp, base); + + return simple_strtoull(cp, endp, base); +} + +int kstrtobool(const char *s, bool *res) +{ + if (!s) + return -EINVAL; + + switch (s[0]) { + case 'y': + case 'Y': + case '1': + *res = true; + return 0; + case 'n': + case 'N': + case '0': + *res = false; + return 0; + case 'o': + case 'O': + switch (s[1]) { + case 'n': + case 'N': + *res = true; + return 0; + case 'f': + case 'F': + *res = false; + return 0; + default: + break; + } + default: + break; + } + + return -EINVAL; +} diff --git a/arch/s390/boot/uv.c b/arch/s390/boot/uv.c new file mode 100644 index 0000000000..1e66d2cbb0 --- /dev/null +++ b/arch/s390/boot/uv.c @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +#include "boot.h" +#include "uv.h" + +/* will be used in arch/s390/kernel/uv.c */ +#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST +int __bootdata_preserved(prot_virt_guest); +#endif +#if IS_ENABLED(CONFIG_KVM) +int __bootdata_preserved(prot_virt_host); +#endif +struct uv_info __bootdata_preserved(uv_info); + +void uv_query_info(void) +{ + struct uv_cb_qui uvcb = { + .header.cmd = UVC_CMD_QUI, + .header.len = sizeof(uvcb) + }; + + if (!test_facility(158)) + return; + + /* rc==0x100 means that there is additional data we do not process */ + if (uv_call(0, (uint64_t)&uvcb) && uvcb.header.rc != 0x100) + return; + + if (IS_ENABLED(CONFIG_KVM)) { + memcpy(uv_info.inst_calls_list, uvcb.inst_calls_list, sizeof(uv_info.inst_calls_list)); + uv_info.uv_base_stor_len = uvcb.uv_base_stor_len; + uv_info.guest_base_stor_len = uvcb.conf_base_phys_stor_len; + uv_info.guest_virt_base_stor_len = uvcb.conf_base_virt_stor_len; + uv_info.guest_virt_var_stor_len = uvcb.conf_virt_var_stor_len; + uv_info.guest_cpu_stor_len = uvcb.cpu_stor_len; + uv_info.max_sec_stor_addr = ALIGN(uvcb.max_guest_stor_addr, PAGE_SIZE); + uv_info.max_num_sec_conf = uvcb.max_num_sec_conf; + uv_info.max_guest_cpu_id = uvcb.max_guest_cpu_id; + uv_info.uv_feature_indications = uvcb.uv_feature_indications; + uv_info.supp_se_hdr_ver = uvcb.supp_se_hdr_versions; + uv_info.supp_se_hdr_pcf = uvcb.supp_se_hdr_pcf; + uv_info.conf_dump_storage_state_len = uvcb.conf_dump_storage_state_len; + uv_info.conf_dump_finalize_len = uvcb.conf_dump_finalize_len; + uv_info.supp_att_req_hdr_ver = uvcb.supp_att_req_hdr_ver; + uv_info.supp_att_pflags = uvcb.supp_att_pflags; + uv_info.supp_add_secret_req_ver = uvcb.supp_add_secret_req_ver; + uv_info.supp_add_secret_pcf = uvcb.supp_add_secret_pcf; + uv_info.supp_secret_types = uvcb.supp_secret_types; + uv_info.max_secrets = uvcb.max_secrets; + } + +#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST + if (test_bit_inv(BIT_UVC_CMD_SET_SHARED_ACCESS, (unsigned long *)uvcb.inst_calls_list) && + test_bit_inv(BIT_UVC_CMD_REMOVE_SHARED_ACCESS, (unsigned long *)uvcb.inst_calls_list)) + prot_virt_guest = 1; +#endif +} + +#if IS_ENABLED(CONFIG_KVM) +unsigned long adjust_to_uv_max(unsigned long limit) +{ + if (is_prot_virt_host() && uv_info.max_sec_stor_addr) + limit = min_t(unsigned long, limit, uv_info.max_sec_stor_addr); + return limit; +} + +static int is_prot_virt_host_capable(void) +{ + /* disable if no prot_virt=1 given on command-line */ + if (!is_prot_virt_host()) + return 0; + /* disable if protected guest virtualization is enabled */ + if (is_prot_virt_guest()) + return 0; + /* disable if no hardware support */ + if (!test_facility(158)) + return 0; + /* disable if kdump */ + if (oldmem_data.start) + return 0; + /* disable if stand-alone dump */ + if (ipl_block_valid && is_ipl_block_dump()) + return 0; + return 1; +} + +void sanitize_prot_virt_host(void) +{ + prot_virt_host = is_prot_virt_host_capable(); +} +#endif diff --git a/arch/s390/boot/uv.h b/arch/s390/boot/uv.h new file mode 100644 index 0000000000..0f3070856f --- /dev/null +++ b/arch/s390/boot/uv.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef BOOT_UV_H +#define BOOT_UV_H + +#if IS_ENABLED(CONFIG_KVM) +unsigned long adjust_to_uv_max(unsigned long limit); +void sanitize_prot_virt_host(void); +#else +static inline unsigned long adjust_to_uv_max(unsigned long limit) +{ + return limit; +} +static inline void sanitize_prot_virt_host(void) {} +#endif + +#if defined(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) || IS_ENABLED(CONFIG_KVM) +void uv_query_info(void); +#else +static inline void uv_query_info(void) {} +#endif + +#endif /* BOOT_UV_H */ diff --git a/arch/s390/boot/version.c b/arch/s390/boot/version.c new file mode 100644 index 0000000000..fd32f03877 --- /dev/null +++ b/arch/s390/boot/version.c @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include "boot.h" + +const char kernel_version[] = UTS_RELEASE + " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") " UTS_VERSION; diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c new file mode 100644 index 0000000000..442a74f113 --- /dev/null +++ b/arch/s390/boot/vmem.c @@ -0,0 +1,458 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "decompressor.h" +#include "boot.h" + +unsigned long __bootdata_preserved(s390_invalid_asce); + +#ifdef CONFIG_PROC_FS +atomic_long_t __bootdata_preserved(direct_pages_count[PG_DIRECT_MAP_MAX]); +#endif + +#define init_mm (*(struct mm_struct *)vmlinux.init_mm_off) +#define swapper_pg_dir vmlinux.swapper_pg_dir_off +#define invalid_pg_dir vmlinux.invalid_pg_dir_off + +enum populate_mode { + POPULATE_NONE, + POPULATE_DIRECT, + POPULATE_ABS_LOWCORE, +#ifdef CONFIG_KASAN + POPULATE_KASAN_MAP_SHADOW, + POPULATE_KASAN_ZERO_SHADOW, + POPULATE_KASAN_SHALLOW +#endif +}; + +static void pgtable_populate(unsigned long addr, unsigned long end, enum populate_mode mode); + +#ifdef CONFIG_KASAN + +#define kasan_early_shadow_page vmlinux.kasan_early_shadow_page_off +#define kasan_early_shadow_pte ((pte_t *)vmlinux.kasan_early_shadow_pte_off) +#define kasan_early_shadow_pmd ((pmd_t *)vmlinux.kasan_early_shadow_pmd_off) +#define kasan_early_shadow_pud ((pud_t *)vmlinux.kasan_early_shadow_pud_off) +#define kasan_early_shadow_p4d ((p4d_t *)vmlinux.kasan_early_shadow_p4d_off) +#define __sha(x) ((unsigned long)kasan_mem_to_shadow((void *)x)) + +static pte_t pte_z; + +static inline void kasan_populate(unsigned long start, unsigned long end, enum populate_mode mode) +{ + start = PAGE_ALIGN_DOWN(__sha(start)); + end = PAGE_ALIGN(__sha(end)); + pgtable_populate(start, end, mode); +} + +static void kasan_populate_shadow(void) +{ + pmd_t pmd_z = __pmd(__pa(kasan_early_shadow_pte) | _SEGMENT_ENTRY); + pud_t pud_z = __pud(__pa(kasan_early_shadow_pmd) | _REGION3_ENTRY); + p4d_t p4d_z = __p4d(__pa(kasan_early_shadow_pud) | _REGION2_ENTRY); + unsigned long memgap_start = 0; + unsigned long untracked_end; + unsigned long start, end; + int i; + + pte_z = __pte(__pa(kasan_early_shadow_page) | pgprot_val(PAGE_KERNEL_RO)); + if (!machine.has_nx) + pte_z = clear_pte_bit(pte_z, __pgprot(_PAGE_NOEXEC)); + crst_table_init((unsigned long *)kasan_early_shadow_p4d, p4d_val(p4d_z)); + crst_table_init((unsigned long *)kasan_early_shadow_pud, pud_val(pud_z)); + crst_table_init((unsigned long *)kasan_early_shadow_pmd, pmd_val(pmd_z)); + memset64((u64 *)kasan_early_shadow_pte, pte_val(pte_z), PTRS_PER_PTE); + + /* + * Current memory layout: + * +- 0 -------------+ +- shadow start -+ + * |1:1 ident mapping| /|1/8 of ident map| + * | | / | | + * +-end of ident map+ / +----------------+ + * | ... gap ... | / | kasan | + * | | / | zero page | + * +- vmalloc area -+ / | mapping | + * | vmalloc_size | / | (untracked) | + * +- modules vaddr -+ / +----------------+ + * | 2Gb |/ | unmapped | allocated per module + * +- shadow start -+ +----------------+ + * | 1/8 addr space | | zero pg mapping| (untracked) + * +- shadow end ----+---------+- shadow end ---+ + * + * Current memory layout (KASAN_VMALLOC): + * +- 0 -------------+ +- shadow start -+ + * |1:1 ident mapping| /|1/8 of ident map| + * | | / | | + * +-end of ident map+ / +----------------+ + * | ... gap ... | / | kasan zero page| (untracked) + * | | / | mapping | + * +- vmalloc area -+ / +----------------+ + * | vmalloc_size | / |shallow populate| + * +- modules vaddr -+ / +----------------+ + * | 2Gb |/ |shallow populate| + * +- shadow start -+ +----------------+ + * | 1/8 addr space | | zero pg mapping| (untracked) + * +- shadow end ----+---------+- shadow end ---+ + */ + + for_each_physmem_usable_range(i, &start, &end) { + kasan_populate(start, end, POPULATE_KASAN_MAP_SHADOW); + if (memgap_start && physmem_info.info_source == MEM_DETECT_DIAG260) + kasan_populate(memgap_start, start, POPULATE_KASAN_ZERO_SHADOW); + memgap_start = end; + } + if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) { + untracked_end = VMALLOC_START; + /* shallowly populate kasan shadow for vmalloc and modules */ + kasan_populate(VMALLOC_START, MODULES_END, POPULATE_KASAN_SHALLOW); + } else { + untracked_end = MODULES_VADDR; + } + /* populate kasan shadow for untracked memory */ + kasan_populate(ident_map_size, untracked_end, POPULATE_KASAN_ZERO_SHADOW); + kasan_populate(MODULES_END, _REGION1_SIZE, POPULATE_KASAN_ZERO_SHADOW); +} + +static bool kasan_pgd_populate_zero_shadow(pgd_t *pgd, unsigned long addr, + unsigned long end, enum populate_mode mode) +{ + if (mode == POPULATE_KASAN_ZERO_SHADOW && + IS_ALIGNED(addr, PGDIR_SIZE) && end - addr >= PGDIR_SIZE) { + pgd_populate(&init_mm, pgd, kasan_early_shadow_p4d); + return true; + } + return false; +} + +static bool kasan_p4d_populate_zero_shadow(p4d_t *p4d, unsigned long addr, + unsigned long end, enum populate_mode mode) +{ + if (mode == POPULATE_KASAN_ZERO_SHADOW && + IS_ALIGNED(addr, P4D_SIZE) && end - addr >= P4D_SIZE) { + p4d_populate(&init_mm, p4d, kasan_early_shadow_pud); + return true; + } + return false; +} + +static bool kasan_pud_populate_zero_shadow(pud_t *pud, unsigned long addr, + unsigned long end, enum populate_mode mode) +{ + if (mode == POPULATE_KASAN_ZERO_SHADOW && + IS_ALIGNED(addr, PUD_SIZE) && end - addr >= PUD_SIZE) { + pud_populate(&init_mm, pud, kasan_early_shadow_pmd); + return true; + } + return false; +} + +static bool kasan_pmd_populate_zero_shadow(pmd_t *pmd, unsigned long addr, + unsigned long end, enum populate_mode mode) +{ + if (mode == POPULATE_KASAN_ZERO_SHADOW && + IS_ALIGNED(addr, PMD_SIZE) && end - addr >= PMD_SIZE) { + pmd_populate(&init_mm, pmd, kasan_early_shadow_pte); + return true; + } + return false; +} + +static bool kasan_pte_populate_zero_shadow(pte_t *pte, enum populate_mode mode) +{ + pte_t entry; + + if (mode == POPULATE_KASAN_ZERO_SHADOW) { + set_pte(pte, pte_z); + return true; + } + return false; +} +#else + +static inline void kasan_populate_shadow(void) {} + +static inline bool kasan_pgd_populate_zero_shadow(pgd_t *pgd, unsigned long addr, + unsigned long end, enum populate_mode mode) +{ + return false; +} + +static inline bool kasan_p4d_populate_zero_shadow(p4d_t *p4d, unsigned long addr, + unsigned long end, enum populate_mode mode) +{ + return false; +} + +static inline bool kasan_pud_populate_zero_shadow(pud_t *pud, unsigned long addr, + unsigned long end, enum populate_mode mode) +{ + return false; +} + +static inline bool kasan_pmd_populate_zero_shadow(pmd_t *pmd, unsigned long addr, + unsigned long end, enum populate_mode mode) +{ + return false; +} + +static bool kasan_pte_populate_zero_shadow(pte_t *pte, enum populate_mode mode) +{ + return false; +} + +#endif + +/* + * Mimic virt_to_kpte() in lack of init_mm symbol. Skip pmd NULL check though. + */ +static inline pte_t *__virt_to_kpte(unsigned long va) +{ + return pte_offset_kernel(pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va), va); +} + +static void *boot_crst_alloc(unsigned long val) +{ + unsigned long size = PAGE_SIZE << CRST_ALLOC_ORDER; + unsigned long *table; + + table = (unsigned long *)physmem_alloc_top_down(RR_VMEM, size, size); + crst_table_init(table, val); + return table; +} + +static pte_t *boot_pte_alloc(void) +{ + static void *pte_leftover; + pte_t *pte; + + /* + * handling pte_leftovers this way helps to avoid memory fragmentation + * during POPULATE_KASAN_MAP_SHADOW when EDAT is off + */ + if (!pte_leftover) { + pte_leftover = (void *)physmem_alloc_top_down(RR_VMEM, PAGE_SIZE, PAGE_SIZE); + pte = pte_leftover + _PAGE_TABLE_SIZE; + } else { + pte = pte_leftover; + pte_leftover = NULL; + } + + memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); + return pte; +} + +static unsigned long _pa(unsigned long addr, unsigned long size, enum populate_mode mode) +{ + switch (mode) { + case POPULATE_NONE: + return -1; + case POPULATE_DIRECT: + return addr; + case POPULATE_ABS_LOWCORE: + return __abs_lowcore_pa(addr); +#ifdef CONFIG_KASAN + case POPULATE_KASAN_MAP_SHADOW: + addr = physmem_alloc_top_down(RR_VMEM, size, size); + memset((void *)addr, 0, size); + return addr; +#endif + default: + return -1; + } +} + +static bool can_large_pud(pud_t *pu_dir, unsigned long addr, unsigned long end) +{ + return machine.has_edat2 && + IS_ALIGNED(addr, PUD_SIZE) && (end - addr) >= PUD_SIZE; +} + +static bool can_large_pmd(pmd_t *pm_dir, unsigned long addr, unsigned long end) +{ + return machine.has_edat1 && + IS_ALIGNED(addr, PMD_SIZE) && (end - addr) >= PMD_SIZE; +} + +static void pgtable_pte_populate(pmd_t *pmd, unsigned long addr, unsigned long end, + enum populate_mode mode) +{ + unsigned long pages = 0; + pte_t *pte, entry; + + pte = pte_offset_kernel(pmd, addr); + for (; addr < end; addr += PAGE_SIZE, pte++) { + if (pte_none(*pte)) { + if (kasan_pte_populate_zero_shadow(pte, mode)) + continue; + entry = __pte(_pa(addr, PAGE_SIZE, mode)); + entry = set_pte_bit(entry, PAGE_KERNEL); + if (!machine.has_nx) + entry = clear_pte_bit(entry, __pgprot(_PAGE_NOEXEC)); + set_pte(pte, entry); + pages++; + } + } + if (mode == POPULATE_DIRECT) + update_page_count(PG_DIRECT_MAP_4K, pages); +} + +static void pgtable_pmd_populate(pud_t *pud, unsigned long addr, unsigned long end, + enum populate_mode mode) +{ + unsigned long next, pages = 0; + pmd_t *pmd, entry; + pte_t *pte; + + pmd = pmd_offset(pud, addr); + for (; addr < end; addr = next, pmd++) { + next = pmd_addr_end(addr, end); + if (pmd_none(*pmd)) { + if (kasan_pmd_populate_zero_shadow(pmd, addr, next, mode)) + continue; + if (can_large_pmd(pmd, addr, next)) { + entry = __pmd(_pa(addr, _SEGMENT_SIZE, mode)); + entry = set_pmd_bit(entry, SEGMENT_KERNEL); + if (!machine.has_nx) + entry = clear_pmd_bit(entry, __pgprot(_SEGMENT_ENTRY_NOEXEC)); + set_pmd(pmd, entry); + pages++; + continue; + } + pte = boot_pte_alloc(); + pmd_populate(&init_mm, pmd, pte); + } else if (pmd_large(*pmd)) { + continue; + } + pgtable_pte_populate(pmd, addr, next, mode); + } + if (mode == POPULATE_DIRECT) + update_page_count(PG_DIRECT_MAP_1M, pages); +} + +static void pgtable_pud_populate(p4d_t *p4d, unsigned long addr, unsigned long end, + enum populate_mode mode) +{ + unsigned long next, pages = 0; + pud_t *pud, entry; + pmd_t *pmd; + + pud = pud_offset(p4d, addr); + for (; addr < end; addr = next, pud++) { + next = pud_addr_end(addr, end); + if (pud_none(*pud)) { + if (kasan_pud_populate_zero_shadow(pud, addr, next, mode)) + continue; + if (can_large_pud(pud, addr, next)) { + entry = __pud(_pa(addr, _REGION3_SIZE, mode)); + entry = set_pud_bit(entry, REGION3_KERNEL); + if (!machine.has_nx) + entry = clear_pud_bit(entry, __pgprot(_REGION_ENTRY_NOEXEC)); + set_pud(pud, entry); + pages++; + continue; + } + pmd = boot_crst_alloc(_SEGMENT_ENTRY_EMPTY); + pud_populate(&init_mm, pud, pmd); + } else if (pud_large(*pud)) { + continue; + } + pgtable_pmd_populate(pud, addr, next, mode); + } + if (mode == POPULATE_DIRECT) + update_page_count(PG_DIRECT_MAP_2G, pages); +} + +static void pgtable_p4d_populate(pgd_t *pgd, unsigned long addr, unsigned long end, + enum populate_mode mode) +{ + unsigned long next; + p4d_t *p4d; + pud_t *pud; + + p4d = p4d_offset(pgd, addr); + for (; addr < end; addr = next, p4d++) { + next = p4d_addr_end(addr, end); + if (p4d_none(*p4d)) { + if (kasan_p4d_populate_zero_shadow(p4d, addr, next, mode)) + continue; + pud = boot_crst_alloc(_REGION3_ENTRY_EMPTY); + p4d_populate(&init_mm, p4d, pud); + } + pgtable_pud_populate(p4d, addr, next, mode); + } +} + +static void pgtable_populate(unsigned long addr, unsigned long end, enum populate_mode mode) +{ + unsigned long next; + pgd_t *pgd; + p4d_t *p4d; + + pgd = pgd_offset(&init_mm, addr); + for (; addr < end; addr = next, pgd++) { + next = pgd_addr_end(addr, end); + if (pgd_none(*pgd)) { + if (kasan_pgd_populate_zero_shadow(pgd, addr, next, mode)) + continue; + p4d = boot_crst_alloc(_REGION2_ENTRY_EMPTY); + pgd_populate(&init_mm, pgd, p4d); + } +#ifdef CONFIG_KASAN + if (mode == POPULATE_KASAN_SHALLOW) + continue; +#endif + pgtable_p4d_populate(pgd, addr, next, mode); + } +} + +void setup_vmem(unsigned long asce_limit) +{ + unsigned long start, end; + unsigned long asce_type; + unsigned long asce_bits; + int i; + + if (asce_limit == _REGION1_SIZE) { + asce_type = _REGION2_ENTRY_EMPTY; + asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; + } else { + asce_type = _REGION3_ENTRY_EMPTY; + asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; + } + s390_invalid_asce = invalid_pg_dir | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; + + crst_table_init((unsigned long *)swapper_pg_dir, asce_type); + crst_table_init((unsigned long *)invalid_pg_dir, _REGION3_ENTRY_EMPTY); + + /* + * To allow prefixing the lowcore must be mapped with 4KB pages. + * To prevent creation of a large page at address 0 first map + * the lowcore and create the identity mapping only afterwards. + */ + pgtable_populate(0, sizeof(struct lowcore), POPULATE_DIRECT); + for_each_physmem_usable_range(i, &start, &end) + pgtable_populate(start, end, POPULATE_DIRECT); + pgtable_populate(__abs_lowcore, __abs_lowcore + sizeof(struct lowcore), + POPULATE_ABS_LOWCORE); + pgtable_populate(__memcpy_real_area, __memcpy_real_area + PAGE_SIZE, + POPULATE_NONE); + memcpy_real_ptep = __virt_to_kpte(__memcpy_real_area); + + kasan_populate_shadow(); + + S390_lowcore.kernel_asce = swapper_pg_dir | asce_bits; + S390_lowcore.user_asce = s390_invalid_asce; + + __ctl_load(S390_lowcore.kernel_asce, 1, 1); + __ctl_load(S390_lowcore.user_asce, 7, 7); + __ctl_load(S390_lowcore.kernel_asce, 13, 13); + + init_mm.context.asce = S390_lowcore.kernel_asce; +} diff --git a/arch/s390/boot/vmlinux.lds.S b/arch/s390/boot/vmlinux.lds.S new file mode 100644 index 0000000000..389df0e0d9 --- /dev/null +++ b/arch/s390/boot/vmlinux.lds.S @@ -0,0 +1,128 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include +#include +#include +#include "boot.h" + +OUTPUT_FORMAT("elf64-s390", "elf64-s390", "elf64-s390") +OUTPUT_ARCH(s390:64-bit) + +ENTRY(startup) + +SECTIONS +{ + . = 0; + .ipldata : { + *(.ipldata) + } + . = IPL_START; + .head.text : { + _head = . ; + HEAD_TEXT + _ehead = . ; + } + . = PARMAREA; + .parmarea : { + *(.parmarea) + } + .text : { + _text = .; /* Text */ + *(.text) + *(.text.*) + _etext = . ; + } + .rodata : { + _rodata = . ; + *(.rodata) /* read-only data */ + *(.rodata.*) + _erodata = . ; + } + NOTES + .data : { + _data = . ; + *(.data) + *(.data.*) + _edata = . ; + } + + BOOT_DATA + BOOT_DATA_PRESERVED + + /* + * This is the BSS section of the decompressor and not of the decompressed Linux kernel. + * It will consume place in the decompressor's image. + */ + . = ALIGN(8); + .bss : { + _bss = . ; + *(.bss) + *(.bss.*) + *(COMMON) + /* + * Stacks for the decompressor + */ + . = ALIGN(PAGE_SIZE); + _dump_info_stack_start = .; + . += PAGE_SIZE; + _dump_info_stack_end = .; + . = ALIGN(PAGE_SIZE); + _stack_start = .; + . += BOOT_STACK_SIZE; + _stack_end = .; + _ebss = .; + } + + /* + * uncompressed image info used by the decompressor it should match + * struct vmlinux_info. It comes from .vmlinux.info section of + * uncompressed vmlinux in a form of info.o + */ + . = ALIGN(8); + .vmlinux.info : { + _vmlinux_info = .; + *(.vmlinux.info) + } + + .decompressor.syms : { + . += 1; /* make sure we have \0 before the first entry */ + . = ALIGN(2); + _decompressor_syms_start = .; + *(.decompressor.syms) + _decompressor_syms_end = .; + } + + _decompressor_end = .; + +#ifdef CONFIG_KERNEL_UNCOMPRESSED + . = 0x100000; +#else + . = ALIGN(8); +#endif + .rodata.compressed : { + _compressed_start = .; + *(.vmlinux.bin.compressed) + _compressed_end = .; + } + +#define SB_TRAILER_SIZE 32 + /* Trailer needed for Secure Boot */ + . += SB_TRAILER_SIZE; /* make sure .sb.trailer does not overwrite the previous section */ + . = ALIGN(4096) - SB_TRAILER_SIZE; + .sb.trailer : { + QUAD(0) + QUAD(0) + QUAD(0) + QUAD(0x000000207a49504c) + } + _end = .; + + /* Sections to be discarded */ + /DISCARD/ : { + *(.eh_frame) + *(__ex_table) + *(*__ksymtab*) + *(___kcrctab*) + } +} diff --git a/arch/s390/configs/btf.config b/arch/s390/configs/btf.config new file mode 100644 index 0000000000..eb7f84f592 --- /dev/null +++ b/arch/s390/configs/btf.config @@ -0,0 +1,2 @@ +# Help: Enable BTF debug info +CONFIG_DEBUG_INFO_BTF=y diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig new file mode 100644 index 0000000000..dd06086293 --- /dev/null +++ b/arch/s390/configs/debug_defconfig @@ -0,0 +1,887 @@ +CONFIG_UAPI_HEADER_TEST=y +CONFIG_SYSVIPC=y +CONFIG_POSIX_MQUEUE=y +CONFIG_WATCH_QUEUE=y +CONFIG_AUDIT=y +CONFIG_NO_HZ_IDLE=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_BPF_SYSCALL=y +CONFIG_BPF_JIT=y +CONFIG_BPF_JIT_ALWAYS_ON=y +CONFIG_BPF_LSM=y +CONFIG_PREEMPT=y +CONFIG_SCHED_CORE=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_BSD_PROCESS_ACCT_V3=y +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_XACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +CONFIG_NUMA_BALANCING=y +CONFIG_MEMCG=y +CONFIG_BLK_CGROUP=y +CONFIG_CFS_BANDWIDTH=y +CONFIG_CGROUP_PIDS=y +CONFIG_CGROUP_RDMA=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CGROUP_HUGETLB=y +CONFIG_CPUSETS=y +CONFIG_CGROUP_DEVICE=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_PERF=y +CONFIG_CGROUP_BPF=y +CONFIG_CGROUP_MISC=y +CONFIG_NAMESPACES=y +CONFIG_USER_NS=y +CONFIG_CHECKPOINT_RESTORE=y +CONFIG_SCHED_AUTOGROUP=y +CONFIG_EXPERT=y +# CONFIG_SYSFS_SYSCALL is not set +CONFIG_PROFILING=y +CONFIG_KEXEC_FILE=y +CONFIG_KEXEC_SIG=y +CONFIG_CRASH_DUMP=y +CONFIG_LIVEPATCH=y +CONFIG_MARCH_ZEC12=y +CONFIG_TUNE_ZEC12=y +CONFIG_NR_CPUS=512 +CONFIG_NUMA=y +CONFIG_HZ_100=y +CONFIG_CERT_STORE=y +CONFIG_EXPOLINE=y +CONFIG_EXPOLINE_AUTO=y +CONFIG_CHSC_SCH=y +CONFIG_VFIO_CCW=m +CONFIG_VFIO_AP=m +CONFIG_PROTECTED_VIRTUALIZATION_GUEST=y +CONFIG_CMM=m +CONFIG_APPLDATA_BASE=y +CONFIG_S390_HYPFS_FS=y +CONFIG_KVM=m +CONFIG_S390_UNWIND_SELFTEST=m +CONFIG_S390_KPROBES_SANITY_TEST=m +CONFIG_S390_MODULES_SANITY_TEST=m +CONFIG_KPROBES=y +CONFIG_JUMP_LABEL=y +CONFIG_STATIC_KEYS_SELFTEST=y +CONFIG_SECCOMP_CACHE_DEBUG=y +CONFIG_LOCK_EVENT_COUNTS=y +# CONFIG_GCC_PLUGINS is not set +CONFIG_MODULES=y +CONFIG_MODULE_FORCE_LOAD=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODULE_FORCE_UNLOAD=y +CONFIG_MODULE_UNLOAD_TAINT_TRACKING=y +CONFIG_MODVERSIONS=y +CONFIG_MODULE_SRCVERSION_ALL=y +CONFIG_MODULE_SIG_SHA256=y +CONFIG_BLK_DEV_THROTTLING=y +CONFIG_BLK_WBT=y +CONFIG_BLK_CGROUP_IOLATENCY=y +CONFIG_BLK_CGROUP_IOCOST=y +CONFIG_BLK_CGROUP_IOPRIO=y +CONFIG_BLK_INLINE_ENCRYPTION=y +CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK=y +CONFIG_PARTITION_ADVANCED=y +CONFIG_IBM_PARTITION=y +CONFIG_BSD_DISKLABEL=y +CONFIG_MINIX_SUBPARTITION=y +CONFIG_SOLARIS_X86_PARTITION=y +CONFIG_UNIXWARE_DISKLABEL=y +CONFIG_IOSCHED_BFQ=y +CONFIG_BINFMT_MISC=m +CONFIG_ZSWAP=y +CONFIG_ZSMALLOC_STAT=y +CONFIG_SLUB_STATS=y +# CONFIG_COMPAT_BRK is not set +CONFIG_MEMORY_HOTPLUG=y +CONFIG_MEMORY_HOTREMOVE=y +CONFIG_KSM=y +CONFIG_TRANSPARENT_HUGEPAGE=y +CONFIG_CMA_DEBUG=y +CONFIG_CMA_DEBUGFS=y +CONFIG_CMA_SYSFS=y +CONFIG_CMA_AREAS=7 +CONFIG_MEM_SOFT_DIRTY=y +CONFIG_DEFERRED_STRUCT_PAGE_INIT=y +CONFIG_IDLE_PAGE_TRACKING=y +CONFIG_PERCPU_STATS=y +CONFIG_GUP_TEST=y +CONFIG_ANON_VMA_NAME=y +CONFIG_USERFAULTFD=y +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_PACKET_DIAG=m +CONFIG_UNIX=y +CONFIG_UNIX_DIAG=m +CONFIG_XFRM_USER=m +CONFIG_NET_KEY=m +CONFIG_SMC=m +CONFIG_SMC_DIAG=m +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_ROUTE_MULTIPATH=y +CONFIG_IP_ROUTE_VERBOSE=y +CONFIG_NET_IPIP=m +CONFIG_NET_IPGRE_DEMUX=m +CONFIG_NET_IPGRE=m +CONFIG_NET_IPGRE_BROADCAST=y +CONFIG_IP_MROUTE=y +CONFIG_IP_MROUTE_MULTIPLE_TABLES=y +CONFIG_IP_PIMSM_V1=y +CONFIG_IP_PIMSM_V2=y +CONFIG_SYN_COOKIES=y +CONFIG_NET_IPVTI=m +CONFIG_INET_AH=m +CONFIG_INET_ESP=m +CONFIG_INET_ESPINTCP=y +CONFIG_INET_IPCOMP=m +CONFIG_INET_DIAG=m +CONFIG_INET_UDP_DIAG=m +CONFIG_TCP_CONG_ADVANCED=y +CONFIG_TCP_CONG_HSTCP=m +CONFIG_TCP_CONG_HYBLA=m +CONFIG_TCP_CONG_SCALABLE=m +CONFIG_TCP_CONG_LP=m +CONFIG_TCP_CONG_VENO=m +CONFIG_TCP_CONG_YEAH=m +CONFIG_TCP_CONG_ILLINOIS=m +CONFIG_IPV6_ROUTER_PREF=y +CONFIG_INET6_AH=m +CONFIG_INET6_ESP=m +CONFIG_INET6_ESPINTCP=y +CONFIG_INET6_IPCOMP=m +CONFIG_IPV6_MIP6=m +CONFIG_IPV6_VTI=m +CONFIG_IPV6_SIT=m +CONFIG_IPV6_GRE=m +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_IPV6_SUBTREES=y +CONFIG_IPV6_RPL_LWTUNNEL=y +CONFIG_MPTCP=y +CONFIG_NETFILTER=y +CONFIG_BRIDGE_NETFILTER=m +CONFIG_NETFILTER_NETLINK_HOOK=m +CONFIG_NF_CONNTRACK=m +CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_PROCFS=y +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CONNTRACK_TIMEOUT=y +CONFIG_NF_CONNTRACK_TIMESTAMP=y +CONFIG_NF_CONNTRACK_AMANDA=m +CONFIG_NF_CONNTRACK_FTP=m +CONFIG_NF_CONNTRACK_H323=m +CONFIG_NF_CONNTRACK_IRC=m +CONFIG_NF_CONNTRACK_NETBIOS_NS=m +CONFIG_NF_CONNTRACK_SNMP=m +CONFIG_NF_CONNTRACK_PPTP=m +CONFIG_NF_CONNTRACK_SANE=m +CONFIG_NF_CONNTRACK_SIP=m +CONFIG_NF_CONNTRACK_TFTP=m +CONFIG_NF_CT_NETLINK=m +CONFIG_NF_CT_NETLINK_TIMEOUT=m +CONFIG_NF_TABLES=m +CONFIG_NF_TABLES_INET=y +CONFIG_NFT_CT=m +CONFIG_NFT_LOG=m +CONFIG_NFT_LIMIT=m +CONFIG_NFT_NAT=m +CONFIG_NFT_REJECT=m +CONFIG_NFT_COMPAT=m +CONFIG_NFT_HASH=m +CONFIG_NFT_FIB_INET=m +CONFIG_NETFILTER_XTABLES_COMPAT=y +CONFIG_NETFILTER_XT_SET=m +CONFIG_NETFILTER_XT_TARGET_AUDIT=m +CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m +CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m +CONFIG_NETFILTER_XT_TARGET_CONNMARK=m +CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m +CONFIG_NETFILTER_XT_TARGET_CT=m +CONFIG_NETFILTER_XT_TARGET_DSCP=m +CONFIG_NETFILTER_XT_TARGET_HMARK=m +CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m +CONFIG_NETFILTER_XT_TARGET_LOG=m +CONFIG_NETFILTER_XT_TARGET_MARK=m +CONFIG_NETFILTER_XT_TARGET_NFLOG=m +CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m +CONFIG_NETFILTER_XT_TARGET_TEE=m +CONFIG_NETFILTER_XT_TARGET_TPROXY=m +CONFIG_NETFILTER_XT_TARGET_TRACE=m +CONFIG_NETFILTER_XT_TARGET_SECMARK=m +CONFIG_NETFILTER_XT_TARGET_TCPMSS=m +CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m +CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m +CONFIG_NETFILTER_XT_MATCH_BPF=m +CONFIG_NETFILTER_XT_MATCH_CLUSTER=m +CONFIG_NETFILTER_XT_MATCH_COMMENT=m +CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m +CONFIG_NETFILTER_XT_MATCH_CONNLABEL=m +CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m +CONFIG_NETFILTER_XT_MATCH_CONNMARK=m +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m +CONFIG_NETFILTER_XT_MATCH_CPU=m +CONFIG_NETFILTER_XT_MATCH_DCCP=m +CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m +CONFIG_NETFILTER_XT_MATCH_DSCP=m +CONFIG_NETFILTER_XT_MATCH_ESP=m +CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m +CONFIG_NETFILTER_XT_MATCH_HELPER=m +CONFIG_NETFILTER_XT_MATCH_IPRANGE=m +CONFIG_NETFILTER_XT_MATCH_IPVS=m +CONFIG_NETFILTER_XT_MATCH_LENGTH=m +CONFIG_NETFILTER_XT_MATCH_LIMIT=m +CONFIG_NETFILTER_XT_MATCH_MAC=m +CONFIG_NETFILTER_XT_MATCH_MARK=m +CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m +CONFIG_NETFILTER_XT_MATCH_NFACCT=m +CONFIG_NETFILTER_XT_MATCH_OSF=m +CONFIG_NETFILTER_XT_MATCH_OWNER=m +CONFIG_NETFILTER_XT_MATCH_POLICY=m +CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m +CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m +CONFIG_NETFILTER_XT_MATCH_QUOTA=m +CONFIG_NETFILTER_XT_MATCH_RATEEST=m +CONFIG_NETFILTER_XT_MATCH_REALM=m +CONFIG_NETFILTER_XT_MATCH_RECENT=m +CONFIG_NETFILTER_XT_MATCH_STATE=m +CONFIG_NETFILTER_XT_MATCH_STATISTIC=m +CONFIG_NETFILTER_XT_MATCH_STRING=m +CONFIG_NETFILTER_XT_MATCH_TCPMSS=m +CONFIG_NETFILTER_XT_MATCH_TIME=m +CONFIG_NETFILTER_XT_MATCH_U32=m +CONFIG_IP_SET=m +CONFIG_IP_SET_BITMAP_IP=m +CONFIG_IP_SET_BITMAP_IPMAC=m +CONFIG_IP_SET_BITMAP_PORT=m +CONFIG_IP_SET_HASH_IP=m +CONFIG_IP_SET_HASH_IPPORT=m +CONFIG_IP_SET_HASH_IPPORTIP=m +CONFIG_IP_SET_HASH_IPPORTNET=m +CONFIG_IP_SET_HASH_NETPORTNET=m +CONFIG_IP_SET_HASH_NET=m +CONFIG_IP_SET_HASH_NETNET=m +CONFIG_IP_SET_HASH_NETPORT=m +CONFIG_IP_SET_HASH_NETIFACE=m +CONFIG_IP_SET_LIST_SET=m +CONFIG_IP_VS=m +CONFIG_IP_VS_PROTO_TCP=y +CONFIG_IP_VS_PROTO_UDP=y +CONFIG_IP_VS_PROTO_ESP=y +CONFIG_IP_VS_PROTO_AH=y +CONFIG_IP_VS_RR=m +CONFIG_IP_VS_WRR=m +CONFIG_IP_VS_LC=m +CONFIG_IP_VS_WLC=m +CONFIG_IP_VS_LBLC=m +CONFIG_IP_VS_LBLCR=m +CONFIG_IP_VS_DH=m +CONFIG_IP_VS_SH=m +CONFIG_IP_VS_SED=m +CONFIG_IP_VS_NQ=m +CONFIG_IP_VS_TWOS=m +CONFIG_IP_VS_FTP=m +CONFIG_IP_VS_PE_SIP=m +CONFIG_NFT_FIB_IPV4=m +CONFIG_NF_TABLES_ARP=y +CONFIG_NF_LOG_IPV4=m +CONFIG_IP_NF_IPTABLES=m +CONFIG_IP_NF_MATCH_AH=m +CONFIG_IP_NF_MATCH_ECN=m +CONFIG_IP_NF_MATCH_RPFILTER=m +CONFIG_IP_NF_MATCH_TTL=m +CONFIG_IP_NF_FILTER=m +CONFIG_IP_NF_TARGET_REJECT=m +CONFIG_IP_NF_NAT=m +CONFIG_IP_NF_TARGET_MASQUERADE=m +CONFIG_IP_NF_MANGLE=m +CONFIG_IP_NF_TARGET_ECN=m +CONFIG_IP_NF_TARGET_TTL=m +CONFIG_IP_NF_RAW=m +CONFIG_IP_NF_SECURITY=m +CONFIG_IP_NF_ARPTABLES=m +CONFIG_IP_NF_ARPFILTER=m +CONFIG_IP_NF_ARP_MANGLE=m +CONFIG_NFT_FIB_IPV6=m +CONFIG_IP6_NF_IPTABLES=m +CONFIG_IP6_NF_MATCH_AH=m +CONFIG_IP6_NF_MATCH_EUI64=m +CONFIG_IP6_NF_MATCH_FRAG=m +CONFIG_IP6_NF_MATCH_OPTS=m +CONFIG_IP6_NF_MATCH_HL=m +CONFIG_IP6_NF_MATCH_IPV6HEADER=m +CONFIG_IP6_NF_MATCH_MH=m +CONFIG_IP6_NF_MATCH_RPFILTER=m +CONFIG_IP6_NF_MATCH_RT=m +CONFIG_IP6_NF_TARGET_HL=m +CONFIG_IP6_NF_FILTER=m +CONFIG_IP6_NF_TARGET_REJECT=m +CONFIG_IP6_NF_MANGLE=m +CONFIG_IP6_NF_RAW=m +CONFIG_IP6_NF_SECURITY=m +CONFIG_IP6_NF_NAT=m +CONFIG_IP6_NF_TARGET_MASQUERADE=m +CONFIG_NF_TABLES_BRIDGE=m +CONFIG_RDS=m +CONFIG_RDS_RDMA=m +CONFIG_RDS_TCP=m +CONFIG_RDS_DEBUG=y +CONFIG_L2TP=m +CONFIG_L2TP_DEBUGFS=m +CONFIG_L2TP_V3=y +CONFIG_L2TP_IP=m +CONFIG_L2TP_ETH=m +CONFIG_BRIDGE=y +CONFIG_BRIDGE_MRP=y +CONFIG_VLAN_8021Q=m +CONFIG_VLAN_8021Q_GVRP=y +CONFIG_NET_SCHED=y +CONFIG_NET_SCH_HTB=m +CONFIG_NET_SCH_HFSC=m +CONFIG_NET_SCH_PRIO=m +CONFIG_NET_SCH_MULTIQ=m +CONFIG_NET_SCH_RED=m +CONFIG_NET_SCH_SFB=m +CONFIG_NET_SCH_SFQ=m +CONFIG_NET_SCH_TEQL=m +CONFIG_NET_SCH_TBF=m +CONFIG_NET_SCH_GRED=m +CONFIG_NET_SCH_NETEM=m +CONFIG_NET_SCH_DRR=m +CONFIG_NET_SCH_MQPRIO=m +CONFIG_NET_SCH_CHOKE=m +CONFIG_NET_SCH_QFQ=m +CONFIG_NET_SCH_CODEL=m +CONFIG_NET_SCH_FQ_CODEL=m +CONFIG_NET_SCH_INGRESS=m +CONFIG_NET_SCH_PLUG=m +CONFIG_NET_SCH_ETS=m +CONFIG_NET_CLS_BASIC=m +CONFIG_NET_CLS_ROUTE4=m +CONFIG_NET_CLS_FW=m +CONFIG_NET_CLS_U32=m +CONFIG_CLS_U32_PERF=y +CONFIG_CLS_U32_MARK=y +CONFIG_NET_CLS_FLOW=m +CONFIG_NET_CLS_CGROUP=y +CONFIG_NET_CLS_BPF=m +CONFIG_NET_CLS_ACT=y +CONFIG_NET_ACT_POLICE=m +CONFIG_NET_ACT_GACT=m +CONFIG_GACT_PROB=y +CONFIG_NET_ACT_MIRRED=m +CONFIG_NET_ACT_IPT=m +CONFIG_NET_ACT_NAT=m +CONFIG_NET_ACT_PEDIT=m +CONFIG_NET_ACT_SIMP=m +CONFIG_NET_ACT_SKBEDIT=m +CONFIG_NET_ACT_CSUM=m +CONFIG_NET_ACT_GATE=m +CONFIG_NET_TC_SKB_EXT=y +CONFIG_DNS_RESOLVER=y +CONFIG_OPENVSWITCH=m +CONFIG_VSOCKETS=m +CONFIG_VIRTIO_VSOCKETS=m +CONFIG_NETLINK_DIAG=m +CONFIG_NET_SWITCHDEV=y +CONFIG_CGROUP_NET_PRIO=y +CONFIG_NET_PKTGEN=m +CONFIG_PCI=y +# CONFIG_PCIEASPM is not set +CONFIG_PCI_DEBUG=y +CONFIG_PCI_IOV=y +CONFIG_HOTPLUG_PCI=y +CONFIG_HOTPLUG_PCI_S390=y +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_SAFE=y +# CONFIG_FW_LOADER is not set +CONFIG_CONNECTOR=y +CONFIG_ZRAM=y +CONFIG_BLK_DEV_LOOP=m +CONFIG_BLK_DEV_DRBD=m +CONFIG_BLK_DEV_NBD=m +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=32768 +CONFIG_VIRTIO_BLK=y +CONFIG_BLK_DEV_RBD=m +CONFIG_BLK_DEV_NVME=m +CONFIG_ENCLOSURE_SERVICES=m +CONFIG_GENWQE=m +CONFIG_RAID_ATTRS=m +CONFIG_SCSI=y +CONFIG_BLK_DEV_SD=y +CONFIG_CHR_DEV_ST=m +CONFIG_BLK_DEV_SR=m +CONFIG_CHR_DEV_SG=y +CONFIG_CHR_DEV_SCH=m +CONFIG_SCSI_ENCLOSURE=m +CONFIG_SCSI_CONSTANTS=y +CONFIG_SCSI_LOGGING=y +CONFIG_SCSI_SPI_ATTRS=m +CONFIG_SCSI_FC_ATTRS=m +CONFIG_SCSI_SAS_LIBSAS=m +CONFIG_SCSI_SRP_ATTRS=m +CONFIG_ISCSI_TCP=m +CONFIG_SCSI_DEBUG=m +CONFIG_ZFCP=m +CONFIG_SCSI_VIRTIO=m +CONFIG_SCSI_DH=y +CONFIG_SCSI_DH_RDAC=m +CONFIG_SCSI_DH_HP_SW=m +CONFIG_SCSI_DH_EMC=m +CONFIG_SCSI_DH_ALUA=m +CONFIG_MD=y +CONFIG_BLK_DEV_MD=y +# CONFIG_MD_BITMAP_FILE is not set +CONFIG_MD_LINEAR=m +CONFIG_MD_MULTIPATH=m +CONFIG_MD_FAULTY=m +CONFIG_MD_CLUSTER=m +CONFIG_BCACHE=m +CONFIG_BLK_DEV_DM=y +CONFIG_DM_UNSTRIPED=m +CONFIG_DM_CRYPT=m +CONFIG_DM_SNAPSHOT=m +CONFIG_DM_THIN_PROVISIONING=m +CONFIG_DM_WRITECACHE=m +CONFIG_DM_CLONE=m +CONFIG_DM_MIRROR=m +CONFIG_DM_LOG_USERSPACE=m +CONFIG_DM_RAID=m +CONFIG_DM_ZERO=m +CONFIG_DM_MULTIPATH=m +CONFIG_DM_MULTIPATH_QL=m +CONFIG_DM_MULTIPATH_ST=m +CONFIG_DM_MULTIPATH_HST=m +CONFIG_DM_MULTIPATH_IOA=m +CONFIG_DM_DELAY=m +CONFIG_DM_INIT=y +CONFIG_DM_UEVENT=y +CONFIG_DM_FLAKEY=m +CONFIG_DM_VERITY=m +CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y +CONFIG_DM_SWITCH=m +CONFIG_DM_INTEGRITY=m +CONFIG_NETDEVICES=y +CONFIG_BONDING=m +CONFIG_DUMMY=m +CONFIG_EQUALIZER=m +CONFIG_IFB=m +CONFIG_MACVLAN=m +CONFIG_MACVTAP=m +CONFIG_VXLAN=m +CONFIG_BAREUDP=m +CONFIG_AMT=m +CONFIG_TUN=m +CONFIG_VETH=m +CONFIG_VIRTIO_NET=m +CONFIG_NLMON=m +# CONFIG_NET_VENDOR_3COM is not set +# CONFIG_NET_VENDOR_ADAPTEC is not set +# CONFIG_NET_VENDOR_AGERE is not set +# CONFIG_NET_VENDOR_ALACRITECH is not set +# CONFIG_NET_VENDOR_ALTEON is not set +# CONFIG_NET_VENDOR_AMAZON is not set +# CONFIG_NET_VENDOR_AMD is not set +# CONFIG_NET_VENDOR_AQUANTIA is not set +# CONFIG_NET_VENDOR_ARC is not set +# CONFIG_NET_VENDOR_ASIX is not set +# CONFIG_NET_VENDOR_ATHEROS is not set +# CONFIG_NET_VENDOR_BROADCOM is not set +# CONFIG_NET_VENDOR_CADENCE is not set +# CONFIG_NET_VENDOR_CAVIUM is not set +# CONFIG_NET_VENDOR_CHELSIO is not set +# CONFIG_NET_VENDOR_CISCO is not set +# CONFIG_NET_VENDOR_CORTINA is not set +# CONFIG_NET_VENDOR_DAVICOM is not set +# CONFIG_NET_VENDOR_DEC is not set +# CONFIG_NET_VENDOR_DLINK is not set +# CONFIG_NET_VENDOR_EMULEX is not set +# CONFIG_NET_VENDOR_ENGLEDER is not set +# CONFIG_NET_VENDOR_EZCHIP is not set +# CONFIG_NET_VENDOR_FUNGIBLE is not set +# CONFIG_NET_VENDOR_GOOGLE is not set +# CONFIG_NET_VENDOR_HUAWEI is not set +# CONFIG_NET_VENDOR_INTEL is not set +# CONFIG_NET_VENDOR_LITEX is not set +# CONFIG_NET_VENDOR_MARVELL is not set +CONFIG_MLX4_EN=m +CONFIG_MLX5_CORE=m +CONFIG_MLX5_CORE_EN=y +# CONFIG_NET_VENDOR_MICREL is not set +# CONFIG_NET_VENDOR_MICROCHIP is not set +# CONFIG_NET_VENDOR_MICROSEMI is not set +# CONFIG_NET_VENDOR_MICROSOFT is not set +# CONFIG_NET_VENDOR_MYRI is not set +# CONFIG_NET_VENDOR_NI is not set +# CONFIG_NET_VENDOR_NATSEMI is not set +# CONFIG_NET_VENDOR_NETERION is not set +# CONFIG_NET_VENDOR_NETRONOME is not set +# CONFIG_NET_VENDOR_NVIDIA is not set +# CONFIG_NET_VENDOR_OKI is not set +# CONFIG_NET_VENDOR_PACKET_ENGINES is not set +# CONFIG_NET_VENDOR_PENSANDO is not set +# CONFIG_NET_VENDOR_QLOGIC is not set +# CONFIG_NET_VENDOR_BROCADE is not set +# CONFIG_NET_VENDOR_QUALCOMM is not set +# CONFIG_NET_VENDOR_RDC is not set +# CONFIG_NET_VENDOR_REALTEK is not set +# CONFIG_NET_VENDOR_RENESAS is not set +# CONFIG_NET_VENDOR_ROCKER is not set +# CONFIG_NET_VENDOR_SAMSUNG is not set +# CONFIG_NET_VENDOR_SEEQ is not set +# CONFIG_NET_VENDOR_SILAN is not set +# CONFIG_NET_VENDOR_SIS is not set +# CONFIG_NET_VENDOR_SOLARFLARE is not set +# CONFIG_NET_VENDOR_SMSC is not set +# CONFIG_NET_VENDOR_SOCIONEXT is not set +# CONFIG_NET_VENDOR_STMICRO is not set +# CONFIG_NET_VENDOR_SUN is not set +# CONFIG_NET_VENDOR_SYNOPSYS is not set +# CONFIG_NET_VENDOR_TEHUTI is not set +# CONFIG_NET_VENDOR_TI is not set +# CONFIG_NET_VENDOR_VERTEXCOM is not set +# CONFIG_NET_VENDOR_VIA is not set +# CONFIG_NET_VENDOR_WANGXUN is not set +# CONFIG_NET_VENDOR_WIZNET is not set +# CONFIG_NET_VENDOR_XILINX is not set +CONFIG_PPP=m +CONFIG_PPP_BSDCOMP=m +CONFIG_PPP_DEFLATE=m +CONFIG_PPP_FILTER=y +CONFIG_PPP_MPPE=m +CONFIG_PPP_MULTILINK=y +CONFIG_PPPOE=m +CONFIG_PPTP=m +CONFIG_PPPOL2TP=m +CONFIG_PPP_ASYNC=m +CONFIG_PPP_SYNC_TTY=m +CONFIG_ISM=m +CONFIG_INPUT_EVDEV=y +# CONFIG_INPUT_KEYBOARD is not set +# CONFIG_INPUT_MOUSE is not set +# CONFIG_SERIO is not set +CONFIG_LEGACY_PTY_COUNT=0 +# CONFIG_LEGACY_TIOCSTI is not set +CONFIG_VIRTIO_CONSOLE=m +CONFIG_HW_RANDOM_VIRTIO=m +CONFIG_HANGCHECK_TIMER=m +CONFIG_TN3270_FS=y +CONFIG_PPS=m +# CONFIG_PTP_1588_CLOCK is not set +# CONFIG_HWMON is not set +CONFIG_WATCHDOG=y +CONFIG_WATCHDOG_NOWAYOUT=y +CONFIG_SOFT_WATCHDOG=m +CONFIG_DIAG288_WATCHDOG=m +# CONFIG_DRM_DEBUG_MODESET_LOCK is not set +CONFIG_FB=y +# CONFIG_FB_DEVICE is not set +CONFIG_FRAMEBUFFER_CONSOLE=y +CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y +# CONFIG_HID_SUPPORT is not set +# CONFIG_USB_SUPPORT is not set +CONFIG_INFINIBAND=m +CONFIG_INFINIBAND_USER_ACCESS=m +CONFIG_MLX4_INFINIBAND=m +CONFIG_MLX5_INFINIBAND=m +CONFIG_SYNC_FILE=y +CONFIG_VFIO=m +CONFIG_VFIO_PCI=m +CONFIG_MLX5_VFIO_PCI=m +CONFIG_VIRTIO_PCI=m +CONFIG_VIRTIO_BALLOON=m +CONFIG_VIRTIO_INPUT=y +CONFIG_VHOST_NET=m +CONFIG_VHOST_VSOCK=m +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_EXT4_FS_SECURITY=y +CONFIG_JBD2_DEBUG=y +CONFIG_JFS_FS=m +CONFIG_JFS_POSIX_ACL=y +CONFIG_JFS_SECURITY=y +CONFIG_JFS_STATISTICS=y +CONFIG_XFS_FS=y +CONFIG_XFS_QUOTA=y +CONFIG_XFS_POSIX_ACL=y +CONFIG_XFS_RT=y +CONFIG_XFS_DEBUG=y +CONFIG_GFS2_FS=m +CONFIG_GFS2_FS_LOCKING_DLM=y +CONFIG_OCFS2_FS=m +CONFIG_BTRFS_FS=y +CONFIG_BTRFS_FS_POSIX_ACL=y +CONFIG_BTRFS_DEBUG=y +CONFIG_BTRFS_ASSERT=y +CONFIG_NILFS2_FS=m +CONFIG_FS_DAX=y +CONFIG_EXPORTFS_BLOCK_OPS=y +CONFIG_FS_ENCRYPTION=y +CONFIG_FS_VERITY=y +CONFIG_FS_VERITY_BUILTIN_SIGNATURES=y +CONFIG_FANOTIFY=y +CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y +CONFIG_QUOTA_NETLINK_INTERFACE=y +CONFIG_QUOTA_DEBUG=y +CONFIG_QFMT_V1=m +CONFIG_QFMT_V2=m +CONFIG_AUTOFS_FS=m +CONFIG_FUSE_FS=y +CONFIG_CUSE=m +CONFIG_VIRTIO_FS=m +CONFIG_OVERLAY_FS=m +CONFIG_NETFS_STATS=y +CONFIG_FSCACHE=m +CONFIG_CACHEFILES=m +CONFIG_ISO9660_FS=y +CONFIG_JOLIET=y +CONFIG_ZISOFS=y +CONFIG_UDF_FS=m +CONFIG_MSDOS_FS=m +CONFIG_VFAT_FS=m +CONFIG_EXFAT_FS=m +CONFIG_NTFS_FS=m +CONFIG_NTFS_RW=y +CONFIG_PROC_KCORE=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_TMPFS_INODE64=y +CONFIG_TMPFS_QUOTA=y +CONFIG_HUGETLBFS=y +CONFIG_ECRYPT_FS=m +CONFIG_CRAMFS=m +CONFIG_SQUASHFS=m +CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT=y +CONFIG_SQUASHFS_XATTR=y +CONFIG_SQUASHFS_LZ4=y +CONFIG_SQUASHFS_LZO=y +CONFIG_SQUASHFS_XZ=y +CONFIG_SQUASHFS_ZSTD=y +CONFIG_ROMFS_FS=m +CONFIG_NFS_FS=m +CONFIG_NFS_V3_ACL=y +CONFIG_NFS_V4=m +CONFIG_NFS_SWAP=y +CONFIG_NFSD=m +CONFIG_NFSD_V3_ACL=y +CONFIG_NFSD_V4=y +CONFIG_NFSD_V4_SECURITY_LABEL=y +CONFIG_CIFS=m +CONFIG_CIFS_UPCALL=y +CONFIG_CIFS_XATTR=y +CONFIG_CIFS_POSIX=y +# CONFIG_CIFS_DEBUG is not set +CONFIG_CIFS_DFS_UPCALL=y +CONFIG_CIFS_SWN_UPCALL=y +CONFIG_NLS_DEFAULT="utf8" +CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_850=m +CONFIG_NLS_ASCII=m +CONFIG_NLS_ISO8859_1=m +CONFIG_NLS_ISO8859_15=m +CONFIG_NLS_UTF8=m +CONFIG_DLM=m +CONFIG_UNICODE=y +CONFIG_PERSISTENT_KEYRINGS=y +CONFIG_ENCRYPTED_KEYS=m +CONFIG_KEY_NOTIFICATIONS=y +CONFIG_SECURITY=y +CONFIG_SECURITY_NETWORK=y +CONFIG_HARDENED_USERCOPY=y +CONFIG_FORTIFY_SOURCE=y +CONFIG_SECURITY_SELINUX=y +CONFIG_SECURITY_SELINUX_BOOTPARAM=y +CONFIG_SECURITY_LOCKDOWN_LSM=y +CONFIG_SECURITY_LOCKDOWN_LSM_EARLY=y +CONFIG_SECURITY_LANDLOCK=y +CONFIG_INTEGRITY_SIGNATURE=y +CONFIG_INTEGRITY_ASYMMETRIC_KEYS=y +CONFIG_INTEGRITY_PLATFORM_KEYRING=y +CONFIG_IMA=y +CONFIG_IMA_DEFAULT_HASH_SHA256=y +CONFIG_IMA_WRITE_POLICY=y +CONFIG_IMA_APPRAISE=y +CONFIG_LSM="yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor" +CONFIG_INIT_STACK_NONE=y +CONFIG_BUG_ON_DATA_CORRUPTION=y +CONFIG_CRYPTO_USER=m +# CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set +CONFIG_CRYPTO_PCRYPT=m +CONFIG_CRYPTO_CRYPTD=m +CONFIG_CRYPTO_TEST=m +CONFIG_CRYPTO_DH=m +CONFIG_CRYPTO_ECDH=m +CONFIG_CRYPTO_ECDSA=m +CONFIG_CRYPTO_ECRDSA=m +CONFIG_CRYPTO_SM2=m +CONFIG_CRYPTO_CURVE25519=m +CONFIG_CRYPTO_AES_TI=m +CONFIG_CRYPTO_ANUBIS=m +CONFIG_CRYPTO_ARIA=m +CONFIG_CRYPTO_BLOWFISH=m +CONFIG_CRYPTO_CAMELLIA=m +CONFIG_CRYPTO_CAST5=m +CONFIG_CRYPTO_CAST6=m +CONFIG_CRYPTO_DES=m +CONFIG_CRYPTO_FCRYPT=m +CONFIG_CRYPTO_KHAZAD=m +CONFIG_CRYPTO_SEED=m +CONFIG_CRYPTO_SERPENT=m +CONFIG_CRYPTO_SM4_GENERIC=m +CONFIG_CRYPTO_TEA=m +CONFIG_CRYPTO_TWOFISH=m +CONFIG_CRYPTO_ADIANTUM=m +CONFIG_CRYPTO_ARC4=m +CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_HCTR2=m +CONFIG_CRYPTO_KEYWRAP=m +CONFIG_CRYPTO_LRW=m +CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_AEGIS128=m +CONFIG_CRYPTO_CHACHA20POLY1305=m +CONFIG_CRYPTO_GCM=y +CONFIG_CRYPTO_SEQIV=y +CONFIG_CRYPTO_MD4=m +CONFIG_CRYPTO_MD5=y +CONFIG_CRYPTO_MICHAEL_MIC=m +CONFIG_CRYPTO_RMD160=m +CONFIG_CRYPTO_SM3_GENERIC=m +CONFIG_CRYPTO_VMAC=m +CONFIG_CRYPTO_WP512=m +CONFIG_CRYPTO_XCBC=m +CONFIG_CRYPTO_CRC32=m +CONFIG_CRYPTO_842=m +CONFIG_CRYPTO_LZ4=m +CONFIG_CRYPTO_LZ4HC=m +CONFIG_CRYPTO_ZSTD=m +CONFIG_CRYPTO_ANSI_CPRNG=m +CONFIG_CRYPTO_USER_API_HASH=m +CONFIG_CRYPTO_USER_API_SKCIPHER=m +CONFIG_CRYPTO_USER_API_RNG=m +CONFIG_CRYPTO_USER_API_AEAD=m +CONFIG_CRYPTO_STATS=y +CONFIG_CRYPTO_CRC32_S390=y +CONFIG_CRYPTO_SHA512_S390=m +CONFIG_CRYPTO_SHA1_S390=m +CONFIG_CRYPTO_SHA256_S390=m +CONFIG_CRYPTO_SHA3_256_S390=m +CONFIG_CRYPTO_SHA3_512_S390=m +CONFIG_CRYPTO_GHASH_S390=m +CONFIG_CRYPTO_AES_S390=m +CONFIG_CRYPTO_DES_S390=m +CONFIG_CRYPTO_CHACHA_S390=m +CONFIG_ZCRYPT=m +CONFIG_PKEY=m +CONFIG_CRYPTO_PAES_S390=m +CONFIG_CRYPTO_DEV_VIRTIO=m +CONFIG_SYSTEM_BLACKLIST_KEYRING=y +CONFIG_CORDIC=m +CONFIG_CRYPTO_LIB_CURVE25519=m +CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m +CONFIG_CRC32_SELFTEST=y +CONFIG_CRC4=m +CONFIG_CRC7=m +CONFIG_CRC8=m +CONFIG_RANDOM32_SELFTEST=y +CONFIG_XZ_DEC_MICROLZMA=y +CONFIG_DMA_CMA=y +CONFIG_CMA_SIZE_MBYTES=0 +CONFIG_PRINTK_TIME=y +CONFIG_DYNAMIC_DEBUG=y +CONFIG_DEBUG_INFO_DWARF4=y +CONFIG_GDB_SCRIPTS=y +CONFIG_HEADERS_INSTALL=y +CONFIG_DEBUG_SECTION_MISMATCH=y +CONFIG_MAGIC_SYSRQ=y +CONFIG_DEBUG_PAGEALLOC=y +CONFIG_SLUB_DEBUG_ON=y +CONFIG_PAGE_OWNER=y +CONFIG_DEBUG_RODATA_TEST=y +CONFIG_DEBUG_WX=y +CONFIG_PTDUMP_DEBUGFS=y +CONFIG_DEBUG_OBJECTS=y +CONFIG_DEBUG_OBJECTS_SELFTEST=y +CONFIG_DEBUG_OBJECTS_FREE=y +CONFIG_DEBUG_OBJECTS_TIMERS=y +CONFIG_DEBUG_OBJECTS_WORK=y +CONFIG_DEBUG_OBJECTS_RCU_HEAD=y +CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER=y +CONFIG_DEBUG_STACK_USAGE=y +CONFIG_DEBUG_VM=y +CONFIG_DEBUG_VM_PGFLAGS=y +CONFIG_DEBUG_MEMORY_INIT=y +CONFIG_MEMORY_NOTIFIER_ERROR_INJECT=m +CONFIG_DEBUG_PER_CPU_MAPS=y +CONFIG_KFENCE=y +CONFIG_KFENCE_DEFERRABLE=y +CONFIG_KFENCE_STATIC_KEYS=y +CONFIG_DEBUG_SHIRQ=y +CONFIG_PANIC_ON_OOPS=y +CONFIG_DETECT_HUNG_TASK=y +CONFIG_WQ_WATCHDOG=y +CONFIG_TEST_LOCKUP=m +CONFIG_DEBUG_PREEMPT=y +CONFIG_PROVE_LOCKING=y +CONFIG_LOCK_STAT=y +CONFIG_DEBUG_ATOMIC_SLEEP=y +CONFIG_DEBUG_LOCKING_API_SELFTESTS=y +CONFIG_DEBUG_IRQFLAGS=y +CONFIG_DEBUG_LIST=y +CONFIG_DEBUG_SG=y +CONFIG_DEBUG_NOTIFIERS=y +CONFIG_RCU_TORTURE_TEST=m +CONFIG_RCU_REF_SCALE_TEST=m +CONFIG_RCU_CPU_STALL_TIMEOUT=300 +# CONFIG_RCU_TRACE is not set +CONFIG_LATENCYTOP=y +CONFIG_BOOTTIME_TRACING=y +CONFIG_FUNCTION_GRAPH_RETVAL=y +CONFIG_FPROBE=y +CONFIG_FUNCTION_PROFILER=y +CONFIG_STACK_TRACER=y +CONFIG_IRQSOFF_TRACER=y +CONFIG_PREEMPT_TRACER=y +CONFIG_SCHED_TRACER=y +CONFIG_FTRACE_SYSCALLS=y +CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_USER_EVENTS=y +CONFIG_HIST_TRIGGERS=y +CONFIG_FTRACE_STARTUP_TEST=y +# CONFIG_EVENT_TRACE_STARTUP_TEST is not set +CONFIG_SAMPLES=y +CONFIG_SAMPLE_TRACE_PRINTK=m +CONFIG_SAMPLE_FTRACE_DIRECT=m +CONFIG_SAMPLE_FTRACE_DIRECT_MULTI=m +CONFIG_SAMPLE_FTRACE_OPS=m +CONFIG_DEBUG_ENTRY=y +CONFIG_CIO_INJECT=y +CONFIG_KUNIT=m +CONFIG_KUNIT_DEBUGFS=y +CONFIG_NOTIFIER_ERROR_INJECTION=m +CONFIG_NETDEV_NOTIFIER_ERROR_INJECT=m +CONFIG_FAULT_INJECTION=y +CONFIG_FAILSLAB=y +CONFIG_FAIL_PAGE_ALLOC=y +CONFIG_FAULT_INJECTION_USERCOPY=y +CONFIG_FAIL_MAKE_REQUEST=y +CONFIG_FAIL_IO_TIMEOUT=y +CONFIG_FAIL_FUTEX=y +CONFIG_FAULT_INJECTION_DEBUG_FS=y +CONFIG_FAULT_INJECTION_CONFIGFS=y +CONFIG_FAULT_INJECTION_STACKTRACE_FILTER=y +CONFIG_LKDTM=m +CONFIG_TEST_MIN_HEAP=y +CONFIG_KPROBES_SANITY_TEST=m +CONFIG_RBTREE_TEST=y +CONFIG_INTERVAL_TREE_TEST=m +CONFIG_PERCPU_TEST=m +CONFIG_ATOMIC64_SELFTEST=y +CONFIG_STRING_SELFTEST=y +CONFIG_TEST_BITOPS=m +CONFIG_TEST_BPF=m +CONFIG_TEST_LIVEPATCH=m diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig new file mode 100644 index 0000000000..1b8150e50f --- /dev/null +++ b/arch/s390/configs/defconfig @@ -0,0 +1,816 @@ +CONFIG_SYSVIPC=y +CONFIG_POSIX_MQUEUE=y +CONFIG_WATCH_QUEUE=y +CONFIG_AUDIT=y +CONFIG_NO_HZ_IDLE=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_BPF_SYSCALL=y +CONFIG_BPF_JIT=y +CONFIG_BPF_JIT_ALWAYS_ON=y +CONFIG_BPF_LSM=y +CONFIG_SCHED_CORE=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_BSD_PROCESS_ACCT_V3=y +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_XACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +CONFIG_NUMA_BALANCING=y +CONFIG_MEMCG=y +CONFIG_BLK_CGROUP=y +CONFIG_CFS_BANDWIDTH=y +CONFIG_CGROUP_PIDS=y +CONFIG_CGROUP_RDMA=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CGROUP_HUGETLB=y +CONFIG_CPUSETS=y +CONFIG_CGROUP_DEVICE=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_PERF=y +CONFIG_CGROUP_BPF=y +CONFIG_CGROUP_MISC=y +CONFIG_NAMESPACES=y +CONFIG_USER_NS=y +CONFIG_CHECKPOINT_RESTORE=y +CONFIG_SCHED_AUTOGROUP=y +CONFIG_EXPERT=y +# CONFIG_SYSFS_SYSCALL is not set +CONFIG_PROFILING=y +CONFIG_KEXEC_FILE=y +CONFIG_KEXEC_SIG=y +CONFIG_CRASH_DUMP=y +CONFIG_LIVEPATCH=y +CONFIG_MARCH_ZEC12=y +CONFIG_TUNE_ZEC12=y +CONFIG_NR_CPUS=512 +CONFIG_NUMA=y +CONFIG_HZ_100=y +CONFIG_CERT_STORE=y +CONFIG_EXPOLINE=y +CONFIG_EXPOLINE_AUTO=y +CONFIG_CHSC_SCH=y +CONFIG_VFIO_CCW=m +CONFIG_VFIO_AP=m +CONFIG_PROTECTED_VIRTUALIZATION_GUEST=y +CONFIG_CMM=m +CONFIG_APPLDATA_BASE=y +CONFIG_S390_HYPFS_FS=y +CONFIG_KVM=m +CONFIG_S390_UNWIND_SELFTEST=m +CONFIG_S390_KPROBES_SANITY_TEST=m +CONFIG_S390_MODULES_SANITY_TEST=m +CONFIG_KPROBES=y +CONFIG_JUMP_LABEL=y +# CONFIG_GCC_PLUGINS is not set +CONFIG_MODULES=y +CONFIG_MODULE_FORCE_LOAD=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODULE_FORCE_UNLOAD=y +CONFIG_MODULE_UNLOAD_TAINT_TRACKING=y +CONFIG_MODVERSIONS=y +CONFIG_MODULE_SRCVERSION_ALL=y +CONFIG_MODULE_SIG_SHA256=y +CONFIG_BLK_DEV_THROTTLING=y +CONFIG_BLK_WBT=y +CONFIG_BLK_CGROUP_IOLATENCY=y +CONFIG_BLK_CGROUP_IOCOST=y +CONFIG_BLK_CGROUP_IOPRIO=y +CONFIG_BLK_INLINE_ENCRYPTION=y +CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK=y +CONFIG_PARTITION_ADVANCED=y +CONFIG_IBM_PARTITION=y +CONFIG_BSD_DISKLABEL=y +CONFIG_MINIX_SUBPARTITION=y +CONFIG_SOLARIS_X86_PARTITION=y +CONFIG_UNIXWARE_DISKLABEL=y +CONFIG_IOSCHED_BFQ=y +CONFIG_BINFMT_MISC=m +CONFIG_ZSWAP=y +CONFIG_ZSMALLOC_STAT=y +# CONFIG_COMPAT_BRK is not set +CONFIG_MEMORY_HOTPLUG=y +CONFIG_MEMORY_HOTREMOVE=y +CONFIG_KSM=y +CONFIG_TRANSPARENT_HUGEPAGE=y +CONFIG_CMA_SYSFS=y +CONFIG_CMA_AREAS=7 +CONFIG_MEM_SOFT_DIRTY=y +CONFIG_DEFERRED_STRUCT_PAGE_INIT=y +CONFIG_IDLE_PAGE_TRACKING=y +CONFIG_PERCPU_STATS=y +CONFIG_ANON_VMA_NAME=y +CONFIG_USERFAULTFD=y +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_PACKET_DIAG=m +CONFIG_UNIX=y +CONFIG_UNIX_DIAG=m +CONFIG_XFRM_USER=m +CONFIG_NET_KEY=m +CONFIG_SMC=m +CONFIG_SMC_DIAG=m +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_ROUTE_MULTIPATH=y +CONFIG_IP_ROUTE_VERBOSE=y +CONFIG_NET_IPIP=m +CONFIG_NET_IPGRE_DEMUX=m +CONFIG_NET_IPGRE=m +CONFIG_NET_IPGRE_BROADCAST=y +CONFIG_IP_MROUTE=y +CONFIG_IP_MROUTE_MULTIPLE_TABLES=y +CONFIG_IP_PIMSM_V1=y +CONFIG_IP_PIMSM_V2=y +CONFIG_SYN_COOKIES=y +CONFIG_NET_IPVTI=m +CONFIG_INET_AH=m +CONFIG_INET_ESP=m +CONFIG_INET_ESPINTCP=y +CONFIG_INET_IPCOMP=m +CONFIG_INET_DIAG=m +CONFIG_INET_UDP_DIAG=m +CONFIG_TCP_CONG_ADVANCED=y +CONFIG_TCP_CONG_HSTCP=m +CONFIG_TCP_CONG_HYBLA=m +CONFIG_TCP_CONG_SCALABLE=m +CONFIG_TCP_CONG_LP=m +CONFIG_TCP_CONG_VENO=m +CONFIG_TCP_CONG_YEAH=m +CONFIG_TCP_CONG_ILLINOIS=m +CONFIG_IPV6_ROUTER_PREF=y +CONFIG_INET6_AH=m +CONFIG_INET6_ESP=m +CONFIG_INET6_ESPINTCP=y +CONFIG_INET6_IPCOMP=m +CONFIG_IPV6_MIP6=m +CONFIG_IPV6_VTI=m +CONFIG_IPV6_SIT=m +CONFIG_IPV6_GRE=m +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_IPV6_SUBTREES=y +CONFIG_IPV6_RPL_LWTUNNEL=y +CONFIG_MPTCP=y +CONFIG_NETFILTER=y +CONFIG_BRIDGE_NETFILTER=m +CONFIG_NETFILTER_NETLINK_HOOK=m +CONFIG_NF_CONNTRACK=m +CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_PROCFS=y +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CONNTRACK_TIMEOUT=y +CONFIG_NF_CONNTRACK_TIMESTAMP=y +CONFIG_NF_CONNTRACK_AMANDA=m +CONFIG_NF_CONNTRACK_FTP=m +CONFIG_NF_CONNTRACK_H323=m +CONFIG_NF_CONNTRACK_IRC=m +CONFIG_NF_CONNTRACK_NETBIOS_NS=m +CONFIG_NF_CONNTRACK_SNMP=m +CONFIG_NF_CONNTRACK_PPTP=m +CONFIG_NF_CONNTRACK_SANE=m +CONFIG_NF_CONNTRACK_SIP=m +CONFIG_NF_CONNTRACK_TFTP=m +CONFIG_NF_CT_NETLINK=m +CONFIG_NF_CT_NETLINK_TIMEOUT=m +CONFIG_NF_TABLES=m +CONFIG_NF_TABLES_INET=y +CONFIG_NFT_CT=m +CONFIG_NFT_LOG=m +CONFIG_NFT_LIMIT=m +CONFIG_NFT_NAT=m +CONFIG_NFT_REJECT=m +CONFIG_NFT_COMPAT=m +CONFIG_NFT_HASH=m +CONFIG_NFT_FIB_INET=m +CONFIG_NETFILTER_XTABLES_COMPAT=y +CONFIG_NETFILTER_XT_SET=m +CONFIG_NETFILTER_XT_TARGET_AUDIT=m +CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m +CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m +CONFIG_NETFILTER_XT_TARGET_CONNMARK=m +CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m +CONFIG_NETFILTER_XT_TARGET_CT=m +CONFIG_NETFILTER_XT_TARGET_DSCP=m +CONFIG_NETFILTER_XT_TARGET_HMARK=m +CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m +CONFIG_NETFILTER_XT_TARGET_LOG=m +CONFIG_NETFILTER_XT_TARGET_MARK=m +CONFIG_NETFILTER_XT_TARGET_NFLOG=m +CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m +CONFIG_NETFILTER_XT_TARGET_TEE=m +CONFIG_NETFILTER_XT_TARGET_TPROXY=m +CONFIG_NETFILTER_XT_TARGET_TRACE=m +CONFIG_NETFILTER_XT_TARGET_SECMARK=m +CONFIG_NETFILTER_XT_TARGET_TCPMSS=m +CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m +CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m +CONFIG_NETFILTER_XT_MATCH_BPF=m +CONFIG_NETFILTER_XT_MATCH_CLUSTER=m +CONFIG_NETFILTER_XT_MATCH_COMMENT=m +CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m +CONFIG_NETFILTER_XT_MATCH_CONNLABEL=m +CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m +CONFIG_NETFILTER_XT_MATCH_CONNMARK=m +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m +CONFIG_NETFILTER_XT_MATCH_CPU=m +CONFIG_NETFILTER_XT_MATCH_DCCP=m +CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m +CONFIG_NETFILTER_XT_MATCH_DSCP=m +CONFIG_NETFILTER_XT_MATCH_ESP=m +CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m +CONFIG_NETFILTER_XT_MATCH_HELPER=m +CONFIG_NETFILTER_XT_MATCH_IPRANGE=m +CONFIG_NETFILTER_XT_MATCH_IPVS=m +CONFIG_NETFILTER_XT_MATCH_LENGTH=m +CONFIG_NETFILTER_XT_MATCH_LIMIT=m +CONFIG_NETFILTER_XT_MATCH_MAC=m +CONFIG_NETFILTER_XT_MATCH_MARK=m +CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m +CONFIG_NETFILTER_XT_MATCH_NFACCT=m +CONFIG_NETFILTER_XT_MATCH_OSF=m +CONFIG_NETFILTER_XT_MATCH_OWNER=m +CONFIG_NETFILTER_XT_MATCH_POLICY=m +CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m +CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m +CONFIG_NETFILTER_XT_MATCH_QUOTA=m +CONFIG_NETFILTER_XT_MATCH_RATEEST=m +CONFIG_NETFILTER_XT_MATCH_REALM=m +CONFIG_NETFILTER_XT_MATCH_RECENT=m +CONFIG_NETFILTER_XT_MATCH_STATE=m +CONFIG_NETFILTER_XT_MATCH_STATISTIC=m +CONFIG_NETFILTER_XT_MATCH_STRING=m +CONFIG_NETFILTER_XT_MATCH_TCPMSS=m +CONFIG_NETFILTER_XT_MATCH_TIME=m +CONFIG_NETFILTER_XT_MATCH_U32=m +CONFIG_IP_SET=m +CONFIG_IP_SET_BITMAP_IP=m +CONFIG_IP_SET_BITMAP_IPMAC=m +CONFIG_IP_SET_BITMAP_PORT=m +CONFIG_IP_SET_HASH_IP=m +CONFIG_IP_SET_HASH_IPPORT=m +CONFIG_IP_SET_HASH_IPPORTIP=m +CONFIG_IP_SET_HASH_IPPORTNET=m +CONFIG_IP_SET_HASH_NETPORTNET=m +CONFIG_IP_SET_HASH_NET=m +CONFIG_IP_SET_HASH_NETNET=m +CONFIG_IP_SET_HASH_NETPORT=m +CONFIG_IP_SET_HASH_NETIFACE=m +CONFIG_IP_SET_LIST_SET=m +CONFIG_IP_VS=m +CONFIG_IP_VS_PROTO_TCP=y +CONFIG_IP_VS_PROTO_UDP=y +CONFIG_IP_VS_PROTO_ESP=y +CONFIG_IP_VS_PROTO_AH=y +CONFIG_IP_VS_RR=m +CONFIG_IP_VS_WRR=m +CONFIG_IP_VS_LC=m +CONFIG_IP_VS_WLC=m +CONFIG_IP_VS_LBLC=m +CONFIG_IP_VS_LBLCR=m +CONFIG_IP_VS_DH=m +CONFIG_IP_VS_SH=m +CONFIG_IP_VS_SED=m +CONFIG_IP_VS_NQ=m +CONFIG_IP_VS_TWOS=m +CONFIG_IP_VS_FTP=m +CONFIG_IP_VS_PE_SIP=m +CONFIG_NFT_FIB_IPV4=m +CONFIG_NF_TABLES_ARP=y +CONFIG_NF_LOG_IPV4=m +CONFIG_IP_NF_IPTABLES=m +CONFIG_IP_NF_MATCH_AH=m +CONFIG_IP_NF_MATCH_ECN=m +CONFIG_IP_NF_MATCH_RPFILTER=m +CONFIG_IP_NF_MATCH_TTL=m +CONFIG_IP_NF_FILTER=m +CONFIG_IP_NF_TARGET_REJECT=m +CONFIG_IP_NF_NAT=m +CONFIG_IP_NF_TARGET_MASQUERADE=m +CONFIG_IP_NF_MANGLE=m +CONFIG_IP_NF_TARGET_ECN=m +CONFIG_IP_NF_TARGET_TTL=m +CONFIG_IP_NF_RAW=m +CONFIG_IP_NF_SECURITY=m +CONFIG_IP_NF_ARPTABLES=m +CONFIG_IP_NF_ARPFILTER=m +CONFIG_IP_NF_ARP_MANGLE=m +CONFIG_NFT_FIB_IPV6=m +CONFIG_IP6_NF_IPTABLES=m +CONFIG_IP6_NF_MATCH_AH=m +CONFIG_IP6_NF_MATCH_EUI64=m +CONFIG_IP6_NF_MATCH_FRAG=m +CONFIG_IP6_NF_MATCH_OPTS=m +CONFIG_IP6_NF_MATCH_HL=m +CONFIG_IP6_NF_MATCH_IPV6HEADER=m +CONFIG_IP6_NF_MATCH_MH=m +CONFIG_IP6_NF_MATCH_RPFILTER=m +CONFIG_IP6_NF_MATCH_RT=m +CONFIG_IP6_NF_TARGET_HL=m +CONFIG_IP6_NF_FILTER=m +CONFIG_IP6_NF_TARGET_REJECT=m +CONFIG_IP6_NF_MANGLE=m +CONFIG_IP6_NF_RAW=m +CONFIG_IP6_NF_SECURITY=m +CONFIG_IP6_NF_NAT=m +CONFIG_IP6_NF_TARGET_MASQUERADE=m +CONFIG_NF_TABLES_BRIDGE=m +CONFIG_RDS=m +CONFIG_RDS_RDMA=m +CONFIG_RDS_TCP=m +CONFIG_L2TP=m +CONFIG_L2TP_DEBUGFS=m +CONFIG_L2TP_V3=y +CONFIG_L2TP_IP=m +CONFIG_L2TP_ETH=m +CONFIG_BRIDGE=y +CONFIG_BRIDGE_MRP=y +CONFIG_VLAN_8021Q=m +CONFIG_VLAN_8021Q_GVRP=y +CONFIG_NET_SCHED=y +CONFIG_NET_SCH_HTB=m +CONFIG_NET_SCH_HFSC=m +CONFIG_NET_SCH_PRIO=m +CONFIG_NET_SCH_MULTIQ=m +CONFIG_NET_SCH_RED=m +CONFIG_NET_SCH_SFB=m +CONFIG_NET_SCH_SFQ=m +CONFIG_NET_SCH_TEQL=m +CONFIG_NET_SCH_TBF=m +CONFIG_NET_SCH_GRED=m +CONFIG_NET_SCH_NETEM=m +CONFIG_NET_SCH_DRR=m +CONFIG_NET_SCH_MQPRIO=m +CONFIG_NET_SCH_CHOKE=m +CONFIG_NET_SCH_QFQ=m +CONFIG_NET_SCH_CODEL=m +CONFIG_NET_SCH_FQ_CODEL=m +CONFIG_NET_SCH_INGRESS=m +CONFIG_NET_SCH_PLUG=m +CONFIG_NET_SCH_ETS=m +CONFIG_NET_CLS_BASIC=m +CONFIG_NET_CLS_ROUTE4=m +CONFIG_NET_CLS_FW=m +CONFIG_NET_CLS_U32=m +CONFIG_CLS_U32_PERF=y +CONFIG_CLS_U32_MARK=y +CONFIG_NET_CLS_FLOW=m +CONFIG_NET_CLS_CGROUP=y +CONFIG_NET_CLS_BPF=m +CONFIG_NET_CLS_ACT=y +CONFIG_NET_ACT_POLICE=m +CONFIG_NET_ACT_GACT=m +CONFIG_GACT_PROB=y +CONFIG_NET_ACT_MIRRED=m +CONFIG_NET_ACT_IPT=m +CONFIG_NET_ACT_NAT=m +CONFIG_NET_ACT_PEDIT=m +CONFIG_NET_ACT_SIMP=m +CONFIG_NET_ACT_SKBEDIT=m +CONFIG_NET_ACT_CSUM=m +CONFIG_NET_ACT_GATE=m +CONFIG_NET_TC_SKB_EXT=y +CONFIG_DNS_RESOLVER=y +CONFIG_OPENVSWITCH=m +CONFIG_VSOCKETS=m +CONFIG_VIRTIO_VSOCKETS=m +CONFIG_NETLINK_DIAG=m +CONFIG_NET_SWITCHDEV=y +CONFIG_CGROUP_NET_PRIO=y +CONFIG_NET_PKTGEN=m +CONFIG_PCI=y +# CONFIG_PCIEASPM is not set +CONFIG_PCI_IOV=y +CONFIG_HOTPLUG_PCI=y +CONFIG_HOTPLUG_PCI_S390=y +CONFIG_UEVENT_HELPER=y +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_SAFE=y +# CONFIG_FW_LOADER is not set +CONFIG_CONNECTOR=y +CONFIG_ZRAM=y +CONFIG_BLK_DEV_LOOP=m +CONFIG_BLK_DEV_DRBD=m +CONFIG_BLK_DEV_NBD=m +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=32768 +CONFIG_VIRTIO_BLK=y +CONFIG_BLK_DEV_RBD=m +CONFIG_BLK_DEV_NVME=m +CONFIG_ENCLOSURE_SERVICES=m +CONFIG_GENWQE=m +CONFIG_RAID_ATTRS=m +CONFIG_SCSI=y +CONFIG_BLK_DEV_SD=y +CONFIG_CHR_DEV_ST=m +CONFIG_BLK_DEV_SR=m +CONFIG_CHR_DEV_SG=y +CONFIG_CHR_DEV_SCH=m +CONFIG_SCSI_ENCLOSURE=m +CONFIG_SCSI_CONSTANTS=y +CONFIG_SCSI_LOGGING=y +CONFIG_SCSI_SPI_ATTRS=m +CONFIG_SCSI_FC_ATTRS=m +CONFIG_SCSI_SAS_LIBSAS=m +CONFIG_SCSI_SRP_ATTRS=m +CONFIG_ISCSI_TCP=m +CONFIG_SCSI_DEBUG=m +CONFIG_ZFCP=m +CONFIG_SCSI_VIRTIO=m +CONFIG_SCSI_DH=y +CONFIG_SCSI_DH_RDAC=m +CONFIG_SCSI_DH_HP_SW=m +CONFIG_SCSI_DH_EMC=m +CONFIG_SCSI_DH_ALUA=m +CONFIG_MD=y +CONFIG_BLK_DEV_MD=y +# CONFIG_MD_BITMAP_FILE is not set +CONFIG_MD_LINEAR=m +CONFIG_MD_MULTIPATH=m +CONFIG_MD_FAULTY=m +CONFIG_MD_CLUSTER=m +CONFIG_BCACHE=m +CONFIG_BLK_DEV_DM=y +CONFIG_DM_UNSTRIPED=m +CONFIG_DM_CRYPT=m +CONFIG_DM_SNAPSHOT=m +CONFIG_DM_THIN_PROVISIONING=m +CONFIG_DM_WRITECACHE=m +CONFIG_DM_CLONE=m +CONFIG_DM_MIRROR=m +CONFIG_DM_LOG_USERSPACE=m +CONFIG_DM_RAID=m +CONFIG_DM_ZERO=m +CONFIG_DM_MULTIPATH=m +CONFIG_DM_MULTIPATH_QL=m +CONFIG_DM_MULTIPATH_ST=m +CONFIG_DM_MULTIPATH_HST=m +CONFIG_DM_MULTIPATH_IOA=m +CONFIG_DM_DELAY=m +CONFIG_DM_INIT=y +CONFIG_DM_UEVENT=y +CONFIG_DM_FLAKEY=m +CONFIG_DM_VERITY=m +CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y +CONFIG_DM_SWITCH=m +CONFIG_DM_INTEGRITY=m +CONFIG_NETDEVICES=y +CONFIG_BONDING=m +CONFIG_DUMMY=m +CONFIG_EQUALIZER=m +CONFIG_IFB=m +CONFIG_MACVLAN=m +CONFIG_MACVTAP=m +CONFIG_VXLAN=m +CONFIG_BAREUDP=m +CONFIG_AMT=m +CONFIG_TUN=m +CONFIG_VETH=m +CONFIG_VIRTIO_NET=m +CONFIG_NLMON=m +# CONFIG_NET_VENDOR_3COM is not set +# CONFIG_NET_VENDOR_ADAPTEC is not set +# CONFIG_NET_VENDOR_AGERE is not set +# CONFIG_NET_VENDOR_ALACRITECH is not set +# CONFIG_NET_VENDOR_ALTEON is not set +# CONFIG_NET_VENDOR_AMAZON is not set +# CONFIG_NET_VENDOR_AMD is not set +# CONFIG_NET_VENDOR_AQUANTIA is not set +# CONFIG_NET_VENDOR_ARC is not set +# CONFIG_NET_VENDOR_ASIX is not set +# CONFIG_NET_VENDOR_ATHEROS is not set +# CONFIG_NET_VENDOR_BROADCOM is not set +# CONFIG_NET_VENDOR_CADENCE is not set +# CONFIG_NET_VENDOR_CAVIUM is not set +# CONFIG_NET_VENDOR_CHELSIO is not set +# CONFIG_NET_VENDOR_CISCO is not set +# CONFIG_NET_VENDOR_CORTINA is not set +# CONFIG_NET_VENDOR_DAVICOM is not set +# CONFIG_NET_VENDOR_DEC is not set +# CONFIG_NET_VENDOR_DLINK is not set +# CONFIG_NET_VENDOR_EMULEX is not set +# CONFIG_NET_VENDOR_ENGLEDER is not set +# CONFIG_NET_VENDOR_EZCHIP is not set +# CONFIG_NET_VENDOR_FUNGIBLE is not set +# CONFIG_NET_VENDOR_GOOGLE is not set +# CONFIG_NET_VENDOR_HUAWEI is not set +# CONFIG_NET_VENDOR_INTEL is not set +# CONFIG_NET_VENDOR_LITEX is not set +# CONFIG_NET_VENDOR_MARVELL is not set +CONFIG_MLX4_EN=m +CONFIG_MLX5_CORE=m +CONFIG_MLX5_CORE_EN=y +# CONFIG_NET_VENDOR_MICREL is not set +# CONFIG_NET_VENDOR_MICROCHIP is not set +# CONFIG_NET_VENDOR_MICROSEMI is not set +# CONFIG_NET_VENDOR_MICROSOFT is not set +# CONFIG_NET_VENDOR_MYRI is not set +# CONFIG_NET_VENDOR_NI is not set +# CONFIG_NET_VENDOR_NATSEMI is not set +# CONFIG_NET_VENDOR_NETERION is not set +# CONFIG_NET_VENDOR_NETRONOME is not set +# CONFIG_NET_VENDOR_NVIDIA is not set +# CONFIG_NET_VENDOR_OKI is not set +# CONFIG_NET_VENDOR_PACKET_ENGINES is not set +# CONFIG_NET_VENDOR_PENSANDO is not set +# CONFIG_NET_VENDOR_QLOGIC is not set +# CONFIG_NET_VENDOR_BROCADE is not set +# CONFIG_NET_VENDOR_QUALCOMM is not set +# CONFIG_NET_VENDOR_RDC is not set +# CONFIG_NET_VENDOR_REALTEK is not set +# CONFIG_NET_VENDOR_RENESAS is not set +# CONFIG_NET_VENDOR_ROCKER is not set +# CONFIG_NET_VENDOR_SAMSUNG is not set +# CONFIG_NET_VENDOR_SEEQ is not set +# CONFIG_NET_VENDOR_SILAN is not set +# CONFIG_NET_VENDOR_SIS is not set +# CONFIG_NET_VENDOR_SOLARFLARE is not set +# CONFIG_NET_VENDOR_SMSC is not set +# CONFIG_NET_VENDOR_SOCIONEXT is not set +# CONFIG_NET_VENDOR_STMICRO is not set +# CONFIG_NET_VENDOR_SUN is not set +# CONFIG_NET_VENDOR_SYNOPSYS is not set +# CONFIG_NET_VENDOR_TEHUTI is not set +# CONFIG_NET_VENDOR_TI is not set +# CONFIG_NET_VENDOR_VERTEXCOM is not set +# CONFIG_NET_VENDOR_VIA is not set +# CONFIG_NET_VENDOR_WANGXUN is not set +# CONFIG_NET_VENDOR_WIZNET is not set +# CONFIG_NET_VENDOR_XILINX is not set +CONFIG_PPP=m +CONFIG_PPP_BSDCOMP=m +CONFIG_PPP_DEFLATE=m +CONFIG_PPP_FILTER=y +CONFIG_PPP_MPPE=m +CONFIG_PPP_MULTILINK=y +CONFIG_PPPOE=m +CONFIG_PPTP=m +CONFIG_PPPOL2TP=m +CONFIG_PPP_ASYNC=m +CONFIG_PPP_SYNC_TTY=m +CONFIG_ISM=m +CONFIG_INPUT_EVDEV=y +# CONFIG_INPUT_KEYBOARD is not set +# CONFIG_INPUT_MOUSE is not set +# CONFIG_SERIO is not set +CONFIG_LEGACY_PTY_COUNT=0 +# CONFIG_LEGACY_TIOCSTI is not set +CONFIG_VIRTIO_CONSOLE=m +CONFIG_HW_RANDOM_VIRTIO=m +CONFIG_HANGCHECK_TIMER=m +CONFIG_TN3270_FS=y +# CONFIG_PTP_1588_CLOCK is not set +# CONFIG_HWMON is not set +CONFIG_WATCHDOG=y +CONFIG_WATCHDOG_CORE=y +CONFIG_WATCHDOG_NOWAYOUT=y +CONFIG_SOFT_WATCHDOG=m +CONFIG_DIAG288_WATCHDOG=m +CONFIG_FB=y +# CONFIG_FB_DEVICE is not set +CONFIG_FRAMEBUFFER_CONSOLE=y +CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y +# CONFIG_HID_SUPPORT is not set +# CONFIG_USB_SUPPORT is not set +CONFIG_INFINIBAND=m +CONFIG_INFINIBAND_USER_ACCESS=m +CONFIG_MLX4_INFINIBAND=m +CONFIG_MLX5_INFINIBAND=m +CONFIG_SYNC_FILE=y +CONFIG_VFIO=m +CONFIG_VFIO_PCI=m +CONFIG_MLX5_VFIO_PCI=m +CONFIG_VIRTIO_PCI=m +CONFIG_VIRTIO_BALLOON=m +CONFIG_VIRTIO_INPUT=y +CONFIG_VHOST_NET=m +CONFIG_VHOST_VSOCK=m +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_EXT4_FS_SECURITY=y +CONFIG_JBD2_DEBUG=y +CONFIG_JFS_FS=m +CONFIG_JFS_POSIX_ACL=y +CONFIG_JFS_SECURITY=y +CONFIG_JFS_STATISTICS=y +CONFIG_XFS_FS=y +CONFIG_XFS_QUOTA=y +CONFIG_XFS_POSIX_ACL=y +CONFIG_XFS_RT=y +CONFIG_GFS2_FS=m +CONFIG_GFS2_FS_LOCKING_DLM=y +CONFIG_OCFS2_FS=m +CONFIG_BTRFS_FS=y +CONFIG_BTRFS_FS_POSIX_ACL=y +CONFIG_NILFS2_FS=m +CONFIG_FS_DAX=y +CONFIG_EXPORTFS_BLOCK_OPS=y +CONFIG_FS_ENCRYPTION=y +CONFIG_FS_VERITY=y +CONFIG_FS_VERITY_BUILTIN_SIGNATURES=y +CONFIG_FANOTIFY=y +CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y +CONFIG_QUOTA_NETLINK_INTERFACE=y +CONFIG_QFMT_V1=m +CONFIG_QFMT_V2=m +CONFIG_AUTOFS_FS=m +CONFIG_FUSE_FS=y +CONFIG_CUSE=m +CONFIG_VIRTIO_FS=m +CONFIG_OVERLAY_FS=m +CONFIG_NETFS_STATS=y +CONFIG_FSCACHE=m +CONFIG_CACHEFILES=m +CONFIG_ISO9660_FS=y +CONFIG_JOLIET=y +CONFIG_ZISOFS=y +CONFIG_UDF_FS=m +CONFIG_MSDOS_FS=m +CONFIG_VFAT_FS=m +CONFIG_EXFAT_FS=m +CONFIG_NTFS_FS=m +CONFIG_NTFS_RW=y +CONFIG_PROC_KCORE=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_TMPFS_INODE64=y +CONFIG_TMPFS_QUOTA=y +CONFIG_HUGETLBFS=y +CONFIG_CONFIGFS_FS=m +CONFIG_ECRYPT_FS=m +CONFIG_CRAMFS=m +CONFIG_SQUASHFS=m +CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT=y +CONFIG_SQUASHFS_XATTR=y +CONFIG_SQUASHFS_LZ4=y +CONFIG_SQUASHFS_LZO=y +CONFIG_SQUASHFS_XZ=y +CONFIG_SQUASHFS_ZSTD=y +CONFIG_ROMFS_FS=m +CONFIG_NFS_FS=m +CONFIG_NFS_V3_ACL=y +CONFIG_NFS_V4=m +CONFIG_NFS_SWAP=y +CONFIG_NFSD=m +CONFIG_NFSD_V3_ACL=y +CONFIG_NFSD_V4=y +CONFIG_NFSD_V4_SECURITY_LABEL=y +CONFIG_CIFS=m +CONFIG_CIFS_UPCALL=y +CONFIG_CIFS_XATTR=y +CONFIG_CIFS_POSIX=y +# CONFIG_CIFS_DEBUG is not set +CONFIG_CIFS_DFS_UPCALL=y +CONFIG_CIFS_SWN_UPCALL=y +CONFIG_NLS_DEFAULT="utf8" +CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_850=m +CONFIG_NLS_ASCII=m +CONFIG_NLS_ISO8859_1=m +CONFIG_NLS_ISO8859_15=m +CONFIG_NLS_UTF8=m +CONFIG_DLM=m +CONFIG_UNICODE=y +CONFIG_PERSISTENT_KEYRINGS=y +CONFIG_ENCRYPTED_KEYS=m +CONFIG_KEY_NOTIFICATIONS=y +CONFIG_SECURITY=y +CONFIG_SECURITY_NETWORK=y +CONFIG_SECURITY_SELINUX=y +CONFIG_SECURITY_SELINUX_BOOTPARAM=y +CONFIG_SECURITY_LOCKDOWN_LSM=y +CONFIG_SECURITY_LOCKDOWN_LSM_EARLY=y +CONFIG_SECURITY_LANDLOCK=y +CONFIG_INTEGRITY_SIGNATURE=y +CONFIG_INTEGRITY_ASYMMETRIC_KEYS=y +CONFIG_INTEGRITY_PLATFORM_KEYRING=y +CONFIG_IMA=y +CONFIG_IMA_DEFAULT_HASH_SHA256=y +CONFIG_IMA_WRITE_POLICY=y +CONFIG_IMA_APPRAISE=y +CONFIG_LSM="yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor" +CONFIG_INIT_STACK_NONE=y +CONFIG_BUG_ON_DATA_CORRUPTION=y +CONFIG_CRYPTO_FIPS=y +CONFIG_CRYPTO_USER=m +# CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set +CONFIG_CRYPTO_PCRYPT=m +CONFIG_CRYPTO_CRYPTD=m +CONFIG_CRYPTO_TEST=m +CONFIG_CRYPTO_DH=m +CONFIG_CRYPTO_ECDH=m +CONFIG_CRYPTO_ECDSA=m +CONFIG_CRYPTO_ECRDSA=m +CONFIG_CRYPTO_SM2=m +CONFIG_CRYPTO_CURVE25519=m +CONFIG_CRYPTO_AES_TI=m +CONFIG_CRYPTO_ANUBIS=m +CONFIG_CRYPTO_ARIA=m +CONFIG_CRYPTO_BLOWFISH=m +CONFIG_CRYPTO_CAMELLIA=m +CONFIG_CRYPTO_CAST5=m +CONFIG_CRYPTO_CAST6=m +CONFIG_CRYPTO_DES=m +CONFIG_CRYPTO_FCRYPT=m +CONFIG_CRYPTO_KHAZAD=m +CONFIG_CRYPTO_SEED=m +CONFIG_CRYPTO_SERPENT=m +CONFIG_CRYPTO_SM4_GENERIC=m +CONFIG_CRYPTO_TEA=m +CONFIG_CRYPTO_TWOFISH=m +CONFIG_CRYPTO_ADIANTUM=m +CONFIG_CRYPTO_ARC4=m +CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_HCTR2=m +CONFIG_CRYPTO_KEYWRAP=m +CONFIG_CRYPTO_LRW=m +CONFIG_CRYPTO_OFB=m +CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_AEGIS128=m +CONFIG_CRYPTO_CHACHA20POLY1305=m +CONFIG_CRYPTO_GCM=y +CONFIG_CRYPTO_SEQIV=y +CONFIG_CRYPTO_MD4=m +CONFIG_CRYPTO_MD5=y +CONFIG_CRYPTO_MICHAEL_MIC=m +CONFIG_CRYPTO_RMD160=m +CONFIG_CRYPTO_SM3_GENERIC=m +CONFIG_CRYPTO_VMAC=m +CONFIG_CRYPTO_WP512=m +CONFIG_CRYPTO_XCBC=m +CONFIG_CRYPTO_CRC32=m +CONFIG_CRYPTO_842=m +CONFIG_CRYPTO_LZ4=m +CONFIG_CRYPTO_LZ4HC=m +CONFIG_CRYPTO_ZSTD=m +CONFIG_CRYPTO_ANSI_CPRNG=m +CONFIG_CRYPTO_USER_API_HASH=m +CONFIG_CRYPTO_USER_API_SKCIPHER=m +CONFIG_CRYPTO_USER_API_RNG=m +CONFIG_CRYPTO_USER_API_AEAD=m +CONFIG_CRYPTO_STATS=y +CONFIG_CRYPTO_CRC32_S390=y +CONFIG_CRYPTO_SHA512_S390=m +CONFIG_CRYPTO_SHA1_S390=m +CONFIG_CRYPTO_SHA256_S390=m +CONFIG_CRYPTO_SHA3_256_S390=m +CONFIG_CRYPTO_SHA3_512_S390=m +CONFIG_CRYPTO_GHASH_S390=m +CONFIG_CRYPTO_AES_S390=m +CONFIG_CRYPTO_DES_S390=m +CONFIG_CRYPTO_CHACHA_S390=m +CONFIG_ZCRYPT=m +CONFIG_PKEY=m +CONFIG_CRYPTO_PAES_S390=m +CONFIG_CRYPTO_DEV_VIRTIO=m +CONFIG_SYSTEM_BLACKLIST_KEYRING=y +CONFIG_CORDIC=m +CONFIG_PRIME_NUMBERS=m +CONFIG_CRYPTO_LIB_CURVE25519=m +CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m +CONFIG_CRC4=m +CONFIG_CRC7=m +CONFIG_CRC8=m +CONFIG_XZ_DEC_MICROLZMA=y +CONFIG_DMA_CMA=y +CONFIG_CMA_SIZE_MBYTES=0 +CONFIG_PRINTK_TIME=y +CONFIG_DYNAMIC_DEBUG=y +CONFIG_DEBUG_INFO_DWARF4=y +CONFIG_GDB_SCRIPTS=y +CONFIG_DEBUG_SECTION_MISMATCH=y +CONFIG_MAGIC_SYSRQ=y +CONFIG_DEBUG_WX=y +CONFIG_PTDUMP_DEBUGFS=y +CONFIG_DEBUG_MEMORY_INIT=y +CONFIG_PANIC_ON_OOPS=y +CONFIG_TEST_LOCKUP=m +CONFIG_RCU_TORTURE_TEST=m +CONFIG_RCU_REF_SCALE_TEST=m +CONFIG_RCU_CPU_STALL_TIMEOUT=60 +CONFIG_LATENCYTOP=y +CONFIG_BOOTTIME_TRACING=y +CONFIG_FUNCTION_GRAPH_RETVAL=y +CONFIG_FPROBE=y +CONFIG_FUNCTION_PROFILER=y +CONFIG_STACK_TRACER=y +CONFIG_SCHED_TRACER=y +CONFIG_FTRACE_SYSCALLS=y +CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_USER_EVENTS=y +CONFIG_HIST_TRIGGERS=y +CONFIG_SAMPLES=y +CONFIG_SAMPLE_TRACE_PRINTK=m +CONFIG_SAMPLE_FTRACE_DIRECT=m +CONFIG_SAMPLE_FTRACE_DIRECT_MULTI=m +CONFIG_SAMPLE_FTRACE_OPS=m +CONFIG_KUNIT=m +CONFIG_KUNIT_DEBUGFS=y +CONFIG_LKDTM=m +CONFIG_KPROBES_SANITY_TEST=m +CONFIG_PERCPU_TEST=m +CONFIG_ATOMIC64_SELFTEST=y +CONFIG_TEST_BPF=m +CONFIG_TEST_LIVEPATCH=m diff --git a/arch/s390/configs/kasan.config b/arch/s390/configs/kasan.config new file mode 100644 index 0000000000..84c2b551e9 --- /dev/null +++ b/arch/s390/configs/kasan.config @@ -0,0 +1,4 @@ +# Help: Enable KASan for debugging +CONFIG_KASAN=y +CONFIG_KASAN_INLINE=y +CONFIG_KASAN_VMALLOC=y diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig new file mode 100644 index 0000000000..b831083b4e --- /dev/null +++ b/arch/s390/configs/zfcpdump_defconfig @@ -0,0 +1,82 @@ +CONFIG_NO_HZ_IDLE=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_BPF_SYSCALL=y +# CONFIG_CPU_ISOLATION is not set +# CONFIG_UTS_NS is not set +# CONFIG_TIME_NS is not set +# CONFIG_PID_NS is not set +# CONFIG_NET_NS is not set +CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_SIZE=y +CONFIG_CRASH_DUMP=y +CONFIG_MARCH_ZEC12=y +CONFIG_TUNE_ZEC12=y +# CONFIG_COMPAT is not set +CONFIG_NR_CPUS=2 +CONFIG_HZ_100=y +# CONFIG_CHSC_SCH is not set +# CONFIG_SCM_BUS is not set +# CONFIG_PFAULT is not set +# CONFIG_S390_HYPFS is not set +# CONFIG_VIRTUALIZATION is not set +# CONFIG_S390_GUEST is not set +# CONFIG_SECCOMP is not set +# CONFIG_GCC_PLUGINS is not set +# CONFIG_BLOCK_LEGACY_AUTOLOAD is not set +CONFIG_PARTITION_ADVANCED=y +# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set +# CONFIG_SWAP is not set +# CONFIG_COMPAT_BRK is not set +# CONFIG_COMPACTION is not set +# CONFIG_MIGRATION is not set +CONFIG_NET=y +# CONFIG_IUCV is not set +# CONFIG_PCPU_DEV_REFCNT is not set +# CONFIG_ETHTOOL_NETLINK is not set +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_SAFE=y +CONFIG_BLK_DEV_RAM=y +# CONFIG_DCSSBLK is not set +# CONFIG_DASD is not set +CONFIG_ENCLOSURE_SERVICES=y +CONFIG_SCSI=y +CONFIG_BLK_DEV_SD=y +CONFIG_SCSI_ENCLOSURE=y +CONFIG_SCSI_CONSTANTS=y +CONFIG_SCSI_LOGGING=y +CONFIG_SCSI_FC_ATTRS=y +CONFIG_ZFCP=y +# CONFIG_INPUT_KEYBOARD is not set +# CONFIG_INPUT_MOUSE is not set +# CONFIG_SERIO is not set +# CONFIG_LEGACY_TIOCSTI is not set +# CONFIG_HVC_IUCV is not set +# CONFIG_HW_RANDOM_S390 is not set +# CONFIG_HMC_DRV is not set +# CONFIG_S390_TAPE is not set +# CONFIG_VMCP is not set +# CONFIG_MONWRITER is not set +# CONFIG_S390_VMUR is not set +# CONFIG_HID_SUPPORT is not set +# CONFIG_VIRTIO_MENU is not set +# CONFIG_VHOST_MENU is not set +# CONFIG_IOMMU_SUPPORT is not set +# CONFIG_DNOTIFY is not set +# CONFIG_INOTIFY_USER is not set +# CONFIG_MISC_FILESYSTEMS is not set +# CONFIG_NETWORK_FILESYSTEMS is not set +CONFIG_LSM="yama,loadpin,safesetid,integrity" +CONFIG_INIT_STACK_NONE=y +# CONFIG_ZLIB_DFLTCC is not set +CONFIG_XZ_DEC_MICROLZMA=y +CONFIG_PRINTK_TIME=y +# CONFIG_SYMBOLIC_ERRNAME is not set +CONFIG_DEBUG_KERNEL=y +CONFIG_DEBUG_INFO_DWARF4=y +CONFIG_DEBUG_FS=y +CONFIG_PANIC_ON_OOPS=y +# CONFIG_SCHED_DEBUG is not set +CONFIG_RCU_CPU_STALL_TIMEOUT=60 +# CONFIG_RCU_TRACE is not set +# CONFIG_FTRACE is not set +# CONFIG_RUNTIME_TESTING_MENU is not set diff --git a/arch/s390/crypto/Kconfig b/arch/s390/crypto/Kconfig new file mode 100644 index 0000000000..06ee706b0d --- /dev/null +++ b/arch/s390/crypto/Kconfig @@ -0,0 +1,135 @@ +# SPDX-License-Identifier: GPL-2.0 + +menu "Accelerated Cryptographic Algorithms for CPU (s390)" + +config CRYPTO_CRC32_S390 + tristate "CRC32c and CRC32" + depends on S390 + select CRYPTO_HASH + select CRC32 + help + CRC32c and CRC32 CRC algorithms + + Architecture: s390 + + It is available with IBM z13 or later. + +config CRYPTO_SHA512_S390 + tristate "Hash functions: SHA-384 and SHA-512" + depends on S390 + select CRYPTO_HASH + help + SHA-384 and SHA-512 secure hash algorithms (FIPS 180) + + Architecture: s390 + + It is available as of z10. + +config CRYPTO_SHA1_S390 + tristate "Hash functions: SHA-1" + depends on S390 + select CRYPTO_HASH + help + SHA-1 secure hash algorithm (FIPS 180) + + Architecture: s390 + + It is available as of z990. + +config CRYPTO_SHA256_S390 + tristate "Hash functions: SHA-224 and SHA-256" + depends on S390 + select CRYPTO_HASH + help + SHA-224 and SHA-256 secure hash algorithms (FIPS 180) + + Architecture: s390 + + It is available as of z9. + +config CRYPTO_SHA3_256_S390 + tristate "Hash functions: SHA3-224 and SHA3-256" + depends on S390 + select CRYPTO_HASH + help + SHA3-224 and SHA3-256 secure hash algorithms (FIPS 202) + + Architecture: s390 + + It is available as of z14. + +config CRYPTO_SHA3_512_S390 + tristate "Hash functions: SHA3-384 and SHA3-512" + depends on S390 + select CRYPTO_HASH + help + SHA3-384 and SHA3-512 secure hash algorithms (FIPS 202) + + Architecture: s390 + + It is available as of z14. + +config CRYPTO_GHASH_S390 + tristate "Hash functions: GHASH" + depends on S390 + select CRYPTO_HASH + help + GCM GHASH hash function (NIST SP800-38D) + + Architecture: s390 + + It is available as of z196. + +config CRYPTO_AES_S390 + tristate "Ciphers: AES, modes: ECB, CBC, CTR, XTS, GCM" + depends on S390 + select CRYPTO_ALGAPI + select CRYPTO_SKCIPHER + help + Block cipher: AES cipher algorithms (FIPS 197) + AEAD cipher: AES with GCM + Length-preserving ciphers: AES with ECB, CBC, XTS, and CTR modes + + Architecture: s390 + + As of z9 the ECB and CBC modes are hardware accelerated + for 128 bit keys. + + As of z10 the ECB and CBC modes are hardware accelerated + for all AES key sizes. + + As of z196 the CTR mode is hardware accelerated for all AES + key sizes and XTS mode is hardware accelerated for 256 and + 512 bit keys. + +config CRYPTO_DES_S390 + tristate "Ciphers: DES and Triple DES EDE, modes: ECB, CBC, CTR" + depends on S390 + select CRYPTO_ALGAPI + select CRYPTO_SKCIPHER + select CRYPTO_LIB_DES + help + Block ciphers: DES (FIPS 46-2) cipher algorithm + Block ciphers: Triple DES EDE (FIPS 46-3) cipher algorithm + Length-preserving ciphers: DES with ECB, CBC, and CTR modes + Length-preserving ciphers: Triple DES EDED with ECB, CBC, and CTR modes + + Architecture: s390 + + As of z990 the ECB and CBC mode are hardware accelerated. + As of z196 the CTR mode is hardware accelerated. + +config CRYPTO_CHACHA_S390 + tristate "Ciphers: ChaCha20" + depends on S390 + select CRYPTO_SKCIPHER + select CRYPTO_LIB_CHACHA_GENERIC + select CRYPTO_ARCH_HAVE_LIB_CHACHA + help + Length-preserving cipher: ChaCha20 stream cipher (RFC 7539) + + Architecture: s390 + + It is available as of z13. + +endmenu diff --git a/arch/s390/crypto/Makefile b/arch/s390/crypto/Makefile new file mode 100644 index 0000000000..1b1cc478fa --- /dev/null +++ b/arch/s390/crypto/Makefile @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Cryptographic API +# + +obj-$(CONFIG_CRYPTO_SHA1_S390) += sha1_s390.o sha_common.o +obj-$(CONFIG_CRYPTO_SHA256_S390) += sha256_s390.o sha_common.o +obj-$(CONFIG_CRYPTO_SHA512_S390) += sha512_s390.o sha_common.o +obj-$(CONFIG_CRYPTO_SHA3_256_S390) += sha3_256_s390.o sha_common.o +obj-$(CONFIG_CRYPTO_SHA3_512_S390) += sha3_512_s390.o sha_common.o +obj-$(CONFIG_CRYPTO_DES_S390) += des_s390.o +obj-$(CONFIG_CRYPTO_AES_S390) += aes_s390.o +obj-$(CONFIG_CRYPTO_PAES_S390) += paes_s390.o +obj-$(CONFIG_CRYPTO_CHACHA_S390) += chacha_s390.o +obj-$(CONFIG_S390_PRNG) += prng.o +obj-$(CONFIG_CRYPTO_GHASH_S390) += ghash_s390.o +obj-$(CONFIG_CRYPTO_CRC32_S390) += crc32-vx_s390.o +obj-y += arch_random.o + +crc32-vx_s390-y := crc32-vx.o crc32le-vx.o crc32be-vx.o +chacha_s390-y := chacha-glue.o chacha-s390.o diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c new file mode 100644 index 0000000000..c6fe5405de --- /dev/null +++ b/arch/s390/crypto/aes_s390.c @@ -0,0 +1,1057 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Cryptographic API. + * + * s390 implementation of the AES Cipher Algorithm. + * + * s390 Version: + * Copyright IBM Corp. 2005, 2017 + * Author(s): Jan Glauber (jang@de.ibm.com) + * Sebastian Siewior (sebastian@breakpoint.cc> SW-Fallback + * Patrick Steuer + * Harald Freudenberger + * + * Derived from "crypto/aes_generic.c" + */ + +#define KMSG_COMPONENT "aes_s390" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static u8 *ctrblk; +static DEFINE_MUTEX(ctrblk_lock); + +static cpacf_mask_t km_functions, kmc_functions, kmctr_functions, + kma_functions; + +struct s390_aes_ctx { + u8 key[AES_MAX_KEY_SIZE]; + int key_len; + unsigned long fc; + union { + struct crypto_skcipher *skcipher; + struct crypto_cipher *cip; + } fallback; +}; + +struct s390_xts_ctx { + u8 key[32]; + u8 pcc_key[32]; + int key_len; + unsigned long fc; + struct crypto_skcipher *fallback; +}; + +struct gcm_sg_walk { + struct scatter_walk walk; + unsigned int walk_bytes; + u8 *walk_ptr; + unsigned int walk_bytes_remain; + u8 buf[AES_BLOCK_SIZE]; + unsigned int buf_bytes; + u8 *ptr; + unsigned int nbytes; +}; + +static int setkey_fallback_cip(struct crypto_tfm *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); + + sctx->fallback.cip->base.crt_flags &= ~CRYPTO_TFM_REQ_MASK; + sctx->fallback.cip->base.crt_flags |= (tfm->crt_flags & + CRYPTO_TFM_REQ_MASK); + + return crypto_cipher_setkey(sctx->fallback.cip, in_key, key_len); +} + +static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); + unsigned long fc; + + /* Pick the correct function code based on the key length */ + fc = (key_len == 16) ? CPACF_KM_AES_128 : + (key_len == 24) ? CPACF_KM_AES_192 : + (key_len == 32) ? CPACF_KM_AES_256 : 0; + + /* Check if the function code is available */ + sctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0; + if (!sctx->fc) + return setkey_fallback_cip(tfm, in_key, key_len); + + sctx->key_len = key_len; + memcpy(sctx->key, in_key, key_len); + return 0; +} + +static void crypto_aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) +{ + struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); + + if (unlikely(!sctx->fc)) { + crypto_cipher_encrypt_one(sctx->fallback.cip, out, in); + return; + } + cpacf_km(sctx->fc, &sctx->key, out, in, AES_BLOCK_SIZE); +} + +static void crypto_aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) +{ + struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); + + if (unlikely(!sctx->fc)) { + crypto_cipher_decrypt_one(sctx->fallback.cip, out, in); + return; + } + cpacf_km(sctx->fc | CPACF_DECRYPT, + &sctx->key, out, in, AES_BLOCK_SIZE); +} + +static int fallback_init_cip(struct crypto_tfm *tfm) +{ + const char *name = tfm->__crt_alg->cra_name; + struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); + + sctx->fallback.cip = crypto_alloc_cipher(name, 0, + CRYPTO_ALG_NEED_FALLBACK); + + if (IS_ERR(sctx->fallback.cip)) { + pr_err("Allocating AES fallback algorithm %s failed\n", + name); + return PTR_ERR(sctx->fallback.cip); + } + + return 0; +} + +static void fallback_exit_cip(struct crypto_tfm *tfm) +{ + struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); + + crypto_free_cipher(sctx->fallback.cip); + sctx->fallback.cip = NULL; +} + +static struct crypto_alg aes_alg = { + .cra_name = "aes", + .cra_driver_name = "aes-s390", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER | + CRYPTO_ALG_NEED_FALLBACK, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct s390_aes_ctx), + .cra_module = THIS_MODULE, + .cra_init = fallback_init_cip, + .cra_exit = fallback_exit_cip, + .cra_u = { + .cipher = { + .cia_min_keysize = AES_MIN_KEY_SIZE, + .cia_max_keysize = AES_MAX_KEY_SIZE, + .cia_setkey = aes_set_key, + .cia_encrypt = crypto_aes_encrypt, + .cia_decrypt = crypto_aes_decrypt, + } + } +}; + +static int setkey_fallback_skcipher(struct crypto_skcipher *tfm, const u8 *key, + unsigned int len) +{ + struct s390_aes_ctx *sctx = crypto_skcipher_ctx(tfm); + + crypto_skcipher_clear_flags(sctx->fallback.skcipher, + CRYPTO_TFM_REQ_MASK); + crypto_skcipher_set_flags(sctx->fallback.skcipher, + crypto_skcipher_get_flags(tfm) & + CRYPTO_TFM_REQ_MASK); + return crypto_skcipher_setkey(sctx->fallback.skcipher, key, len); +} + +static int fallback_skcipher_crypt(struct s390_aes_ctx *sctx, + struct skcipher_request *req, + unsigned long modifier) +{ + struct skcipher_request *subreq = skcipher_request_ctx(req); + + *subreq = *req; + skcipher_request_set_tfm(subreq, sctx->fallback.skcipher); + return (modifier & CPACF_DECRYPT) ? + crypto_skcipher_decrypt(subreq) : + crypto_skcipher_encrypt(subreq); +} + +static int ecb_aes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct s390_aes_ctx *sctx = crypto_skcipher_ctx(tfm); + unsigned long fc; + + /* Pick the correct function code based on the key length */ + fc = (key_len == 16) ? CPACF_KM_AES_128 : + (key_len == 24) ? CPACF_KM_AES_192 : + (key_len == 32) ? CPACF_KM_AES_256 : 0; + + /* Check if the function code is available */ + sctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0; + if (!sctx->fc) + return setkey_fallback_skcipher(tfm, in_key, key_len); + + sctx->key_len = key_len; + memcpy(sctx->key, in_key, key_len); + return 0; +} + +static int ecb_aes_crypt(struct skcipher_request *req, unsigned long modifier) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_aes_ctx *sctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes, n; + int ret; + + if (unlikely(!sctx->fc)) + return fallback_skcipher_crypt(sctx, req, modifier); + + ret = skcipher_walk_virt(&walk, req, false); + while ((nbytes = walk.nbytes) != 0) { + /* only use complete blocks */ + n = nbytes & ~(AES_BLOCK_SIZE - 1); + cpacf_km(sctx->fc | modifier, sctx->key, + walk.dst.virt.addr, walk.src.virt.addr, n); + ret = skcipher_walk_done(&walk, nbytes - n); + } + return ret; +} + +static int ecb_aes_encrypt(struct skcipher_request *req) +{ + return ecb_aes_crypt(req, 0); +} + +static int ecb_aes_decrypt(struct skcipher_request *req) +{ + return ecb_aes_crypt(req, CPACF_DECRYPT); +} + +static int fallback_init_skcipher(struct crypto_skcipher *tfm) +{ + const char *name = crypto_tfm_alg_name(&tfm->base); + struct s390_aes_ctx *sctx = crypto_skcipher_ctx(tfm); + + sctx->fallback.skcipher = crypto_alloc_skcipher(name, 0, + CRYPTO_ALG_NEED_FALLBACK | CRYPTO_ALG_ASYNC); + + if (IS_ERR(sctx->fallback.skcipher)) { + pr_err("Allocating AES fallback algorithm %s failed\n", + name); + return PTR_ERR(sctx->fallback.skcipher); + } + + crypto_skcipher_set_reqsize(tfm, sizeof(struct skcipher_request) + + crypto_skcipher_reqsize(sctx->fallback.skcipher)); + return 0; +} + +static void fallback_exit_skcipher(struct crypto_skcipher *tfm) +{ + struct s390_aes_ctx *sctx = crypto_skcipher_ctx(tfm); + + crypto_free_skcipher(sctx->fallback.skcipher); +} + +static struct skcipher_alg ecb_aes_alg = { + .base.cra_name = "ecb(aes)", + .base.cra_driver_name = "ecb-aes-s390", + .base.cra_priority = 401, /* combo: aes + ecb + 1 */ + .base.cra_flags = CRYPTO_ALG_NEED_FALLBACK, + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct s390_aes_ctx), + .base.cra_module = THIS_MODULE, + .init = fallback_init_skcipher, + .exit = fallback_exit_skcipher, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .setkey = ecb_aes_set_key, + .encrypt = ecb_aes_encrypt, + .decrypt = ecb_aes_decrypt, +}; + +static int cbc_aes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct s390_aes_ctx *sctx = crypto_skcipher_ctx(tfm); + unsigned long fc; + + /* Pick the correct function code based on the key length */ + fc = (key_len == 16) ? CPACF_KMC_AES_128 : + (key_len == 24) ? CPACF_KMC_AES_192 : + (key_len == 32) ? CPACF_KMC_AES_256 : 0; + + /* Check if the function code is available */ + sctx->fc = (fc && cpacf_test_func(&kmc_functions, fc)) ? fc : 0; + if (!sctx->fc) + return setkey_fallback_skcipher(tfm, in_key, key_len); + + sctx->key_len = key_len; + memcpy(sctx->key, in_key, key_len); + return 0; +} + +static int cbc_aes_crypt(struct skcipher_request *req, unsigned long modifier) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_aes_ctx *sctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes, n; + int ret; + struct { + u8 iv[AES_BLOCK_SIZE]; + u8 key[AES_MAX_KEY_SIZE]; + } param; + + if (unlikely(!sctx->fc)) + return fallback_skcipher_crypt(sctx, req, modifier); + + ret = skcipher_walk_virt(&walk, req, false); + if (ret) + return ret; + memcpy(param.iv, walk.iv, AES_BLOCK_SIZE); + memcpy(param.key, sctx->key, sctx->key_len); + while ((nbytes = walk.nbytes) != 0) { + /* only use complete blocks */ + n = nbytes & ~(AES_BLOCK_SIZE - 1); + cpacf_kmc(sctx->fc | modifier, ¶m, + walk.dst.virt.addr, walk.src.virt.addr, n); + memcpy(walk.iv, param.iv, AES_BLOCK_SIZE); + ret = skcipher_walk_done(&walk, nbytes - n); + } + memzero_explicit(¶m, sizeof(param)); + return ret; +} + +static int cbc_aes_encrypt(struct skcipher_request *req) +{ + return cbc_aes_crypt(req, 0); +} + +static int cbc_aes_decrypt(struct skcipher_request *req) +{ + return cbc_aes_crypt(req, CPACF_DECRYPT); +} + +static struct skcipher_alg cbc_aes_alg = { + .base.cra_name = "cbc(aes)", + .base.cra_driver_name = "cbc-aes-s390", + .base.cra_priority = 402, /* ecb-aes-s390 + 1 */ + .base.cra_flags = CRYPTO_ALG_NEED_FALLBACK, + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct s390_aes_ctx), + .base.cra_module = THIS_MODULE, + .init = fallback_init_skcipher, + .exit = fallback_exit_skcipher, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = cbc_aes_set_key, + .encrypt = cbc_aes_encrypt, + .decrypt = cbc_aes_decrypt, +}; + +static int xts_fallback_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int len) +{ + struct s390_xts_ctx *xts_ctx = crypto_skcipher_ctx(tfm); + + crypto_skcipher_clear_flags(xts_ctx->fallback, CRYPTO_TFM_REQ_MASK); + crypto_skcipher_set_flags(xts_ctx->fallback, + crypto_skcipher_get_flags(tfm) & + CRYPTO_TFM_REQ_MASK); + return crypto_skcipher_setkey(xts_ctx->fallback, key, len); +} + +static int xts_aes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct s390_xts_ctx *xts_ctx = crypto_skcipher_ctx(tfm); + unsigned long fc; + int err; + + err = xts_fallback_setkey(tfm, in_key, key_len); + if (err) + return err; + + /* Pick the correct function code based on the key length */ + fc = (key_len == 32) ? CPACF_KM_XTS_128 : + (key_len == 64) ? CPACF_KM_XTS_256 : 0; + + /* Check if the function code is available */ + xts_ctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0; + if (!xts_ctx->fc) + return 0; + + /* Split the XTS key into the two subkeys */ + key_len = key_len / 2; + xts_ctx->key_len = key_len; + memcpy(xts_ctx->key, in_key, key_len); + memcpy(xts_ctx->pcc_key, in_key + key_len, key_len); + return 0; +} + +static int xts_aes_crypt(struct skcipher_request *req, unsigned long modifier) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_xts_ctx *xts_ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int offset, nbytes, n; + int ret; + struct { + u8 key[32]; + u8 tweak[16]; + u8 block[16]; + u8 bit[16]; + u8 xts[16]; + } pcc_param; + struct { + u8 key[32]; + u8 init[16]; + } xts_param; + + if (req->cryptlen < AES_BLOCK_SIZE) + return -EINVAL; + + if (unlikely(!xts_ctx->fc || (req->cryptlen % AES_BLOCK_SIZE) != 0)) { + struct skcipher_request *subreq = skcipher_request_ctx(req); + + *subreq = *req; + skcipher_request_set_tfm(subreq, xts_ctx->fallback); + return (modifier & CPACF_DECRYPT) ? + crypto_skcipher_decrypt(subreq) : + crypto_skcipher_encrypt(subreq); + } + + ret = skcipher_walk_virt(&walk, req, false); + if (ret) + return ret; + offset = xts_ctx->key_len & 0x10; + memset(pcc_param.block, 0, sizeof(pcc_param.block)); + memset(pcc_param.bit, 0, sizeof(pcc_param.bit)); + memset(pcc_param.xts, 0, sizeof(pcc_param.xts)); + memcpy(pcc_param.tweak, walk.iv, sizeof(pcc_param.tweak)); + memcpy(pcc_param.key + offset, xts_ctx->pcc_key, xts_ctx->key_len); + cpacf_pcc(xts_ctx->fc, pcc_param.key + offset); + + memcpy(xts_param.key + offset, xts_ctx->key, xts_ctx->key_len); + memcpy(xts_param.init, pcc_param.xts, 16); + + while ((nbytes = walk.nbytes) != 0) { + /* only use complete blocks */ + n = nbytes & ~(AES_BLOCK_SIZE - 1); + cpacf_km(xts_ctx->fc | modifier, xts_param.key + offset, + walk.dst.virt.addr, walk.src.virt.addr, n); + ret = skcipher_walk_done(&walk, nbytes - n); + } + memzero_explicit(&pcc_param, sizeof(pcc_param)); + memzero_explicit(&xts_param, sizeof(xts_param)); + return ret; +} + +static int xts_aes_encrypt(struct skcipher_request *req) +{ + return xts_aes_crypt(req, 0); +} + +static int xts_aes_decrypt(struct skcipher_request *req) +{ + return xts_aes_crypt(req, CPACF_DECRYPT); +} + +static int xts_fallback_init(struct crypto_skcipher *tfm) +{ + const char *name = crypto_tfm_alg_name(&tfm->base); + struct s390_xts_ctx *xts_ctx = crypto_skcipher_ctx(tfm); + + xts_ctx->fallback = crypto_alloc_skcipher(name, 0, + CRYPTO_ALG_NEED_FALLBACK | CRYPTO_ALG_ASYNC); + + if (IS_ERR(xts_ctx->fallback)) { + pr_err("Allocating XTS fallback algorithm %s failed\n", + name); + return PTR_ERR(xts_ctx->fallback); + } + crypto_skcipher_set_reqsize(tfm, sizeof(struct skcipher_request) + + crypto_skcipher_reqsize(xts_ctx->fallback)); + return 0; +} + +static void xts_fallback_exit(struct crypto_skcipher *tfm) +{ + struct s390_xts_ctx *xts_ctx = crypto_skcipher_ctx(tfm); + + crypto_free_skcipher(xts_ctx->fallback); +} + +static struct skcipher_alg xts_aes_alg = { + .base.cra_name = "xts(aes)", + .base.cra_driver_name = "xts-aes-s390", + .base.cra_priority = 402, /* ecb-aes-s390 + 1 */ + .base.cra_flags = CRYPTO_ALG_NEED_FALLBACK, + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct s390_xts_ctx), + .base.cra_module = THIS_MODULE, + .init = xts_fallback_init, + .exit = xts_fallback_exit, + .min_keysize = 2 * AES_MIN_KEY_SIZE, + .max_keysize = 2 * AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = xts_aes_set_key, + .encrypt = xts_aes_encrypt, + .decrypt = xts_aes_decrypt, +}; + +static int ctr_aes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct s390_aes_ctx *sctx = crypto_skcipher_ctx(tfm); + unsigned long fc; + + /* Pick the correct function code based on the key length */ + fc = (key_len == 16) ? CPACF_KMCTR_AES_128 : + (key_len == 24) ? CPACF_KMCTR_AES_192 : + (key_len == 32) ? CPACF_KMCTR_AES_256 : 0; + + /* Check if the function code is available */ + sctx->fc = (fc && cpacf_test_func(&kmctr_functions, fc)) ? fc : 0; + if (!sctx->fc) + return setkey_fallback_skcipher(tfm, in_key, key_len); + + sctx->key_len = key_len; + memcpy(sctx->key, in_key, key_len); + return 0; +} + +static unsigned int __ctrblk_init(u8 *ctrptr, u8 *iv, unsigned int nbytes) +{ + unsigned int i, n; + + /* only use complete blocks, max. PAGE_SIZE */ + memcpy(ctrptr, iv, AES_BLOCK_SIZE); + n = (nbytes > PAGE_SIZE) ? PAGE_SIZE : nbytes & ~(AES_BLOCK_SIZE - 1); + for (i = (n / AES_BLOCK_SIZE) - 1; i > 0; i--) { + memcpy(ctrptr + AES_BLOCK_SIZE, ctrptr, AES_BLOCK_SIZE); + crypto_inc(ctrptr + AES_BLOCK_SIZE, AES_BLOCK_SIZE); + ctrptr += AES_BLOCK_SIZE; + } + return n; +} + +static int ctr_aes_crypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_aes_ctx *sctx = crypto_skcipher_ctx(tfm); + u8 buf[AES_BLOCK_SIZE], *ctrptr; + struct skcipher_walk walk; + unsigned int n, nbytes; + int ret, locked; + + if (unlikely(!sctx->fc)) + return fallback_skcipher_crypt(sctx, req, 0); + + locked = mutex_trylock(&ctrblk_lock); + + ret = skcipher_walk_virt(&walk, req, false); + while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) { + n = AES_BLOCK_SIZE; + + if (nbytes >= 2*AES_BLOCK_SIZE && locked) + n = __ctrblk_init(ctrblk, walk.iv, nbytes); + ctrptr = (n > AES_BLOCK_SIZE) ? ctrblk : walk.iv; + cpacf_kmctr(sctx->fc, sctx->key, walk.dst.virt.addr, + walk.src.virt.addr, n, ctrptr); + if (ctrptr == ctrblk) + memcpy(walk.iv, ctrptr + n - AES_BLOCK_SIZE, + AES_BLOCK_SIZE); + crypto_inc(walk.iv, AES_BLOCK_SIZE); + ret = skcipher_walk_done(&walk, nbytes - n); + } + if (locked) + mutex_unlock(&ctrblk_lock); + /* + * final block may be < AES_BLOCK_SIZE, copy only nbytes + */ + if (nbytes) { + memset(buf, 0, AES_BLOCK_SIZE); + memcpy(buf, walk.src.virt.addr, nbytes); + cpacf_kmctr(sctx->fc, sctx->key, buf, buf, + AES_BLOCK_SIZE, walk.iv); + memcpy(walk.dst.virt.addr, buf, nbytes); + crypto_inc(walk.iv, AES_BLOCK_SIZE); + ret = skcipher_walk_done(&walk, 0); + } + + return ret; +} + +static struct skcipher_alg ctr_aes_alg = { + .base.cra_name = "ctr(aes)", + .base.cra_driver_name = "ctr-aes-s390", + .base.cra_priority = 402, /* ecb-aes-s390 + 1 */ + .base.cra_flags = CRYPTO_ALG_NEED_FALLBACK, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct s390_aes_ctx), + .base.cra_module = THIS_MODULE, + .init = fallback_init_skcipher, + .exit = fallback_exit_skcipher, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ctr_aes_set_key, + .encrypt = ctr_aes_crypt, + .decrypt = ctr_aes_crypt, + .chunksize = AES_BLOCK_SIZE, +}; + +static int gcm_aes_setkey(struct crypto_aead *tfm, const u8 *key, + unsigned int keylen) +{ + struct s390_aes_ctx *ctx = crypto_aead_ctx(tfm); + + switch (keylen) { + case AES_KEYSIZE_128: + ctx->fc = CPACF_KMA_GCM_AES_128; + break; + case AES_KEYSIZE_192: + ctx->fc = CPACF_KMA_GCM_AES_192; + break; + case AES_KEYSIZE_256: + ctx->fc = CPACF_KMA_GCM_AES_256; + break; + default: + return -EINVAL; + } + + memcpy(ctx->key, key, keylen); + ctx->key_len = keylen; + return 0; +} + +static int gcm_aes_setauthsize(struct crypto_aead *tfm, unsigned int authsize) +{ + switch (authsize) { + case 4: + case 8: + case 12: + case 13: + case 14: + case 15: + case 16: + break; + default: + return -EINVAL; + } + + return 0; +} + +static void gcm_walk_start(struct gcm_sg_walk *gw, struct scatterlist *sg, + unsigned int len) +{ + memset(gw, 0, sizeof(*gw)); + gw->walk_bytes_remain = len; + scatterwalk_start(&gw->walk, sg); +} + +static inline unsigned int _gcm_sg_clamp_and_map(struct gcm_sg_walk *gw) +{ + struct scatterlist *nextsg; + + gw->walk_bytes = scatterwalk_clamp(&gw->walk, gw->walk_bytes_remain); + while (!gw->walk_bytes) { + nextsg = sg_next(gw->walk.sg); + if (!nextsg) + return 0; + scatterwalk_start(&gw->walk, nextsg); + gw->walk_bytes = scatterwalk_clamp(&gw->walk, + gw->walk_bytes_remain); + } + gw->walk_ptr = scatterwalk_map(&gw->walk); + return gw->walk_bytes; +} + +static inline void _gcm_sg_unmap_and_advance(struct gcm_sg_walk *gw, + unsigned int nbytes) +{ + gw->walk_bytes_remain -= nbytes; + scatterwalk_unmap(gw->walk_ptr); + scatterwalk_advance(&gw->walk, nbytes); + scatterwalk_done(&gw->walk, 0, gw->walk_bytes_remain); + gw->walk_ptr = NULL; +} + +static int gcm_in_walk_go(struct gcm_sg_walk *gw, unsigned int minbytesneeded) +{ + int n; + + if (gw->buf_bytes && gw->buf_bytes >= minbytesneeded) { + gw->ptr = gw->buf; + gw->nbytes = gw->buf_bytes; + goto out; + } + + if (gw->walk_bytes_remain == 0) { + gw->ptr = NULL; + gw->nbytes = 0; + goto out; + } + + if (!_gcm_sg_clamp_and_map(gw)) { + gw->ptr = NULL; + gw->nbytes = 0; + goto out; + } + + if (!gw->buf_bytes && gw->walk_bytes >= minbytesneeded) { + gw->ptr = gw->walk_ptr; + gw->nbytes = gw->walk_bytes; + goto out; + } + + while (1) { + n = min(gw->walk_bytes, AES_BLOCK_SIZE - gw->buf_bytes); + memcpy(gw->buf + gw->buf_bytes, gw->walk_ptr, n); + gw->buf_bytes += n; + _gcm_sg_unmap_and_advance(gw, n); + if (gw->buf_bytes >= minbytesneeded) { + gw->ptr = gw->buf; + gw->nbytes = gw->buf_bytes; + goto out; + } + if (!_gcm_sg_clamp_and_map(gw)) { + gw->ptr = NULL; + gw->nbytes = 0; + goto out; + } + } + +out: + return gw->nbytes; +} + +static int gcm_out_walk_go(struct gcm_sg_walk *gw, unsigned int minbytesneeded) +{ + if (gw->walk_bytes_remain == 0) { + gw->ptr = NULL; + gw->nbytes = 0; + goto out; + } + + if (!_gcm_sg_clamp_and_map(gw)) { + gw->ptr = NULL; + gw->nbytes = 0; + goto out; + } + + if (gw->walk_bytes >= minbytesneeded) { + gw->ptr = gw->walk_ptr; + gw->nbytes = gw->walk_bytes; + goto out; + } + + scatterwalk_unmap(gw->walk_ptr); + gw->walk_ptr = NULL; + + gw->ptr = gw->buf; + gw->nbytes = sizeof(gw->buf); + +out: + return gw->nbytes; +} + +static int gcm_in_walk_done(struct gcm_sg_walk *gw, unsigned int bytesdone) +{ + if (gw->ptr == NULL) + return 0; + + if (gw->ptr == gw->buf) { + int n = gw->buf_bytes - bytesdone; + if (n > 0) { + memmove(gw->buf, gw->buf + bytesdone, n); + gw->buf_bytes = n; + } else + gw->buf_bytes = 0; + } else + _gcm_sg_unmap_and_advance(gw, bytesdone); + + return bytesdone; +} + +static int gcm_out_walk_done(struct gcm_sg_walk *gw, unsigned int bytesdone) +{ + int i, n; + + if (gw->ptr == NULL) + return 0; + + if (gw->ptr == gw->buf) { + for (i = 0; i < bytesdone; i += n) { + if (!_gcm_sg_clamp_and_map(gw)) + return i; + n = min(gw->walk_bytes, bytesdone - i); + memcpy(gw->walk_ptr, gw->buf + i, n); + _gcm_sg_unmap_and_advance(gw, n); + } + } else + _gcm_sg_unmap_and_advance(gw, bytesdone); + + return bytesdone; +} + +static int gcm_aes_crypt(struct aead_request *req, unsigned int flags) +{ + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct s390_aes_ctx *ctx = crypto_aead_ctx(tfm); + unsigned int ivsize = crypto_aead_ivsize(tfm); + unsigned int taglen = crypto_aead_authsize(tfm); + unsigned int aadlen = req->assoclen; + unsigned int pclen = req->cryptlen; + int ret = 0; + + unsigned int n, len, in_bytes, out_bytes, + min_bytes, bytes, aad_bytes, pc_bytes; + struct gcm_sg_walk gw_in, gw_out; + u8 tag[GHASH_DIGEST_SIZE]; + + struct { + u32 _[3]; /* reserved */ + u32 cv; /* Counter Value */ + u8 t[GHASH_DIGEST_SIZE];/* Tag */ + u8 h[AES_BLOCK_SIZE]; /* Hash-subkey */ + u64 taadl; /* Total AAD Length */ + u64 tpcl; /* Total Plain-/Cipher-text Length */ + u8 j0[GHASH_BLOCK_SIZE];/* initial counter value */ + u8 k[AES_MAX_KEY_SIZE]; /* Key */ + } param; + + /* + * encrypt + * req->src: aad||plaintext + * req->dst: aad||ciphertext||tag + * decrypt + * req->src: aad||ciphertext||tag + * req->dst: aad||plaintext, return 0 or -EBADMSG + * aad, plaintext and ciphertext may be empty. + */ + if (flags & CPACF_DECRYPT) + pclen -= taglen; + len = aadlen + pclen; + + memset(¶m, 0, sizeof(param)); + param.cv = 1; + param.taadl = aadlen * 8; + param.tpcl = pclen * 8; + memcpy(param.j0, req->iv, ivsize); + *(u32 *)(param.j0 + ivsize) = 1; + memcpy(param.k, ctx->key, ctx->key_len); + + gcm_walk_start(&gw_in, req->src, len); + gcm_walk_start(&gw_out, req->dst, len); + + do { + min_bytes = min_t(unsigned int, + aadlen > 0 ? aadlen : pclen, AES_BLOCK_SIZE); + in_bytes = gcm_in_walk_go(&gw_in, min_bytes); + out_bytes = gcm_out_walk_go(&gw_out, min_bytes); + bytes = min(in_bytes, out_bytes); + + if (aadlen + pclen <= bytes) { + aad_bytes = aadlen; + pc_bytes = pclen; + flags |= CPACF_KMA_LAAD | CPACF_KMA_LPC; + } else { + if (aadlen <= bytes) { + aad_bytes = aadlen; + pc_bytes = (bytes - aadlen) & + ~(AES_BLOCK_SIZE - 1); + flags |= CPACF_KMA_LAAD; + } else { + aad_bytes = bytes & ~(AES_BLOCK_SIZE - 1); + pc_bytes = 0; + } + } + + if (aad_bytes > 0) + memcpy(gw_out.ptr, gw_in.ptr, aad_bytes); + + cpacf_kma(ctx->fc | flags, ¶m, + gw_out.ptr + aad_bytes, + gw_in.ptr + aad_bytes, pc_bytes, + gw_in.ptr, aad_bytes); + + n = aad_bytes + pc_bytes; + if (gcm_in_walk_done(&gw_in, n) != n) + return -ENOMEM; + if (gcm_out_walk_done(&gw_out, n) != n) + return -ENOMEM; + aadlen -= aad_bytes; + pclen -= pc_bytes; + } while (aadlen + pclen > 0); + + if (flags & CPACF_DECRYPT) { + scatterwalk_map_and_copy(tag, req->src, len, taglen, 0); + if (crypto_memneq(tag, param.t, taglen)) + ret = -EBADMSG; + } else + scatterwalk_map_and_copy(param.t, req->dst, len, taglen, 1); + + memzero_explicit(¶m, sizeof(param)); + return ret; +} + +static int gcm_aes_encrypt(struct aead_request *req) +{ + return gcm_aes_crypt(req, CPACF_ENCRYPT); +} + +static int gcm_aes_decrypt(struct aead_request *req) +{ + return gcm_aes_crypt(req, CPACF_DECRYPT); +} + +static struct aead_alg gcm_aes_aead = { + .setkey = gcm_aes_setkey, + .setauthsize = gcm_aes_setauthsize, + .encrypt = gcm_aes_encrypt, + .decrypt = gcm_aes_decrypt, + + .ivsize = GHASH_BLOCK_SIZE - sizeof(u32), + .maxauthsize = GHASH_DIGEST_SIZE, + .chunksize = AES_BLOCK_SIZE, + + .base = { + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct s390_aes_ctx), + .cra_priority = 900, + .cra_name = "gcm(aes)", + .cra_driver_name = "gcm-aes-s390", + .cra_module = THIS_MODULE, + }, +}; + +static struct crypto_alg *aes_s390_alg; +static struct skcipher_alg *aes_s390_skcipher_algs[4]; +static int aes_s390_skciphers_num; +static struct aead_alg *aes_s390_aead_alg; + +static int aes_s390_register_skcipher(struct skcipher_alg *alg) +{ + int ret; + + ret = crypto_register_skcipher(alg); + if (!ret) + aes_s390_skcipher_algs[aes_s390_skciphers_num++] = alg; + return ret; +} + +static void aes_s390_fini(void) +{ + if (aes_s390_alg) + crypto_unregister_alg(aes_s390_alg); + while (aes_s390_skciphers_num--) + crypto_unregister_skcipher(aes_s390_skcipher_algs[aes_s390_skciphers_num]); + if (ctrblk) + free_page((unsigned long) ctrblk); + + if (aes_s390_aead_alg) + crypto_unregister_aead(aes_s390_aead_alg); +} + +static int __init aes_s390_init(void) +{ + int ret; + + /* Query available functions for KM, KMC, KMCTR and KMA */ + cpacf_query(CPACF_KM, &km_functions); + cpacf_query(CPACF_KMC, &kmc_functions); + cpacf_query(CPACF_KMCTR, &kmctr_functions); + cpacf_query(CPACF_KMA, &kma_functions); + + if (cpacf_test_func(&km_functions, CPACF_KM_AES_128) || + cpacf_test_func(&km_functions, CPACF_KM_AES_192) || + cpacf_test_func(&km_functions, CPACF_KM_AES_256)) { + ret = crypto_register_alg(&aes_alg); + if (ret) + goto out_err; + aes_s390_alg = &aes_alg; + ret = aes_s390_register_skcipher(&ecb_aes_alg); + if (ret) + goto out_err; + } + + if (cpacf_test_func(&kmc_functions, CPACF_KMC_AES_128) || + cpacf_test_func(&kmc_functions, CPACF_KMC_AES_192) || + cpacf_test_func(&kmc_functions, CPACF_KMC_AES_256)) { + ret = aes_s390_register_skcipher(&cbc_aes_alg); + if (ret) + goto out_err; + } + + if (cpacf_test_func(&km_functions, CPACF_KM_XTS_128) || + cpacf_test_func(&km_functions, CPACF_KM_XTS_256)) { + ret = aes_s390_register_skcipher(&xts_aes_alg); + if (ret) + goto out_err; + } + + if (cpacf_test_func(&kmctr_functions, CPACF_KMCTR_AES_128) || + cpacf_test_func(&kmctr_functions, CPACF_KMCTR_AES_192) || + cpacf_test_func(&kmctr_functions, CPACF_KMCTR_AES_256)) { + ctrblk = (u8 *) __get_free_page(GFP_KERNEL); + if (!ctrblk) { + ret = -ENOMEM; + goto out_err; + } + ret = aes_s390_register_skcipher(&ctr_aes_alg); + if (ret) + goto out_err; + } + + if (cpacf_test_func(&kma_functions, CPACF_KMA_GCM_AES_128) || + cpacf_test_func(&kma_functions, CPACF_KMA_GCM_AES_192) || + cpacf_test_func(&kma_functions, CPACF_KMA_GCM_AES_256)) { + ret = crypto_register_aead(&gcm_aes_aead); + if (ret) + goto out_err; + aes_s390_aead_alg = &gcm_aes_aead; + } + + return 0; +out_err: + aes_s390_fini(); + return ret; +} + +module_cpu_feature_match(S390_CPU_FEATURE_MSA, aes_s390_init); +module_exit(aes_s390_fini); + +MODULE_ALIAS_CRYPTO("aes-all"); + +MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm"); +MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(CRYPTO_INTERNAL); diff --git a/arch/s390/crypto/arch_random.c b/arch/s390/crypto/arch_random.c new file mode 100644 index 0000000000..a8a2407381 --- /dev/null +++ b/arch/s390/crypto/arch_random.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * s390 arch random implementation. + * + * Copyright IBM Corp. 2017, 2020 + * Author(s): Harald Freudenberger + */ + +#include +#include +#include +#include +#include +#include + +DEFINE_STATIC_KEY_FALSE(s390_arch_random_available); + +atomic64_t s390_arch_random_counter = ATOMIC64_INIT(0); +EXPORT_SYMBOL(s390_arch_random_counter); diff --git a/arch/s390/crypto/chacha-glue.c b/arch/s390/crypto/chacha-glue.c new file mode 100644 index 0000000000..5fae187f94 --- /dev/null +++ b/arch/s390/crypto/chacha-glue.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * s390 ChaCha stream cipher. + * + * Copyright IBM Corp. 2021 + */ + +#define KMSG_COMPONENT "chacha_s390" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include "chacha-s390.h" + +static void chacha20_crypt_s390(u32 *state, u8 *dst, const u8 *src, + unsigned int nbytes, const u32 *key, + u32 *counter) +{ + struct kernel_fpu vxstate; + + kernel_fpu_begin(&vxstate, KERNEL_VXR); + chacha20_vx(dst, src, nbytes, key, counter); + kernel_fpu_end(&vxstate, KERNEL_VXR); + + *counter += round_up(nbytes, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE; +} + +static int chacha20_s390(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + u32 state[CHACHA_STATE_WORDS] __aligned(16); + struct skcipher_walk walk; + unsigned int nbytes; + int rc; + + rc = skcipher_walk_virt(&walk, req, false); + chacha_init_generic(state, ctx->key, req->iv); + + while (walk.nbytes > 0) { + nbytes = walk.nbytes; + if (nbytes < walk.total) + nbytes = round_down(nbytes, walk.stride); + + if (nbytes <= CHACHA_BLOCK_SIZE) { + chacha_crypt_generic(state, walk.dst.virt.addr, + walk.src.virt.addr, nbytes, + ctx->nrounds); + } else { + chacha20_crypt_s390(state, walk.dst.virt.addr, + walk.src.virt.addr, nbytes, + &state[4], &state[12]); + } + rc = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } + return rc; +} + +void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds) +{ + /* TODO: implement hchacha_block_arch() in assembly */ + hchacha_block_generic(state, stream, nrounds); +} +EXPORT_SYMBOL(hchacha_block_arch); + +void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv) +{ + chacha_init_generic(state, key, iv); +} +EXPORT_SYMBOL(chacha_init_arch); + +void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + /* s390 chacha20 implementation has 20 rounds hard-coded, + * it cannot handle a block of data or less, but otherwise + * it can handle data of arbitrary size + */ + if (bytes <= CHACHA_BLOCK_SIZE || nrounds != 20 || !MACHINE_HAS_VX) + chacha_crypt_generic(state, dst, src, bytes, nrounds); + else + chacha20_crypt_s390(state, dst, src, bytes, + &state[4], &state[12]); +} +EXPORT_SYMBOL(chacha_crypt_arch); + +static struct skcipher_alg chacha_algs[] = { + { + .base.cra_name = "chacha20", + .base.cra_driver_name = "chacha20-s390", + .base.cra_priority = 900, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = CHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .setkey = chacha20_setkey, + .encrypt = chacha20_s390, + .decrypt = chacha20_s390, + } +}; + +static int __init chacha_mod_init(void) +{ + return IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) ? + crypto_register_skciphers(chacha_algs, ARRAY_SIZE(chacha_algs)) : 0; +} + +static void __exit chacha_mod_fini(void) +{ + if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER)) + crypto_unregister_skciphers(chacha_algs, ARRAY_SIZE(chacha_algs)); +} + +module_cpu_feature_match(S390_CPU_FEATURE_VXRS, chacha_mod_init); +module_exit(chacha_mod_fini); + +MODULE_DESCRIPTION("ChaCha20 stream cipher"); +MODULE_LICENSE("GPL v2"); + +MODULE_ALIAS_CRYPTO("chacha20"); diff --git a/arch/s390/crypto/chacha-s390.S b/arch/s390/crypto/chacha-s390.S new file mode 100644 index 0000000000..37cb63f25b --- /dev/null +++ b/arch/s390/crypto/chacha-s390.S @@ -0,0 +1,908 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Original implementation written by Andy Polyakov, @dot-asm. + * This is an adaptation of the original code for kernel use. + * + * Copyright (C) 2006-2019 CRYPTOGAMS by . All Rights Reserved. + */ + +#include +#include +#include + +#define SP %r15 +#define FRAME (16 * 8 + 4 * 8) + + .data + .balign 32 + +SYM_DATA_START_LOCAL(sigma) + .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral + .long 1,0,0,0 + .long 2,0,0,0 + .long 3,0,0,0 + .long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c # byte swap + + .long 0,1,2,3 + .long 0x61707865,0x61707865,0x61707865,0x61707865 # smashed sigma + .long 0x3320646e,0x3320646e,0x3320646e,0x3320646e + .long 0x79622d32,0x79622d32,0x79622d32,0x79622d32 + .long 0x6b206574,0x6b206574,0x6b206574,0x6b206574 +SYM_DATA_END(sigma) + + .previous + + GEN_BR_THUNK %r14 + + .text + +############################################################################# +# void chacha20_vx_4x(u8 *out, counst u8 *inp, size_t len, +# counst u32 *key, const u32 *counter) + +#define OUT %r2 +#define INP %r3 +#define LEN %r4 +#define KEY %r5 +#define COUNTER %r6 + +#define BEPERM %v31 +#define CTR %v26 + +#define K0 %v16 +#define K1 %v17 +#define K2 %v18 +#define K3 %v19 + +#define XA0 %v0 +#define XA1 %v1 +#define XA2 %v2 +#define XA3 %v3 + +#define XB0 %v4 +#define XB1 %v5 +#define XB2 %v6 +#define XB3 %v7 + +#define XC0 %v8 +#define XC1 %v9 +#define XC2 %v10 +#define XC3 %v11 + +#define XD0 %v12 +#define XD1 %v13 +#define XD2 %v14 +#define XD3 %v15 + +#define XT0 %v27 +#define XT1 %v28 +#define XT2 %v29 +#define XT3 %v30 + +SYM_FUNC_START(chacha20_vx_4x) + stmg %r6,%r7,6*8(SP) + + larl %r7,sigma + lhi %r0,10 + lhi %r1,0 + + VL K0,0,,%r7 # load sigma + VL K1,0,,KEY # load key + VL K2,16,,KEY + VL K3,0,,COUNTER # load counter + + VL BEPERM,0x40,,%r7 + VL CTR,0x50,,%r7 + + VLM XA0,XA3,0x60,%r7,4 # load [smashed] sigma + + VREPF XB0,K1,0 # smash the key + VREPF XB1,K1,1 + VREPF XB2,K1,2 + VREPF XB3,K1,3 + + VREPF XD0,K3,0 + VREPF XD1,K3,1 + VREPF XD2,K3,2 + VREPF XD3,K3,3 + VAF XD0,XD0,CTR + + VREPF XC0,K2,0 + VREPF XC1,K2,1 + VREPF XC2,K2,2 + VREPF XC3,K2,3 + +.Loop_4x: + VAF XA0,XA0,XB0 + VX XD0,XD0,XA0 + VERLLF XD0,XD0,16 + + VAF XA1,XA1,XB1 + VX XD1,XD1,XA1 + VERLLF XD1,XD1,16 + + VAF XA2,XA2,XB2 + VX XD2,XD2,XA2 + VERLLF XD2,XD2,16 + + VAF XA3,XA3,XB3 + VX XD3,XD3,XA3 + VERLLF XD3,XD3,16 + + VAF XC0,XC0,XD0 + VX XB0,XB0,XC0 + VERLLF XB0,XB0,12 + + VAF XC1,XC1,XD1 + VX XB1,XB1,XC1 + VERLLF XB1,XB1,12 + + VAF XC2,XC2,XD2 + VX XB2,XB2,XC2 + VERLLF XB2,XB2,12 + + VAF XC3,XC3,XD3 + VX XB3,XB3,XC3 + VERLLF XB3,XB3,12 + + VAF XA0,XA0,XB0 + VX XD0,XD0,XA0 + VERLLF XD0,XD0,8 + + VAF XA1,XA1,XB1 + VX XD1,XD1,XA1 + VERLLF XD1,XD1,8 + + VAF XA2,XA2,XB2 + VX XD2,XD2,XA2 + VERLLF XD2,XD2,8 + + VAF XA3,XA3,XB3 + VX XD3,XD3,XA3 + VERLLF XD3,XD3,8 + + VAF XC0,XC0,XD0 + VX XB0,XB0,XC0 + VERLLF XB0,XB0,7 + + VAF XC1,XC1,XD1 + VX XB1,XB1,XC1 + VERLLF XB1,XB1,7 + + VAF XC2,XC2,XD2 + VX XB2,XB2,XC2 + VERLLF XB2,XB2,7 + + VAF XC3,XC3,XD3 + VX XB3,XB3,XC3 + VERLLF XB3,XB3,7 + + VAF XA0,XA0,XB1 + VX XD3,XD3,XA0 + VERLLF XD3,XD3,16 + + VAF XA1,XA1,XB2 + VX XD0,XD0,XA1 + VERLLF XD0,XD0,16 + + VAF XA2,XA2,XB3 + VX XD1,XD1,XA2 + VERLLF XD1,XD1,16 + + VAF XA3,XA3,XB0 + VX XD2,XD2,XA3 + VERLLF XD2,XD2,16 + + VAF XC2,XC2,XD3 + VX XB1,XB1,XC2 + VERLLF XB1,XB1,12 + + VAF XC3,XC3,XD0 + VX XB2,XB2,XC3 + VERLLF XB2,XB2,12 + + VAF XC0,XC0,XD1 + VX XB3,XB3,XC0 + VERLLF XB3,XB3,12 + + VAF XC1,XC1,XD2 + VX XB0,XB0,XC1 + VERLLF XB0,XB0,12 + + VAF XA0,XA0,XB1 + VX XD3,XD3,XA0 + VERLLF XD3,XD3,8 + + VAF XA1,XA1,XB2 + VX XD0,XD0,XA1 + VERLLF XD0,XD0,8 + + VAF XA2,XA2,XB3 + VX XD1,XD1,XA2 + VERLLF XD1,XD1,8 + + VAF XA3,XA3,XB0 + VX XD2,XD2,XA3 + VERLLF XD2,XD2,8 + + VAF XC2,XC2,XD3 + VX XB1,XB1,XC2 + VERLLF XB1,XB1,7 + + VAF XC3,XC3,XD0 + VX XB2,XB2,XC3 + VERLLF XB2,XB2,7 + + VAF XC0,XC0,XD1 + VX XB3,XB3,XC0 + VERLLF XB3,XB3,7 + + VAF XC1,XC1,XD2 + VX XB0,XB0,XC1 + VERLLF XB0,XB0,7 + brct %r0,.Loop_4x + + VAF XD0,XD0,CTR + + VMRHF XT0,XA0,XA1 # transpose data + VMRHF XT1,XA2,XA3 + VMRLF XT2,XA0,XA1 + VMRLF XT3,XA2,XA3 + VPDI XA0,XT0,XT1,0b0000 + VPDI XA1,XT0,XT1,0b0101 + VPDI XA2,XT2,XT3,0b0000 + VPDI XA3,XT2,XT3,0b0101 + + VMRHF XT0,XB0,XB1 + VMRHF XT1,XB2,XB3 + VMRLF XT2,XB0,XB1 + VMRLF XT3,XB2,XB3 + VPDI XB0,XT0,XT1,0b0000 + VPDI XB1,XT0,XT1,0b0101 + VPDI XB2,XT2,XT3,0b0000 + VPDI XB3,XT2,XT3,0b0101 + + VMRHF XT0,XC0,XC1 + VMRHF XT1,XC2,XC3 + VMRLF XT2,XC0,XC1 + VMRLF XT3,XC2,XC3 + VPDI XC0,XT0,XT1,0b0000 + VPDI XC1,XT0,XT1,0b0101 + VPDI XC2,XT2,XT3,0b0000 + VPDI XC3,XT2,XT3,0b0101 + + VMRHF XT0,XD0,XD1 + VMRHF XT1,XD2,XD3 + VMRLF XT2,XD0,XD1 + VMRLF XT3,XD2,XD3 + VPDI XD0,XT0,XT1,0b0000 + VPDI XD1,XT0,XT1,0b0101 + VPDI XD2,XT2,XT3,0b0000 + VPDI XD3,XT2,XT3,0b0101 + + VAF XA0,XA0,K0 + VAF XB0,XB0,K1 + VAF XC0,XC0,K2 + VAF XD0,XD0,K3 + + VPERM XA0,XA0,XA0,BEPERM + VPERM XB0,XB0,XB0,BEPERM + VPERM XC0,XC0,XC0,BEPERM + VPERM XD0,XD0,XD0,BEPERM + + VLM XT0,XT3,0,INP,0 + + VX XT0,XT0,XA0 + VX XT1,XT1,XB0 + VX XT2,XT2,XC0 + VX XT3,XT3,XD0 + + VSTM XT0,XT3,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + aghi LEN,-0x40 + + VAF XA0,XA1,K0 + VAF XB0,XB1,K1 + VAF XC0,XC1,K2 + VAF XD0,XD1,K3 + + VPERM XA0,XA0,XA0,BEPERM + VPERM XB0,XB0,XB0,BEPERM + VPERM XC0,XC0,XC0,BEPERM + VPERM XD0,XD0,XD0,BEPERM + + clgfi LEN,0x40 + jl .Ltail_4x + + VLM XT0,XT3,0,INP,0 + + VX XT0,XT0,XA0 + VX XT1,XT1,XB0 + VX XT2,XT2,XC0 + VX XT3,XT3,XD0 + + VSTM XT0,XT3,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + aghi LEN,-0x40 + je .Ldone_4x + + VAF XA0,XA2,K0 + VAF XB0,XB2,K1 + VAF XC0,XC2,K2 + VAF XD0,XD2,K3 + + VPERM XA0,XA0,XA0,BEPERM + VPERM XB0,XB0,XB0,BEPERM + VPERM XC0,XC0,XC0,BEPERM + VPERM XD0,XD0,XD0,BEPERM + + clgfi LEN,0x40 + jl .Ltail_4x + + VLM XT0,XT3,0,INP,0 + + VX XT0,XT0,XA0 + VX XT1,XT1,XB0 + VX XT2,XT2,XC0 + VX XT3,XT3,XD0 + + VSTM XT0,XT3,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + aghi LEN,-0x40 + je .Ldone_4x + + VAF XA0,XA3,K0 + VAF XB0,XB3,K1 + VAF XC0,XC3,K2 + VAF XD0,XD3,K3 + + VPERM XA0,XA0,XA0,BEPERM + VPERM XB0,XB0,XB0,BEPERM + VPERM XC0,XC0,XC0,BEPERM + VPERM XD0,XD0,XD0,BEPERM + + clgfi LEN,0x40 + jl .Ltail_4x + + VLM XT0,XT3,0,INP,0 + + VX XT0,XT0,XA0 + VX XT1,XT1,XB0 + VX XT2,XT2,XC0 + VX XT3,XT3,XD0 + + VSTM XT0,XT3,0,OUT,0 + +.Ldone_4x: + lmg %r6,%r7,6*8(SP) + BR_EX %r14 + +.Ltail_4x: + VLR XT0,XC0 + VLR XT1,XD0 + + VST XA0,8*8+0x00,,SP + VST XB0,8*8+0x10,,SP + VST XT0,8*8+0x20,,SP + VST XT1,8*8+0x30,,SP + + lghi %r1,0 + +.Loop_tail_4x: + llgc %r5,0(%r1,INP) + llgc %r6,8*8(%r1,SP) + xr %r6,%r5 + stc %r6,0(%r1,OUT) + la %r1,1(%r1) + brct LEN,.Loop_tail_4x + + lmg %r6,%r7,6*8(SP) + BR_EX %r14 +SYM_FUNC_END(chacha20_vx_4x) + +#undef OUT +#undef INP +#undef LEN +#undef KEY +#undef COUNTER + +#undef BEPERM + +#undef K0 +#undef K1 +#undef K2 +#undef K3 + + +############################################################################# +# void chacha20_vx(u8 *out, counst u8 *inp, size_t len, +# counst u32 *key, const u32 *counter) + +#define OUT %r2 +#define INP %r3 +#define LEN %r4 +#define KEY %r5 +#define COUNTER %r6 + +#define BEPERM %v31 + +#define K0 %v27 +#define K1 %v24 +#define K2 %v25 +#define K3 %v26 + +#define A0 %v0 +#define B0 %v1 +#define C0 %v2 +#define D0 %v3 + +#define A1 %v4 +#define B1 %v5 +#define C1 %v6 +#define D1 %v7 + +#define A2 %v8 +#define B2 %v9 +#define C2 %v10 +#define D2 %v11 + +#define A3 %v12 +#define B3 %v13 +#define C3 %v14 +#define D3 %v15 + +#define A4 %v16 +#define B4 %v17 +#define C4 %v18 +#define D4 %v19 + +#define A5 %v20 +#define B5 %v21 +#define C5 %v22 +#define D5 %v23 + +#define T0 %v27 +#define T1 %v28 +#define T2 %v29 +#define T3 %v30 + +SYM_FUNC_START(chacha20_vx) + clgfi LEN,256 + jle chacha20_vx_4x + stmg %r6,%r7,6*8(SP) + + lghi %r1,-FRAME + lgr %r0,SP + la SP,0(%r1,SP) + stg %r0,0(SP) # back-chain + + larl %r7,sigma + lhi %r0,10 + + VLM K1,K2,0,KEY,0 # load key + VL K3,0,,COUNTER # load counter + + VLM K0,BEPERM,0,%r7,4 # load sigma, increments, ... + +.Loop_outer_vx: + VLR A0,K0 + VLR B0,K1 + VLR A1,K0 + VLR B1,K1 + VLR A2,K0 + VLR B2,K1 + VLR A3,K0 + VLR B3,K1 + VLR A4,K0 + VLR B4,K1 + VLR A5,K0 + VLR B5,K1 + + VLR D0,K3 + VAF D1,K3,T1 # K[3]+1 + VAF D2,K3,T2 # K[3]+2 + VAF D3,K3,T3 # K[3]+3 + VAF D4,D2,T2 # K[3]+4 + VAF D5,D2,T3 # K[3]+5 + + VLR C0,K2 + VLR C1,K2 + VLR C2,K2 + VLR C3,K2 + VLR C4,K2 + VLR C5,K2 + + VLR T1,D1 + VLR T2,D2 + VLR T3,D3 + +.Loop_vx: + VAF A0,A0,B0 + VAF A1,A1,B1 + VAF A2,A2,B2 + VAF A3,A3,B3 + VAF A4,A4,B4 + VAF A5,A5,B5 + VX D0,D0,A0 + VX D1,D1,A1 + VX D2,D2,A2 + VX D3,D3,A3 + VX D4,D4,A4 + VX D5,D5,A5 + VERLLF D0,D0,16 + VERLLF D1,D1,16 + VERLLF D2,D2,16 + VERLLF D3,D3,16 + VERLLF D4,D4,16 + VERLLF D5,D5,16 + + VAF C0,C0,D0 + VAF C1,C1,D1 + VAF C2,C2,D2 + VAF C3,C3,D3 + VAF C4,C4,D4 + VAF C5,C5,D5 + VX B0,B0,C0 + VX B1,B1,C1 + VX B2,B2,C2 + VX B3,B3,C3 + VX B4,B4,C4 + VX B5,B5,C5 + VERLLF B0,B0,12 + VERLLF B1,B1,12 + VERLLF B2,B2,12 + VERLLF B3,B3,12 + VERLLF B4,B4,12 + VERLLF B5,B5,12 + + VAF A0,A0,B0 + VAF A1,A1,B1 + VAF A2,A2,B2 + VAF A3,A3,B3 + VAF A4,A4,B4 + VAF A5,A5,B5 + VX D0,D0,A0 + VX D1,D1,A1 + VX D2,D2,A2 + VX D3,D3,A3 + VX D4,D4,A4 + VX D5,D5,A5 + VERLLF D0,D0,8 + VERLLF D1,D1,8 + VERLLF D2,D2,8 + VERLLF D3,D3,8 + VERLLF D4,D4,8 + VERLLF D5,D5,8 + + VAF C0,C0,D0 + VAF C1,C1,D1 + VAF C2,C2,D2 + VAF C3,C3,D3 + VAF C4,C4,D4 + VAF C5,C5,D5 + VX B0,B0,C0 + VX B1,B1,C1 + VX B2,B2,C2 + VX B3,B3,C3 + VX B4,B4,C4 + VX B5,B5,C5 + VERLLF B0,B0,7 + VERLLF B1,B1,7 + VERLLF B2,B2,7 + VERLLF B3,B3,7 + VERLLF B4,B4,7 + VERLLF B5,B5,7 + + VSLDB C0,C0,C0,8 + VSLDB C1,C1,C1,8 + VSLDB C2,C2,C2,8 + VSLDB C3,C3,C3,8 + VSLDB C4,C4,C4,8 + VSLDB C5,C5,C5,8 + VSLDB B0,B0,B0,4 + VSLDB B1,B1,B1,4 + VSLDB B2,B2,B2,4 + VSLDB B3,B3,B3,4 + VSLDB B4,B4,B4,4 + VSLDB B5,B5,B5,4 + VSLDB D0,D0,D0,12 + VSLDB D1,D1,D1,12 + VSLDB D2,D2,D2,12 + VSLDB D3,D3,D3,12 + VSLDB D4,D4,D4,12 + VSLDB D5,D5,D5,12 + + VAF A0,A0,B0 + VAF A1,A1,B1 + VAF A2,A2,B2 + VAF A3,A3,B3 + VAF A4,A4,B4 + VAF A5,A5,B5 + VX D0,D0,A0 + VX D1,D1,A1 + VX D2,D2,A2 + VX D3,D3,A3 + VX D4,D4,A4 + VX D5,D5,A5 + VERLLF D0,D0,16 + VERLLF D1,D1,16 + VERLLF D2,D2,16 + VERLLF D3,D3,16 + VERLLF D4,D4,16 + VERLLF D5,D5,16 + + VAF C0,C0,D0 + VAF C1,C1,D1 + VAF C2,C2,D2 + VAF C3,C3,D3 + VAF C4,C4,D4 + VAF C5,C5,D5 + VX B0,B0,C0 + VX B1,B1,C1 + VX B2,B2,C2 + VX B3,B3,C3 + VX B4,B4,C4 + VX B5,B5,C5 + VERLLF B0,B0,12 + VERLLF B1,B1,12 + VERLLF B2,B2,12 + VERLLF B3,B3,12 + VERLLF B4,B4,12 + VERLLF B5,B5,12 + + VAF A0,A0,B0 + VAF A1,A1,B1 + VAF A2,A2,B2 + VAF A3,A3,B3 + VAF A4,A4,B4 + VAF A5,A5,B5 + VX D0,D0,A0 + VX D1,D1,A1 + VX D2,D2,A2 + VX D3,D3,A3 + VX D4,D4,A4 + VX D5,D5,A5 + VERLLF D0,D0,8 + VERLLF D1,D1,8 + VERLLF D2,D2,8 + VERLLF D3,D3,8 + VERLLF D4,D4,8 + VERLLF D5,D5,8 + + VAF C0,C0,D0 + VAF C1,C1,D1 + VAF C2,C2,D2 + VAF C3,C3,D3 + VAF C4,C4,D4 + VAF C5,C5,D5 + VX B0,B0,C0 + VX B1,B1,C1 + VX B2,B2,C2 + VX B3,B3,C3 + VX B4,B4,C4 + VX B5,B5,C5 + VERLLF B0,B0,7 + VERLLF B1,B1,7 + VERLLF B2,B2,7 + VERLLF B3,B3,7 + VERLLF B4,B4,7 + VERLLF B5,B5,7 + + VSLDB C0,C0,C0,8 + VSLDB C1,C1,C1,8 + VSLDB C2,C2,C2,8 + VSLDB C3,C3,C3,8 + VSLDB C4,C4,C4,8 + VSLDB C5,C5,C5,8 + VSLDB B0,B0,B0,12 + VSLDB B1,B1,B1,12 + VSLDB B2,B2,B2,12 + VSLDB B3,B3,B3,12 + VSLDB B4,B4,B4,12 + VSLDB B5,B5,B5,12 + VSLDB D0,D0,D0,4 + VSLDB D1,D1,D1,4 + VSLDB D2,D2,D2,4 + VSLDB D3,D3,D3,4 + VSLDB D4,D4,D4,4 + VSLDB D5,D5,D5,4 + brct %r0,.Loop_vx + + VAF A0,A0,K0 + VAF B0,B0,K1 + VAF C0,C0,K2 + VAF D0,D0,K3 + VAF A1,A1,K0 + VAF D1,D1,T1 # +K[3]+1 + + VPERM A0,A0,A0,BEPERM + VPERM B0,B0,B0,BEPERM + VPERM C0,C0,C0,BEPERM + VPERM D0,D0,D0,BEPERM + + clgfi LEN,0x40 + jl .Ltail_vx + + VAF D2,D2,T2 # +K[3]+2 + VAF D3,D3,T3 # +K[3]+3 + VLM T0,T3,0,INP,0 + + VX A0,A0,T0 + VX B0,B0,T1 + VX C0,C0,T2 + VX D0,D0,T3 + + VLM K0,T3,0,%r7,4 # re-load sigma and increments + + VSTM A0,D0,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + aghi LEN,-0x40 + je .Ldone_vx + + VAF B1,B1,K1 + VAF C1,C1,K2 + + VPERM A0,A1,A1,BEPERM + VPERM B0,B1,B1,BEPERM + VPERM C0,C1,C1,BEPERM + VPERM D0,D1,D1,BEPERM + + clgfi LEN,0x40 + jl .Ltail_vx + + VLM A1,D1,0,INP,0 + + VX A0,A0,A1 + VX B0,B0,B1 + VX C0,C0,C1 + VX D0,D0,D1 + + VSTM A0,D0,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + aghi LEN,-0x40 + je .Ldone_vx + + VAF A2,A2,K0 + VAF B2,B2,K1 + VAF C2,C2,K2 + + VPERM A0,A2,A2,BEPERM + VPERM B0,B2,B2,BEPERM + VPERM C0,C2,C2,BEPERM + VPERM D0,D2,D2,BEPERM + + clgfi LEN,0x40 + jl .Ltail_vx + + VLM A1,D1,0,INP,0 + + VX A0,A0,A1 + VX B0,B0,B1 + VX C0,C0,C1 + VX D0,D0,D1 + + VSTM A0,D0,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + aghi LEN,-0x40 + je .Ldone_vx + + VAF A3,A3,K0 + VAF B3,B3,K1 + VAF C3,C3,K2 + VAF D2,K3,T3 # K[3]+3 + + VPERM A0,A3,A3,BEPERM + VPERM B0,B3,B3,BEPERM + VPERM C0,C3,C3,BEPERM + VPERM D0,D3,D3,BEPERM + + clgfi LEN,0x40 + jl .Ltail_vx + + VAF D3,D2,T1 # K[3]+4 + VLM A1,D1,0,INP,0 + + VX A0,A0,A1 + VX B0,B0,B1 + VX C0,C0,C1 + VX D0,D0,D1 + + VSTM A0,D0,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + aghi LEN,-0x40 + je .Ldone_vx + + VAF A4,A4,K0 + VAF B4,B4,K1 + VAF C4,C4,K2 + VAF D4,D4,D3 # +K[3]+4 + VAF D3,D3,T1 # K[3]+5 + VAF K3,D2,T3 # K[3]+=6 + + VPERM A0,A4,A4,BEPERM + VPERM B0,B4,B4,BEPERM + VPERM C0,C4,C4,BEPERM + VPERM D0,D4,D4,BEPERM + + clgfi LEN,0x40 + jl .Ltail_vx + + VLM A1,D1,0,INP,0 + + VX A0,A0,A1 + VX B0,B0,B1 + VX C0,C0,C1 + VX D0,D0,D1 + + VSTM A0,D0,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + aghi LEN,-0x40 + je .Ldone_vx + + VAF A5,A5,K0 + VAF B5,B5,K1 + VAF C5,C5,K2 + VAF D5,D5,D3 # +K[3]+5 + + VPERM A0,A5,A5,BEPERM + VPERM B0,B5,B5,BEPERM + VPERM C0,C5,C5,BEPERM + VPERM D0,D5,D5,BEPERM + + clgfi LEN,0x40 + jl .Ltail_vx + + VLM A1,D1,0,INP,0 + + VX A0,A0,A1 + VX B0,B0,B1 + VX C0,C0,C1 + VX D0,D0,D1 + + VSTM A0,D0,0,OUT,0 + + la INP,0x40(INP) + la OUT,0x40(OUT) + lhi %r0,10 + aghi LEN,-0x40 + jne .Loop_outer_vx + +.Ldone_vx: + lmg %r6,%r7,FRAME+6*8(SP) + la SP,FRAME(SP) + BR_EX %r14 + +.Ltail_vx: + VSTM A0,D0,8*8,SP,3 + lghi %r1,0 + +.Loop_tail_vx: + llgc %r5,0(%r1,INP) + llgc %r6,8*8(%r1,SP) + xr %r6,%r5 + stc %r6,0(%r1,OUT) + la %r1,1(%r1) + brct LEN,.Loop_tail_vx + + lmg %r6,%r7,FRAME+6*8(SP) + la SP,FRAME(SP) + BR_EX %r14 +SYM_FUNC_END(chacha20_vx) + +.previous diff --git a/arch/s390/crypto/chacha-s390.h b/arch/s390/crypto/chacha-s390.h new file mode 100644 index 0000000000..733744ce30 --- /dev/null +++ b/arch/s390/crypto/chacha-s390.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * s390 ChaCha stream cipher. + * + * Copyright IBM Corp. 2021 + */ + +#ifndef _CHACHA_S390_H +#define _CHACHA_S390_H + +void chacha20_vx(u8 *out, const u8 *inp, size_t len, const u32 *key, + const u32 *counter); + +#endif /* _CHACHA_S390_H */ diff --git a/arch/s390/crypto/crc32-vx.c b/arch/s390/crypto/crc32-vx.c new file mode 100644 index 0000000000..017143e9ce --- /dev/null +++ b/arch/s390/crypto/crc32-vx.c @@ -0,0 +1,310 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Crypto-API module for CRC-32 algorithms implemented with the + * z/Architecture Vector Extension Facility. + * + * Copyright IBM Corp. 2015 + * Author(s): Hendrik Brueckner + */ +#define KMSG_COMPONENT "crc32-vx" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include + + +#define CRC32_BLOCK_SIZE 1 +#define CRC32_DIGEST_SIZE 4 + +#define VX_MIN_LEN 64 +#define VX_ALIGNMENT 16L +#define VX_ALIGN_MASK (VX_ALIGNMENT - 1) + +struct crc_ctx { + u32 key; +}; + +struct crc_desc_ctx { + u32 crc; +}; + +/* Prototypes for functions in assembly files */ +u32 crc32_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size); +u32 crc32_be_vgfm_16(u32 crc, unsigned char const *buf, size_t size); +u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size); + +/* + * DEFINE_CRC32_VX() - Define a CRC-32 function using the vector extension + * + * Creates a function to perform a particular CRC-32 computation. Depending + * on the message buffer, the hardware-accelerated or software implementation + * is used. Note that the message buffer is aligned to improve fetch + * operations of VECTOR LOAD MULTIPLE instructions. + * + */ +#define DEFINE_CRC32_VX(___fname, ___crc32_vx, ___crc32_sw) \ + static u32 __pure ___fname(u32 crc, \ + unsigned char const *data, size_t datalen) \ + { \ + struct kernel_fpu vxstate; \ + unsigned long prealign, aligned, remaining; \ + \ + if (datalen < VX_MIN_LEN + VX_ALIGN_MASK) \ + return ___crc32_sw(crc, data, datalen); \ + \ + if ((unsigned long)data & VX_ALIGN_MASK) { \ + prealign = VX_ALIGNMENT - \ + ((unsigned long)data & VX_ALIGN_MASK); \ + datalen -= prealign; \ + crc = ___crc32_sw(crc, data, prealign); \ + data = (void *)((unsigned long)data + prealign); \ + } \ + \ + aligned = datalen & ~VX_ALIGN_MASK; \ + remaining = datalen & VX_ALIGN_MASK; \ + \ + kernel_fpu_begin(&vxstate, KERNEL_VXR_LOW); \ + crc = ___crc32_vx(crc, data, aligned); \ + kernel_fpu_end(&vxstate, KERNEL_VXR_LOW); \ + \ + if (remaining) \ + crc = ___crc32_sw(crc, data + aligned, remaining); \ + \ + return crc; \ + } + +DEFINE_CRC32_VX(crc32_le_vx, crc32_le_vgfm_16, crc32_le) +DEFINE_CRC32_VX(crc32_be_vx, crc32_be_vgfm_16, crc32_be) +DEFINE_CRC32_VX(crc32c_le_vx, crc32c_le_vgfm_16, __crc32c_le) + + +static int crc32_vx_cra_init_zero(struct crypto_tfm *tfm) +{ + struct crc_ctx *mctx = crypto_tfm_ctx(tfm); + + mctx->key = 0; + return 0; +} + +static int crc32_vx_cra_init_invert(struct crypto_tfm *tfm) +{ + struct crc_ctx *mctx = crypto_tfm_ctx(tfm); + + mctx->key = ~0; + return 0; +} + +static int crc32_vx_init(struct shash_desc *desc) +{ + struct crc_ctx *mctx = crypto_shash_ctx(desc->tfm); + struct crc_desc_ctx *ctx = shash_desc_ctx(desc); + + ctx->crc = mctx->key; + return 0; +} + +static int crc32_vx_setkey(struct crypto_shash *tfm, const u8 *newkey, + unsigned int newkeylen) +{ + struct crc_ctx *mctx = crypto_shash_ctx(tfm); + + if (newkeylen != sizeof(mctx->key)) + return -EINVAL; + mctx->key = le32_to_cpu(*(__le32 *)newkey); + return 0; +} + +static int crc32be_vx_setkey(struct crypto_shash *tfm, const u8 *newkey, + unsigned int newkeylen) +{ + struct crc_ctx *mctx = crypto_shash_ctx(tfm); + + if (newkeylen != sizeof(mctx->key)) + return -EINVAL; + mctx->key = be32_to_cpu(*(__be32 *)newkey); + return 0; +} + +static int crc32le_vx_final(struct shash_desc *desc, u8 *out) +{ + struct crc_desc_ctx *ctx = shash_desc_ctx(desc); + + *(__le32 *)out = cpu_to_le32p(&ctx->crc); + return 0; +} + +static int crc32be_vx_final(struct shash_desc *desc, u8 *out) +{ + struct crc_desc_ctx *ctx = shash_desc_ctx(desc); + + *(__be32 *)out = cpu_to_be32p(&ctx->crc); + return 0; +} + +static int crc32c_vx_final(struct shash_desc *desc, u8 *out) +{ + struct crc_desc_ctx *ctx = shash_desc_ctx(desc); + + /* + * Perform a final XOR with 0xFFFFFFFF to be in sync + * with the generic crc32c shash implementation. + */ + *(__le32 *)out = ~cpu_to_le32p(&ctx->crc); + return 0; +} + +static int __crc32le_vx_finup(u32 *crc, const u8 *data, unsigned int len, + u8 *out) +{ + *(__le32 *)out = cpu_to_le32(crc32_le_vx(*crc, data, len)); + return 0; +} + +static int __crc32be_vx_finup(u32 *crc, const u8 *data, unsigned int len, + u8 *out) +{ + *(__be32 *)out = cpu_to_be32(crc32_be_vx(*crc, data, len)); + return 0; +} + +static int __crc32c_vx_finup(u32 *crc, const u8 *data, unsigned int len, + u8 *out) +{ + /* + * Perform a final XOR with 0xFFFFFFFF to be in sync + * with the generic crc32c shash implementation. + */ + *(__le32 *)out = ~cpu_to_le32(crc32c_le_vx(*crc, data, len)); + return 0; +} + + +#define CRC32_VX_FINUP(alg, func) \ + static int alg ## _vx_finup(struct shash_desc *desc, const u8 *data, \ + unsigned int datalen, u8 *out) \ + { \ + return __ ## alg ## _vx_finup(shash_desc_ctx(desc), \ + data, datalen, out); \ + } + +CRC32_VX_FINUP(crc32le, crc32_le_vx) +CRC32_VX_FINUP(crc32be, crc32_be_vx) +CRC32_VX_FINUP(crc32c, crc32c_le_vx) + +#define CRC32_VX_DIGEST(alg, func) \ + static int alg ## _vx_digest(struct shash_desc *desc, const u8 *data, \ + unsigned int len, u8 *out) \ + { \ + return __ ## alg ## _vx_finup(crypto_shash_ctx(desc->tfm), \ + data, len, out); \ + } + +CRC32_VX_DIGEST(crc32le, crc32_le_vx) +CRC32_VX_DIGEST(crc32be, crc32_be_vx) +CRC32_VX_DIGEST(crc32c, crc32c_le_vx) + +#define CRC32_VX_UPDATE(alg, func) \ + static int alg ## _vx_update(struct shash_desc *desc, const u8 *data, \ + unsigned int datalen) \ + { \ + struct crc_desc_ctx *ctx = shash_desc_ctx(desc); \ + ctx->crc = func(ctx->crc, data, datalen); \ + return 0; \ + } + +CRC32_VX_UPDATE(crc32le, crc32_le_vx) +CRC32_VX_UPDATE(crc32be, crc32_be_vx) +CRC32_VX_UPDATE(crc32c, crc32c_le_vx) + + +static struct shash_alg crc32_vx_algs[] = { + /* CRC-32 LE */ + { + .init = crc32_vx_init, + .setkey = crc32_vx_setkey, + .update = crc32le_vx_update, + .final = crc32le_vx_final, + .finup = crc32le_vx_finup, + .digest = crc32le_vx_digest, + .descsize = sizeof(struct crc_desc_ctx), + .digestsize = CRC32_DIGEST_SIZE, + .base = { + .cra_name = "crc32", + .cra_driver_name = "crc32-vx", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, + .cra_blocksize = CRC32_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crc_ctx), + .cra_module = THIS_MODULE, + .cra_init = crc32_vx_cra_init_zero, + }, + }, + /* CRC-32 BE */ + { + .init = crc32_vx_init, + .setkey = crc32be_vx_setkey, + .update = crc32be_vx_update, + .final = crc32be_vx_final, + .finup = crc32be_vx_finup, + .digest = crc32be_vx_digest, + .descsize = sizeof(struct crc_desc_ctx), + .digestsize = CRC32_DIGEST_SIZE, + .base = { + .cra_name = "crc32be", + .cra_driver_name = "crc32be-vx", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, + .cra_blocksize = CRC32_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crc_ctx), + .cra_module = THIS_MODULE, + .cra_init = crc32_vx_cra_init_zero, + }, + }, + /* CRC-32C LE */ + { + .init = crc32_vx_init, + .setkey = crc32_vx_setkey, + .update = crc32c_vx_update, + .final = crc32c_vx_final, + .finup = crc32c_vx_finup, + .digest = crc32c_vx_digest, + .descsize = sizeof(struct crc_desc_ctx), + .digestsize = CRC32_DIGEST_SIZE, + .base = { + .cra_name = "crc32c", + .cra_driver_name = "crc32c-vx", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, + .cra_blocksize = CRC32_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crc_ctx), + .cra_module = THIS_MODULE, + .cra_init = crc32_vx_cra_init_invert, + }, + }, +}; + + +static int __init crc_vx_mod_init(void) +{ + return crypto_register_shashes(crc32_vx_algs, + ARRAY_SIZE(crc32_vx_algs)); +} + +static void __exit crc_vx_mod_exit(void) +{ + crypto_unregister_shashes(crc32_vx_algs, ARRAY_SIZE(crc32_vx_algs)); +} + +module_cpu_feature_match(S390_CPU_FEATURE_VXRS, crc_vx_mod_init); +module_exit(crc_vx_mod_exit); + +MODULE_AUTHOR("Hendrik Brueckner "); +MODULE_LICENSE("GPL"); + +MODULE_ALIAS_CRYPTO("crc32"); +MODULE_ALIAS_CRYPTO("crc32-vx"); +MODULE_ALIAS_CRYPTO("crc32c"); +MODULE_ALIAS_CRYPTO("crc32c-vx"); diff --git a/arch/s390/crypto/crc32be-vx.S b/arch/s390/crypto/crc32be-vx.S new file mode 100644 index 0000000000..34ee479268 --- /dev/null +++ b/arch/s390/crypto/crc32be-vx.S @@ -0,0 +1,213 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Hardware-accelerated CRC-32 variants for Linux on z Systems + * + * Use the z/Architecture Vector Extension Facility to accelerate the + * computing of CRC-32 checksums. + * + * This CRC-32 implementation algorithm processes the most-significant + * bit first (BE). + * + * Copyright IBM Corp. 2015 + * Author(s): Hendrik Brueckner + */ + +#include +#include +#include + +/* Vector register range containing CRC-32 constants */ +#define CONST_R1R2 %v9 +#define CONST_R3R4 %v10 +#define CONST_R5 %v11 +#define CONST_R6 %v12 +#define CONST_RU_POLY %v13 +#define CONST_CRC_POLY %v14 + + .data + .balign 8 + +/* + * The CRC-32 constant block contains reduction constants to fold and + * process particular chunks of the input data stream in parallel. + * + * For the CRC-32 variants, the constants are precomputed according to + * these definitions: + * + * R1 = x4*128+64 mod P(x) + * R2 = x4*128 mod P(x) + * R3 = x128+64 mod P(x) + * R4 = x128 mod P(x) + * R5 = x96 mod P(x) + * R6 = x64 mod P(x) + * + * Barret reduction constant, u, is defined as floor(x**64 / P(x)). + * + * where P(x) is the polynomial in the normal domain and the P'(x) is the + * polynomial in the reversed (bitreflected) domain. + * + * Note that the constant definitions below are extended in order to compute + * intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction. + * The rightmost doubleword can be 0 to prevent contribution to the result or + * can be multiplied by 1 to perform an XOR without the need for a separate + * VECTOR EXCLUSIVE OR instruction. + * + * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials: + * + * P(x) = 0x04C11DB7 + * P'(x) = 0xEDB88320 + */ + +SYM_DATA_START_LOCAL(constants_CRC_32_BE) + .quad 0x08833794c, 0x0e6228b11 # R1, R2 + .quad 0x0c5b9cd4c, 0x0e8a45605 # R3, R4 + .quad 0x0f200aa66, 1 << 32 # R5, x32 + .quad 0x0490d678d, 1 # R6, 1 + .quad 0x104d101df, 0 # u + .quad 0x104C11DB7, 0 # P(x) +SYM_DATA_END(constants_CRC_32_BE) + + .previous + + GEN_BR_THUNK %r14 + + .text +/* + * The CRC-32 function(s) use these calling conventions: + * + * Parameters: + * + * %r2: Initial CRC value, typically ~0; and final CRC (return) value. + * %r3: Input buffer pointer, performance might be improved if the + * buffer is on a doubleword boundary. + * %r4: Length of the buffer, must be 64 bytes or greater. + * + * Register usage: + * + * %r5: CRC-32 constant pool base pointer. + * V0: Initial CRC value and intermediate constants and results. + * V1..V4: Data for CRC computation. + * V5..V8: Next data chunks that are fetched from the input buffer. + * + * V9..V14: CRC-32 constants. + */ +SYM_FUNC_START(crc32_be_vgfm_16) + /* Load CRC-32 constants */ + larl %r5,constants_CRC_32_BE + VLM CONST_R1R2,CONST_CRC_POLY,0,%r5 + + /* Load the initial CRC value into the leftmost word of V0. */ + VZERO %v0 + VLVGF %v0,%r2,0 + + /* Load a 64-byte data chunk and XOR with CRC */ + VLM %v1,%v4,0,%r3 /* 64-bytes into V1..V4 */ + VX %v1,%v0,%v1 /* V1 ^= CRC */ + aghi %r3,64 /* BUF = BUF + 64 */ + aghi %r4,-64 /* LEN = LEN - 64 */ + + /* Check remaining buffer size and jump to proper folding method */ + cghi %r4,64 + jl .Lless_than_64bytes + +.Lfold_64bytes_loop: + /* Load the next 64-byte data chunk into V5 to V8 */ + VLM %v5,%v8,0,%r3 + + /* + * Perform a GF(2) multiplication of the doublewords in V1 with + * the reduction constants in V0. The intermediate result is + * then folded (accumulated) with the next data chunk in V5 and + * stored in V1. Repeat this step for the register contents + * in V2, V3, and V4 respectively. + */ + VGFMAG %v1,CONST_R1R2,%v1,%v5 + VGFMAG %v2,CONST_R1R2,%v2,%v6 + VGFMAG %v3,CONST_R1R2,%v3,%v7 + VGFMAG %v4,CONST_R1R2,%v4,%v8 + + /* Adjust buffer pointer and length for next loop */ + aghi %r3,64 /* BUF = BUF + 64 */ + aghi %r4,-64 /* LEN = LEN - 64 */ + + cghi %r4,64 + jnl .Lfold_64bytes_loop + +.Lless_than_64bytes: + /* Fold V1 to V4 into a single 128-bit value in V1 */ + VGFMAG %v1,CONST_R3R4,%v1,%v2 + VGFMAG %v1,CONST_R3R4,%v1,%v3 + VGFMAG %v1,CONST_R3R4,%v1,%v4 + + /* Check whether to continue with 64-bit folding */ + cghi %r4,16 + jl .Lfinal_fold + +.Lfold_16bytes_loop: + + VL %v2,0,,%r3 /* Load next data chunk */ + VGFMAG %v1,CONST_R3R4,%v1,%v2 /* Fold next data chunk */ + + /* Adjust buffer pointer and size for folding next data chunk */ + aghi %r3,16 + aghi %r4,-16 + + /* Process remaining data chunks */ + cghi %r4,16 + jnl .Lfold_16bytes_loop + +.Lfinal_fold: + /* + * The R5 constant is used to fold a 128-bit value into an 96-bit value + * that is XORed with the next 96-bit input data chunk. To use a single + * VGFMG instruction, multiply the rightmost 64-bit with x^32 (1<<32) to + * form an intermediate 96-bit value (with appended zeros) which is then + * XORed with the intermediate reduction result. + */ + VGFMG %v1,CONST_R5,%v1 + + /* + * Further reduce the remaining 96-bit value to a 64-bit value using a + * single VGFMG, the rightmost doubleword is multiplied with 0x1. The + * intermediate result is then XORed with the product of the leftmost + * doubleword with R6. The result is a 64-bit value and is subject to + * the Barret reduction. + */ + VGFMG %v1,CONST_R6,%v1 + + /* + * The input values to the Barret reduction are the degree-63 polynomial + * in V1 (R(x)), degree-32 generator polynomial, and the reduction + * constant u. The Barret reduction result is the CRC value of R(x) mod + * P(x). + * + * The Barret reduction algorithm is defined as: + * + * 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u + * 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x) + * 3. C(x) = R(x) XOR T2(x) mod x^32 + * + * Note: To compensate the division by x^32, use the vector unpack + * instruction to move the leftmost word into the leftmost doubleword + * of the vector register. The rightmost doubleword is multiplied + * with zero to not contribute to the intermediate results. + */ + + /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */ + VUPLLF %v2,%v1 + VGFMG %v2,CONST_RU_POLY,%v2 + + /* + * Compute the GF(2) product of the CRC polynomial in VO with T1(x) in + * V2 and XOR the intermediate result, T2(x), with the value in V1. + * The final result is in the rightmost word of V2. + */ + VUPLLF %v2,%v2 + VGFMAG %v2,CONST_CRC_POLY,%v2,%v1 + +.Ldone: + VLGVF %r2,%v2,3 + BR_EX %r14 +SYM_FUNC_END(crc32_be_vgfm_16) + +.previous diff --git a/arch/s390/crypto/crc32le-vx.S b/arch/s390/crypto/crc32le-vx.S new file mode 100644 index 0000000000..5a819ae09a --- /dev/null +++ b/arch/s390/crypto/crc32le-vx.S @@ -0,0 +1,275 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Hardware-accelerated CRC-32 variants for Linux on z Systems + * + * Use the z/Architecture Vector Extension Facility to accelerate the + * computing of bitreflected CRC-32 checksums for IEEE 802.3 Ethernet + * and Castagnoli. + * + * This CRC-32 implementation algorithm is bitreflected and processes + * the least-significant bit first (Little-Endian). + * + * Copyright IBM Corp. 2015 + * Author(s): Hendrik Brueckner + */ + +#include +#include +#include + +/* Vector register range containing CRC-32 constants */ +#define CONST_PERM_LE2BE %v9 +#define CONST_R2R1 %v10 +#define CONST_R4R3 %v11 +#define CONST_R5 %v12 +#define CONST_RU_POLY %v13 +#define CONST_CRC_POLY %v14 + + .data + .balign 8 + +/* + * The CRC-32 constant block contains reduction constants to fold and + * process particular chunks of the input data stream in parallel. + * + * For the CRC-32 variants, the constants are precomputed according to + * these definitions: + * + * R1 = [(x4*128+32 mod P'(x) << 32)]' << 1 + * R2 = [(x4*128-32 mod P'(x) << 32)]' << 1 + * R3 = [(x128+32 mod P'(x) << 32)]' << 1 + * R4 = [(x128-32 mod P'(x) << 32)]' << 1 + * R5 = [(x64 mod P'(x) << 32)]' << 1 + * R6 = [(x32 mod P'(x) << 32)]' << 1 + * + * The bitreflected Barret reduction constant, u', is defined as + * the bit reversal of floor(x**64 / P(x)). + * + * where P(x) is the polynomial in the normal domain and the P'(x) is the + * polynomial in the reversed (bitreflected) domain. + * + * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials: + * + * P(x) = 0x04C11DB7 + * P'(x) = 0xEDB88320 + * + * CRC-32C (Castagnoli) polynomials: + * + * P(x) = 0x1EDC6F41 + * P'(x) = 0x82F63B78 + */ + +SYM_DATA_START_LOCAL(constants_CRC_32_LE) + .octa 0x0F0E0D0C0B0A09080706050403020100 # BE->LE mask + .quad 0x1c6e41596, 0x154442bd4 # R2, R1 + .quad 0x0ccaa009e, 0x1751997d0 # R4, R3 + .octa 0x163cd6124 # R5 + .octa 0x1F7011641 # u' + .octa 0x1DB710641 # P'(x) << 1 +SYM_DATA_END(constants_CRC_32_LE) + +SYM_DATA_START_LOCAL(constants_CRC_32C_LE) + .octa 0x0F0E0D0C0B0A09080706050403020100 # BE->LE mask + .quad 0x09e4addf8, 0x740eef02 # R2, R1 + .quad 0x14cd00bd6, 0xf20c0dfe # R4, R3 + .octa 0x0dd45aab8 # R5 + .octa 0x0dea713f1 # u' + .octa 0x105ec76f0 # P'(x) << 1 +SYM_DATA_END(constants_CRC_32C_LE) + + .previous + + GEN_BR_THUNK %r14 + + .text + +/* + * The CRC-32 functions use these calling conventions: + * + * Parameters: + * + * %r2: Initial CRC value, typically ~0; and final CRC (return) value. + * %r3: Input buffer pointer, performance might be improved if the + * buffer is on a doubleword boundary. + * %r4: Length of the buffer, must be 64 bytes or greater. + * + * Register usage: + * + * %r5: CRC-32 constant pool base pointer. + * V0: Initial CRC value and intermediate constants and results. + * V1..V4: Data for CRC computation. + * V5..V8: Next data chunks that are fetched from the input buffer. + * V9: Constant for BE->LE conversion and shift operations + * + * V10..V14: CRC-32 constants. + */ + +SYM_FUNC_START(crc32_le_vgfm_16) + larl %r5,constants_CRC_32_LE + j crc32_le_vgfm_generic +SYM_FUNC_END(crc32_le_vgfm_16) + +SYM_FUNC_START(crc32c_le_vgfm_16) + larl %r5,constants_CRC_32C_LE + j crc32_le_vgfm_generic +SYM_FUNC_END(crc32c_le_vgfm_16) + +SYM_FUNC_START(crc32_le_vgfm_generic) + /* Load CRC-32 constants */ + VLM CONST_PERM_LE2BE,CONST_CRC_POLY,0,%r5 + + /* + * Load the initial CRC value. + * + * The CRC value is loaded into the rightmost word of the + * vector register and is later XORed with the LSB portion + * of the loaded input data. + */ + VZERO %v0 /* Clear V0 */ + VLVGF %v0,%r2,3 /* Load CRC into rightmost word */ + + /* Load a 64-byte data chunk and XOR with CRC */ + VLM %v1,%v4,0,%r3 /* 64-bytes into V1..V4 */ + VPERM %v1,%v1,%v1,CONST_PERM_LE2BE + VPERM %v2,%v2,%v2,CONST_PERM_LE2BE + VPERM %v3,%v3,%v3,CONST_PERM_LE2BE + VPERM %v4,%v4,%v4,CONST_PERM_LE2BE + + VX %v1,%v0,%v1 /* V1 ^= CRC */ + aghi %r3,64 /* BUF = BUF + 64 */ + aghi %r4,-64 /* LEN = LEN - 64 */ + + cghi %r4,64 + jl .Lless_than_64bytes + +.Lfold_64bytes_loop: + /* Load the next 64-byte data chunk into V5 to V8 */ + VLM %v5,%v8,0,%r3 + VPERM %v5,%v5,%v5,CONST_PERM_LE2BE + VPERM %v6,%v6,%v6,CONST_PERM_LE2BE + VPERM %v7,%v7,%v7,CONST_PERM_LE2BE + VPERM %v8,%v8,%v8,CONST_PERM_LE2BE + + /* + * Perform a GF(2) multiplication of the doublewords in V1 with + * the R1 and R2 reduction constants in V0. The intermediate result + * is then folded (accumulated) with the next data chunk in V5 and + * stored in V1. Repeat this step for the register contents + * in V2, V3, and V4 respectively. + */ + VGFMAG %v1,CONST_R2R1,%v1,%v5 + VGFMAG %v2,CONST_R2R1,%v2,%v6 + VGFMAG %v3,CONST_R2R1,%v3,%v7 + VGFMAG %v4,CONST_R2R1,%v4,%v8 + + aghi %r3,64 /* BUF = BUF + 64 */ + aghi %r4,-64 /* LEN = LEN - 64 */ + + cghi %r4,64 + jnl .Lfold_64bytes_loop + +.Lless_than_64bytes: + /* + * Fold V1 to V4 into a single 128-bit value in V1. Multiply V1 with R3 + * and R4 and accumulating the next 128-bit chunk until a single 128-bit + * value remains. + */ + VGFMAG %v1,CONST_R4R3,%v1,%v2 + VGFMAG %v1,CONST_R4R3,%v1,%v3 + VGFMAG %v1,CONST_R4R3,%v1,%v4 + + cghi %r4,16 + jl .Lfinal_fold + +.Lfold_16bytes_loop: + + VL %v2,0,,%r3 /* Load next data chunk */ + VPERM %v2,%v2,%v2,CONST_PERM_LE2BE + VGFMAG %v1,CONST_R4R3,%v1,%v2 /* Fold next data chunk */ + + aghi %r3,16 + aghi %r4,-16 + + cghi %r4,16 + jnl .Lfold_16bytes_loop + +.Lfinal_fold: + /* + * Set up a vector register for byte shifts. The shift value must + * be loaded in bits 1-4 in byte element 7 of a vector register. + * Shift by 8 bytes: 0x40 + * Shift by 4 bytes: 0x20 + */ + VLEIB %v9,0x40,7 + + /* + * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes + * to move R4 into the rightmost doubleword and set the leftmost + * doubleword to 0x1. + */ + VSRLB %v0,CONST_R4R3,%v9 + VLEIG %v0,1,0 + + /* + * Compute GF(2) product of V1 and V0. The rightmost doubleword + * of V1 is multiplied with R4. The leftmost doubleword of V1 is + * multiplied by 0x1 and is then XORed with rightmost product. + * Implicitly, the intermediate leftmost product becomes padded + */ + VGFMG %v1,%v0,%v1 + + /* + * Now do the final 32-bit fold by multiplying the rightmost word + * in V1 with R5 and XOR the result with the remaining bits in V1. + * + * To achieve this by a single VGFMAG, right shift V1 by a word + * and store the result in V2 which is then accumulated. Use the + * vector unpack instruction to load the rightmost half of the + * doubleword into the rightmost doubleword element of V1; the other + * half is loaded in the leftmost doubleword. + * The vector register with CONST_R5 contains the R5 constant in the + * rightmost doubleword and the leftmost doubleword is zero to ignore + * the leftmost product of V1. + */ + VLEIB %v9,0x20,7 /* Shift by words */ + VSRLB %v2,%v1,%v9 /* Store remaining bits in V2 */ + VUPLLF %v1,%v1 /* Split rightmost doubleword */ + VGFMAG %v1,CONST_R5,%v1,%v2 /* V1 = (V1 * R5) XOR V2 */ + + /* + * Apply a Barret reduction to compute the final 32-bit CRC value. + * + * The input values to the Barret reduction are the degree-63 polynomial + * in V1 (R(x)), degree-32 generator polynomial, and the reduction + * constant u. The Barret reduction result is the CRC value of R(x) mod + * P(x). + * + * The Barret reduction algorithm is defined as: + * + * 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u + * 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x) + * 3. C(x) = R(x) XOR T2(x) mod x^32 + * + * Note: The leftmost doubleword of vector register containing + * CONST_RU_POLY is zero and, thus, the intermediate GF(2) product + * is zero and does not contribute to the final result. + */ + + /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */ + VUPLLF %v2,%v1 + VGFMG %v2,CONST_RU_POLY,%v2 + + /* + * Compute the GF(2) product of the CRC polynomial with T1(x) in + * V2 and XOR the intermediate result, T2(x), with the value in V1. + * The final result is stored in word element 2 of V2. + */ + VUPLLF %v2,%v2 + VGFMAG %v2,CONST_CRC_POLY,%v2,%v1 + +.Ldone: + VLGVF %r2,%v2,2 + BR_EX %r14 +SYM_FUNC_END(crc32_le_vgfm_generic) + +.previous diff --git a/arch/s390/crypto/des_s390.c b/arch/s390/crypto/des_s390.c new file mode 100644 index 0000000000..8e75b83a5d --- /dev/null +++ b/arch/s390/crypto/des_s390.c @@ -0,0 +1,502 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Cryptographic API. + * + * s390 implementation of the DES Cipher Algorithm. + * + * Copyright IBM Corp. 2003, 2011 + * Author(s): Thomas Spatzier + * Jan Glauber (jan.glauber@de.ibm.com) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DES3_KEY_SIZE (3 * DES_KEY_SIZE) + +static u8 *ctrblk; +static DEFINE_MUTEX(ctrblk_lock); + +static cpacf_mask_t km_functions, kmc_functions, kmctr_functions; + +struct s390_des_ctx { + u8 iv[DES_BLOCK_SIZE]; + u8 key[DES3_KEY_SIZE]; +}; + +static int des_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int key_len) +{ + struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm); + int err; + + err = crypto_des_verify_key(tfm, key); + if (err) + return err; + + memcpy(ctx->key, key, key_len); + return 0; +} + +static int des_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, + unsigned int key_len) +{ + return des_setkey(crypto_skcipher_tfm(tfm), key, key_len); +} + +static void s390_des_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) +{ + struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm); + + cpacf_km(CPACF_KM_DEA, ctx->key, out, in, DES_BLOCK_SIZE); +} + +static void s390_des_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) +{ + struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm); + + cpacf_km(CPACF_KM_DEA | CPACF_DECRYPT, + ctx->key, out, in, DES_BLOCK_SIZE); +} + +static struct crypto_alg des_alg = { + .cra_name = "des", + .cra_driver_name = "des-s390", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = DES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct s390_des_ctx), + .cra_module = THIS_MODULE, + .cra_u = { + .cipher = { + .cia_min_keysize = DES_KEY_SIZE, + .cia_max_keysize = DES_KEY_SIZE, + .cia_setkey = des_setkey, + .cia_encrypt = s390_des_encrypt, + .cia_decrypt = s390_des_decrypt, + } + } +}; + +static int ecb_desall_crypt(struct skcipher_request *req, unsigned long fc) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_des_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes, n; + int ret; + + ret = skcipher_walk_virt(&walk, req, false); + while ((nbytes = walk.nbytes) != 0) { + /* only use complete blocks */ + n = nbytes & ~(DES_BLOCK_SIZE - 1); + cpacf_km(fc, ctx->key, walk.dst.virt.addr, + walk.src.virt.addr, n); + ret = skcipher_walk_done(&walk, nbytes - n); + } + return ret; +} + +static int cbc_desall_crypt(struct skcipher_request *req, unsigned long fc) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_des_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes, n; + int ret; + struct { + u8 iv[DES_BLOCK_SIZE]; + u8 key[DES3_KEY_SIZE]; + } param; + + ret = skcipher_walk_virt(&walk, req, false); + if (ret) + return ret; + memcpy(param.iv, walk.iv, DES_BLOCK_SIZE); + memcpy(param.key, ctx->key, DES3_KEY_SIZE); + while ((nbytes = walk.nbytes) != 0) { + /* only use complete blocks */ + n = nbytes & ~(DES_BLOCK_SIZE - 1); + cpacf_kmc(fc, ¶m, walk.dst.virt.addr, + walk.src.virt.addr, n); + memcpy(walk.iv, param.iv, DES_BLOCK_SIZE); + ret = skcipher_walk_done(&walk, nbytes - n); + } + return ret; +} + +static int ecb_des_encrypt(struct skcipher_request *req) +{ + return ecb_desall_crypt(req, CPACF_KM_DEA); +} + +static int ecb_des_decrypt(struct skcipher_request *req) +{ + return ecb_desall_crypt(req, CPACF_KM_DEA | CPACF_DECRYPT); +} + +static struct skcipher_alg ecb_des_alg = { + .base.cra_name = "ecb(des)", + .base.cra_driver_name = "ecb-des-s390", + .base.cra_priority = 400, /* combo: des + ecb */ + .base.cra_blocksize = DES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct s390_des_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = DES_KEY_SIZE, + .max_keysize = DES_KEY_SIZE, + .setkey = des_setkey_skcipher, + .encrypt = ecb_des_encrypt, + .decrypt = ecb_des_decrypt, +}; + +static int cbc_des_encrypt(struct skcipher_request *req) +{ + return cbc_desall_crypt(req, CPACF_KMC_DEA); +} + +static int cbc_des_decrypt(struct skcipher_request *req) +{ + return cbc_desall_crypt(req, CPACF_KMC_DEA | CPACF_DECRYPT); +} + +static struct skcipher_alg cbc_des_alg = { + .base.cra_name = "cbc(des)", + .base.cra_driver_name = "cbc-des-s390", + .base.cra_priority = 400, /* combo: des + cbc */ + .base.cra_blocksize = DES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct s390_des_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = DES_KEY_SIZE, + .max_keysize = DES_KEY_SIZE, + .ivsize = DES_BLOCK_SIZE, + .setkey = des_setkey_skcipher, + .encrypt = cbc_des_encrypt, + .decrypt = cbc_des_decrypt, +}; + +/* + * RFC2451: + * + * For DES-EDE3, there is no known need to reject weak or + * complementation keys. Any weakness is obviated by the use of + * multiple keys. + * + * However, if the first two or last two independent 64-bit keys are + * equal (k1 == k2 or k2 == k3), then the DES3 operation is simply the + * same as DES. Implementers MUST reject keys that exhibit this + * property. + * + * In fips mode additionally check for all 3 keys are unique. + * + */ +static int des3_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int key_len) +{ + struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm); + int err; + + err = crypto_des3_ede_verify_key(tfm, key); + if (err) + return err; + + memcpy(ctx->key, key, key_len); + return 0; +} + +static int des3_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, + unsigned int key_len) +{ + return des3_setkey(crypto_skcipher_tfm(tfm), key, key_len); +} + +static void des3_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm); + + cpacf_km(CPACF_KM_TDEA_192, ctx->key, dst, src, DES_BLOCK_SIZE); +} + +static void des3_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm); + + cpacf_km(CPACF_KM_TDEA_192 | CPACF_DECRYPT, + ctx->key, dst, src, DES_BLOCK_SIZE); +} + +static struct crypto_alg des3_alg = { + .cra_name = "des3_ede", + .cra_driver_name = "des3_ede-s390", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = DES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct s390_des_ctx), + .cra_module = THIS_MODULE, + .cra_u = { + .cipher = { + .cia_min_keysize = DES3_KEY_SIZE, + .cia_max_keysize = DES3_KEY_SIZE, + .cia_setkey = des3_setkey, + .cia_encrypt = des3_encrypt, + .cia_decrypt = des3_decrypt, + } + } +}; + +static int ecb_des3_encrypt(struct skcipher_request *req) +{ + return ecb_desall_crypt(req, CPACF_KM_TDEA_192); +} + +static int ecb_des3_decrypt(struct skcipher_request *req) +{ + return ecb_desall_crypt(req, CPACF_KM_TDEA_192 | CPACF_DECRYPT); +} + +static struct skcipher_alg ecb_des3_alg = { + .base.cra_name = "ecb(des3_ede)", + .base.cra_driver_name = "ecb-des3_ede-s390", + .base.cra_priority = 400, /* combo: des3 + ecb */ + .base.cra_blocksize = DES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct s390_des_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = DES3_KEY_SIZE, + .max_keysize = DES3_KEY_SIZE, + .setkey = des3_setkey_skcipher, + .encrypt = ecb_des3_encrypt, + .decrypt = ecb_des3_decrypt, +}; + +static int cbc_des3_encrypt(struct skcipher_request *req) +{ + return cbc_desall_crypt(req, CPACF_KMC_TDEA_192); +} + +static int cbc_des3_decrypt(struct skcipher_request *req) +{ + return cbc_desall_crypt(req, CPACF_KMC_TDEA_192 | CPACF_DECRYPT); +} + +static struct skcipher_alg cbc_des3_alg = { + .base.cra_name = "cbc(des3_ede)", + .base.cra_driver_name = "cbc-des3_ede-s390", + .base.cra_priority = 400, /* combo: des3 + cbc */ + .base.cra_blocksize = DES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct s390_des_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = DES3_KEY_SIZE, + .max_keysize = DES3_KEY_SIZE, + .ivsize = DES_BLOCK_SIZE, + .setkey = des3_setkey_skcipher, + .encrypt = cbc_des3_encrypt, + .decrypt = cbc_des3_decrypt, +}; + +static unsigned int __ctrblk_init(u8 *ctrptr, u8 *iv, unsigned int nbytes) +{ + unsigned int i, n; + + /* align to block size, max. PAGE_SIZE */ + n = (nbytes > PAGE_SIZE) ? PAGE_SIZE : nbytes & ~(DES_BLOCK_SIZE - 1); + memcpy(ctrptr, iv, DES_BLOCK_SIZE); + for (i = (n / DES_BLOCK_SIZE) - 1; i > 0; i--) { + memcpy(ctrptr + DES_BLOCK_SIZE, ctrptr, DES_BLOCK_SIZE); + crypto_inc(ctrptr + DES_BLOCK_SIZE, DES_BLOCK_SIZE); + ctrptr += DES_BLOCK_SIZE; + } + return n; +} + +static int ctr_desall_crypt(struct skcipher_request *req, unsigned long fc) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_des_ctx *ctx = crypto_skcipher_ctx(tfm); + u8 buf[DES_BLOCK_SIZE], *ctrptr; + struct skcipher_walk walk; + unsigned int n, nbytes; + int ret, locked; + + locked = mutex_trylock(&ctrblk_lock); + + ret = skcipher_walk_virt(&walk, req, false); + while ((nbytes = walk.nbytes) >= DES_BLOCK_SIZE) { + n = DES_BLOCK_SIZE; + if (nbytes >= 2*DES_BLOCK_SIZE && locked) + n = __ctrblk_init(ctrblk, walk.iv, nbytes); + ctrptr = (n > DES_BLOCK_SIZE) ? ctrblk : walk.iv; + cpacf_kmctr(fc, ctx->key, walk.dst.virt.addr, + walk.src.virt.addr, n, ctrptr); + if (ctrptr == ctrblk) + memcpy(walk.iv, ctrptr + n - DES_BLOCK_SIZE, + DES_BLOCK_SIZE); + crypto_inc(walk.iv, DES_BLOCK_SIZE); + ret = skcipher_walk_done(&walk, nbytes - n); + } + if (locked) + mutex_unlock(&ctrblk_lock); + /* final block may be < DES_BLOCK_SIZE, copy only nbytes */ + if (nbytes) { + cpacf_kmctr(fc, ctx->key, buf, walk.src.virt.addr, + DES_BLOCK_SIZE, walk.iv); + memcpy(walk.dst.virt.addr, buf, nbytes); + crypto_inc(walk.iv, DES_BLOCK_SIZE); + ret = skcipher_walk_done(&walk, 0); + } + return ret; +} + +static int ctr_des_crypt(struct skcipher_request *req) +{ + return ctr_desall_crypt(req, CPACF_KMCTR_DEA); +} + +static struct skcipher_alg ctr_des_alg = { + .base.cra_name = "ctr(des)", + .base.cra_driver_name = "ctr-des-s390", + .base.cra_priority = 400, /* combo: des + ctr */ + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct s390_des_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = DES_KEY_SIZE, + .max_keysize = DES_KEY_SIZE, + .ivsize = DES_BLOCK_SIZE, + .setkey = des_setkey_skcipher, + .encrypt = ctr_des_crypt, + .decrypt = ctr_des_crypt, + .chunksize = DES_BLOCK_SIZE, +}; + +static int ctr_des3_crypt(struct skcipher_request *req) +{ + return ctr_desall_crypt(req, CPACF_KMCTR_TDEA_192); +} + +static struct skcipher_alg ctr_des3_alg = { + .base.cra_name = "ctr(des3_ede)", + .base.cra_driver_name = "ctr-des3_ede-s390", + .base.cra_priority = 400, /* combo: des3 + ede */ + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct s390_des_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = DES3_KEY_SIZE, + .max_keysize = DES3_KEY_SIZE, + .ivsize = DES_BLOCK_SIZE, + .setkey = des3_setkey_skcipher, + .encrypt = ctr_des3_crypt, + .decrypt = ctr_des3_crypt, + .chunksize = DES_BLOCK_SIZE, +}; + +static struct crypto_alg *des_s390_algs_ptr[2]; +static int des_s390_algs_num; +static struct skcipher_alg *des_s390_skciphers_ptr[6]; +static int des_s390_skciphers_num; + +static int des_s390_register_alg(struct crypto_alg *alg) +{ + int ret; + + ret = crypto_register_alg(alg); + if (!ret) + des_s390_algs_ptr[des_s390_algs_num++] = alg; + return ret; +} + +static int des_s390_register_skcipher(struct skcipher_alg *alg) +{ + int ret; + + ret = crypto_register_skcipher(alg); + if (!ret) + des_s390_skciphers_ptr[des_s390_skciphers_num++] = alg; + return ret; +} + +static void des_s390_exit(void) +{ + while (des_s390_algs_num--) + crypto_unregister_alg(des_s390_algs_ptr[des_s390_algs_num]); + while (des_s390_skciphers_num--) + crypto_unregister_skcipher(des_s390_skciphers_ptr[des_s390_skciphers_num]); + if (ctrblk) + free_page((unsigned long) ctrblk); +} + +static int __init des_s390_init(void) +{ + int ret; + + /* Query available functions for KM, KMC and KMCTR */ + cpacf_query(CPACF_KM, &km_functions); + cpacf_query(CPACF_KMC, &kmc_functions); + cpacf_query(CPACF_KMCTR, &kmctr_functions); + + if (cpacf_test_func(&km_functions, CPACF_KM_DEA)) { + ret = des_s390_register_alg(&des_alg); + if (ret) + goto out_err; + ret = des_s390_register_skcipher(&ecb_des_alg); + if (ret) + goto out_err; + } + if (cpacf_test_func(&kmc_functions, CPACF_KMC_DEA)) { + ret = des_s390_register_skcipher(&cbc_des_alg); + if (ret) + goto out_err; + } + if (cpacf_test_func(&km_functions, CPACF_KM_TDEA_192)) { + ret = des_s390_register_alg(&des3_alg); + if (ret) + goto out_err; + ret = des_s390_register_skcipher(&ecb_des3_alg); + if (ret) + goto out_err; + } + if (cpacf_test_func(&kmc_functions, CPACF_KMC_TDEA_192)) { + ret = des_s390_register_skcipher(&cbc_des3_alg); + if (ret) + goto out_err; + } + + if (cpacf_test_func(&kmctr_functions, CPACF_KMCTR_DEA) || + cpacf_test_func(&kmctr_functions, CPACF_KMCTR_TDEA_192)) { + ctrblk = (u8 *) __get_free_page(GFP_KERNEL); + if (!ctrblk) { + ret = -ENOMEM; + goto out_err; + } + } + + if (cpacf_test_func(&kmctr_functions, CPACF_KMCTR_DEA)) { + ret = des_s390_register_skcipher(&ctr_des_alg); + if (ret) + goto out_err; + } + if (cpacf_test_func(&kmctr_functions, CPACF_KMCTR_TDEA_192)) { + ret = des_s390_register_skcipher(&ctr_des3_alg); + if (ret) + goto out_err; + } + + return 0; +out_err: + des_s390_exit(); + return ret; +} + +module_cpu_feature_match(S390_CPU_FEATURE_MSA, des_s390_init); +module_exit(des_s390_exit); + +MODULE_ALIAS_CRYPTO("des"); +MODULE_ALIAS_CRYPTO("des3_ede"); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("DES & Triple DES EDE Cipher Algorithms"); diff --git a/arch/s390/crypto/ghash_s390.c b/arch/s390/crypto/ghash_s390.c new file mode 100644 index 0000000000..0800a2a579 --- /dev/null +++ b/arch/s390/crypto/ghash_s390.c @@ -0,0 +1,154 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Cryptographic API. + * + * s390 implementation of the GHASH algorithm for GCM (Galois/Counter Mode). + * + * Copyright IBM Corp. 2011 + * Author(s): Gerald Schaefer + */ + +#include +#include +#include +#include + +#define GHASH_BLOCK_SIZE 16 +#define GHASH_DIGEST_SIZE 16 + +struct ghash_ctx { + u8 key[GHASH_BLOCK_SIZE]; +}; + +struct ghash_desc_ctx { + u8 icv[GHASH_BLOCK_SIZE]; + u8 key[GHASH_BLOCK_SIZE]; + u8 buffer[GHASH_BLOCK_SIZE]; + u32 bytes; +}; + +static int ghash_init(struct shash_desc *desc) +{ + struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); + struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); + + memset(dctx, 0, sizeof(*dctx)); + memcpy(dctx->key, ctx->key, GHASH_BLOCK_SIZE); + + return 0; +} + +static int ghash_setkey(struct crypto_shash *tfm, + const u8 *key, unsigned int keylen) +{ + struct ghash_ctx *ctx = crypto_shash_ctx(tfm); + + if (keylen != GHASH_BLOCK_SIZE) + return -EINVAL; + + memcpy(ctx->key, key, GHASH_BLOCK_SIZE); + + return 0; +} + +static int ghash_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); + unsigned int n; + u8 *buf = dctx->buffer; + + if (dctx->bytes) { + u8 *pos = buf + (GHASH_BLOCK_SIZE - dctx->bytes); + + n = min(srclen, dctx->bytes); + dctx->bytes -= n; + srclen -= n; + + memcpy(pos, src, n); + src += n; + + if (!dctx->bytes) { + cpacf_kimd(CPACF_KIMD_GHASH, dctx, buf, + GHASH_BLOCK_SIZE); + } + } + + n = srclen & ~(GHASH_BLOCK_SIZE - 1); + if (n) { + cpacf_kimd(CPACF_KIMD_GHASH, dctx, src, n); + src += n; + srclen -= n; + } + + if (srclen) { + dctx->bytes = GHASH_BLOCK_SIZE - srclen; + memcpy(buf, src, srclen); + } + + return 0; +} + +static int ghash_flush(struct ghash_desc_ctx *dctx) +{ + u8 *buf = dctx->buffer; + + if (dctx->bytes) { + u8 *pos = buf + (GHASH_BLOCK_SIZE - dctx->bytes); + + memset(pos, 0, dctx->bytes); + cpacf_kimd(CPACF_KIMD_GHASH, dctx, buf, GHASH_BLOCK_SIZE); + dctx->bytes = 0; + } + + return 0; +} + +static int ghash_final(struct shash_desc *desc, u8 *dst) +{ + struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); + int ret; + + ret = ghash_flush(dctx); + if (!ret) + memcpy(dst, dctx->icv, GHASH_BLOCK_SIZE); + return ret; +} + +static struct shash_alg ghash_alg = { + .digestsize = GHASH_DIGEST_SIZE, + .init = ghash_init, + .update = ghash_update, + .final = ghash_final, + .setkey = ghash_setkey, + .descsize = sizeof(struct ghash_desc_ctx), + .base = { + .cra_name = "ghash", + .cra_driver_name = "ghash-s390", + .cra_priority = 300, + .cra_blocksize = GHASH_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct ghash_ctx), + .cra_module = THIS_MODULE, + }, +}; + +static int __init ghash_mod_init(void) +{ + if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_GHASH)) + return -ENODEV; + + return crypto_register_shash(&ghash_alg); +} + +static void __exit ghash_mod_exit(void) +{ + crypto_unregister_shash(&ghash_alg); +} + +module_cpu_feature_match(S390_CPU_FEATURE_MSA, ghash_mod_init); +module_exit(ghash_mod_exit); + +MODULE_ALIAS_CRYPTO("ghash"); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("GHASH hash function, s390 implementation"); diff --git a/arch/s390/crypto/paes_s390.c b/arch/s390/crypto/paes_s390.c new file mode 100644 index 0000000000..55ee5567a5 --- /dev/null +++ b/arch/s390/crypto/paes_s390.c @@ -0,0 +1,809 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Cryptographic API. + * + * s390 implementation of the AES Cipher Algorithm with protected keys. + * + * s390 Version: + * Copyright IBM Corp. 2017, 2023 + * Author(s): Martin Schwidefsky + * Harald Freudenberger + */ + +#define KMSG_COMPONENT "paes_s390" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Key blobs smaller/bigger than these defines are rejected + * by the common code even before the individual setkey function + * is called. As paes can handle different kinds of key blobs + * and padding is also possible, the limits need to be generous. + */ +#define PAES_MIN_KEYSIZE 16 +#define PAES_MAX_KEYSIZE MAXEP11AESKEYBLOBSIZE + +static u8 *ctrblk; +static DEFINE_MUTEX(ctrblk_lock); + +static cpacf_mask_t km_functions, kmc_functions, kmctr_functions; + +struct key_blob { + /* + * Small keys will be stored in the keybuf. Larger keys are + * stored in extra allocated memory. In both cases does + * key point to the memory where the key is stored. + * The code distinguishes by checking keylen against + * sizeof(keybuf). See the two following helper functions. + */ + u8 *key; + u8 keybuf[128]; + unsigned int keylen; +}; + +static inline int _key_to_kb(struct key_blob *kb, + const u8 *key, + unsigned int keylen) +{ + struct clearkey_header { + u8 type; + u8 res0[3]; + u8 version; + u8 res1[3]; + u32 keytype; + u32 len; + } __packed * h; + + switch (keylen) { + case 16: + case 24: + case 32: + /* clear key value, prepare pkey clear key token in keybuf */ + memset(kb->keybuf, 0, sizeof(kb->keybuf)); + h = (struct clearkey_header *) kb->keybuf; + h->version = 0x02; /* TOKVER_CLEAR_KEY */ + h->keytype = (keylen - 8) >> 3; + h->len = keylen; + memcpy(kb->keybuf + sizeof(*h), key, keylen); + kb->keylen = sizeof(*h) + keylen; + kb->key = kb->keybuf; + break; + default: + /* other key material, let pkey handle this */ + if (keylen <= sizeof(kb->keybuf)) + kb->key = kb->keybuf; + else { + kb->key = kmalloc(keylen, GFP_KERNEL); + if (!kb->key) + return -ENOMEM; + } + memcpy(kb->key, key, keylen); + kb->keylen = keylen; + break; + } + + return 0; +} + +static inline void _free_kb_keybuf(struct key_blob *kb) +{ + if (kb->key && kb->key != kb->keybuf + && kb->keylen > sizeof(kb->keybuf)) { + kfree_sensitive(kb->key); + kb->key = NULL; + } +} + +struct s390_paes_ctx { + struct key_blob kb; + struct pkey_protkey pk; + spinlock_t pk_lock; + unsigned long fc; +}; + +struct s390_pxts_ctx { + struct key_blob kb[2]; + struct pkey_protkey pk[2]; + spinlock_t pk_lock; + unsigned long fc; +}; + +static inline int __paes_keyblob2pkey(struct key_blob *kb, + struct pkey_protkey *pk) +{ + int i, ret; + + /* try three times in case of failure */ + for (i = 0; i < 3; i++) { + if (i > 0 && ret == -EAGAIN && in_task()) + if (msleep_interruptible(1000)) + return -EINTR; + ret = pkey_keyblob2pkey(kb->key, kb->keylen, + pk->protkey, &pk->len, &pk->type); + if (ret == 0) + break; + } + + return ret; +} + +static inline int __paes_convert_key(struct s390_paes_ctx *ctx) +{ + int ret; + struct pkey_protkey pkey; + + pkey.len = sizeof(pkey.protkey); + ret = __paes_keyblob2pkey(&ctx->kb, &pkey); + if (ret) + return ret; + + spin_lock_bh(&ctx->pk_lock); + memcpy(&ctx->pk, &pkey, sizeof(pkey)); + spin_unlock_bh(&ctx->pk_lock); + + return 0; +} + +static int ecb_paes_init(struct crypto_skcipher *tfm) +{ + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + + ctx->kb.key = NULL; + spin_lock_init(&ctx->pk_lock); + + return 0; +} + +static void ecb_paes_exit(struct crypto_skcipher *tfm) +{ + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + + _free_kb_keybuf(&ctx->kb); +} + +static inline int __ecb_paes_set_key(struct s390_paes_ctx *ctx) +{ + int rc; + unsigned long fc; + + rc = __paes_convert_key(ctx); + if (rc) + return rc; + + /* Pick the correct function code based on the protected key type */ + fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KM_PAES_128 : + (ctx->pk.type == PKEY_KEYTYPE_AES_192) ? CPACF_KM_PAES_192 : + (ctx->pk.type == PKEY_KEYTYPE_AES_256) ? CPACF_KM_PAES_256 : 0; + + /* Check if the function code is available */ + ctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0; + + return ctx->fc ? 0 : -EINVAL; +} + +static int ecb_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, + unsigned int key_len) +{ + int rc; + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + + _free_kb_keybuf(&ctx->kb); + rc = _key_to_kb(&ctx->kb, in_key, key_len); + if (rc) + return rc; + + return __ecb_paes_set_key(ctx); +} + +static int ecb_paes_crypt(struct skcipher_request *req, unsigned long modifier) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes, n, k; + int ret; + struct { + u8 key[MAXPROTKEYSIZE]; + } param; + + ret = skcipher_walk_virt(&walk, req, false); + if (ret) + return ret; + + spin_lock_bh(&ctx->pk_lock); + memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE); + spin_unlock_bh(&ctx->pk_lock); + + while ((nbytes = walk.nbytes) != 0) { + /* only use complete blocks */ + n = nbytes & ~(AES_BLOCK_SIZE - 1); + k = cpacf_km(ctx->fc | modifier, ¶m, + walk.dst.virt.addr, walk.src.virt.addr, n); + if (k) + ret = skcipher_walk_done(&walk, nbytes - k); + if (k < n) { + if (__paes_convert_key(ctx)) + return skcipher_walk_done(&walk, -EIO); + spin_lock_bh(&ctx->pk_lock); + memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE); + spin_unlock_bh(&ctx->pk_lock); + } + } + return ret; +} + +static int ecb_paes_encrypt(struct skcipher_request *req) +{ + return ecb_paes_crypt(req, 0); +} + +static int ecb_paes_decrypt(struct skcipher_request *req) +{ + return ecb_paes_crypt(req, CPACF_DECRYPT); +} + +static struct skcipher_alg ecb_paes_alg = { + .base.cra_name = "ecb(paes)", + .base.cra_driver_name = "ecb-paes-s390", + .base.cra_priority = 401, /* combo: aes + ecb + 1 */ + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct s390_paes_ctx), + .base.cra_module = THIS_MODULE, + .base.cra_list = LIST_HEAD_INIT(ecb_paes_alg.base.cra_list), + .init = ecb_paes_init, + .exit = ecb_paes_exit, + .min_keysize = PAES_MIN_KEYSIZE, + .max_keysize = PAES_MAX_KEYSIZE, + .setkey = ecb_paes_set_key, + .encrypt = ecb_paes_encrypt, + .decrypt = ecb_paes_decrypt, +}; + +static int cbc_paes_init(struct crypto_skcipher *tfm) +{ + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + + ctx->kb.key = NULL; + spin_lock_init(&ctx->pk_lock); + + return 0; +} + +static void cbc_paes_exit(struct crypto_skcipher *tfm) +{ + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + + _free_kb_keybuf(&ctx->kb); +} + +static inline int __cbc_paes_set_key(struct s390_paes_ctx *ctx) +{ + int rc; + unsigned long fc; + + rc = __paes_convert_key(ctx); + if (rc) + return rc; + + /* Pick the correct function code based on the protected key type */ + fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KMC_PAES_128 : + (ctx->pk.type == PKEY_KEYTYPE_AES_192) ? CPACF_KMC_PAES_192 : + (ctx->pk.type == PKEY_KEYTYPE_AES_256) ? CPACF_KMC_PAES_256 : 0; + + /* Check if the function code is available */ + ctx->fc = (fc && cpacf_test_func(&kmc_functions, fc)) ? fc : 0; + + return ctx->fc ? 0 : -EINVAL; +} + +static int cbc_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, + unsigned int key_len) +{ + int rc; + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + + _free_kb_keybuf(&ctx->kb); + rc = _key_to_kb(&ctx->kb, in_key, key_len); + if (rc) + return rc; + + return __cbc_paes_set_key(ctx); +} + +static int cbc_paes_crypt(struct skcipher_request *req, unsigned long modifier) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes, n, k; + int ret; + struct { + u8 iv[AES_BLOCK_SIZE]; + u8 key[MAXPROTKEYSIZE]; + } param; + + ret = skcipher_walk_virt(&walk, req, false); + if (ret) + return ret; + + memcpy(param.iv, walk.iv, AES_BLOCK_SIZE); + spin_lock_bh(&ctx->pk_lock); + memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE); + spin_unlock_bh(&ctx->pk_lock); + + while ((nbytes = walk.nbytes) != 0) { + /* only use complete blocks */ + n = nbytes & ~(AES_BLOCK_SIZE - 1); + k = cpacf_kmc(ctx->fc | modifier, ¶m, + walk.dst.virt.addr, walk.src.virt.addr, n); + if (k) { + memcpy(walk.iv, param.iv, AES_BLOCK_SIZE); + ret = skcipher_walk_done(&walk, nbytes - k); + } + if (k < n) { + if (__paes_convert_key(ctx)) + return skcipher_walk_done(&walk, -EIO); + spin_lock_bh(&ctx->pk_lock); + memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE); + spin_unlock_bh(&ctx->pk_lock); + } + } + return ret; +} + +static int cbc_paes_encrypt(struct skcipher_request *req) +{ + return cbc_paes_crypt(req, 0); +} + +static int cbc_paes_decrypt(struct skcipher_request *req) +{ + return cbc_paes_crypt(req, CPACF_DECRYPT); +} + +static struct skcipher_alg cbc_paes_alg = { + .base.cra_name = "cbc(paes)", + .base.cra_driver_name = "cbc-paes-s390", + .base.cra_priority = 402, /* ecb-paes-s390 + 1 */ + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct s390_paes_ctx), + .base.cra_module = THIS_MODULE, + .base.cra_list = LIST_HEAD_INIT(cbc_paes_alg.base.cra_list), + .init = cbc_paes_init, + .exit = cbc_paes_exit, + .min_keysize = PAES_MIN_KEYSIZE, + .max_keysize = PAES_MAX_KEYSIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = cbc_paes_set_key, + .encrypt = cbc_paes_encrypt, + .decrypt = cbc_paes_decrypt, +}; + +static int xts_paes_init(struct crypto_skcipher *tfm) +{ + struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm); + + ctx->kb[0].key = NULL; + ctx->kb[1].key = NULL; + spin_lock_init(&ctx->pk_lock); + + return 0; +} + +static void xts_paes_exit(struct crypto_skcipher *tfm) +{ + struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm); + + _free_kb_keybuf(&ctx->kb[0]); + _free_kb_keybuf(&ctx->kb[1]); +} + +static inline int __xts_paes_convert_key(struct s390_pxts_ctx *ctx) +{ + struct pkey_protkey pkey0, pkey1; + + pkey0.len = sizeof(pkey0.protkey); + pkey1.len = sizeof(pkey1.protkey); + + if (__paes_keyblob2pkey(&ctx->kb[0], &pkey0) || + __paes_keyblob2pkey(&ctx->kb[1], &pkey1)) + return -EINVAL; + + spin_lock_bh(&ctx->pk_lock); + memcpy(&ctx->pk[0], &pkey0, sizeof(pkey0)); + memcpy(&ctx->pk[1], &pkey1, sizeof(pkey1)); + spin_unlock_bh(&ctx->pk_lock); + + return 0; +} + +static inline int __xts_paes_set_key(struct s390_pxts_ctx *ctx) +{ + unsigned long fc; + + if (__xts_paes_convert_key(ctx)) + return -EINVAL; + + if (ctx->pk[0].type != ctx->pk[1].type) + return -EINVAL; + + /* Pick the correct function code based on the protected key type */ + fc = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? CPACF_KM_PXTS_128 : + (ctx->pk[0].type == PKEY_KEYTYPE_AES_256) ? + CPACF_KM_PXTS_256 : 0; + + /* Check if the function code is available */ + ctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0; + + return ctx->fc ? 0 : -EINVAL; +} + +static int xts_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, + unsigned int xts_key_len) +{ + int rc; + struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm); + u8 ckey[2 * AES_MAX_KEY_SIZE]; + unsigned int ckey_len, key_len; + + if (xts_key_len % 2) + return -EINVAL; + + key_len = xts_key_len / 2; + + _free_kb_keybuf(&ctx->kb[0]); + _free_kb_keybuf(&ctx->kb[1]); + rc = _key_to_kb(&ctx->kb[0], in_key, key_len); + if (rc) + return rc; + rc = _key_to_kb(&ctx->kb[1], in_key + key_len, key_len); + if (rc) + return rc; + + rc = __xts_paes_set_key(ctx); + if (rc) + return rc; + + /* + * xts_verify_key verifies the key length is not odd and makes + * sure that the two keys are not the same. This can be done + * on the two protected keys as well + */ + ckey_len = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? + AES_KEYSIZE_128 : AES_KEYSIZE_256; + memcpy(ckey, ctx->pk[0].protkey, ckey_len); + memcpy(ckey + ckey_len, ctx->pk[1].protkey, ckey_len); + return xts_verify_key(tfm, ckey, 2*ckey_len); +} + +static int xts_paes_crypt(struct skcipher_request *req, unsigned long modifier) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int keylen, offset, nbytes, n, k; + int ret; + struct { + u8 key[MAXPROTKEYSIZE]; /* key + verification pattern */ + u8 tweak[16]; + u8 block[16]; + u8 bit[16]; + u8 xts[16]; + } pcc_param; + struct { + u8 key[MAXPROTKEYSIZE]; /* key + verification pattern */ + u8 init[16]; + } xts_param; + + ret = skcipher_walk_virt(&walk, req, false); + if (ret) + return ret; + + keylen = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? 48 : 64; + offset = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? 16 : 0; + + memset(&pcc_param, 0, sizeof(pcc_param)); + memcpy(pcc_param.tweak, walk.iv, sizeof(pcc_param.tweak)); + spin_lock_bh(&ctx->pk_lock); + memcpy(pcc_param.key + offset, ctx->pk[1].protkey, keylen); + memcpy(xts_param.key + offset, ctx->pk[0].protkey, keylen); + spin_unlock_bh(&ctx->pk_lock); + cpacf_pcc(ctx->fc, pcc_param.key + offset); + memcpy(xts_param.init, pcc_param.xts, 16); + + while ((nbytes = walk.nbytes) != 0) { + /* only use complete blocks */ + n = nbytes & ~(AES_BLOCK_SIZE - 1); + k = cpacf_km(ctx->fc | modifier, xts_param.key + offset, + walk.dst.virt.addr, walk.src.virt.addr, n); + if (k) + ret = skcipher_walk_done(&walk, nbytes - k); + if (k < n) { + if (__xts_paes_convert_key(ctx)) + return skcipher_walk_done(&walk, -EIO); + spin_lock_bh(&ctx->pk_lock); + memcpy(xts_param.key + offset, + ctx->pk[0].protkey, keylen); + spin_unlock_bh(&ctx->pk_lock); + } + } + + return ret; +} + +static int xts_paes_encrypt(struct skcipher_request *req) +{ + return xts_paes_crypt(req, 0); +} + +static int xts_paes_decrypt(struct skcipher_request *req) +{ + return xts_paes_crypt(req, CPACF_DECRYPT); +} + +static struct skcipher_alg xts_paes_alg = { + .base.cra_name = "xts(paes)", + .base.cra_driver_name = "xts-paes-s390", + .base.cra_priority = 402, /* ecb-paes-s390 + 1 */ + .base.cra_blocksize = AES_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct s390_pxts_ctx), + .base.cra_module = THIS_MODULE, + .base.cra_list = LIST_HEAD_INIT(xts_paes_alg.base.cra_list), + .init = xts_paes_init, + .exit = xts_paes_exit, + .min_keysize = 2 * PAES_MIN_KEYSIZE, + .max_keysize = 2 * PAES_MAX_KEYSIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = xts_paes_set_key, + .encrypt = xts_paes_encrypt, + .decrypt = xts_paes_decrypt, +}; + +static int ctr_paes_init(struct crypto_skcipher *tfm) +{ + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + + ctx->kb.key = NULL; + spin_lock_init(&ctx->pk_lock); + + return 0; +} + +static void ctr_paes_exit(struct crypto_skcipher *tfm) +{ + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + + _free_kb_keybuf(&ctx->kb); +} + +static inline int __ctr_paes_set_key(struct s390_paes_ctx *ctx) +{ + int rc; + unsigned long fc; + + rc = __paes_convert_key(ctx); + if (rc) + return rc; + + /* Pick the correct function code based on the protected key type */ + fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KMCTR_PAES_128 : + (ctx->pk.type == PKEY_KEYTYPE_AES_192) ? CPACF_KMCTR_PAES_192 : + (ctx->pk.type == PKEY_KEYTYPE_AES_256) ? + CPACF_KMCTR_PAES_256 : 0; + + /* Check if the function code is available */ + ctx->fc = (fc && cpacf_test_func(&kmctr_functions, fc)) ? fc : 0; + + return ctx->fc ? 0 : -EINVAL; +} + +static int ctr_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, + unsigned int key_len) +{ + int rc; + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + + _free_kb_keybuf(&ctx->kb); + rc = _key_to_kb(&ctx->kb, in_key, key_len); + if (rc) + return rc; + + return __ctr_paes_set_key(ctx); +} + +static unsigned int __ctrblk_init(u8 *ctrptr, u8 *iv, unsigned int nbytes) +{ + unsigned int i, n; + + /* only use complete blocks, max. PAGE_SIZE */ + memcpy(ctrptr, iv, AES_BLOCK_SIZE); + n = (nbytes > PAGE_SIZE) ? PAGE_SIZE : nbytes & ~(AES_BLOCK_SIZE - 1); + for (i = (n / AES_BLOCK_SIZE) - 1; i > 0; i--) { + memcpy(ctrptr + AES_BLOCK_SIZE, ctrptr, AES_BLOCK_SIZE); + crypto_inc(ctrptr + AES_BLOCK_SIZE, AES_BLOCK_SIZE); + ctrptr += AES_BLOCK_SIZE; + } + return n; +} + +static int ctr_paes_crypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm); + u8 buf[AES_BLOCK_SIZE], *ctrptr; + struct skcipher_walk walk; + unsigned int nbytes, n, k; + int ret, locked; + struct { + u8 key[MAXPROTKEYSIZE]; + } param; + + ret = skcipher_walk_virt(&walk, req, false); + if (ret) + return ret; + + spin_lock_bh(&ctx->pk_lock); + memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE); + spin_unlock_bh(&ctx->pk_lock); + + locked = mutex_trylock(&ctrblk_lock); + + while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) { + n = AES_BLOCK_SIZE; + if (nbytes >= 2*AES_BLOCK_SIZE && locked) + n = __ctrblk_init(ctrblk, walk.iv, nbytes); + ctrptr = (n > AES_BLOCK_SIZE) ? ctrblk : walk.iv; + k = cpacf_kmctr(ctx->fc, ¶m, walk.dst.virt.addr, + walk.src.virt.addr, n, ctrptr); + if (k) { + if (ctrptr == ctrblk) + memcpy(walk.iv, ctrptr + k - AES_BLOCK_SIZE, + AES_BLOCK_SIZE); + crypto_inc(walk.iv, AES_BLOCK_SIZE); + ret = skcipher_walk_done(&walk, nbytes - k); + } + if (k < n) { + if (__paes_convert_key(ctx)) { + if (locked) + mutex_unlock(&ctrblk_lock); + return skcipher_walk_done(&walk, -EIO); + } + spin_lock_bh(&ctx->pk_lock); + memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE); + spin_unlock_bh(&ctx->pk_lock); + } + } + if (locked) + mutex_unlock(&ctrblk_lock); + /* + * final block may be < AES_BLOCK_SIZE, copy only nbytes + */ + if (nbytes) { + memset(buf, 0, AES_BLOCK_SIZE); + memcpy(buf, walk.src.virt.addr, nbytes); + while (1) { + if (cpacf_kmctr(ctx->fc, ¶m, buf, + buf, AES_BLOCK_SIZE, + walk.iv) == AES_BLOCK_SIZE) + break; + if (__paes_convert_key(ctx)) + return skcipher_walk_done(&walk, -EIO); + spin_lock_bh(&ctx->pk_lock); + memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE); + spin_unlock_bh(&ctx->pk_lock); + } + memcpy(walk.dst.virt.addr, buf, nbytes); + crypto_inc(walk.iv, AES_BLOCK_SIZE); + ret = skcipher_walk_done(&walk, nbytes); + } + + return ret; +} + +static struct skcipher_alg ctr_paes_alg = { + .base.cra_name = "ctr(paes)", + .base.cra_driver_name = "ctr-paes-s390", + .base.cra_priority = 402, /* ecb-paes-s390 + 1 */ + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct s390_paes_ctx), + .base.cra_module = THIS_MODULE, + .base.cra_list = LIST_HEAD_INIT(ctr_paes_alg.base.cra_list), + .init = ctr_paes_init, + .exit = ctr_paes_exit, + .min_keysize = PAES_MIN_KEYSIZE, + .max_keysize = PAES_MAX_KEYSIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ctr_paes_set_key, + .encrypt = ctr_paes_crypt, + .decrypt = ctr_paes_crypt, + .chunksize = AES_BLOCK_SIZE, +}; + +static inline void __crypto_unregister_skcipher(struct skcipher_alg *alg) +{ + if (!list_empty(&alg->base.cra_list)) + crypto_unregister_skcipher(alg); +} + +static void paes_s390_fini(void) +{ + __crypto_unregister_skcipher(&ctr_paes_alg); + __crypto_unregister_skcipher(&xts_paes_alg); + __crypto_unregister_skcipher(&cbc_paes_alg); + __crypto_unregister_skcipher(&ecb_paes_alg); + if (ctrblk) + free_page((unsigned long) ctrblk); +} + +static int __init paes_s390_init(void) +{ + int ret; + + /* Query available functions for KM, KMC and KMCTR */ + cpacf_query(CPACF_KM, &km_functions); + cpacf_query(CPACF_KMC, &kmc_functions); + cpacf_query(CPACF_KMCTR, &kmctr_functions); + + if (cpacf_test_func(&km_functions, CPACF_KM_PAES_128) || + cpacf_test_func(&km_functions, CPACF_KM_PAES_192) || + cpacf_test_func(&km_functions, CPACF_KM_PAES_256)) { + ret = crypto_register_skcipher(&ecb_paes_alg); + if (ret) + goto out_err; + } + + if (cpacf_test_func(&kmc_functions, CPACF_KMC_PAES_128) || + cpacf_test_func(&kmc_functions, CPACF_KMC_PAES_192) || + cpacf_test_func(&kmc_functions, CPACF_KMC_PAES_256)) { + ret = crypto_register_skcipher(&cbc_paes_alg); + if (ret) + goto out_err; + } + + if (cpacf_test_func(&km_functions, CPACF_KM_PXTS_128) || + cpacf_test_func(&km_functions, CPACF_KM_PXTS_256)) { + ret = crypto_register_skcipher(&xts_paes_alg); + if (ret) + goto out_err; + } + + if (cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_128) || + cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_192) || + cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_256)) { + ctrblk = (u8 *) __get_free_page(GFP_KERNEL); + if (!ctrblk) { + ret = -ENOMEM; + goto out_err; + } + ret = crypto_register_skcipher(&ctr_paes_alg); + if (ret) + goto out_err; + } + + return 0; +out_err: + paes_s390_fini(); + return ret; +} + +module_init(paes_s390_init); +module_exit(paes_s390_fini); + +MODULE_ALIAS_CRYPTO("paes"); + +MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm with protected keys"); +MODULE_LICENSE("GPL"); diff --git a/arch/s390/crypto/prng.c b/arch/s390/crypto/prng.c new file mode 100644 index 0000000000..a077087bc6 --- /dev/null +++ b/arch/s390/crypto/prng.c @@ -0,0 +1,911 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2006, 2015 + * Author(s): Jan Glauber + * Harald Freudenberger + * Driver for the s390 pseudo random number generator + */ + +#define KMSG_COMPONENT "prng" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("IBM Corporation"); +MODULE_DESCRIPTION("s390 PRNG interface"); + + +#define PRNG_MODE_AUTO 0 +#define PRNG_MODE_TDES 1 +#define PRNG_MODE_SHA512 2 + +static unsigned int prng_mode = PRNG_MODE_AUTO; +module_param_named(mode, prng_mode, int, 0); +MODULE_PARM_DESC(prng_mode, "PRNG mode: 0 - auto, 1 - TDES, 2 - SHA512"); + + +#define PRNG_CHUNKSIZE_TDES_MIN 8 +#define PRNG_CHUNKSIZE_TDES_MAX (64*1024) +#define PRNG_CHUNKSIZE_SHA512_MIN 64 +#define PRNG_CHUNKSIZE_SHA512_MAX (64*1024) + +static unsigned int prng_chunk_size = 256; +module_param_named(chunksize, prng_chunk_size, int, 0); +MODULE_PARM_DESC(prng_chunk_size, "PRNG read chunk size in bytes"); + + +#define PRNG_RESEED_LIMIT_TDES 4096 +#define PRNG_RESEED_LIMIT_TDES_LOWER 4096 +#define PRNG_RESEED_LIMIT_SHA512 100000 +#define PRNG_RESEED_LIMIT_SHA512_LOWER 10000 + +static unsigned int prng_reseed_limit; +module_param_named(reseed_limit, prng_reseed_limit, int, 0); +MODULE_PARM_DESC(prng_reseed_limit, "PRNG reseed limit"); + +static bool trng_available; + +/* + * Any one who considers arithmetical methods of producing random digits is, + * of course, in a state of sin. -- John von Neumann + */ + +static int prng_errorflag; + +#define PRNG_GEN_ENTROPY_FAILED 1 +#define PRNG_SELFTEST_FAILED 2 +#define PRNG_INSTANTIATE_FAILED 3 +#define PRNG_SEED_FAILED 4 +#define PRNG_RESEED_FAILED 5 +#define PRNG_GEN_FAILED 6 + +struct prng_ws_s { + u8 parm_block[32]; + u32 reseed_counter; + u64 byte_counter; +}; + +struct prno_ws_s { + u32 res; + u32 reseed_counter; + u64 stream_bytes; + u8 V[112]; + u8 C[112]; +}; + +struct prng_data_s { + struct mutex mutex; + union { + struct prng_ws_s prngws; + struct prno_ws_s prnows; + }; + u8 *buf; + u32 rest; + u8 *prev; +}; + +static struct prng_data_s *prng_data; + +/* initial parameter block for tdes mode, copied from libica */ +static const u8 initial_parm_block[32] __initconst = { + 0x0F, 0x2B, 0x8E, 0x63, 0x8C, 0x8E, 0xD2, 0x52, + 0x64, 0xB7, 0xA0, 0x7B, 0x75, 0x28, 0xB8, 0xF4, + 0x75, 0x5F, 0xD2, 0xA6, 0x8D, 0x97, 0x11, 0xFF, + 0x49, 0xD8, 0x23, 0xF3, 0x7E, 0x21, 0xEC, 0xA0 }; + + +/*** helper functions ***/ + +/* + * generate_entropy: + * This function fills a given buffer with random bytes. The entropy within + * the random bytes given back is assumed to have at least 50% - meaning + * a 64 bytes buffer has at least 64 * 8 / 2 = 256 bits of entropy. + * Within the function the entropy generation is done in junks of 64 bytes. + * So the caller should also ask for buffer fill in multiples of 64 bytes. + * The generation of the entropy is based on the assumption that every stckf() + * invocation produces 0.5 bits of entropy. To accumulate 256 bits of entropy + * at least 512 stckf() values are needed. The entropy relevant part of the + * stckf value is bit 51 (counting starts at the left with bit nr 0) so + * here we use the lower 4 bytes and exor the values into 2k of bufferspace. + * To be on the save side, if there is ever a problem with stckf() the + * other half of the page buffer is filled with bytes from urandom via + * get_random_bytes(), so this function consumes 2k of urandom for each + * requested 64 bytes output data. Finally the buffer page is condensed into + * a 64 byte value by hashing with a SHA512 hash. + */ +static int generate_entropy(u8 *ebuf, size_t nbytes) +{ + int n, ret = 0; + u8 *pg, pblock[80] = { + /* 8 x 64 bit init values */ + 0x6A, 0x09, 0xE6, 0x67, 0xF3, 0xBC, 0xC9, 0x08, + 0xBB, 0x67, 0xAE, 0x85, 0x84, 0xCA, 0xA7, 0x3B, + 0x3C, 0x6E, 0xF3, 0x72, 0xFE, 0x94, 0xF8, 0x2B, + 0xA5, 0x4F, 0xF5, 0x3A, 0x5F, 0x1D, 0x36, 0xF1, + 0x51, 0x0E, 0x52, 0x7F, 0xAD, 0xE6, 0x82, 0xD1, + 0x9B, 0x05, 0x68, 0x8C, 0x2B, 0x3E, 0x6C, 0x1F, + 0x1F, 0x83, 0xD9, 0xAB, 0xFB, 0x41, 0xBD, 0x6B, + 0x5B, 0xE0, 0xCD, 0x19, 0x13, 0x7E, 0x21, 0x79, + /* 128 bit counter total message bit length */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00 }; + + /* allocate one page stckf buffer */ + pg = (u8 *) __get_free_page(GFP_KERNEL); + if (!pg) { + prng_errorflag = PRNG_GEN_ENTROPY_FAILED; + return -ENOMEM; + } + + /* fill the ebuf in chunks of 64 byte each */ + while (nbytes) { + /* fill lower 2k with urandom bytes */ + get_random_bytes(pg, PAGE_SIZE / 2); + /* exor upper 2k with 512 stckf values, offset 4 bytes each */ + for (n = 0; n < 512; n++) { + int offset = (PAGE_SIZE / 2) + (n * 4) - 4; + u64 *p = (u64 *)(pg + offset); + *p ^= get_tod_clock_fast(); + } + /* hash over the filled page */ + cpacf_klmd(CPACF_KLMD_SHA_512, pblock, pg, PAGE_SIZE); + n = (nbytes < 64) ? nbytes : 64; + memcpy(ebuf, pblock, n); + ret += n; + ebuf += n; + nbytes -= n; + } + + memzero_explicit(pblock, sizeof(pblock)); + memzero_explicit(pg, PAGE_SIZE); + free_page((unsigned long)pg); + return ret; +} + + +/*** tdes functions ***/ + +static void prng_tdes_add_entropy(void) +{ + __u64 entropy[4]; + unsigned int i; + + for (i = 0; i < 16; i++) { + cpacf_kmc(CPACF_KMC_PRNG, prng_data->prngws.parm_block, + (char *) entropy, (char *) entropy, + sizeof(entropy)); + memcpy(prng_data->prngws.parm_block, entropy, sizeof(entropy)); + } +} + + +static void prng_tdes_seed(int nbytes) +{ + char buf[16]; + int i = 0; + + BUG_ON(nbytes > sizeof(buf)); + + get_random_bytes(buf, nbytes); + + /* Add the entropy */ + while (nbytes >= 8) { + *((__u64 *)prng_data->prngws.parm_block) ^= *((__u64 *)(buf+i)); + prng_tdes_add_entropy(); + i += 8; + nbytes -= 8; + } + prng_tdes_add_entropy(); + prng_data->prngws.reseed_counter = 0; +} + + +static int __init prng_tdes_instantiate(void) +{ + int datalen; + + pr_debug("prng runs in TDES mode with " + "chunksize=%d and reseed_limit=%u\n", + prng_chunk_size, prng_reseed_limit); + + /* memory allocation, prng_data struct init, mutex init */ + datalen = sizeof(struct prng_data_s) + prng_chunk_size; + prng_data = kzalloc(datalen, GFP_KERNEL); + if (!prng_data) { + prng_errorflag = PRNG_INSTANTIATE_FAILED; + return -ENOMEM; + } + mutex_init(&prng_data->mutex); + prng_data->buf = ((u8 *)prng_data) + sizeof(struct prng_data_s); + memcpy(prng_data->prngws.parm_block, initial_parm_block, 32); + + /* initialize the PRNG, add 128 bits of entropy */ + prng_tdes_seed(16); + + return 0; +} + + +static void prng_tdes_deinstantiate(void) +{ + pr_debug("The prng module stopped " + "after running in triple DES mode\n"); + kfree_sensitive(prng_data); +} + + +/*** sha512 functions ***/ + +static int __init prng_sha512_selftest(void) +{ + /* NIST DRBG testvector for Hash Drbg, Sha-512, Count #0 */ + static const u8 seed[] __initconst = { + 0x6b, 0x50, 0xa7, 0xd8, 0xf8, 0xa5, 0x5d, 0x7a, + 0x3d, 0xf8, 0xbb, 0x40, 0xbc, 0xc3, 0xb7, 0x22, + 0xd8, 0x70, 0x8d, 0xe6, 0x7f, 0xda, 0x01, 0x0b, + 0x03, 0xc4, 0xc8, 0x4d, 0x72, 0x09, 0x6f, 0x8c, + 0x3e, 0xc6, 0x49, 0xcc, 0x62, 0x56, 0xd9, 0xfa, + 0x31, 0xdb, 0x7a, 0x29, 0x04, 0xaa, 0xf0, 0x25 }; + static const u8 V0[] __initconst = { + 0x00, 0xad, 0xe3, 0x6f, 0x9a, 0x01, 0xc7, 0x76, + 0x61, 0x34, 0x35, 0xf5, 0x4e, 0x24, 0x74, 0x22, + 0x21, 0x9a, 0x29, 0x89, 0xc7, 0x93, 0x2e, 0x60, + 0x1e, 0xe8, 0x14, 0x24, 0x8d, 0xd5, 0x03, 0xf1, + 0x65, 0x5d, 0x08, 0x22, 0x72, 0xd5, 0xad, 0x95, + 0xe1, 0x23, 0x1e, 0x8a, 0xa7, 0x13, 0xd9, 0x2b, + 0x5e, 0xbc, 0xbb, 0x80, 0xab, 0x8d, 0xe5, 0x79, + 0xab, 0x5b, 0x47, 0x4e, 0xdd, 0xee, 0x6b, 0x03, + 0x8f, 0x0f, 0x5c, 0x5e, 0xa9, 0x1a, 0x83, 0xdd, + 0xd3, 0x88, 0xb2, 0x75, 0x4b, 0xce, 0x83, 0x36, + 0x57, 0x4b, 0xf1, 0x5c, 0xca, 0x7e, 0x09, 0xc0, + 0xd3, 0x89, 0xc6, 0xe0, 0xda, 0xc4, 0x81, 0x7e, + 0x5b, 0xf9, 0xe1, 0x01, 0xc1, 0x92, 0x05, 0xea, + 0xf5, 0x2f, 0xc6, 0xc6, 0xc7, 0x8f, 0xbc, 0xf4 }; + static const u8 C0[] __initconst = { + 0x00, 0xf4, 0xa3, 0xe5, 0xa0, 0x72, 0x63, 0x95, + 0xc6, 0x4f, 0x48, 0xd0, 0x8b, 0x5b, 0x5f, 0x8e, + 0x6b, 0x96, 0x1f, 0x16, 0xed, 0xbc, 0x66, 0x94, + 0x45, 0x31, 0xd7, 0x47, 0x73, 0x22, 0xa5, 0x86, + 0xce, 0xc0, 0x4c, 0xac, 0x63, 0xb8, 0x39, 0x50, + 0xbf, 0xe6, 0x59, 0x6c, 0x38, 0x58, 0x99, 0x1f, + 0x27, 0xa7, 0x9d, 0x71, 0x2a, 0xb3, 0x7b, 0xf9, + 0xfb, 0x17, 0x86, 0xaa, 0x99, 0x81, 0xaa, 0x43, + 0xe4, 0x37, 0xd3, 0x1e, 0x6e, 0xe5, 0xe6, 0xee, + 0xc2, 0xed, 0x95, 0x4f, 0x53, 0x0e, 0x46, 0x8a, + 0xcc, 0x45, 0xa5, 0xdb, 0x69, 0x0d, 0x81, 0xc9, + 0x32, 0x92, 0xbc, 0x8f, 0x33, 0xe6, 0xf6, 0x09, + 0x7c, 0x8e, 0x05, 0x19, 0x0d, 0xf1, 0xb6, 0xcc, + 0xf3, 0x02, 0x21, 0x90, 0x25, 0xec, 0xed, 0x0e }; + static const u8 random[] __initconst = { + 0x95, 0xb7, 0xf1, 0x7e, 0x98, 0x02, 0xd3, 0x57, + 0x73, 0x92, 0xc6, 0xa9, 0xc0, 0x80, 0x83, 0xb6, + 0x7d, 0xd1, 0x29, 0x22, 0x65, 0xb5, 0xf4, 0x2d, + 0x23, 0x7f, 0x1c, 0x55, 0xbb, 0x9b, 0x10, 0xbf, + 0xcf, 0xd8, 0x2c, 0x77, 0xa3, 0x78, 0xb8, 0x26, + 0x6a, 0x00, 0x99, 0x14, 0x3b, 0x3c, 0x2d, 0x64, + 0x61, 0x1e, 0xee, 0xb6, 0x9a, 0xcd, 0xc0, 0x55, + 0x95, 0x7c, 0x13, 0x9e, 0x8b, 0x19, 0x0c, 0x7a, + 0x06, 0x95, 0x5f, 0x2c, 0x79, 0x7c, 0x27, 0x78, + 0xde, 0x94, 0x03, 0x96, 0xa5, 0x01, 0xf4, 0x0e, + 0x91, 0x39, 0x6a, 0xcf, 0x8d, 0x7e, 0x45, 0xeb, + 0xdb, 0xb5, 0x3b, 0xbf, 0x8c, 0x97, 0x52, 0x30, + 0xd2, 0xf0, 0xff, 0x91, 0x06, 0xc7, 0x61, 0x19, + 0xae, 0x49, 0x8e, 0x7f, 0xbc, 0x03, 0xd9, 0x0f, + 0x8e, 0x4c, 0x51, 0x62, 0x7a, 0xed, 0x5c, 0x8d, + 0x42, 0x63, 0xd5, 0xd2, 0xb9, 0x78, 0x87, 0x3a, + 0x0d, 0xe5, 0x96, 0xee, 0x6d, 0xc7, 0xf7, 0xc2, + 0x9e, 0x37, 0xee, 0xe8, 0xb3, 0x4c, 0x90, 0xdd, + 0x1c, 0xf6, 0xa9, 0xdd, 0xb2, 0x2b, 0x4c, 0xbd, + 0x08, 0x6b, 0x14, 0xb3, 0x5d, 0xe9, 0x3d, 0xa2, + 0xd5, 0xcb, 0x18, 0x06, 0x69, 0x8c, 0xbd, 0x7b, + 0xbb, 0x67, 0xbf, 0xe3, 0xd3, 0x1f, 0xd2, 0xd1, + 0xdb, 0xd2, 0xa1, 0xe0, 0x58, 0xa3, 0xeb, 0x99, + 0xd7, 0xe5, 0x1f, 0x1a, 0x93, 0x8e, 0xed, 0x5e, + 0x1c, 0x1d, 0xe2, 0x3a, 0x6b, 0x43, 0x45, 0xd3, + 0x19, 0x14, 0x09, 0xf9, 0x2f, 0x39, 0xb3, 0x67, + 0x0d, 0x8d, 0xbf, 0xb6, 0x35, 0xd8, 0xe6, 0xa3, + 0x69, 0x32, 0xd8, 0x10, 0x33, 0xd1, 0x44, 0x8d, + 0x63, 0xb4, 0x03, 0xdd, 0xf8, 0x8e, 0x12, 0x1b, + 0x6e, 0x81, 0x9a, 0xc3, 0x81, 0x22, 0x6c, 0x13, + 0x21, 0xe4, 0xb0, 0x86, 0x44, 0xf6, 0x72, 0x7c, + 0x36, 0x8c, 0x5a, 0x9f, 0x7a, 0x4b, 0x3e, 0xe2 }; + + u8 buf[sizeof(random)]; + struct prno_ws_s ws; + + memset(&ws, 0, sizeof(ws)); + + /* initial seed */ + cpacf_prno(CPACF_PRNO_SHA512_DRNG_SEED, + &ws, NULL, 0, seed, sizeof(seed)); + + /* check working states V and C */ + if (memcmp(ws.V, V0, sizeof(V0)) != 0 + || memcmp(ws.C, C0, sizeof(C0)) != 0) { + pr_err("The prng self test state test " + "for the SHA-512 mode failed\n"); + prng_errorflag = PRNG_SELFTEST_FAILED; + return -EIO; + } + + /* generate random bytes */ + cpacf_prno(CPACF_PRNO_SHA512_DRNG_GEN, + &ws, buf, sizeof(buf), NULL, 0); + cpacf_prno(CPACF_PRNO_SHA512_DRNG_GEN, + &ws, buf, sizeof(buf), NULL, 0); + + /* check against expected data */ + if (memcmp(buf, random, sizeof(random)) != 0) { + pr_err("The prng self test data test " + "for the SHA-512 mode failed\n"); + prng_errorflag = PRNG_SELFTEST_FAILED; + return -EIO; + } + + return 0; +} + + +static int __init prng_sha512_instantiate(void) +{ + int ret, datalen, seedlen; + u8 seed[128 + 16]; + + pr_debug("prng runs in SHA-512 mode " + "with chunksize=%d and reseed_limit=%u\n", + prng_chunk_size, prng_reseed_limit); + + /* memory allocation, prng_data struct init, mutex init */ + datalen = sizeof(struct prng_data_s) + prng_chunk_size; + if (fips_enabled) + datalen += prng_chunk_size; + prng_data = kzalloc(datalen, GFP_KERNEL); + if (!prng_data) { + prng_errorflag = PRNG_INSTANTIATE_FAILED; + return -ENOMEM; + } + mutex_init(&prng_data->mutex); + prng_data->buf = ((u8 *)prng_data) + sizeof(struct prng_data_s); + + /* selftest */ + ret = prng_sha512_selftest(); + if (ret) + goto outfree; + + /* generate initial seed, we need at least 256 + 128 bits entropy. */ + if (trng_available) { + /* + * Trng available, so use it. The trng works in chunks of + * 32 bytes and produces 100% entropy. So we pull 64 bytes + * which gives us 512 bits entropy. + */ + seedlen = 2 * 32; + cpacf_trng(NULL, 0, seed, seedlen); + } else { + /* + * No trng available, so use the generate_entropy() function. + * This function works in 64 byte junks and produces + * 50% entropy. So we pull 2*64 bytes which gives us 512 bits + * of entropy. + */ + seedlen = 2 * 64; + ret = generate_entropy(seed, seedlen); + if (ret != seedlen) + goto outfree; + } + + /* append the seed by 16 bytes of unique nonce */ + store_tod_clock_ext((union tod_clock *)(seed + seedlen)); + seedlen += 16; + + /* now initial seed of the prno drng */ + cpacf_prno(CPACF_PRNO_SHA512_DRNG_SEED, + &prng_data->prnows, NULL, 0, seed, seedlen); + memzero_explicit(seed, sizeof(seed)); + + /* if fips mode is enabled, generate a first block of random + bytes for the FIPS 140-2 Conditional Self Test */ + if (fips_enabled) { + prng_data->prev = prng_data->buf + prng_chunk_size; + cpacf_prno(CPACF_PRNO_SHA512_DRNG_GEN, + &prng_data->prnows, + prng_data->prev, prng_chunk_size, NULL, 0); + } + + return 0; + +outfree: + kfree(prng_data); + return ret; +} + + +static void prng_sha512_deinstantiate(void) +{ + pr_debug("The prng module stopped after running in SHA-512 mode\n"); + kfree_sensitive(prng_data); +} + + +static int prng_sha512_reseed(void) +{ + int ret, seedlen; + u8 seed[64]; + + /* We need at least 256 bits of fresh entropy for reseeding */ + if (trng_available) { + /* trng produces 256 bits entropy in 32 bytes */ + seedlen = 32; + cpacf_trng(NULL, 0, seed, seedlen); + } else { + /* generate_entropy() produces 256 bits entropy in 64 bytes */ + seedlen = 64; + ret = generate_entropy(seed, seedlen); + if (ret != sizeof(seed)) + return ret; + } + + /* do a reseed of the prno drng with this bytestring */ + cpacf_prno(CPACF_PRNO_SHA512_DRNG_SEED, + &prng_data->prnows, NULL, 0, seed, seedlen); + memzero_explicit(seed, sizeof(seed)); + + return 0; +} + + +static int prng_sha512_generate(u8 *buf, size_t nbytes) +{ + int ret; + + /* reseed needed ? */ + if (prng_data->prnows.reseed_counter > prng_reseed_limit) { + ret = prng_sha512_reseed(); + if (ret) + return ret; + } + + /* PRNO generate */ + cpacf_prno(CPACF_PRNO_SHA512_DRNG_GEN, + &prng_data->prnows, buf, nbytes, NULL, 0); + + /* FIPS 140-2 Conditional Self Test */ + if (fips_enabled) { + if (!memcmp(prng_data->prev, buf, nbytes)) { + prng_errorflag = PRNG_GEN_FAILED; + return -EILSEQ; + } + memcpy(prng_data->prev, buf, nbytes); + } + + return nbytes; +} + + +/*** file io functions ***/ + +static int prng_open(struct inode *inode, struct file *file) +{ + return nonseekable_open(inode, file); +} + + +static ssize_t prng_tdes_read(struct file *file, char __user *ubuf, + size_t nbytes, loff_t *ppos) +{ + int chunk, n, ret = 0; + + /* lock prng_data struct */ + if (mutex_lock_interruptible(&prng_data->mutex)) + return -ERESTARTSYS; + + while (nbytes) { + if (need_resched()) { + if (signal_pending(current)) { + if (ret == 0) + ret = -ERESTARTSYS; + break; + } + /* give mutex free before calling schedule() */ + mutex_unlock(&prng_data->mutex); + schedule(); + /* occupy mutex again */ + if (mutex_lock_interruptible(&prng_data->mutex)) { + if (ret == 0) + ret = -ERESTARTSYS; + return ret; + } + } + + /* + * we lose some random bytes if an attacker issues + * reads < 8 bytes, but we don't care + */ + chunk = min_t(int, nbytes, prng_chunk_size); + + /* PRNG only likes multiples of 8 bytes */ + n = (chunk + 7) & -8; + + if (prng_data->prngws.reseed_counter > prng_reseed_limit) + prng_tdes_seed(8); + + /* if the CPU supports PRNG stckf is present too */ + *((unsigned long long *)prng_data->buf) = get_tod_clock_fast(); + + /* + * Beside the STCKF the input for the TDES-EDE is the output + * of the last operation. We differ here from X9.17 since we + * only store one timestamp into the buffer. Padding the whole + * buffer with timestamps does not improve security, since + * successive stckf have nearly constant offsets. + * If an attacker knows the first timestamp it would be + * trivial to guess the additional values. One timestamp + * is therefore enough and still guarantees unique input values. + * + * Note: you can still get strict X9.17 conformity by setting + * prng_chunk_size to 8 bytes. + */ + cpacf_kmc(CPACF_KMC_PRNG, prng_data->prngws.parm_block, + prng_data->buf, prng_data->buf, n); + + prng_data->prngws.byte_counter += n; + prng_data->prngws.reseed_counter += n; + + if (copy_to_user(ubuf, prng_data->buf, chunk)) { + ret = -EFAULT; + break; + } + + nbytes -= chunk; + ret += chunk; + ubuf += chunk; + } + + /* unlock prng_data struct */ + mutex_unlock(&prng_data->mutex); + + return ret; +} + + +static ssize_t prng_sha512_read(struct file *file, char __user *ubuf, + size_t nbytes, loff_t *ppos) +{ + int n, ret = 0; + u8 *p; + + /* if errorflag is set do nothing and return 'broken pipe' */ + if (prng_errorflag) + return -EPIPE; + + /* lock prng_data struct */ + if (mutex_lock_interruptible(&prng_data->mutex)) + return -ERESTARTSYS; + + while (nbytes) { + if (need_resched()) { + if (signal_pending(current)) { + if (ret == 0) + ret = -ERESTARTSYS; + break; + } + /* give mutex free before calling schedule() */ + mutex_unlock(&prng_data->mutex); + schedule(); + /* occopy mutex again */ + if (mutex_lock_interruptible(&prng_data->mutex)) { + if (ret == 0) + ret = -ERESTARTSYS; + return ret; + } + } + if (prng_data->rest) { + /* push left over random bytes from the previous read */ + p = prng_data->buf + prng_chunk_size - prng_data->rest; + n = (nbytes < prng_data->rest) ? + nbytes : prng_data->rest; + prng_data->rest -= n; + } else { + /* generate one chunk of random bytes into read buf */ + p = prng_data->buf; + n = prng_sha512_generate(p, prng_chunk_size); + if (n < 0) { + ret = n; + break; + } + if (nbytes < prng_chunk_size) { + n = nbytes; + prng_data->rest = prng_chunk_size - n; + } else { + n = prng_chunk_size; + prng_data->rest = 0; + } + } + if (copy_to_user(ubuf, p, n)) { + ret = -EFAULT; + break; + } + memzero_explicit(p, n); + ubuf += n; + nbytes -= n; + ret += n; + } + + /* unlock prng_data struct */ + mutex_unlock(&prng_data->mutex); + + return ret; +} + + +/*** sysfs stuff ***/ + +static const struct file_operations prng_sha512_fops = { + .owner = THIS_MODULE, + .open = &prng_open, + .release = NULL, + .read = &prng_sha512_read, + .llseek = noop_llseek, +}; +static const struct file_operations prng_tdes_fops = { + .owner = THIS_MODULE, + .open = &prng_open, + .release = NULL, + .read = &prng_tdes_read, + .llseek = noop_llseek, +}; + +/* chunksize attribute (ro) */ +static ssize_t prng_chunksize_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%u\n", prng_chunk_size); +} +static DEVICE_ATTR(chunksize, 0444, prng_chunksize_show, NULL); + +/* counter attribute (ro) */ +static ssize_t prng_counter_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + u64 counter; + + if (mutex_lock_interruptible(&prng_data->mutex)) + return -ERESTARTSYS; + if (prng_mode == PRNG_MODE_SHA512) + counter = prng_data->prnows.stream_bytes; + else + counter = prng_data->prngws.byte_counter; + mutex_unlock(&prng_data->mutex); + + return scnprintf(buf, PAGE_SIZE, "%llu\n", counter); +} +static DEVICE_ATTR(byte_counter, 0444, prng_counter_show, NULL); + +/* errorflag attribute (ro) */ +static ssize_t prng_errorflag_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%d\n", prng_errorflag); +} +static DEVICE_ATTR(errorflag, 0444, prng_errorflag_show, NULL); + +/* mode attribute (ro) */ +static ssize_t prng_mode_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + if (prng_mode == PRNG_MODE_TDES) + return scnprintf(buf, PAGE_SIZE, "TDES\n"); + else + return scnprintf(buf, PAGE_SIZE, "SHA512\n"); +} +static DEVICE_ATTR(mode, 0444, prng_mode_show, NULL); + +/* reseed attribute (w) */ +static ssize_t prng_reseed_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + if (mutex_lock_interruptible(&prng_data->mutex)) + return -ERESTARTSYS; + prng_sha512_reseed(); + mutex_unlock(&prng_data->mutex); + + return count; +} +static DEVICE_ATTR(reseed, 0200, NULL, prng_reseed_store); + +/* reseed limit attribute (rw) */ +static ssize_t prng_reseed_limit_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%u\n", prng_reseed_limit); +} +static ssize_t prng_reseed_limit_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned limit; + + if (sscanf(buf, "%u\n", &limit) != 1) + return -EINVAL; + + if (prng_mode == PRNG_MODE_SHA512) { + if (limit < PRNG_RESEED_LIMIT_SHA512_LOWER) + return -EINVAL; + } else { + if (limit < PRNG_RESEED_LIMIT_TDES_LOWER) + return -EINVAL; + } + + prng_reseed_limit = limit; + + return count; +} +static DEVICE_ATTR(reseed_limit, 0644, + prng_reseed_limit_show, prng_reseed_limit_store); + +/* strength attribute (ro) */ +static ssize_t prng_strength_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "256\n"); +} +static DEVICE_ATTR(strength, 0444, prng_strength_show, NULL); + +static struct attribute *prng_sha512_dev_attrs[] = { + &dev_attr_errorflag.attr, + &dev_attr_chunksize.attr, + &dev_attr_byte_counter.attr, + &dev_attr_mode.attr, + &dev_attr_reseed.attr, + &dev_attr_reseed_limit.attr, + &dev_attr_strength.attr, + NULL +}; +ATTRIBUTE_GROUPS(prng_sha512_dev); + +static struct attribute *prng_tdes_dev_attrs[] = { + &dev_attr_chunksize.attr, + &dev_attr_byte_counter.attr, + &dev_attr_mode.attr, + NULL +}; +ATTRIBUTE_GROUPS(prng_tdes_dev); + +static struct miscdevice prng_sha512_dev = { + .name = "prandom", + .minor = MISC_DYNAMIC_MINOR, + .mode = 0644, + .fops = &prng_sha512_fops, + .groups = prng_sha512_dev_groups, +}; + +static struct miscdevice prng_tdes_dev = { + .name = "prandom", + .minor = MISC_DYNAMIC_MINOR, + .mode = 0644, + .fops = &prng_tdes_fops, + .groups = prng_tdes_dev_groups, +}; + + +/*** module init and exit ***/ + +static int __init prng_init(void) +{ + int ret; + + /* check if the CPU has a PRNG */ + if (!cpacf_query_func(CPACF_KMC, CPACF_KMC_PRNG)) + return -ENODEV; + + /* check if TRNG subfunction is available */ + if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG)) + trng_available = true; + + /* choose prng mode */ + if (prng_mode != PRNG_MODE_TDES) { + /* check for MSA5 support for PRNO operations */ + if (!cpacf_query_func(CPACF_PRNO, CPACF_PRNO_SHA512_DRNG_GEN)) { + if (prng_mode == PRNG_MODE_SHA512) { + pr_err("The prng module cannot " + "start in SHA-512 mode\n"); + return -ENODEV; + } + prng_mode = PRNG_MODE_TDES; + } else + prng_mode = PRNG_MODE_SHA512; + } + + if (prng_mode == PRNG_MODE_SHA512) { + + /* SHA512 mode */ + + if (prng_chunk_size < PRNG_CHUNKSIZE_SHA512_MIN + || prng_chunk_size > PRNG_CHUNKSIZE_SHA512_MAX) + return -EINVAL; + prng_chunk_size = (prng_chunk_size + 0x3f) & ~0x3f; + + if (prng_reseed_limit == 0) + prng_reseed_limit = PRNG_RESEED_LIMIT_SHA512; + else if (prng_reseed_limit < PRNG_RESEED_LIMIT_SHA512_LOWER) + return -EINVAL; + + ret = prng_sha512_instantiate(); + if (ret) + goto out; + + ret = misc_register(&prng_sha512_dev); + if (ret) { + prng_sha512_deinstantiate(); + goto out; + } + + } else { + + /* TDES mode */ + + if (prng_chunk_size < PRNG_CHUNKSIZE_TDES_MIN + || prng_chunk_size > PRNG_CHUNKSIZE_TDES_MAX) + return -EINVAL; + prng_chunk_size = (prng_chunk_size + 0x07) & ~0x07; + + if (prng_reseed_limit == 0) + prng_reseed_limit = PRNG_RESEED_LIMIT_TDES; + else if (prng_reseed_limit < PRNG_RESEED_LIMIT_TDES_LOWER) + return -EINVAL; + + ret = prng_tdes_instantiate(); + if (ret) + goto out; + + ret = misc_register(&prng_tdes_dev); + if (ret) { + prng_tdes_deinstantiate(); + goto out; + } + } + +out: + return ret; +} + + +static void __exit prng_exit(void) +{ + if (prng_mode == PRNG_MODE_SHA512) { + misc_deregister(&prng_sha512_dev); + prng_sha512_deinstantiate(); + } else { + misc_deregister(&prng_tdes_dev); + prng_tdes_deinstantiate(); + } +} + +module_cpu_feature_match(S390_CPU_FEATURE_MSA, prng_init); +module_exit(prng_exit); diff --git a/arch/s390/crypto/sha.h b/arch/s390/crypto/sha.h new file mode 100644 index 0000000000..65ea12fc87 --- /dev/null +++ b/arch/s390/crypto/sha.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Cryptographic API. + * + * s390 generic implementation of the SHA Secure Hash Algorithms. + * + * Copyright IBM Corp. 2007 + * Author(s): Jan Glauber (jang@de.ibm.com) + */ +#ifndef _CRYPTO_ARCH_S390_SHA_H +#define _CRYPTO_ARCH_S390_SHA_H + +#include +#include +#include +#include + +/* must be big enough for the largest SHA variant */ +#define SHA3_STATE_SIZE 200 +#define CPACF_MAX_PARMBLOCK_SIZE SHA3_STATE_SIZE +#define SHA_MAX_BLOCK_SIZE SHA3_224_BLOCK_SIZE + +struct s390_sha_ctx { + u64 count; /* message length in bytes */ + u32 state[CPACF_MAX_PARMBLOCK_SIZE / sizeof(u32)]; + u8 buf[SHA_MAX_BLOCK_SIZE]; + int func; /* KIMD function to use */ +}; + +struct shash_desc; + +int s390_sha_update(struct shash_desc *desc, const u8 *data, unsigned int len); +int s390_sha_final(struct shash_desc *desc, u8 *out); + +#endif diff --git a/arch/s390/crypto/sha1_s390.c b/arch/s390/crypto/sha1_s390.c new file mode 100644 index 0000000000..bc3a22704e --- /dev/null +++ b/arch/s390/crypto/sha1_s390.c @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Cryptographic API. + * + * s390 implementation of the SHA1 Secure Hash Algorithm. + * + * Derived from cryptoapi implementation, adapted for in-place + * scatterlist interface. Originally based on the public domain + * implementation written by Steve Reid. + * + * s390 Version: + * Copyright IBM Corp. 2003, 2007 + * Author(s): Thomas Spatzier + * Jan Glauber (jan.glauber@de.ibm.com) + * + * Derived from "crypto/sha1_generic.c" + * Copyright (c) Alan Smithee. + * Copyright (c) Andrew McDonald + * Copyright (c) Jean-Francois Dive + */ +#include +#include +#include +#include +#include +#include + +#include "sha.h" + +static int s390_sha1_init(struct shash_desc *desc) +{ + struct s390_sha_ctx *sctx = shash_desc_ctx(desc); + + sctx->state[0] = SHA1_H0; + sctx->state[1] = SHA1_H1; + sctx->state[2] = SHA1_H2; + sctx->state[3] = SHA1_H3; + sctx->state[4] = SHA1_H4; + sctx->count = 0; + sctx->func = CPACF_KIMD_SHA_1; + + return 0; +} + +static int s390_sha1_export(struct shash_desc *desc, void *out) +{ + struct s390_sha_ctx *sctx = shash_desc_ctx(desc); + struct sha1_state *octx = out; + + octx->count = sctx->count; + memcpy(octx->state, sctx->state, sizeof(octx->state)); + memcpy(octx->buffer, sctx->buf, sizeof(octx->buffer)); + return 0; +} + +static int s390_sha1_import(struct shash_desc *desc, const void *in) +{ + struct s390_sha_ctx *sctx = shash_desc_ctx(desc); + const struct sha1_state *ictx = in; + + sctx->count = ictx->count; + memcpy(sctx->state, ictx->state, sizeof(ictx->state)); + memcpy(sctx->buf, ictx->buffer, sizeof(ictx->buffer)); + sctx->func = CPACF_KIMD_SHA_1; + return 0; +} + +static struct shash_alg alg = { + .digestsize = SHA1_DIGEST_SIZE, + .init = s390_sha1_init, + .update = s390_sha_update, + .final = s390_sha_final, + .export = s390_sha1_export, + .import = s390_sha1_import, + .descsize = sizeof(struct s390_sha_ctx), + .statesize = sizeof(struct sha1_state), + .base = { + .cra_name = "sha1", + .cra_driver_name= "sha1-s390", + .cra_priority = 300, + .cra_blocksize = SHA1_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static int __init sha1_s390_init(void) +{ + if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_1)) + return -ENODEV; + return crypto_register_shash(&alg); +} + +static void __exit sha1_s390_fini(void) +{ + crypto_unregister_shash(&alg); +} + +module_cpu_feature_match(S390_CPU_FEATURE_MSA, sha1_s390_init); +module_exit(sha1_s390_fini); + +MODULE_ALIAS_CRYPTO("sha1"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm"); diff --git a/arch/s390/crypto/sha256_s390.c b/arch/s390/crypto/sha256_s390.c new file mode 100644 index 0000000000..6f1ccdf93d --- /dev/null +++ b/arch/s390/crypto/sha256_s390.c @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Cryptographic API. + * + * s390 implementation of the SHA256 and SHA224 Secure Hash Algorithm. + * + * s390 Version: + * Copyright IBM Corp. 2005, 2011 + * Author(s): Jan Glauber (jang@de.ibm.com) + */ +#include +#include +#include +#include +#include +#include + +#include "sha.h" + +static int s390_sha256_init(struct shash_desc *desc) +{ + struct s390_sha_ctx *sctx = shash_desc_ctx(desc); + + sctx->state[0] = SHA256_H0; + sctx->state[1] = SHA256_H1; + sctx->state[2] = SHA256_H2; + sctx->state[3] = SHA256_H3; + sctx->state[4] = SHA256_H4; + sctx->state[5] = SHA256_H5; + sctx->state[6] = SHA256_H6; + sctx->state[7] = SHA256_H7; + sctx->count = 0; + sctx->func = CPACF_KIMD_SHA_256; + + return 0; +} + +static int sha256_export(struct shash_desc *desc, void *out) +{ + struct s390_sha_ctx *sctx = shash_desc_ctx(desc); + struct sha256_state *octx = out; + + octx->count = sctx->count; + memcpy(octx->state, sctx->state, sizeof(octx->state)); + memcpy(octx->buf, sctx->buf, sizeof(octx->buf)); + return 0; +} + +static int sha256_import(struct shash_desc *desc, const void *in) +{ + struct s390_sha_ctx *sctx = shash_desc_ctx(desc); + const struct sha256_state *ictx = in; + + sctx->count = ictx->count; + memcpy(sctx->state, ictx->state, sizeof(ictx->state)); + memcpy(sctx->buf, ictx->buf, sizeof(ictx->buf)); + sctx->func = CPACF_KIMD_SHA_256; + return 0; +} + +static struct shash_alg sha256_alg = { + .digestsize = SHA256_DIGEST_SIZE, + .init = s390_sha256_init, + .update = s390_sha_update, + .final = s390_sha_final, + .export = sha256_export, + .import = sha256_import, + .descsize = sizeof(struct s390_sha_ctx), + .statesize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha256", + .cra_driver_name= "sha256-s390", + .cra_priority = 300, + .cra_blocksize = SHA256_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static int s390_sha224_init(struct shash_desc *desc) +{ + struct s390_sha_ctx *sctx = shash_desc_ctx(desc); + + sctx->state[0] = SHA224_H0; + sctx->state[1] = SHA224_H1; + sctx->state[2] = SHA224_H2; + sctx->state[3] = SHA224_H3; + sctx->state[4] = SHA224_H4; + sctx->state[5] = SHA224_H5; + sctx->state[6] = SHA224_H6; + sctx->state[7] = SHA224_H7; + sctx->count = 0; + sctx->func = CPACF_KIMD_SHA_256; + + return 0; +} + +static struct shash_alg sha224_alg = { + .digestsize = SHA224_DIGEST_SIZE, + .init = s390_sha224_init, + .update = s390_sha_update, + .final = s390_sha_final, + .export = sha256_export, + .import = sha256_import, + .descsize = sizeof(struct s390_sha_ctx), + .statesize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha224", + .cra_driver_name= "sha224-s390", + .cra_priority = 300, + .cra_blocksize = SHA224_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static int __init sha256_s390_init(void) +{ + int ret; + + if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_256)) + return -ENODEV; + ret = crypto_register_shash(&sha256_alg); + if (ret < 0) + goto out; + ret = crypto_register_shash(&sha224_alg); + if (ret < 0) + crypto_unregister_shash(&sha256_alg); +out: + return ret; +} + +static void __exit sha256_s390_fini(void) +{ + crypto_unregister_shash(&sha224_alg); + crypto_unregister_shash(&sha256_alg); +} + +module_cpu_feature_match(S390_CPU_FEATURE_MSA, sha256_s390_init); +module_exit(sha256_s390_fini); + +MODULE_ALIAS_CRYPTO("sha256"); +MODULE_ALIAS_CRYPTO("sha224"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA256 and SHA224 Secure Hash Algorithm"); diff --git a/arch/s390/crypto/sha3_256_s390.c b/arch/s390/crypto/sha3_256_s390.c new file mode 100644 index 0000000000..e1350e033a --- /dev/null +++ b/arch/s390/crypto/sha3_256_s390.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Cryptographic API. + * + * s390 implementation of the SHA256 and SHA224 Secure Hash Algorithm. + * + * s390 Version: + * Copyright IBM Corp. 2019 + * Author(s): Joerg Schmidbauer (jschmidb@de.ibm.com) + */ +#include +#include +#include +#include +#include +#include + +#include "sha.h" + +static int sha3_256_init(struct shash_desc *desc) +{ + struct s390_sha_ctx *sctx = shash_desc_ctx(desc); + + memset(sctx->state, 0, sizeof(sctx->state)); + sctx->count = 0; + sctx->func = CPACF_KIMD_SHA3_256; + + return 0; +} + +static int sha3_256_export(struct shash_desc *desc, void *out) +{ + struct s390_sha_ctx *sctx = shash_desc_ctx(desc); + struct sha3_state *octx = out; + + octx->rsiz = sctx->count; + memcpy(octx->st, sctx->state, sizeof(octx->st)); + memcpy(octx->buf, sctx->buf, sizeof(octx->buf)); + + return 0; +} + +static int sha3_256_import(struct shash_desc *desc, const void *in) +{ + struct s390_sha_ctx *sctx = shash_desc_ctx(desc); + const struct sha3_state *ictx = in; + + sctx->count = ictx->rsiz; + memcpy(sctx->state, ictx->st, sizeof(ictx->st)); + memcpy(sctx->buf, ictx->buf, sizeof(ictx->buf)); + sctx->func = CPACF_KIMD_SHA3_256; + + return 0; +} + +static int sha3_224_import(struct shash_desc *desc, const void *in) +{ + struct s390_sha_ctx *sctx = shash_desc_ctx(desc); + const struct sha3_state *ictx = in; + + sctx->count = ictx->rsiz; + memcpy(sctx->state, ictx->st, sizeof(ictx->st)); + memcpy(sctx->buf, ictx->buf, sizeof(ictx->buf)); + sctx->func = CPACF_KIMD_SHA3_224; + + return 0; +} + +static struct shash_alg sha3_256_alg = { + .digestsize = SHA3_256_DIGEST_SIZE, /* = 32 */ + .init = sha3_256_init, + .update = s390_sha_update, + .final = s390_sha_final, + .export = sha3_256_export, + .import = sha3_256_import, + .descsize = sizeof(struct s390_sha_ctx), + .statesize = sizeof(struct sha3_state), + .base = { + .cra_name = "sha3-256", + .cra_driver_name = "sha3-256-s390", + .cra_priority = 300, + .cra_blocksize = SHA3_256_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static int sha3_224_init(struct shash_desc *desc) +{ + struct s390_sha_ctx *sctx = shash_desc_ctx(desc); + + memset(sctx->state, 0, sizeof(sctx->state)); + sctx->count = 0; + sctx->func = CPACF_KIMD_SHA3_224; + + return 0; +} + +static struct shash_alg sha3_224_alg = { + .digestsize = SHA3_224_DIGEST_SIZE, + .init = sha3_224_init, + .update = s390_sha_update, + .final = s390_sha_final, + .export = sha3_256_export, /* same as for 256 */ + .import = sha3_224_import, /* function code different! */ + .descsize = sizeof(struct s390_sha_ctx), + .statesize = sizeof(struct sha3_state), + .base = { + .cra_name = "sha3-224", + .cra_driver_name = "sha3-224-s390", + .cra_priority = 300, + .cra_blocksize = SHA3_224_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static int __init sha3_256_s390_init(void) +{ + int ret; + + if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA3_256)) + return -ENODEV; + + ret = crypto_register_shash(&sha3_256_alg); + if (ret < 0) + goto out; + + ret = crypto_register_shash(&sha3_224_alg); + if (ret < 0) + crypto_unregister_shash(&sha3_256_alg); +out: + return ret; +} + +static void __exit sha3_256_s390_fini(void) +{ + crypto_unregister_shash(&sha3_224_alg); + crypto_unregister_shash(&sha3_256_alg); +} + +module_cpu_feature_match(S390_CPU_FEATURE_MSA, sha3_256_s390_init); +module_exit(sha3_256_s390_fini); + +MODULE_ALIAS_CRYPTO("sha3-256"); +MODULE_ALIAS_CRYPTO("sha3-224"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA3-256 and SHA3-224 Secure Hash Algorithm"); diff --git a/arch/s390/crypto/sha3_512_s390.c b/arch/s390/crypto/sha3_512_s390.c new file mode 100644 index 0000000000..06c142ed9b --- /dev/null +++ b/arch/s390/crypto/sha3_512_s390.c @@ -0,0 +1,154 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Cryptographic API. + * + * s390 implementation of the SHA512 and SHA384 Secure Hash Algorithm. + * + * Copyright IBM Corp. 2019 + * Author(s): Joerg Schmidbauer (jschmidb@de.ibm.com) + */ +#include +#include +#include +#include +#include +#include + +#include "sha.h" + +static int sha3_512_init(struct shash_desc *desc) +{ + struct s390_sha_ctx *sctx = shash_desc_ctx(desc); + + memset(sctx->state, 0, sizeof(sctx->state)); + sctx->count = 0; + sctx->func = CPACF_KIMD_SHA3_512; + + return 0; +} + +static int sha3_512_export(struct shash_desc *desc, void *out) +{ + struct s390_sha_ctx *sctx = shash_desc_ctx(desc); + struct sha3_state *octx = out; + + octx->rsiz = sctx->count; + octx->rsizw = sctx->count >> 32; + + memcpy(octx->st, sctx->state, sizeof(octx->st)); + memcpy(octx->buf, sctx->buf, sizeof(octx->buf)); + + return 0; +} + +static int sha3_512_import(struct shash_desc *desc, const void *in) +{ + struct s390_sha_ctx *sctx = shash_desc_ctx(desc); + const struct sha3_state *ictx = in; + + if (unlikely(ictx->rsizw)) + return -ERANGE; + sctx->count = ictx->rsiz; + + memcpy(sctx->state, ictx->st, sizeof(ictx->st)); + memcpy(sctx->buf, ictx->buf, sizeof(ictx->buf)); + sctx->func = CPACF_KIMD_SHA3_512; + + return 0; +} + +static int sha3_384_import(struct shash_desc *desc, const void *in) +{ + struct s390_sha_ctx *sctx = shash_desc_ctx(desc); + const struct sha3_state *ictx = in; + + if (unlikely(ictx->rsizw)) + return -ERANGE; + sctx->count = ictx->rsiz; + + memcpy(sctx->state, ictx->st, sizeof(ictx->st)); + memcpy(sctx->buf, ictx->buf, sizeof(ictx->buf)); + sctx->func = CPACF_KIMD_SHA3_384; + + return 0; +} + +static struct shash_alg sha3_512_alg = { + .digestsize = SHA3_512_DIGEST_SIZE, + .init = sha3_512_init, + .update = s390_sha_update, + .final = s390_sha_final, + .export = sha3_512_export, + .import = sha3_512_import, + .descsize = sizeof(struct s390_sha_ctx), + .statesize = sizeof(struct sha3_state), + .base = { + .cra_name = "sha3-512", + .cra_driver_name = "sha3-512-s390", + .cra_priority = 300, + .cra_blocksize = SHA3_512_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +MODULE_ALIAS_CRYPTO("sha3-512"); + +static int sha3_384_init(struct shash_desc *desc) +{ + struct s390_sha_ctx *sctx = shash_desc_ctx(desc); + + memset(sctx->state, 0, sizeof(sctx->state)); + sctx->count = 0; + sctx->func = CPACF_KIMD_SHA3_384; + + return 0; +} + +static struct shash_alg sha3_384_alg = { + .digestsize = SHA3_384_DIGEST_SIZE, + .init = sha3_384_init, + .update = s390_sha_update, + .final = s390_sha_final, + .export = sha3_512_export, /* same as for 512 */ + .import = sha3_384_import, /* function code different! */ + .descsize = sizeof(struct s390_sha_ctx), + .statesize = sizeof(struct sha3_state), + .base = { + .cra_name = "sha3-384", + .cra_driver_name = "sha3-384-s390", + .cra_priority = 300, + .cra_blocksize = SHA3_384_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct s390_sha_ctx), + .cra_module = THIS_MODULE, + } +}; + +MODULE_ALIAS_CRYPTO("sha3-384"); + +static int __init init(void) +{ + int ret; + + if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA3_512)) + return -ENODEV; + ret = crypto_register_shash(&sha3_512_alg); + if (ret < 0) + goto out; + ret = crypto_register_shash(&sha3_384_alg); + if (ret < 0) + crypto_unregister_shash(&sha3_512_alg); +out: + return ret; +} + +static void __exit fini(void) +{ + crypto_unregister_shash(&sha3_512_alg); + crypto_unregister_shash(&sha3_384_alg); +} + +module_cpu_feature_match(S390_CPU_FEATURE_MSA, init); +module_exit(fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA3-512 and SHA3-384 Secure Hash Algorithm"); diff --git a/arch/s390/crypto/sha512_s390.c b/arch/s390/crypto/sha512_s390.c new file mode 100644 index 0000000000..04f11c4077 --- /dev/null +++ b/arch/s390/crypto/sha512_s390.c @@ -0,0 +1,149 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Cryptographic API. + * + * s390 implementation of the SHA512 and SHA38 Secure Hash Algorithm. + * + * Copyright IBM Corp. 2007 + * Author(s): Jan Glauber (jang@de.ibm.com) + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sha.h" + +static int sha512_init(struct shash_desc *desc) +{ + struct s390_sha_ctx *ctx = shash_desc_ctx(desc); + + *(__u64 *)&ctx->state[0] = SHA512_H0; + *(__u64 *)&ctx->state[2] = SHA512_H1; + *(__u64 *)&ctx->state[4] = SHA512_H2; + *(__u64 *)&ctx->state[6] = SHA512_H3; + *(__u64 *)&ctx->state[8] = SHA512_H4; + *(__u64 *)&ctx->state[10] = SHA512_H5; + *(__u64 *)&ctx->state[12] = SHA512_H6; + *(__u64 *)&ctx->state[14] = SHA512_H7; + ctx->count = 0; + ctx->func = CPACF_KIMD_SHA_512; + + return 0; +} + +static int sha512_export(struct shash_desc *desc, void *out) +{ + struct s390_sha_ctx *sctx = shash_desc_ctx(desc); + struct sha512_state *octx = out; + + octx->count[0] = sctx->count; + octx->count[1] = 0; + memcpy(octx->state, sctx->state, sizeof(octx->state)); + memcpy(octx->buf, sctx->buf, sizeof(octx->buf)); + return 0; +} + +static int sha512_import(struct shash_desc *desc, const void *in) +{ + struct s390_sha_ctx *sctx = shash_desc_ctx(desc); + const struct sha512_state *ictx = in; + + if (unlikely(ictx->count[1])) + return -ERANGE; + sctx->count = ictx->count[0]; + + memcpy(sctx->state, ictx->state, sizeof(ictx->state)); + memcpy(sctx->buf, ictx->buf, sizeof(ictx->buf)); + sctx->func = CPACF_KIMD_SHA_512; + return 0; +} + +static struct shash_alg sha512_alg = { + .digestsize = SHA512_DIGEST_SIZE, + .init = sha512_init, + .update = s390_sha_update, + .final = s390_sha_final, + .export = sha512_export, + .import = sha512_import, + .descsize = sizeof(struct s390_sha_ctx), + .statesize = sizeof(struct sha512_state), + .base = { + .cra_name = "sha512", + .cra_driver_name= "sha512-s390", + .cra_priority = 300, + .cra_blocksize = SHA512_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +MODULE_ALIAS_CRYPTO("sha512"); + +static int sha384_init(struct shash_desc *desc) +{ + struct s390_sha_ctx *ctx = shash_desc_ctx(desc); + + *(__u64 *)&ctx->state[0] = SHA384_H0; + *(__u64 *)&ctx->state[2] = SHA384_H1; + *(__u64 *)&ctx->state[4] = SHA384_H2; + *(__u64 *)&ctx->state[6] = SHA384_H3; + *(__u64 *)&ctx->state[8] = SHA384_H4; + *(__u64 *)&ctx->state[10] = SHA384_H5; + *(__u64 *)&ctx->state[12] = SHA384_H6; + *(__u64 *)&ctx->state[14] = SHA384_H7; + ctx->count = 0; + ctx->func = CPACF_KIMD_SHA_512; + + return 0; +} + +static struct shash_alg sha384_alg = { + .digestsize = SHA384_DIGEST_SIZE, + .init = sha384_init, + .update = s390_sha_update, + .final = s390_sha_final, + .export = sha512_export, + .import = sha512_import, + .descsize = sizeof(struct s390_sha_ctx), + .statesize = sizeof(struct sha512_state), + .base = { + .cra_name = "sha384", + .cra_driver_name= "sha384-s390", + .cra_priority = 300, + .cra_blocksize = SHA384_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct s390_sha_ctx), + .cra_module = THIS_MODULE, + } +}; + +MODULE_ALIAS_CRYPTO("sha384"); + +static int __init init(void) +{ + int ret; + + if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_512)) + return -ENODEV; + if ((ret = crypto_register_shash(&sha512_alg)) < 0) + goto out; + if ((ret = crypto_register_shash(&sha384_alg)) < 0) + crypto_unregister_shash(&sha512_alg); +out: + return ret; +} + +static void __exit fini(void) +{ + crypto_unregister_shash(&sha512_alg); + crypto_unregister_shash(&sha384_alg); +} + +module_cpu_feature_match(S390_CPU_FEATURE_MSA, init); +module_exit(fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA512 and SHA-384 Secure Hash Algorithm"); diff --git a/arch/s390/crypto/sha_common.c b/arch/s390/crypto/sha_common.c new file mode 100644 index 0000000000..686fe7aa19 --- /dev/null +++ b/arch/s390/crypto/sha_common.c @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Cryptographic API. + * + * s390 generic implementation of the SHA Secure Hash Algorithms. + * + * Copyright IBM Corp. 2007 + * Author(s): Jan Glauber (jang@de.ibm.com) + */ + +#include +#include +#include +#include "sha.h" + +int s390_sha_update(struct shash_desc *desc, const u8 *data, unsigned int len) +{ + struct s390_sha_ctx *ctx = shash_desc_ctx(desc); + unsigned int bsize = crypto_shash_blocksize(desc->tfm); + unsigned int index, n; + + /* how much is already in the buffer? */ + index = ctx->count % bsize; + ctx->count += len; + + if ((index + len) < bsize) + goto store; + + /* process one stored block */ + if (index) { + memcpy(ctx->buf + index, data, bsize - index); + cpacf_kimd(ctx->func, ctx->state, ctx->buf, bsize); + data += bsize - index; + len -= bsize - index; + index = 0; + } + + /* process as many blocks as possible */ + if (len >= bsize) { + n = (len / bsize) * bsize; + cpacf_kimd(ctx->func, ctx->state, data, n); + data += n; + len -= n; + } +store: + if (len) + memcpy(ctx->buf + index , data, len); + + return 0; +} +EXPORT_SYMBOL_GPL(s390_sha_update); + +static int s390_crypto_shash_parmsize(int func) +{ + switch (func) { + case CPACF_KLMD_SHA_1: + return 20; + case CPACF_KLMD_SHA_256: + return 32; + case CPACF_KLMD_SHA_512: + return 64; + case CPACF_KLMD_SHA3_224: + case CPACF_KLMD_SHA3_256: + case CPACF_KLMD_SHA3_384: + case CPACF_KLMD_SHA3_512: + return 200; + default: + return -EINVAL; + } +} + +int s390_sha_final(struct shash_desc *desc, u8 *out) +{ + struct s390_sha_ctx *ctx = shash_desc_ctx(desc); + unsigned int bsize = crypto_shash_blocksize(desc->tfm); + u64 bits; + unsigned int n; + int mbl_offset; + + n = ctx->count % bsize; + bits = ctx->count * 8; + mbl_offset = s390_crypto_shash_parmsize(ctx->func); + if (mbl_offset < 0) + return -EINVAL; + + mbl_offset = mbl_offset / sizeof(u32); + + /* set total msg bit length (mbl) in CPACF parmblock */ + switch (ctx->func) { + case CPACF_KLMD_SHA_1: + case CPACF_KLMD_SHA_256: + memcpy(ctx->state + mbl_offset, &bits, sizeof(bits)); + break; + case CPACF_KLMD_SHA_512: + /* + * the SHA512 parmblock has a 128-bit mbl field, clear + * high-order u64 field, copy bits to low-order u64 field + */ + memset(ctx->state + mbl_offset, 0x00, sizeof(bits)); + mbl_offset += sizeof(u64) / sizeof(u32); + memcpy(ctx->state + mbl_offset, &bits, sizeof(bits)); + break; + case CPACF_KLMD_SHA3_224: + case CPACF_KLMD_SHA3_256: + case CPACF_KLMD_SHA3_384: + case CPACF_KLMD_SHA3_512: + break; + default: + return -EINVAL; + } + + cpacf_klmd(ctx->func, ctx->state, ctx->buf, n); + + /* copy digest to out */ + memcpy(out, ctx->state, crypto_shash_digestsize(desc->tfm)); + /* wipe context */ + memset(ctx, 0, sizeof *ctx); + + return 0; +} +EXPORT_SYMBOL_GPL(s390_sha_final); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("s390 SHA cipher common functions"); diff --git a/arch/s390/hypfs/Makefile b/arch/s390/hypfs/Makefile new file mode 100644 index 0000000000..c34854d298 --- /dev/null +++ b/arch/s390/hypfs/Makefile @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the linux hypfs filesystem routines. +# + +obj-$(CONFIG_S390_HYPFS) += hypfs_dbfs.o +obj-$(CONFIG_S390_HYPFS) += hypfs_diag.o +obj-$(CONFIG_S390_HYPFS) += hypfs_diag0c.o +obj-$(CONFIG_S390_HYPFS) += hypfs_sprp.o +obj-$(CONFIG_S390_HYPFS) += hypfs_vm.o + +obj-$(CONFIG_S390_HYPFS_FS) += hypfs_diag_fs.o +obj-$(CONFIG_S390_HYPFS_FS) += hypfs_vm_fs.o +obj-$(CONFIG_S390_HYPFS_FS) += inode.o diff --git a/arch/s390/hypfs/hypfs.h b/arch/s390/hypfs/hypfs.h new file mode 100644 index 0000000000..65f4036fd5 --- /dev/null +++ b/arch/s390/hypfs/hypfs.h @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Hypervisor filesystem for Linux on s390. + * + * Copyright IBM Corp. 2006 + * Author(s): Michael Holzheu + */ + +#ifndef _HYPFS_H_ +#define _HYPFS_H_ + +#include +#include +#include +#include +#include +#include + +#define REG_FILE_MODE 0440 +#define UPDATE_FILE_MODE 0220 +#define DIR_MODE 0550 + +extern struct dentry *hypfs_mkdir(struct dentry *parent, const char *name); + +extern struct dentry *hypfs_create_u64(struct dentry *dir, const char *name, + __u64 value); + +extern struct dentry *hypfs_create_str(struct dentry *dir, const char *name, + char *string); + +/* LPAR Hypervisor */ +extern int hypfs_diag_init(void); +extern void hypfs_diag_exit(void); +extern int hypfs_diag_create_files(struct dentry *root); + +/* VM Hypervisor */ +extern int hypfs_vm_init(void); +extern void hypfs_vm_exit(void); +extern int hypfs_vm_create_files(struct dentry *root); + +/* VM diagnose 0c */ +int hypfs_diag0c_init(void); +void hypfs_diag0c_exit(void); + +/* Set Partition-Resource Parameter */ +void hypfs_sprp_init(void); +void hypfs_sprp_exit(void); + +int __hypfs_fs_init(void); + +static inline int hypfs_fs_init(void) +{ + if (IS_ENABLED(CONFIG_S390_HYPFS_FS)) + return __hypfs_fs_init(); + return 0; +} + +/* debugfs interface */ +struct hypfs_dbfs_file; + +struct hypfs_dbfs_data { + void *buf; + void *buf_free_ptr; + size_t size; + struct hypfs_dbfs_file *dbfs_file; +}; + +struct hypfs_dbfs_file { + const char *name; + int (*data_create)(void **data, void **data_free_ptr, + size_t *size); + void (*data_free)(const void *buf_free_ptr); + long (*unlocked_ioctl) (struct file *, unsigned int, + unsigned long); + + /* Private data for hypfs_dbfs.c */ + struct mutex lock; + struct dentry *dentry; +}; + +extern void hypfs_dbfs_exit(void); +extern void hypfs_dbfs_create_file(struct hypfs_dbfs_file *df); +extern void hypfs_dbfs_remove_file(struct hypfs_dbfs_file *df); + +#endif /* _HYPFS_H_ */ diff --git a/arch/s390/hypfs/hypfs_dbfs.c b/arch/s390/hypfs/hypfs_dbfs.c new file mode 100644 index 0000000000..4024599eb4 --- /dev/null +++ b/arch/s390/hypfs/hypfs_dbfs.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Hypervisor filesystem for Linux on s390 - debugfs interface + * + * Copyright IBM Corp. 2010 + * Author(s): Michael Holzheu + */ + +#include +#include "hypfs.h" + +static struct dentry *dbfs_dir; + +static struct hypfs_dbfs_data *hypfs_dbfs_data_alloc(struct hypfs_dbfs_file *f) +{ + struct hypfs_dbfs_data *data; + + data = kmalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return NULL; + data->dbfs_file = f; + return data; +} + +static void hypfs_dbfs_data_free(struct hypfs_dbfs_data *data) +{ + data->dbfs_file->data_free(data->buf_free_ptr); + kfree(data); +} + +static ssize_t dbfs_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct hypfs_dbfs_data *data; + struct hypfs_dbfs_file *df; + ssize_t rc; + + if (*ppos != 0) + return 0; + + df = file_inode(file)->i_private; + mutex_lock(&df->lock); + data = hypfs_dbfs_data_alloc(df); + if (!data) { + mutex_unlock(&df->lock); + return -ENOMEM; + } + rc = df->data_create(&data->buf, &data->buf_free_ptr, &data->size); + if (rc) { + mutex_unlock(&df->lock); + kfree(data); + return rc; + } + mutex_unlock(&df->lock); + + rc = simple_read_from_buffer(buf, size, ppos, data->buf, data->size); + hypfs_dbfs_data_free(data); + return rc; +} + +static long dbfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct hypfs_dbfs_file *df = file_inode(file)->i_private; + long rc; + + mutex_lock(&df->lock); + if (df->unlocked_ioctl) + rc = df->unlocked_ioctl(file, cmd, arg); + else + rc = -ENOTTY; + mutex_unlock(&df->lock); + return rc; +} + +static const struct file_operations dbfs_ops = { + .read = dbfs_read, + .llseek = no_llseek, + .unlocked_ioctl = dbfs_ioctl, +}; + +void hypfs_dbfs_create_file(struct hypfs_dbfs_file *df) +{ + df->dentry = debugfs_create_file(df->name, 0400, dbfs_dir, df, + &dbfs_ops); + mutex_init(&df->lock); +} + +void hypfs_dbfs_remove_file(struct hypfs_dbfs_file *df) +{ + debugfs_remove(df->dentry); +} + +static int __init hypfs_dbfs_init(void) +{ + int rc = -ENODATA; + + dbfs_dir = debugfs_create_dir("s390_hypfs", NULL); + if (hypfs_diag_init()) + goto fail_dbfs_exit; + if (hypfs_vm_init()) + goto fail_hypfs_diag_exit; + hypfs_sprp_init(); + if (hypfs_diag0c_init()) + goto fail_hypfs_sprp_exit; + rc = hypfs_fs_init(); + if (rc) + goto fail_hypfs_diag0c_exit; + return 0; + +fail_hypfs_diag0c_exit: + hypfs_diag0c_exit(); +fail_hypfs_sprp_exit: + hypfs_sprp_exit(); + hypfs_vm_exit(); +fail_hypfs_diag_exit: + hypfs_diag_exit(); + pr_err("Initialization of hypfs failed with rc=%i\n", rc); +fail_dbfs_exit: + debugfs_remove(dbfs_dir); + return rc; +} +device_initcall(hypfs_dbfs_init) diff --git a/arch/s390/hypfs/hypfs_diag.c b/arch/s390/hypfs/hypfs_diag.c new file mode 100644 index 0000000000..279b7bba4d --- /dev/null +++ b/arch/s390/hypfs/hypfs_diag.c @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Hypervisor filesystem for Linux on s390. Diag 204 and 224 + * implementation. + * + * Copyright IBM Corp. 2006, 2008 + * Author(s): Michael Holzheu + */ + +#define KMSG_COMPONENT "hypfs" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include "hypfs_diag.h" +#include "hypfs.h" + +#define DBFS_D204_HDR_VERSION 0 + +static enum diag204_sc diag204_store_sc; /* used subcode for store */ +static enum diag204_format diag204_info_type; /* used diag 204 data format */ + +static void *diag204_buf; /* 4K aligned buffer for diag204 data */ +static int diag204_buf_pages; /* number of pages for diag204 data */ + +static struct dentry *dbfs_d204_file; + +enum diag204_format diag204_get_info_type(void) +{ + return diag204_info_type; +} + +static void diag204_set_info_type(enum diag204_format type) +{ + diag204_info_type = type; +} + +/* Diagnose 204 functions */ +/* + * For the old diag subcode 4 with simple data format we have to use real + * memory. If we use subcode 6 or 7 with extended data format, we can (and + * should) use vmalloc, since we need a lot of memory in that case. Currently + * up to 93 pages! + */ + +static void diag204_free_buffer(void) +{ + vfree(diag204_buf); + diag204_buf = NULL; +} + +void *diag204_get_buffer(enum diag204_format fmt, int *pages) +{ + if (diag204_buf) { + *pages = diag204_buf_pages; + return diag204_buf; + } + if (fmt == DIAG204_INFO_SIMPLE) { + *pages = 1; + } else {/* DIAG204_INFO_EXT */ + *pages = diag204((unsigned long)DIAG204_SUBC_RSI | + (unsigned long)DIAG204_INFO_EXT, 0, NULL); + if (*pages <= 0) + return ERR_PTR(-EOPNOTSUPP); + } + diag204_buf = __vmalloc_node(array_size(*pages, PAGE_SIZE), + PAGE_SIZE, GFP_KERNEL, NUMA_NO_NODE, + __builtin_return_address(0)); + if (!diag204_buf) + return ERR_PTR(-ENOMEM); + diag204_buf_pages = *pages; + return diag204_buf; +} + +/* + * diag204_probe() has to find out, which type of diagnose 204 implementation + * we have on our machine. Currently there are three possible scanarios: + * - subcode 4 + simple data format (only one page) + * - subcode 4-6 + extended data format + * - subcode 4-7 + extended data format + * + * Subcode 5 is used to retrieve the size of the data, provided by subcodes + * 6 and 7. Subcode 7 basically has the same function as subcode 6. In addition + * to subcode 6 it provides also information about secondary cpus. + * In order to get as much information as possible, we first try + * subcode 7, then 6 and if both fail, we use subcode 4. + */ + +static int diag204_probe(void) +{ + void *buf; + int pages, rc; + + buf = diag204_get_buffer(DIAG204_INFO_EXT, &pages); + if (!IS_ERR(buf)) { + if (diag204((unsigned long)DIAG204_SUBC_STIB7 | + (unsigned long)DIAG204_INFO_EXT, pages, buf) >= 0) { + diag204_store_sc = DIAG204_SUBC_STIB7; + diag204_set_info_type(DIAG204_INFO_EXT); + goto out; + } + if (diag204((unsigned long)DIAG204_SUBC_STIB6 | + (unsigned long)DIAG204_INFO_EXT, pages, buf) >= 0) { + diag204_store_sc = DIAG204_SUBC_STIB6; + diag204_set_info_type(DIAG204_INFO_EXT); + goto out; + } + diag204_free_buffer(); + } + + /* subcodes 6 and 7 failed, now try subcode 4 */ + + buf = diag204_get_buffer(DIAG204_INFO_SIMPLE, &pages); + if (IS_ERR(buf)) { + rc = PTR_ERR(buf); + goto fail_alloc; + } + if (diag204((unsigned long)DIAG204_SUBC_STIB4 | + (unsigned long)DIAG204_INFO_SIMPLE, pages, buf) >= 0) { + diag204_store_sc = DIAG204_SUBC_STIB4; + diag204_set_info_type(DIAG204_INFO_SIMPLE); + goto out; + } else { + rc = -EOPNOTSUPP; + goto fail_store; + } +out: + rc = 0; +fail_store: + diag204_free_buffer(); +fail_alloc: + return rc; +} + +int diag204_store(void *buf, int pages) +{ + int rc; + + rc = diag204((unsigned long)diag204_store_sc | + (unsigned long)diag204_get_info_type(), pages, buf); + return rc < 0 ? -EOPNOTSUPP : 0; +} + +struct dbfs_d204_hdr { + u64 len; /* Length of d204 buffer without header */ + u16 version; /* Version of header */ + u8 sc; /* Used subcode */ + char reserved[53]; +} __attribute__ ((packed)); + +struct dbfs_d204 { + struct dbfs_d204_hdr hdr; /* 64 byte header */ + char buf[]; /* d204 buffer */ +} __attribute__ ((packed)); + +static int dbfs_d204_create(void **data, void **data_free_ptr, size_t *size) +{ + struct dbfs_d204 *d204; + int rc, buf_size; + void *base; + + buf_size = PAGE_SIZE * (diag204_buf_pages + 1) + sizeof(d204->hdr); + base = vzalloc(buf_size); + if (!base) + return -ENOMEM; + d204 = PTR_ALIGN(base + sizeof(d204->hdr), PAGE_SIZE) - sizeof(d204->hdr); + rc = diag204_store(d204->buf, diag204_buf_pages); + if (rc) { + vfree(base); + return rc; + } + d204->hdr.version = DBFS_D204_HDR_VERSION; + d204->hdr.len = PAGE_SIZE * diag204_buf_pages; + d204->hdr.sc = diag204_store_sc; + *data = d204; + *data_free_ptr = base; + *size = d204->hdr.len + sizeof(struct dbfs_d204_hdr); + return 0; +} + +static struct hypfs_dbfs_file dbfs_file_d204 = { + .name = "diag_204", + .data_create = dbfs_d204_create, + .data_free = vfree, +}; + +__init int hypfs_diag_init(void) +{ + int rc; + + if (diag204_probe()) { + pr_info("The hardware system does not support hypfs\n"); + return -ENODATA; + } + + if (diag204_get_info_type() == DIAG204_INFO_EXT) + hypfs_dbfs_create_file(&dbfs_file_d204); + + rc = hypfs_diag_fs_init(); + if (rc) { + pr_err("The hardware system does not provide all functions required by hypfs\n"); + debugfs_remove(dbfs_d204_file); + } + return rc; +} + +void hypfs_diag_exit(void) +{ + debugfs_remove(dbfs_d204_file); + hypfs_diag_fs_exit(); + diag204_free_buffer(); + hypfs_dbfs_remove_file(&dbfs_file_d204); +} diff --git a/arch/s390/hypfs/hypfs_diag.h b/arch/s390/hypfs/hypfs_diag.h new file mode 100644 index 0000000000..7090eff27f --- /dev/null +++ b/arch/s390/hypfs/hypfs_diag.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Hypervisor filesystem for Linux on s390. Diag 204 and 224 + * implementation. + * + * Copyright IBM Corp. 2006, 2008 + * Author(s): Michael Holzheu + */ + +#ifndef _S390_HYPFS_DIAG_H_ +#define _S390_HYPFS_DIAG_H_ + +#include + +enum diag204_format diag204_get_info_type(void); +void *diag204_get_buffer(enum diag204_format fmt, int *pages); +int diag204_store(void *buf, int pages); + +int __hypfs_diag_fs_init(void); +void __hypfs_diag_fs_exit(void); + +static inline int hypfs_diag_fs_init(void) +{ + if (IS_ENABLED(CONFIG_S390_HYPFS_FS)) + return __hypfs_diag_fs_init(); + return 0; +} + +static inline void hypfs_diag_fs_exit(void) +{ + if (IS_ENABLED(CONFIG_S390_HYPFS_FS)) + __hypfs_diag_fs_exit(); +} + +#endif /* _S390_HYPFS_DIAG_H_ */ diff --git a/arch/s390/hypfs/hypfs_diag0c.c b/arch/s390/hypfs/hypfs_diag0c.c new file mode 100644 index 0000000000..9a2786079e --- /dev/null +++ b/arch/s390/hypfs/hypfs_diag0c.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Hypervisor filesystem for Linux on s390 + * + * Diag 0C implementation + * + * Copyright IBM Corp. 2014 + */ + +#include +#include +#include +#include +#include "hypfs.h" + +#define DBFS_D0C_HDR_VERSION 0 + +/* + * Get hypfs_diag0c_entry from CPU vector and store diag0c data + */ +static void diag0c_fn(void *data) +{ + diag_stat_inc(DIAG_STAT_X00C); + diag_amode31_ops.diag0c(((void **)data)[smp_processor_id()]); +} + +/* + * Allocate buffer and store diag 0c data + */ +static void *diag0c_store(unsigned int *count) +{ + struct hypfs_diag0c_data *diag0c_data; + unsigned int cpu_count, cpu, i; + void **cpu_vec; + + cpus_read_lock(); + cpu_count = num_online_cpus(); + cpu_vec = kmalloc_array(num_possible_cpus(), sizeof(*cpu_vec), + GFP_KERNEL); + if (!cpu_vec) + goto fail_unlock_cpus; + /* Note: Diag 0c needs 8 byte alignment and real storage */ + diag0c_data = kzalloc(struct_size(diag0c_data, entry, cpu_count), + GFP_KERNEL | GFP_DMA); + if (!diag0c_data) + goto fail_kfree_cpu_vec; + i = 0; + /* Fill CPU vector for each online CPU */ + for_each_online_cpu(cpu) { + diag0c_data->entry[i].cpu = cpu; + cpu_vec[cpu] = &diag0c_data->entry[i++]; + } + /* Collect data all CPUs */ + on_each_cpu(diag0c_fn, cpu_vec, 1); + *count = cpu_count; + kfree(cpu_vec); + cpus_read_unlock(); + return diag0c_data; + +fail_kfree_cpu_vec: + kfree(cpu_vec); +fail_unlock_cpus: + cpus_read_unlock(); + return ERR_PTR(-ENOMEM); +} + +/* + * Hypfs DBFS callback: Free diag 0c data + */ +static void dbfs_diag0c_free(const void *data) +{ + kfree(data); +} + +/* + * Hypfs DBFS callback: Create diag 0c data + */ +static int dbfs_diag0c_create(void **data, void **data_free_ptr, size_t *size) +{ + struct hypfs_diag0c_data *diag0c_data; + unsigned int count; + + diag0c_data = diag0c_store(&count); + if (IS_ERR(diag0c_data)) + return PTR_ERR(diag0c_data); + memset(&diag0c_data->hdr, 0, sizeof(diag0c_data->hdr)); + store_tod_clock_ext((union tod_clock *)diag0c_data->hdr.tod_ext); + diag0c_data->hdr.len = count * sizeof(struct hypfs_diag0c_entry); + diag0c_data->hdr.version = DBFS_D0C_HDR_VERSION; + diag0c_data->hdr.count = count; + *data = diag0c_data; + *data_free_ptr = diag0c_data; + *size = diag0c_data->hdr.len + sizeof(struct hypfs_diag0c_hdr); + return 0; +} + +/* + * Hypfs DBFS file structure + */ +static struct hypfs_dbfs_file dbfs_file_0c = { + .name = "diag_0c", + .data_create = dbfs_diag0c_create, + .data_free = dbfs_diag0c_free, +}; + +/* + * Initialize diag 0c interface for z/VM + */ +int __init hypfs_diag0c_init(void) +{ + if (!MACHINE_IS_VM) + return 0; + hypfs_dbfs_create_file(&dbfs_file_0c); + return 0; +} + +/* + * Shutdown diag 0c interface for z/VM + */ +void hypfs_diag0c_exit(void) +{ + if (!MACHINE_IS_VM) + return; + hypfs_dbfs_remove_file(&dbfs_file_0c); +} diff --git a/arch/s390/hypfs/hypfs_diag_fs.c b/arch/s390/hypfs/hypfs_diag_fs.c new file mode 100644 index 0000000000..00a6d370a2 --- /dev/null +++ b/arch/s390/hypfs/hypfs_diag_fs.c @@ -0,0 +1,393 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Hypervisor filesystem for Linux on s390. Diag 204 and 224 + * implementation. + * + * Copyright IBM Corp. 2006, 2008 + * Author(s): Michael Holzheu + */ + +#define KMSG_COMPONENT "hypfs" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include "hypfs_diag.h" +#include "hypfs.h" + +#define TMP_SIZE 64 /* size of temporary buffers */ + +static char *diag224_cpu_names; /* diag 224 name table */ +static int diag224_idx2name(int index, char *name); + +/* + * DIAG 204 member access functions. + * + * Since we have two different diag 204 data formats for old and new s390 + * machines, we do not access the structs directly, but use getter functions for + * each struct member instead. This should make the code more readable. + */ + +/* Time information block */ + +static inline int info_blk_hdr__size(enum diag204_format type) +{ + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_info_blk_hdr); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_info_blk_hdr); +} + +static inline __u8 info_blk_hdr__npar(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_info_blk_hdr *)hdr)->npar; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_info_blk_hdr *)hdr)->npar; +} + +static inline __u8 info_blk_hdr__flags(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_info_blk_hdr *)hdr)->flags; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_info_blk_hdr *)hdr)->flags; +} + +/* Partition header */ + +static inline int part_hdr__size(enum diag204_format type) +{ + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_part_hdr); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_part_hdr); +} + +static inline __u8 part_hdr__rcpus(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_part_hdr *)hdr)->cpus; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_part_hdr *)hdr)->rcpus; +} + +static inline void part_hdr__part_name(enum diag204_format type, void *hdr, + char *name) +{ + if (type == DIAG204_INFO_SIMPLE) + memcpy(name, ((struct diag204_part_hdr *)hdr)->part_name, + DIAG204_LPAR_NAME_LEN); + else /* DIAG204_INFO_EXT */ + memcpy(name, ((struct diag204_x_part_hdr *)hdr)->part_name, + DIAG204_LPAR_NAME_LEN); + EBCASC(name, DIAG204_LPAR_NAME_LEN); + name[DIAG204_LPAR_NAME_LEN] = 0; + strim(name); +} + +/* CPU info block */ + +static inline int cpu_info__size(enum diag204_format type) +{ + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_cpu_info); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_cpu_info); +} + +static inline __u8 cpu_info__ctidx(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_cpu_info *)hdr)->ctidx; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->ctidx; +} + +static inline __u16 cpu_info__cpu_addr(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_cpu_info *)hdr)->cpu_addr; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->cpu_addr; +} + +static inline __u64 cpu_info__acc_time(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_cpu_info *)hdr)->acc_time; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->acc_time; +} + +static inline __u64 cpu_info__lp_time(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_cpu_info *)hdr)->lp_time; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->lp_time; +} + +static inline __u64 cpu_info__online_time(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return 0; /* online_time not available in simple info */ + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->online_time; +} + +/* Physical header */ + +static inline int phys_hdr__size(enum diag204_format type) +{ + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_phys_hdr); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_phys_hdr); +} + +static inline __u8 phys_hdr__cpus(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_phys_hdr *)hdr)->cpus; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_phys_hdr *)hdr)->cpus; +} + +/* Physical CPU info block */ + +static inline int phys_cpu__size(enum diag204_format type) +{ + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_phys_cpu); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_phys_cpu); +} + +static inline __u16 phys_cpu__cpu_addr(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_phys_cpu *)hdr)->cpu_addr; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_phys_cpu *)hdr)->cpu_addr; +} + +static inline __u64 phys_cpu__mgm_time(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_phys_cpu *)hdr)->mgm_time; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_phys_cpu *)hdr)->mgm_time; +} + +static inline __u64 phys_cpu__ctidx(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_phys_cpu *)hdr)->ctidx; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_phys_cpu *)hdr)->ctidx; +} + +/* + * Functions to create the directory structure + * ******************************************* + */ + +static int hypfs_create_cpu_files(struct dentry *cpus_dir, void *cpu_info) +{ + struct dentry *cpu_dir; + char buffer[TMP_SIZE]; + void *rc; + + snprintf(buffer, TMP_SIZE, "%d", cpu_info__cpu_addr(diag204_get_info_type(), + cpu_info)); + cpu_dir = hypfs_mkdir(cpus_dir, buffer); + rc = hypfs_create_u64(cpu_dir, "mgmtime", + cpu_info__acc_time(diag204_get_info_type(), cpu_info) - + cpu_info__lp_time(diag204_get_info_type(), cpu_info)); + if (IS_ERR(rc)) + return PTR_ERR(rc); + rc = hypfs_create_u64(cpu_dir, "cputime", + cpu_info__lp_time(diag204_get_info_type(), cpu_info)); + if (IS_ERR(rc)) + return PTR_ERR(rc); + if (diag204_get_info_type() == DIAG204_INFO_EXT) { + rc = hypfs_create_u64(cpu_dir, "onlinetime", + cpu_info__online_time(diag204_get_info_type(), + cpu_info)); + if (IS_ERR(rc)) + return PTR_ERR(rc); + } + diag224_idx2name(cpu_info__ctidx(diag204_get_info_type(), cpu_info), buffer); + rc = hypfs_create_str(cpu_dir, "type", buffer); + return PTR_ERR_OR_ZERO(rc); +} + +static void *hypfs_create_lpar_files(struct dentry *systems_dir, void *part_hdr) +{ + struct dentry *cpus_dir; + struct dentry *lpar_dir; + char lpar_name[DIAG204_LPAR_NAME_LEN + 1]; + void *cpu_info; + int i; + + part_hdr__part_name(diag204_get_info_type(), part_hdr, lpar_name); + lpar_name[DIAG204_LPAR_NAME_LEN] = 0; + lpar_dir = hypfs_mkdir(systems_dir, lpar_name); + if (IS_ERR(lpar_dir)) + return lpar_dir; + cpus_dir = hypfs_mkdir(lpar_dir, "cpus"); + if (IS_ERR(cpus_dir)) + return cpus_dir; + cpu_info = part_hdr + part_hdr__size(diag204_get_info_type()); + for (i = 0; i < part_hdr__rcpus(diag204_get_info_type(), part_hdr); i++) { + int rc; + + rc = hypfs_create_cpu_files(cpus_dir, cpu_info); + if (rc) + return ERR_PTR(rc); + cpu_info += cpu_info__size(diag204_get_info_type()); + } + return cpu_info; +} + +static int hypfs_create_phys_cpu_files(struct dentry *cpus_dir, void *cpu_info) +{ + struct dentry *cpu_dir; + char buffer[TMP_SIZE]; + void *rc; + + snprintf(buffer, TMP_SIZE, "%i", phys_cpu__cpu_addr(diag204_get_info_type(), + cpu_info)); + cpu_dir = hypfs_mkdir(cpus_dir, buffer); + if (IS_ERR(cpu_dir)) + return PTR_ERR(cpu_dir); + rc = hypfs_create_u64(cpu_dir, "mgmtime", + phys_cpu__mgm_time(diag204_get_info_type(), cpu_info)); + if (IS_ERR(rc)) + return PTR_ERR(rc); + diag224_idx2name(phys_cpu__ctidx(diag204_get_info_type(), cpu_info), buffer); + rc = hypfs_create_str(cpu_dir, "type", buffer); + return PTR_ERR_OR_ZERO(rc); +} + +static void *hypfs_create_phys_files(struct dentry *parent_dir, void *phys_hdr) +{ + int i; + void *cpu_info; + struct dentry *cpus_dir; + + cpus_dir = hypfs_mkdir(parent_dir, "cpus"); + if (IS_ERR(cpus_dir)) + return cpus_dir; + cpu_info = phys_hdr + phys_hdr__size(diag204_get_info_type()); + for (i = 0; i < phys_hdr__cpus(diag204_get_info_type(), phys_hdr); i++) { + int rc; + + rc = hypfs_create_phys_cpu_files(cpus_dir, cpu_info); + if (rc) + return ERR_PTR(rc); + cpu_info += phys_cpu__size(diag204_get_info_type()); + } + return cpu_info; +} + +int hypfs_diag_create_files(struct dentry *root) +{ + struct dentry *systems_dir, *hyp_dir; + void *time_hdr, *part_hdr; + void *buffer, *ptr; + int i, rc, pages; + + buffer = diag204_get_buffer(diag204_get_info_type(), &pages); + if (IS_ERR(buffer)) + return PTR_ERR(buffer); + rc = diag204_store(buffer, pages); + if (rc) + return rc; + + systems_dir = hypfs_mkdir(root, "systems"); + if (IS_ERR(systems_dir)) { + rc = PTR_ERR(systems_dir); + goto err_out; + } + time_hdr = (struct x_info_blk_hdr *)buffer; + part_hdr = time_hdr + info_blk_hdr__size(diag204_get_info_type()); + for (i = 0; i < info_blk_hdr__npar(diag204_get_info_type(), time_hdr); i++) { + part_hdr = hypfs_create_lpar_files(systems_dir, part_hdr); + if (IS_ERR(part_hdr)) { + rc = PTR_ERR(part_hdr); + goto err_out; + } + } + if (info_blk_hdr__flags(diag204_get_info_type(), time_hdr) & + DIAG204_LPAR_PHYS_FLG) { + ptr = hypfs_create_phys_files(root, part_hdr); + if (IS_ERR(ptr)) { + rc = PTR_ERR(ptr); + goto err_out; + } + } + hyp_dir = hypfs_mkdir(root, "hyp"); + if (IS_ERR(hyp_dir)) { + rc = PTR_ERR(hyp_dir); + goto err_out; + } + ptr = hypfs_create_str(hyp_dir, "type", "LPAR Hypervisor"); + if (IS_ERR(ptr)) { + rc = PTR_ERR(ptr); + goto err_out; + } + rc = 0; + +err_out: + return rc; +} + +/* Diagnose 224 functions */ + +static int diag224_idx2name(int index, char *name) +{ + memcpy(name, diag224_cpu_names + ((index + 1) * DIAG204_CPU_NAME_LEN), + DIAG204_CPU_NAME_LEN); + name[DIAG204_CPU_NAME_LEN] = 0; + strim(name); + return 0; +} + +static int diag224_get_name_table(void) +{ + /* memory must be below 2GB */ + diag224_cpu_names = (char *)__get_free_page(GFP_KERNEL | GFP_DMA); + if (!diag224_cpu_names) + return -ENOMEM; + if (diag224(diag224_cpu_names)) { + free_page((unsigned long)diag224_cpu_names); + return -EOPNOTSUPP; + } + EBCASC(diag224_cpu_names + 16, (*diag224_cpu_names + 1) * 16); + return 0; +} + +static void diag224_delete_name_table(void) +{ + free_page((unsigned long)diag224_cpu_names); +} + +int __init __hypfs_diag_fs_init(void) +{ + if (MACHINE_IS_LPAR) + return diag224_get_name_table(); + return 0; +} + +void __hypfs_diag_fs_exit(void) +{ + diag224_delete_name_table(); +} diff --git a/arch/s390/hypfs/hypfs_sprp.c b/arch/s390/hypfs/hypfs_sprp.c new file mode 100644 index 0000000000..f5f7e78ddc --- /dev/null +++ b/arch/s390/hypfs/hypfs_sprp.c @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Hypervisor filesystem for Linux on s390. + * Set Partition-Resource Parameter interface. + * + * Copyright IBM Corp. 2013 + * Author(s): Martin Schwidefsky + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "hypfs.h" + +#define DIAG304_SET_WEIGHTS 0 +#define DIAG304_QUERY_PRP 1 +#define DIAG304_SET_CAPPING 2 + +#define DIAG304_CMD_MAX 2 + +static inline unsigned long __hypfs_sprp_diag304(void *data, unsigned long cmd) +{ + union register_pair r1 = { .even = (unsigned long)data, }; + + asm volatile("diag %[r1],%[r3],0x304\n" + : [r1] "+&d" (r1.pair) + : [r3] "d" (cmd) + : "memory"); + return r1.odd; +} + +static unsigned long hypfs_sprp_diag304(void *data, unsigned long cmd) +{ + diag_stat_inc(DIAG_STAT_X304); + return __hypfs_sprp_diag304(data, cmd); +} + +static void hypfs_sprp_free(const void *data) +{ + free_page((unsigned long) data); +} + +static int hypfs_sprp_create(void **data_ptr, void **free_ptr, size_t *size) +{ + unsigned long rc; + void *data; + + data = (void *) get_zeroed_page(GFP_KERNEL); + if (!data) + return -ENOMEM; + rc = hypfs_sprp_diag304(data, DIAG304_QUERY_PRP); + if (rc != 1) { + *data_ptr = *free_ptr = NULL; + *size = 0; + free_page((unsigned long) data); + return -EIO; + } + *data_ptr = *free_ptr = data; + *size = PAGE_SIZE; + return 0; +} + +static int __hypfs_sprp_ioctl(void __user *user_area) +{ + struct hypfs_diag304 *diag304; + unsigned long cmd; + void __user *udata; + void *data; + int rc; + + rc = -ENOMEM; + data = (void *) get_zeroed_page(GFP_KERNEL | GFP_DMA); + diag304 = kzalloc(sizeof(*diag304), GFP_KERNEL); + if (!data || !diag304) + goto out; + + rc = -EFAULT; + if (copy_from_user(diag304, user_area, sizeof(*diag304))) + goto out; + rc = -EINVAL; + if ((diag304->args[0] >> 8) != 0 || diag304->args[1] > DIAG304_CMD_MAX) + goto out; + + rc = -EFAULT; + udata = (void __user *)(unsigned long) diag304->data; + if (diag304->args[1] == DIAG304_SET_WEIGHTS || + diag304->args[1] == DIAG304_SET_CAPPING) + if (copy_from_user(data, udata, PAGE_SIZE)) + goto out; + + cmd = *(unsigned long *) &diag304->args[0]; + diag304->rc = hypfs_sprp_diag304(data, cmd); + + if (diag304->args[1] == DIAG304_QUERY_PRP) + if (copy_to_user(udata, data, PAGE_SIZE)) { + rc = -EFAULT; + goto out; + } + + rc = copy_to_user(user_area, diag304, sizeof(*diag304)) ? -EFAULT : 0; +out: + kfree(diag304); + free_page((unsigned long) data); + return rc; +} + +static long hypfs_sprp_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + void __user *argp; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (is_compat_task()) + argp = compat_ptr(arg); + else + argp = (void __user *) arg; + switch (cmd) { + case HYPFS_DIAG304: + return __hypfs_sprp_ioctl(argp); + default: /* unknown ioctl number */ + return -ENOTTY; + } + return 0; +} + +static struct hypfs_dbfs_file hypfs_sprp_file = { + .name = "diag_304", + .data_create = hypfs_sprp_create, + .data_free = hypfs_sprp_free, + .unlocked_ioctl = hypfs_sprp_ioctl, +}; + +void hypfs_sprp_init(void) +{ + if (!sclp.has_sprp) + return; + hypfs_dbfs_create_file(&hypfs_sprp_file); +} + +void hypfs_sprp_exit(void) +{ + if (!sclp.has_sprp) + return; + hypfs_dbfs_remove_file(&hypfs_sprp_file); +} diff --git a/arch/s390/hypfs/hypfs_vm.c b/arch/s390/hypfs/hypfs_vm.c new file mode 100644 index 0000000000..3db40ad853 --- /dev/null +++ b/arch/s390/hypfs/hypfs_vm.c @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Hypervisor filesystem for Linux on s390. z/VM implementation. + * + * Copyright IBM Corp. 2006 + * Author(s): Michael Holzheu + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "hypfs_vm.h" +#include "hypfs.h" + +#define DBFS_D2FC_HDR_VERSION 0 + +static char local_guest[] = " "; +static char all_guests[] = "* "; +static char *all_groups = all_guests; +char *diag2fc_guest_query; + +static int diag2fc(int size, char* query, void *addr) +{ + unsigned long residual_cnt; + unsigned long rc; + struct diag2fc_parm_list parm_list; + + memcpy(parm_list.userid, query, DIAG2FC_NAME_LEN); + ASCEBC(parm_list.userid, DIAG2FC_NAME_LEN); + memcpy(parm_list.aci_grp, all_groups, DIAG2FC_NAME_LEN); + ASCEBC(parm_list.aci_grp, DIAG2FC_NAME_LEN); + parm_list.addr = (unsigned long)addr; + parm_list.size = size; + parm_list.fmt = 0x02; + rc = -1; + + diag_stat_inc(DIAG_STAT_X2FC); + asm volatile( + " diag %0,%1,0x2fc\n" + "0: nopr %%r7\n" + EX_TABLE(0b,0b) + : "=d" (residual_cnt), "+d" (rc) : "0" (&parm_list) : "memory"); + + if ((rc != 0 ) && (rc != -2)) + return rc; + else + return -residual_cnt; +} + +/* + * Allocate buffer for "query" and store diag 2fc at "offset" + */ +void *diag2fc_store(char *query, unsigned int *count, int offset) +{ + void *data; + int size; + + do { + size = diag2fc(0, query, NULL); + if (size < 0) + return ERR_PTR(-EACCES); + data = vmalloc(size + offset); + if (!data) + return ERR_PTR(-ENOMEM); + if (diag2fc(size, query, data + offset) == 0) + break; + vfree(data); + } while (1); + *count = (size / sizeof(struct diag2fc_data)); + + return data; +} + +void diag2fc_free(const void *data) +{ + vfree(data); +} + +struct dbfs_d2fc_hdr { + u64 len; /* Length of d2fc buffer without header */ + u16 version; /* Version of header */ + union tod_clock tod_ext; /* TOD clock for d2fc */ + u64 count; /* Number of VM guests in d2fc buffer */ + char reserved[30]; +} __attribute__ ((packed)); + +struct dbfs_d2fc { + struct dbfs_d2fc_hdr hdr; /* 64 byte header */ + char buf[]; /* d2fc buffer */ +} __attribute__ ((packed)); + +static int dbfs_diag2fc_create(void **data, void **data_free_ptr, size_t *size) +{ + struct dbfs_d2fc *d2fc; + unsigned int count; + + d2fc = diag2fc_store(diag2fc_guest_query, &count, sizeof(d2fc->hdr)); + if (IS_ERR(d2fc)) + return PTR_ERR(d2fc); + store_tod_clock_ext(&d2fc->hdr.tod_ext); + d2fc->hdr.len = count * sizeof(struct diag2fc_data); + d2fc->hdr.version = DBFS_D2FC_HDR_VERSION; + d2fc->hdr.count = count; + memset(&d2fc->hdr.reserved, 0, sizeof(d2fc->hdr.reserved)); + *data = d2fc; + *data_free_ptr = d2fc; + *size = d2fc->hdr.len + sizeof(struct dbfs_d2fc_hdr); + return 0; +} + +static struct hypfs_dbfs_file dbfs_file_2fc = { + .name = "diag_2fc", + .data_create = dbfs_diag2fc_create, + .data_free = diag2fc_free, +}; + +int hypfs_vm_init(void) +{ + if (!MACHINE_IS_VM) + return 0; + if (diag2fc(0, all_guests, NULL) > 0) + diag2fc_guest_query = all_guests; + else if (diag2fc(0, local_guest, NULL) > 0) + diag2fc_guest_query = local_guest; + else + return -EACCES; + hypfs_dbfs_create_file(&dbfs_file_2fc); + return 0; +} + +void hypfs_vm_exit(void) +{ + if (!MACHINE_IS_VM) + return; + hypfs_dbfs_remove_file(&dbfs_file_2fc); +} diff --git a/arch/s390/hypfs/hypfs_vm.h b/arch/s390/hypfs/hypfs_vm.h new file mode 100644 index 0000000000..fe2e5851ad --- /dev/null +++ b/arch/s390/hypfs/hypfs_vm.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Hypervisor filesystem for Linux on s390. z/VM implementation. + * + * Copyright IBM Corp. 2006 + * Author(s): Michael Holzheu + */ + +#ifndef _S390_HYPFS_VM_H_ +#define _S390_HYPFS_VM_H_ + +#define DIAG2FC_NAME_LEN 8 + +struct diag2fc_data { + __u32 version; + __u32 flags; + __u64 used_cpu; + __u64 el_time; + __u64 mem_min_kb; + __u64 mem_max_kb; + __u64 mem_share_kb; + __u64 mem_used_kb; + __u32 pcpus; + __u32 lcpus; + __u32 vcpus; + __u32 ocpus; + __u32 cpu_max; + __u32 cpu_shares; + __u32 cpu_use_samp; + __u32 cpu_delay_samp; + __u32 page_wait_samp; + __u32 idle_samp; + __u32 other_samp; + __u32 total_samp; + char guest_name[DIAG2FC_NAME_LEN]; +}; + +struct diag2fc_parm_list { + char userid[DIAG2FC_NAME_LEN]; + char aci_grp[DIAG2FC_NAME_LEN]; + __u64 addr; + __u32 size; + __u32 fmt; +}; + +void *diag2fc_store(char *query, unsigned int *count, int offset); +void diag2fc_free(const void *data); +extern char *diag2fc_guest_query; + +#endif /* _S390_HYPFS_VM_H_ */ diff --git a/arch/s390/hypfs/hypfs_vm_fs.c b/arch/s390/hypfs/hypfs_vm_fs.c new file mode 100644 index 0000000000..6011289afa --- /dev/null +++ b/arch/s390/hypfs/hypfs_vm_fs.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Hypervisor filesystem for Linux on s390. z/VM implementation. + * + * Copyright IBM Corp. 2006 + * Author(s): Michael Holzheu + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "hypfs_vm.h" +#include "hypfs.h" + +#define ATTRIBUTE(dir, name, member) \ +do { \ + void *rc; \ + rc = hypfs_create_u64(dir, name, member); \ + if (IS_ERR(rc)) \ + return PTR_ERR(rc); \ +} while (0) + +static int hypfs_vm_create_guest(struct dentry *systems_dir, + struct diag2fc_data *data) +{ + char guest_name[DIAG2FC_NAME_LEN + 1] = {}; + struct dentry *guest_dir, *cpus_dir, *samples_dir, *mem_dir; + int dedicated_flag, capped_value; + + capped_value = (data->flags & 0x00000006) >> 1; + dedicated_flag = (data->flags & 0x00000008) >> 3; + + /* guest dir */ + memcpy(guest_name, data->guest_name, DIAG2FC_NAME_LEN); + EBCASC(guest_name, DIAG2FC_NAME_LEN); + strim(guest_name); + guest_dir = hypfs_mkdir(systems_dir, guest_name); + if (IS_ERR(guest_dir)) + return PTR_ERR(guest_dir); + ATTRIBUTE(guest_dir, "onlinetime_us", data->el_time); + + /* logical cpu information */ + cpus_dir = hypfs_mkdir(guest_dir, "cpus"); + if (IS_ERR(cpus_dir)) + return PTR_ERR(cpus_dir); + ATTRIBUTE(cpus_dir, "cputime_us", data->used_cpu); + ATTRIBUTE(cpus_dir, "capped", capped_value); + ATTRIBUTE(cpus_dir, "dedicated", dedicated_flag); + ATTRIBUTE(cpus_dir, "count", data->vcpus); + /* + * Note: The "weight_min" attribute got the wrong name. + * The value represents the number of non-stopped (operating) + * CPUS. + */ + ATTRIBUTE(cpus_dir, "weight_min", data->ocpus); + ATTRIBUTE(cpus_dir, "weight_max", data->cpu_max); + ATTRIBUTE(cpus_dir, "weight_cur", data->cpu_shares); + + /* memory information */ + mem_dir = hypfs_mkdir(guest_dir, "mem"); + if (IS_ERR(mem_dir)) + return PTR_ERR(mem_dir); + ATTRIBUTE(mem_dir, "min_KiB", data->mem_min_kb); + ATTRIBUTE(mem_dir, "max_KiB", data->mem_max_kb); + ATTRIBUTE(mem_dir, "used_KiB", data->mem_used_kb); + ATTRIBUTE(mem_dir, "share_KiB", data->mem_share_kb); + + /* samples */ + samples_dir = hypfs_mkdir(guest_dir, "samples"); + if (IS_ERR(samples_dir)) + return PTR_ERR(samples_dir); + ATTRIBUTE(samples_dir, "cpu_using", data->cpu_use_samp); + ATTRIBUTE(samples_dir, "cpu_delay", data->cpu_delay_samp); + ATTRIBUTE(samples_dir, "mem_delay", data->page_wait_samp); + ATTRIBUTE(samples_dir, "idle", data->idle_samp); + ATTRIBUTE(samples_dir, "other", data->other_samp); + ATTRIBUTE(samples_dir, "total", data->total_samp); + return 0; +} + +int hypfs_vm_create_files(struct dentry *root) +{ + struct dentry *dir, *file; + struct diag2fc_data *data; + unsigned int count = 0; + int rc, i; + + data = diag2fc_store(diag2fc_guest_query, &count, 0); + if (IS_ERR(data)) + return PTR_ERR(data); + + /* Hypervisor Info */ + dir = hypfs_mkdir(root, "hyp"); + if (IS_ERR(dir)) { + rc = PTR_ERR(dir); + goto failed; + } + file = hypfs_create_str(dir, "type", "z/VM Hypervisor"); + if (IS_ERR(file)) { + rc = PTR_ERR(file); + goto failed; + } + + /* physical cpus */ + dir = hypfs_mkdir(root, "cpus"); + if (IS_ERR(dir)) { + rc = PTR_ERR(dir); + goto failed; + } + file = hypfs_create_u64(dir, "count", data->lcpus); + if (IS_ERR(file)) { + rc = PTR_ERR(file); + goto failed; + } + + /* guests */ + dir = hypfs_mkdir(root, "systems"); + if (IS_ERR(dir)) { + rc = PTR_ERR(dir); + goto failed; + } + + for (i = 0; i < count; i++) { + rc = hypfs_vm_create_guest(dir, &data[i]); + if (rc) + goto failed; + } + diag2fc_free(data); + return 0; + +failed: + diag2fc_free(data); + return rc; +} diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c new file mode 100644 index 0000000000..ada8314993 --- /dev/null +++ b/arch/s390/hypfs/inode.c @@ -0,0 +1,477 @@ +// SPDX-License-Identifier: GPL-1.0+ +/* + * Hypervisor filesystem for Linux on s390. + * + * Copyright IBM Corp. 2006, 2008 + * Author(s): Michael Holzheu + */ + +#define KMSG_COMPONENT "hypfs" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "hypfs.h" + +#define HYPFS_MAGIC 0x687970 /* ASCII 'hyp' */ +#define TMP_SIZE 64 /* size of temporary buffers */ + +static struct dentry *hypfs_create_update_file(struct dentry *dir); + +struct hypfs_sb_info { + kuid_t uid; /* uid used for files and dirs */ + kgid_t gid; /* gid used for files and dirs */ + struct dentry *update_file; /* file to trigger update */ + time64_t last_update; /* last update, CLOCK_MONOTONIC time */ + struct mutex lock; /* lock to protect update process */ +}; + +static const struct file_operations hypfs_file_ops; +static struct file_system_type hypfs_type; +static const struct super_operations hypfs_s_ops; + +/* start of list of all dentries, which have to be deleted on update */ +static struct dentry *hypfs_last_dentry; + +static void hypfs_update_update(struct super_block *sb) +{ + struct hypfs_sb_info *sb_info = sb->s_fs_info; + struct inode *inode = d_inode(sb_info->update_file); + + sb_info->last_update = ktime_get_seconds(); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); +} + +/* directory tree removal functions */ + +static void hypfs_add_dentry(struct dentry *dentry) +{ + dentry->d_fsdata = hypfs_last_dentry; + hypfs_last_dentry = dentry; +} + +static void hypfs_remove(struct dentry *dentry) +{ + struct dentry *parent; + + parent = dentry->d_parent; + inode_lock(d_inode(parent)); + if (simple_positive(dentry)) { + if (d_is_dir(dentry)) + simple_rmdir(d_inode(parent), dentry); + else + simple_unlink(d_inode(parent), dentry); + } + d_drop(dentry); + dput(dentry); + inode_unlock(d_inode(parent)); +} + +static void hypfs_delete_tree(struct dentry *root) +{ + while (hypfs_last_dentry) { + struct dentry *next_dentry; + next_dentry = hypfs_last_dentry->d_fsdata; + hypfs_remove(hypfs_last_dentry); + hypfs_last_dentry = next_dentry; + } +} + +static struct inode *hypfs_make_inode(struct super_block *sb, umode_t mode) +{ + struct inode *ret = new_inode(sb); + + if (ret) { + struct hypfs_sb_info *hypfs_info = sb->s_fs_info; + ret->i_ino = get_next_ino(); + ret->i_mode = mode; + ret->i_uid = hypfs_info->uid; + ret->i_gid = hypfs_info->gid; + ret->i_atime = ret->i_mtime = inode_set_ctime_current(ret); + if (S_ISDIR(mode)) + set_nlink(ret, 2); + } + return ret; +} + +static void hypfs_evict_inode(struct inode *inode) +{ + clear_inode(inode); + kfree(inode->i_private); +} + +static int hypfs_open(struct inode *inode, struct file *filp) +{ + char *data = file_inode(filp)->i_private; + struct hypfs_sb_info *fs_info; + + if (filp->f_mode & FMODE_WRITE) { + if (!(inode->i_mode & S_IWUGO)) + return -EACCES; + } + if (filp->f_mode & FMODE_READ) { + if (!(inode->i_mode & S_IRUGO)) + return -EACCES; + } + + fs_info = inode->i_sb->s_fs_info; + if(data) { + mutex_lock(&fs_info->lock); + filp->private_data = kstrdup(data, GFP_KERNEL); + if (!filp->private_data) { + mutex_unlock(&fs_info->lock); + return -ENOMEM; + } + mutex_unlock(&fs_info->lock); + } + return nonseekable_open(inode, filp); +} + +static ssize_t hypfs_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + char *data = file->private_data; + size_t available = strlen(data); + loff_t pos = iocb->ki_pos; + size_t count; + + if (pos < 0) + return -EINVAL; + if (pos >= available || !iov_iter_count(to)) + return 0; + count = copy_to_iter(data + pos, available - pos, to); + if (!count) + return -EFAULT; + iocb->ki_pos = pos + count; + file_accessed(file); + return count; +} + +static ssize_t hypfs_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + int rc; + struct super_block *sb = file_inode(iocb->ki_filp)->i_sb; + struct hypfs_sb_info *fs_info = sb->s_fs_info; + size_t count = iov_iter_count(from); + + /* + * Currently we only allow one update per second for two reasons: + * 1. diag 204 is VERY expensive + * 2. If several processes do updates in parallel and then read the + * hypfs data, the likelihood of collisions is reduced, if we restrict + * the minimum update interval. A collision occurs, if during the + * data gathering of one process another process triggers an update + * If the first process wants to ensure consistent data, it has + * to restart data collection in this case. + */ + mutex_lock(&fs_info->lock); + if (fs_info->last_update == ktime_get_seconds()) { + rc = -EBUSY; + goto out; + } + hypfs_delete_tree(sb->s_root); + if (MACHINE_IS_VM) + rc = hypfs_vm_create_files(sb->s_root); + else + rc = hypfs_diag_create_files(sb->s_root); + if (rc) { + pr_err("Updating the hypfs tree failed\n"); + hypfs_delete_tree(sb->s_root); + goto out; + } + hypfs_update_update(sb); + rc = count; + iov_iter_advance(from, count); +out: + mutex_unlock(&fs_info->lock); + return rc; +} + +static int hypfs_release(struct inode *inode, struct file *filp) +{ + kfree(filp->private_data); + return 0; +} + +enum { Opt_uid, Opt_gid, }; + +static const struct fs_parameter_spec hypfs_fs_parameters[] = { + fsparam_u32("gid", Opt_gid), + fsparam_u32("uid", Opt_uid), + {} +}; + +static int hypfs_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct hypfs_sb_info *hypfs_info = fc->s_fs_info; + struct fs_parse_result result; + kuid_t uid; + kgid_t gid; + int opt; + + opt = fs_parse(fc, hypfs_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_uid: + uid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(uid)) + return invalf(fc, "Unknown uid"); + hypfs_info->uid = uid; + break; + case Opt_gid: + gid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(gid)) + return invalf(fc, "Unknown gid"); + hypfs_info->gid = gid; + break; + } + return 0; +} + +static int hypfs_show_options(struct seq_file *s, struct dentry *root) +{ + struct hypfs_sb_info *hypfs_info = root->d_sb->s_fs_info; + + seq_printf(s, ",uid=%u", from_kuid_munged(&init_user_ns, hypfs_info->uid)); + seq_printf(s, ",gid=%u", from_kgid_munged(&init_user_ns, hypfs_info->gid)); + return 0; +} + +static int hypfs_fill_super(struct super_block *sb, struct fs_context *fc) +{ + struct hypfs_sb_info *sbi = sb->s_fs_info; + struct inode *root_inode; + struct dentry *root_dentry, *update_file; + int rc; + + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; + sb->s_magic = HYPFS_MAGIC; + sb->s_op = &hypfs_s_ops; + + root_inode = hypfs_make_inode(sb, S_IFDIR | 0755); + if (!root_inode) + return -ENOMEM; + root_inode->i_op = &simple_dir_inode_operations; + root_inode->i_fop = &simple_dir_operations; + sb->s_root = root_dentry = d_make_root(root_inode); + if (!root_dentry) + return -ENOMEM; + if (MACHINE_IS_VM) + rc = hypfs_vm_create_files(root_dentry); + else + rc = hypfs_diag_create_files(root_dentry); + if (rc) + return rc; + update_file = hypfs_create_update_file(root_dentry); + if (IS_ERR(update_file)) + return PTR_ERR(update_file); + sbi->update_file = update_file; + hypfs_update_update(sb); + pr_info("Hypervisor filesystem mounted\n"); + return 0; +} + +static int hypfs_get_tree(struct fs_context *fc) +{ + return get_tree_single(fc, hypfs_fill_super); +} + +static void hypfs_free_fc(struct fs_context *fc) +{ + kfree(fc->s_fs_info); +} + +static const struct fs_context_operations hypfs_context_ops = { + .free = hypfs_free_fc, + .parse_param = hypfs_parse_param, + .get_tree = hypfs_get_tree, +}; + +static int hypfs_init_fs_context(struct fs_context *fc) +{ + struct hypfs_sb_info *sbi; + + sbi = kzalloc(sizeof(struct hypfs_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + mutex_init(&sbi->lock); + sbi->uid = current_uid(); + sbi->gid = current_gid(); + + fc->s_fs_info = sbi; + fc->ops = &hypfs_context_ops; + return 0; +} + +static void hypfs_kill_super(struct super_block *sb) +{ + struct hypfs_sb_info *sb_info = sb->s_fs_info; + + if (sb->s_root) + hypfs_delete_tree(sb->s_root); + if (sb_info && sb_info->update_file) + hypfs_remove(sb_info->update_file); + kfree(sb->s_fs_info); + sb->s_fs_info = NULL; + kill_litter_super(sb); +} + +static struct dentry *hypfs_create_file(struct dentry *parent, const char *name, + char *data, umode_t mode) +{ + struct dentry *dentry; + struct inode *inode; + + inode_lock(d_inode(parent)); + dentry = lookup_one_len(name, parent, strlen(name)); + if (IS_ERR(dentry)) { + dentry = ERR_PTR(-ENOMEM); + goto fail; + } + inode = hypfs_make_inode(parent->d_sb, mode); + if (!inode) { + dput(dentry); + dentry = ERR_PTR(-ENOMEM); + goto fail; + } + if (S_ISREG(mode)) { + inode->i_fop = &hypfs_file_ops; + if (data) + inode->i_size = strlen(data); + else + inode->i_size = 0; + } else if (S_ISDIR(mode)) { + inode->i_op = &simple_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + inc_nlink(d_inode(parent)); + } else + BUG(); + inode->i_private = data; + d_instantiate(dentry, inode); + dget(dentry); +fail: + inode_unlock(d_inode(parent)); + return dentry; +} + +struct dentry *hypfs_mkdir(struct dentry *parent, const char *name) +{ + struct dentry *dentry; + + dentry = hypfs_create_file(parent, name, NULL, S_IFDIR | DIR_MODE); + if (IS_ERR(dentry)) + return dentry; + hypfs_add_dentry(dentry); + return dentry; +} + +static struct dentry *hypfs_create_update_file(struct dentry *dir) +{ + struct dentry *dentry; + + dentry = hypfs_create_file(dir, "update", NULL, + S_IFREG | UPDATE_FILE_MODE); + /* + * We do not put the update file on the 'delete' list with + * hypfs_add_dentry(), since it should not be removed when the tree + * is updated. + */ + return dentry; +} + +struct dentry *hypfs_create_u64(struct dentry *dir, + const char *name, __u64 value) +{ + char *buffer; + char tmp[TMP_SIZE]; + struct dentry *dentry; + + snprintf(tmp, TMP_SIZE, "%llu\n", (unsigned long long int)value); + buffer = kstrdup(tmp, GFP_KERNEL); + if (!buffer) + return ERR_PTR(-ENOMEM); + dentry = + hypfs_create_file(dir, name, buffer, S_IFREG | REG_FILE_MODE); + if (IS_ERR(dentry)) { + kfree(buffer); + return ERR_PTR(-ENOMEM); + } + hypfs_add_dentry(dentry); + return dentry; +} + +struct dentry *hypfs_create_str(struct dentry *dir, + const char *name, char *string) +{ + char *buffer; + struct dentry *dentry; + + buffer = kmalloc(strlen(string) + 2, GFP_KERNEL); + if (!buffer) + return ERR_PTR(-ENOMEM); + sprintf(buffer, "%s\n", string); + dentry = + hypfs_create_file(dir, name, buffer, S_IFREG | REG_FILE_MODE); + if (IS_ERR(dentry)) { + kfree(buffer); + return ERR_PTR(-ENOMEM); + } + hypfs_add_dentry(dentry); + return dentry; +} + +static const struct file_operations hypfs_file_ops = { + .open = hypfs_open, + .release = hypfs_release, + .read_iter = hypfs_read_iter, + .write_iter = hypfs_write_iter, + .llseek = no_llseek, +}; + +static struct file_system_type hypfs_type = { + .owner = THIS_MODULE, + .name = "s390_hypfs", + .init_fs_context = hypfs_init_fs_context, + .parameters = hypfs_fs_parameters, + .kill_sb = hypfs_kill_super +}; + +static const struct super_operations hypfs_s_ops = { + .statfs = simple_statfs, + .evict_inode = hypfs_evict_inode, + .show_options = hypfs_show_options, +}; + +int __init __hypfs_fs_init(void) +{ + int rc; + + rc = sysfs_create_mount_point(hypervisor_kobj, "s390"); + if (rc) + return rc; + rc = register_filesystem(&hypfs_type); + if (rc) + goto fail; + return 0; +fail: + sysfs_remove_mount_point(hypervisor_kobj, "s390"); + return rc; +} diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild new file mode 100644 index 0000000000..4b904110d2 --- /dev/null +++ b/arch/s390/include/asm/Kbuild @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 +generated-y += dis-defs.h +generated-y += facility-defs.h +generated-y += syscall_table.h +generated-y += unistd_nr.h + +generic-y += asm-offsets.h +generic-y += kvm_types.h +generic-y += mcs_spinlock.h diff --git a/arch/s390/include/asm/abs_lowcore.h b/arch/s390/include/asm/abs_lowcore.h new file mode 100644 index 0000000000..6f264b79e3 --- /dev/null +++ b/arch/s390/include/asm/abs_lowcore.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_ABS_LOWCORE_H +#define _ASM_S390_ABS_LOWCORE_H + +#include + +#define ABS_LOWCORE_MAP_SIZE (NR_CPUS * sizeof(struct lowcore)) + +extern unsigned long __abs_lowcore; + +int abs_lowcore_map(int cpu, struct lowcore *lc, bool alloc); +void abs_lowcore_unmap(int cpu); + +static inline struct lowcore *get_abs_lowcore(void) +{ + int cpu; + + cpu = get_cpu(); + return ((struct lowcore *)__abs_lowcore) + cpu; +} + +static inline void put_abs_lowcore(struct lowcore *lc) +{ + put_cpu(); +} + +#endif /* _ASM_S390_ABS_LOWCORE_H */ diff --git a/arch/s390/include/asm/airq.h b/arch/s390/include/asm/airq.h new file mode 100644 index 0000000000..c4c28c2609 --- /dev/null +++ b/arch/s390/include/asm/airq.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2002, 2007 + * Author(s): Ingo Adlung + * Cornelia Huck + * Arnd Bergmann + * Peter Oberparleiter + */ + +#ifndef _ASM_S390_AIRQ_H +#define _ASM_S390_AIRQ_H + +#include +#include +#include + +struct airq_struct { + struct hlist_node list; /* Handler queueing. */ + void (*handler)(struct airq_struct *airq, struct tpi_info *tpi_info); + u8 *lsi_ptr; /* Local-Summary-Indicator pointer */ + u8 isc; /* Interrupt-subclass */ + u8 flags; +}; + +#define AIRQ_PTR_ALLOCATED 0x01 + +int register_adapter_interrupt(struct airq_struct *airq); +void unregister_adapter_interrupt(struct airq_struct *airq); + +/* Adapter interrupt bit vector */ +struct airq_iv { + unsigned long *vector; /* Adapter interrupt bit vector */ + dma_addr_t vector_dma; /* Adapter interrupt bit vector dma */ + unsigned long *avail; /* Allocation bit mask for the bit vector */ + unsigned long *bitlock; /* Lock bit mask for the bit vector */ + unsigned long *ptr; /* Pointer associated with each bit */ + unsigned int *data; /* 32 bit value associated with each bit */ + unsigned long bits; /* Number of bits in the vector */ + unsigned long end; /* Number of highest allocated bit + 1 */ + unsigned long flags; /* Allocation flags */ + spinlock_t lock; /* Lock to protect alloc & free */ +}; + +#define AIRQ_IV_ALLOC 1 /* Use an allocation bit mask */ +#define AIRQ_IV_BITLOCK 2 /* Allocate the lock bit mask */ +#define AIRQ_IV_PTR 4 /* Allocate the ptr array */ +#define AIRQ_IV_DATA 8 /* Allocate the data array */ +#define AIRQ_IV_CACHELINE 16 /* Cacheline alignment for the vector */ +#define AIRQ_IV_GUESTVEC 32 /* Vector is a pinned guest page */ + +struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags, + unsigned long *vec); +void airq_iv_release(struct airq_iv *iv); +unsigned long airq_iv_alloc(struct airq_iv *iv, unsigned long num); +void airq_iv_free(struct airq_iv *iv, unsigned long bit, unsigned long num); +unsigned long airq_iv_scan(struct airq_iv *iv, unsigned long start, + unsigned long end); + +static inline unsigned long airq_iv_alloc_bit(struct airq_iv *iv) +{ + return airq_iv_alloc(iv, 1); +} + +static inline void airq_iv_free_bit(struct airq_iv *iv, unsigned long bit) +{ + airq_iv_free(iv, bit, 1); +} + +static inline unsigned long airq_iv_end(struct airq_iv *iv) +{ + return iv->end; +} + +static inline void airq_iv_lock(struct airq_iv *iv, unsigned long bit) +{ + const unsigned long be_to_le = BITS_PER_LONG - 1; + bit_spin_lock(bit ^ be_to_le, iv->bitlock); +} + +static inline void airq_iv_unlock(struct airq_iv *iv, unsigned long bit) +{ + const unsigned long be_to_le = BITS_PER_LONG - 1; + bit_spin_unlock(bit ^ be_to_le, iv->bitlock); +} + +static inline void airq_iv_set_data(struct airq_iv *iv, unsigned long bit, + unsigned int data) +{ + iv->data[bit] = data; +} + +static inline unsigned int airq_iv_get_data(struct airq_iv *iv, + unsigned long bit) +{ + return iv->data[bit]; +} + +static inline void airq_iv_set_ptr(struct airq_iv *iv, unsigned long bit, + unsigned long ptr) +{ + iv->ptr[bit] = ptr; +} + +static inline unsigned long airq_iv_get_ptr(struct airq_iv *iv, + unsigned long bit) +{ + return iv->ptr[bit]; +} + +#endif /* _ASM_S390_AIRQ_H */ diff --git a/arch/s390/include/asm/alternative-asm.h b/arch/s390/include/asm/alternative-asm.h new file mode 100644 index 0000000000..7db046596b --- /dev/null +++ b/arch/s390/include/asm/alternative-asm.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_ALTERNATIVE_ASM_H +#define _ASM_S390_ALTERNATIVE_ASM_H + +#ifdef __ASSEMBLY__ + +/* + * Issue one struct alt_instr descriptor entry (need to put it into + * the section .altinstructions, see below). This entry contains + * enough information for the alternatives patching code to patch an + * instruction. See apply_alternatives(). + */ +.macro alt_entry orig_start, orig_end, alt_start, alt_end, feature + .long \orig_start - . + .long \alt_start - . + .word \feature + .byte \orig_end - \orig_start + .org . - ( \orig_end - \orig_start ) + ( \alt_end - \alt_start ) + .org . - ( \alt_end - \alt_start ) + ( \orig_end - \orig_start ) +.endm + +/* + * Define an alternative between two instructions. If @feature is + * present, early code in apply_alternatives() replaces @oldinstr with + * @newinstr. + */ +.macro ALTERNATIVE oldinstr, newinstr, feature + .pushsection .altinstr_replacement,"ax" +770: \newinstr +771: .popsection +772: \oldinstr +773: .pushsection .altinstructions,"a" + alt_entry 772b, 773b, 770b, 771b, \feature + .popsection +.endm + +/* + * Define an alternative between two instructions. If @feature is + * present, early code in apply_alternatives() replaces @oldinstr with + * @newinstr. + */ +.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2 + .pushsection .altinstr_replacement,"ax" +770: \newinstr1 +771: \newinstr2 +772: .popsection +773: \oldinstr +774: .pushsection .altinstructions,"a" + alt_entry 773b, 774b, 770b, 771b,\feature1 + alt_entry 773b, 774b, 771b, 772b,\feature2 + .popsection +.endm + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_S390_ALTERNATIVE_ASM_H */ diff --git a/arch/s390/include/asm/alternative.h b/arch/s390/include/asm/alternative.h new file mode 100644 index 0000000000..904dd049f9 --- /dev/null +++ b/arch/s390/include/asm/alternative.h @@ -0,0 +1,120 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_ALTERNATIVE_H +#define _ASM_S390_ALTERNATIVE_H + +#ifndef __ASSEMBLY__ + +#include +#include +#include + +struct alt_instr { + s32 instr_offset; /* original instruction */ + s32 repl_offset; /* offset to replacement instruction */ + u16 facility; /* facility bit set for replacement */ + u8 instrlen; /* length of original instruction */ +} __packed; + +void apply_alternative_instructions(void); +void apply_alternatives(struct alt_instr *start, struct alt_instr *end); + +/* + * +---------------------------------+ + * |661: |662: + * | oldinstr | + * +---------------------------------+ + * + * .altinstr_replacement section + * +---------------------------------+ + * |6641: |6651: + * | alternative instr 1 | + * +---------------------------------+ + * |6642: |6652: + * | alternative instr 2 | + * +---------------------------------+ + * + * .altinstructions section + * +---------------------------------+ + * | alt_instr entries for each | + * | alternative instr | + * +---------------------------------+ + */ + +#define b_altinstr(num) "664"#num +#define e_altinstr(num) "665"#num +#define oldinstr_len "662b-661b" +#define altinstr_len(num) e_altinstr(num)"b-"b_altinstr(num)"b" + +#define OLDINSTR(oldinstr) \ + "661:\n\t" oldinstr "\n662:\n" + +#define ALTINSTR_ENTRY(facility, num) \ + "\t.long 661b - .\n" /* old instruction */ \ + "\t.long " b_altinstr(num)"b - .\n" /* alt instruction */ \ + "\t.word " __stringify(facility) "\n" /* facility bit */ \ + "\t.byte " oldinstr_len "\n" /* instruction len */ \ + "\t.org . - (" oldinstr_len ") + (" altinstr_len(num) ")\n" \ + "\t.org . - (" altinstr_len(num) ") + (" oldinstr_len ")\n" + +#define ALTINSTR_REPLACEMENT(altinstr, num) /* replacement */ \ + b_altinstr(num)":\n\t" altinstr "\n" e_altinstr(num) ":\n" + +/* alternative assembly primitive: */ +#define ALTERNATIVE(oldinstr, altinstr, facility) \ + ".pushsection .altinstr_replacement, \"ax\"\n" \ + ALTINSTR_REPLACEMENT(altinstr, 1) \ + ".popsection\n" \ + OLDINSTR(oldinstr) \ + ".pushsection .altinstructions,\"a\"\n" \ + ALTINSTR_ENTRY(facility, 1) \ + ".popsection\n" + +#define ALTERNATIVE_2(oldinstr, altinstr1, facility1, altinstr2, facility2)\ + ".pushsection .altinstr_replacement, \"ax\"\n" \ + ALTINSTR_REPLACEMENT(altinstr1, 1) \ + ALTINSTR_REPLACEMENT(altinstr2, 2) \ + ".popsection\n" \ + OLDINSTR(oldinstr) \ + ".pushsection .altinstructions,\"a\"\n" \ + ALTINSTR_ENTRY(facility1, 1) \ + ALTINSTR_ENTRY(facility2, 2) \ + ".popsection\n" + +/* + * Alternative instructions for different CPU types or capabilities. + * + * This allows to use optimized instructions even on generic binary + * kernels. + * + * oldinstr is padded with jump and nops at compile time if altinstr is + * longer. altinstr is padded with jump and nops at run-time during patching. + * + * For non barrier like inlines please define new variants + * without volatile and memory clobber. + */ +#define alternative(oldinstr, altinstr, facility) \ + asm_inline volatile(ALTERNATIVE(oldinstr, altinstr, facility) : : : "memory") + +#define alternative_2(oldinstr, altinstr1, facility1, altinstr2, facility2) \ + asm_inline volatile(ALTERNATIVE_2(oldinstr, altinstr1, facility1, \ + altinstr2, facility2) ::: "memory") + +/* Alternative inline assembly with input. */ +#define alternative_input(oldinstr, newinstr, feature, input...) \ + asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, feature) \ + : : input) + +/* Like alternative_input, but with a single output argument */ +#define alternative_io(oldinstr, altinstr, facility, output, input...) \ + asm_inline volatile(ALTERNATIVE(oldinstr, altinstr, facility) \ + : output : input) + +/* Use this macro if more than one output parameter is needed. */ +#define ASM_OUTPUT2(a...) a + +/* Use this macro if clobbers are needed without inputs. */ +#define ASM_NO_INPUT_CLOBBER(clobber...) : clobber + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_S390_ALTERNATIVE_H */ diff --git a/arch/s390/include/asm/ap.h b/arch/s390/include/asm/ap.h new file mode 100644 index 0000000000..40c2b82f08 --- /dev/null +++ b/arch/s390/include/asm/ap.h @@ -0,0 +1,555 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Adjunct processor (AP) interfaces + * + * Copyright IBM Corp. 2017 + * + * Author(s): Tony Krowiak + * Martin Schwidefsky + * Harald Freudenberger + */ + +#ifndef _ASM_S390_AP_H_ +#define _ASM_S390_AP_H_ + +#include +#include + +/** + * The ap_qid_t identifier of an ap queue. + * If the AP facilities test (APFT) facility is available, + * card and queue index are 8 bit values, otherwise + * card index is 6 bit and queue index a 4 bit value. + */ +typedef unsigned int ap_qid_t; + +#define AP_MKQID(_card, _queue) (((_card) & 0xff) << 8 | ((_queue) & 0xff)) +#define AP_QID_CARD(_qid) (((_qid) >> 8) & 0xff) +#define AP_QID_QUEUE(_qid) ((_qid) & 0xff) + +/** + * struct ap_queue_status - Holds the AP queue status. + * @queue_empty: Shows if queue is empty + * @replies_waiting: Waiting replies + * @queue_full: Is 1 if the queue is full + * @irq_enabled: Shows if interrupts are enabled for the AP + * @response_code: Holds the 8 bit response code + * + * The ap queue status word is returned by all three AP functions + * (PQAP, NQAP and DQAP). There's a set of flags in the first + * byte, followed by a 1 byte response code. + */ +struct ap_queue_status { + unsigned int queue_empty : 1; + unsigned int replies_waiting : 1; + unsigned int queue_full : 1; + unsigned int : 3; + unsigned int async : 1; + unsigned int irq_enabled : 1; + unsigned int response_code : 8; + unsigned int : 16; +}; + +/* + * AP queue status reg union to access the reg1 + * register with the lower 32 bits comprising the + * ap queue status. + */ +union ap_queue_status_reg { + unsigned long value; + struct { + u32 _pad; + struct ap_queue_status status; + }; +}; + +/** + * ap_intructions_available() - Test if AP instructions are available. + * + * Returns true if the AP instructions are installed, otherwise false. + */ +static inline bool ap_instructions_available(void) +{ + unsigned long reg0 = AP_MKQID(0, 0); + unsigned long reg1 = 0; + + asm volatile( + " lgr 0,%[reg0]\n" /* qid into gr0 */ + " lghi 1,0\n" /* 0 into gr1 */ + " lghi 2,0\n" /* 0 into gr2 */ + " .insn rre,0xb2af0000,0,0\n" /* PQAP(TAPQ) */ + "0: la %[reg1],1\n" /* 1 into reg1 */ + "1:\n" + EX_TABLE(0b, 1b) + : [reg1] "+&d" (reg1) + : [reg0] "d" (reg0) + : "cc", "0", "1", "2"); + return reg1 != 0; +} + +/* TAPQ register GR2 response struct */ +struct ap_tapq_gr2 { + union { + unsigned long value; + struct { + unsigned int fac : 32; /* facility bits */ + unsigned int apinfo : 32; /* ap type, ... */ + }; + struct { + unsigned int s : 1; /* APSC */ + unsigned int m : 1; /* AP4KM */ + unsigned int c : 1; /* AP4KC */ + unsigned int mode : 3; + unsigned int n : 1; /* APXA */ + unsigned int : 1; + unsigned int class : 8; + unsigned int bs : 2; /* SE bind/assoc */ + unsigned int : 14; + unsigned int at : 8; /* ap type */ + unsigned int nd : 8; /* nr of domains */ + unsigned int : 4; + unsigned int ml : 4; /* apxl ml */ + unsigned int : 4; + unsigned int qd : 4; /* queue depth */ + }; + }; +}; + +/* + * Convenience defines to be used with the bs field from struct ap_tapq_gr2 + */ +#define AP_BS_Q_USABLE 0 +#define AP_BS_Q_USABLE_NO_SECURE_KEY 1 +#define AP_BS_Q_AVAIL_FOR_BINDING 2 +#define AP_BS_Q_UNUSABLE 3 + +/** + * ap_tapq(): Test adjunct processor queue. + * @qid: The AP queue number + * @info: Pointer to queue descriptor + * + * Returns AP queue status structure. + */ +static inline struct ap_queue_status ap_tapq(ap_qid_t qid, struct ap_tapq_gr2 *info) +{ + union ap_queue_status_reg reg1; + unsigned long reg2; + + asm volatile( + " lgr 0,%[qid]\n" /* qid into gr0 */ + " lghi 2,0\n" /* 0 into gr2 */ + " .insn rre,0xb2af0000,0,0\n" /* PQAP(TAPQ) */ + " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ + " lgr %[reg2],2\n" /* gr2 into reg2 */ + : [reg1] "=&d" (reg1.value), [reg2] "=&d" (reg2) + : [qid] "d" (qid) + : "cc", "0", "1", "2"); + if (info) + info->value = reg2; + return reg1.status; +} + +/** + * ap_test_queue(): Test adjunct processor queue. + * @qid: The AP queue number + * @tbit: Test facilities bit + * @info: Ptr to tapq gr2 struct + * + * Returns AP queue status structure. + */ +static inline struct ap_queue_status ap_test_queue(ap_qid_t qid, int tbit, + struct ap_tapq_gr2 *info) +{ + if (tbit) + qid |= 1UL << 23; /* set T bit*/ + return ap_tapq(qid, info); +} + +/** + * ap_pqap_rapq(): Reset adjunct processor queue. + * @qid: The AP queue number + * @fbit: if != 0 set F bit + * + * Returns AP queue status structure. + */ +static inline struct ap_queue_status ap_rapq(ap_qid_t qid, int fbit) +{ + unsigned long reg0 = qid | (1UL << 24); /* fc 1UL is RAPQ */ + union ap_queue_status_reg reg1; + + if (fbit) + reg0 |= 1UL << 22; + + asm volatile( + " lgr 0,%[reg0]\n" /* qid arg into gr0 */ + " .insn rre,0xb2af0000,0,0\n" /* PQAP(RAPQ) */ + " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ + : [reg1] "=&d" (reg1.value) + : [reg0] "d" (reg0) + : "cc", "0", "1"); + return reg1.status; +} + +/** + * ap_pqap_zapq(): Reset and zeroize adjunct processor queue. + * @qid: The AP queue number + * @fbit: if != 0 set F bit + * + * Returns AP queue status structure. + */ +static inline struct ap_queue_status ap_zapq(ap_qid_t qid, int fbit) +{ + unsigned long reg0 = qid | (2UL << 24); /* fc 2UL is ZAPQ */ + union ap_queue_status_reg reg1; + + if (fbit) + reg0 |= 1UL << 22; + + asm volatile( + " lgr 0,%[reg0]\n" /* qid arg into gr0 */ + " .insn rre,0xb2af0000,0,0\n" /* PQAP(ZAPQ) */ + " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ + : [reg1] "=&d" (reg1.value) + : [reg0] "d" (reg0) + : "cc", "0", "1"); + return reg1.status; +} + +/** + * struct ap_config_info - convenience struct for AP crypto + * config info as returned by the ap_qci() function. + */ +struct ap_config_info { + unsigned int apsc : 1; /* S bit */ + unsigned int apxa : 1; /* N bit */ + unsigned int qact : 1; /* C bit */ + unsigned int rc8a : 1; /* R bit */ + unsigned int : 4; + unsigned int apsb : 1; /* B bit */ + unsigned int : 23; + unsigned char na; /* max # of APs - 1 */ + unsigned char nd; /* max # of Domains - 1 */ + unsigned char _reserved0[10]; + unsigned int apm[8]; /* AP ID mask */ + unsigned int aqm[8]; /* AP (usage) queue mask */ + unsigned int adm[8]; /* AP (control) domain mask */ + unsigned char _reserved1[16]; +} __aligned(8); + +/** + * ap_qci(): Get AP configuration data + * + * Returns 0 on success, or -EOPNOTSUPP. + */ +static inline int ap_qci(struct ap_config_info *config) +{ + unsigned long reg0 = 4UL << 24; /* fc 4UL is QCI */ + unsigned long reg1 = -EOPNOTSUPP; + struct ap_config_info *reg2 = config; + + asm volatile( + " lgr 0,%[reg0]\n" /* QCI fc into gr0 */ + " lgr 2,%[reg2]\n" /* ptr to config into gr2 */ + " .insn rre,0xb2af0000,0,0\n" /* PQAP(QCI) */ + "0: la %[reg1],0\n" /* good case, QCI fc available */ + "1:\n" + EX_TABLE(0b, 1b) + : [reg1] "+&d" (reg1) + : [reg0] "d" (reg0), [reg2] "d" (reg2) + : "cc", "memory", "0", "2"); + + return reg1; +} + +/* + * struct ap_qirq_ctrl - convenient struct for easy invocation + * of the ap_aqic() function. This struct is passed as GR1 + * parameter to the PQAP(AQIC) instruction. For details please + * see the AR documentation. + */ +union ap_qirq_ctrl { + unsigned long value; + struct { + unsigned int : 8; + unsigned int zone : 8; /* zone info */ + unsigned int ir : 1; /* ir flag: enable (1) or disable (0) irq */ + unsigned int : 4; + unsigned int gisc : 3; /* guest isc field */ + unsigned int : 6; + unsigned int gf : 2; /* gisa format */ + unsigned int : 1; + unsigned int gisa : 27; /* gisa origin */ + unsigned int : 1; + unsigned int isc : 3; /* irq sub class */ + }; +}; + +/** + * ap_aqic(): Control interruption for a specific AP. + * @qid: The AP queue number + * @qirqctrl: struct ap_qirq_ctrl (64 bit value) + * @pa_ind: Physical address of the notification indicator byte + * + * Returns AP queue status. + */ +static inline struct ap_queue_status ap_aqic(ap_qid_t qid, + union ap_qirq_ctrl qirqctrl, + phys_addr_t pa_ind) +{ + unsigned long reg0 = qid | (3UL << 24); /* fc 3UL is AQIC */ + union ap_queue_status_reg reg1; + unsigned long reg2 = pa_ind; + + reg1.value = qirqctrl.value; + + asm volatile( + " lgr 0,%[reg0]\n" /* qid param into gr0 */ + " lgr 1,%[reg1]\n" /* irq ctrl into gr1 */ + " lgr 2,%[reg2]\n" /* ni addr into gr2 */ + " .insn rre,0xb2af0000,0,0\n" /* PQAP(AQIC) */ + " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ + : [reg1] "+&d" (reg1.value) + : [reg0] "d" (reg0), [reg2] "d" (reg2) + : "cc", "memory", "0", "1", "2"); + + return reg1.status; +} + +/* + * union ap_qact_ap_info - used together with the + * ap_aqic() function to provide a convenient way + * to handle the ap info needed by the qact function. + */ +union ap_qact_ap_info { + unsigned long val; + struct { + unsigned int : 3; + unsigned int mode : 3; + unsigned int : 26; + unsigned int cat : 8; + unsigned int : 8; + unsigned char ver[2]; + }; +}; + +/** + * ap_qact(): Query AP compatibility type. + * @qid: The AP queue number + * @apinfo: On input the info about the AP queue. On output the + * alternate AP queue info provided by the qact function + * in GR2 is stored in. + * + * Returns AP queue status. Check response_code field for failures. + */ +static inline struct ap_queue_status ap_qact(ap_qid_t qid, int ifbit, + union ap_qact_ap_info *apinfo) +{ + unsigned long reg0 = qid | (5UL << 24) | ((ifbit & 0x01) << 22); + union ap_queue_status_reg reg1; + unsigned long reg2; + + reg1.value = apinfo->val; + + asm volatile( + " lgr 0,%[reg0]\n" /* qid param into gr0 */ + " lgr 1,%[reg1]\n" /* qact in info into gr1 */ + " .insn rre,0xb2af0000,0,0\n" /* PQAP(QACT) */ + " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ + " lgr %[reg2],2\n" /* qact out info into reg2 */ + : [reg1] "+&d" (reg1.value), [reg2] "=&d" (reg2) + : [reg0] "d" (reg0) + : "cc", "0", "1", "2"); + apinfo->val = reg2; + return reg1.status; +} + +/* + * ap_bapq(): SE bind AP queue. + * @qid: The AP queue number + * + * Returns AP queue status structure. + * + * Invoking this function in a non-SE environment + * may case a specification exception. + */ +static inline struct ap_queue_status ap_bapq(ap_qid_t qid) +{ + unsigned long reg0 = qid | (7UL << 24); /* fc 7 is BAPQ */ + union ap_queue_status_reg reg1; + + asm volatile( + " lgr 0,%[reg0]\n" /* qid arg into gr0 */ + " .insn rre,0xb2af0000,0,0\n" /* PQAP(BAPQ) */ + " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ + : [reg1] "=&d" (reg1.value) + : [reg0] "d" (reg0) + : "cc", "0", "1"); + + return reg1.status; +} + +/* + * ap_aapq(): SE associate AP queue. + * @qid: The AP queue number + * @sec_idx: The secret index + * + * Returns AP queue status structure. + * + * Invoking this function in a non-SE environment + * may case a specification exception. + */ +static inline struct ap_queue_status ap_aapq(ap_qid_t qid, unsigned int sec_idx) +{ + unsigned long reg0 = qid | (8UL << 24); /* fc 8 is AAPQ */ + unsigned long reg2 = sec_idx; + union ap_queue_status_reg reg1; + + asm volatile( + " lgr 0,%[reg0]\n" /* qid arg into gr0 */ + " lgr 2,%[reg2]\n" /* secret index into gr2 */ + " .insn rre,0xb2af0000,0,0\n" /* PQAP(AAPQ) */ + " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ + : [reg1] "=&d" (reg1.value) + : [reg0] "d" (reg0), [reg2] "d" (reg2) + : "cc", "0", "1", "2"); + + return reg1.status; +} + +/** + * ap_nqap(): Send message to adjunct processor queue. + * @qid: The AP queue number + * @psmid: The program supplied message identifier + * @msg: The message text + * @length: The message length + * + * Returns AP queue status structure. + * Condition code 1 on NQAP can't happen because the L bit is 1. + * Condition code 2 on NQAP also means the send is incomplete, + * because a segment boundary was reached. The NQAP is repeated. + */ +static inline struct ap_queue_status ap_nqap(ap_qid_t qid, + unsigned long long psmid, + void *msg, size_t length) +{ + unsigned long reg0 = qid | 0x40000000UL; /* 0x4... is last msg part */ + union register_pair nqap_r1, nqap_r2; + union ap_queue_status_reg reg1; + + nqap_r1.even = (unsigned int)(psmid >> 32); + nqap_r1.odd = psmid & 0xffffffff; + nqap_r2.even = (unsigned long)msg; + nqap_r2.odd = (unsigned long)length; + + asm volatile ( + " lgr 0,%[reg0]\n" /* qid param in gr0 */ + "0: .insn rre,0xb2ad0000,%[nqap_r1],%[nqap_r2]\n" + " brc 2,0b\n" /* handle partial completion */ + " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ + : [reg0] "+&d" (reg0), [reg1] "=&d" (reg1.value), + [nqap_r2] "+&d" (nqap_r2.pair) + : [nqap_r1] "d" (nqap_r1.pair) + : "cc", "memory", "0", "1"); + return reg1.status; +} + +/** + * ap_dqap(): Receive message from adjunct processor queue. + * @qid: The AP queue number + * @psmid: Pointer to program supplied message identifier + * @msg: Pointer to message buffer + * @msglen: Message buffer size + * @length: Pointer to length of actually written bytes + * @reslength: Residual length on return + * @resgr0: input: gr0 value (only used if != 0), output: residual gr0 content + * + * Returns AP queue status structure. + * Condition code 1 on DQAP means the receive has taken place + * but only partially. The response is incomplete, hence the + * DQAP is repeated. + * Condition code 2 on DQAP also means the receive is incomplete, + * this time because a segment boundary was reached. Again, the + * DQAP is repeated. + * Note that gpr2 is used by the DQAP instruction to keep track of + * any 'residual' length, in case the instruction gets interrupted. + * Hence it gets zeroed before the instruction. + * If the message does not fit into the buffer, this function will + * return with a truncated message and the reply in the firmware queue + * is not removed. This is indicated to the caller with an + * ap_queue_status response_code value of all bits on (0xFF) and (if + * the reslength ptr is given) the remaining length is stored in + * *reslength and (if the resgr0 ptr is given) the updated gr0 value + * for further processing of this msg entry is stored in *resgr0. The + * caller needs to detect this situation and should invoke ap_dqap + * with a valid resgr0 ptr and a value in there != 0 to indicate that + * *resgr0 is to be used instead of qid to further process this entry. + */ +static inline struct ap_queue_status ap_dqap(ap_qid_t qid, + unsigned long *psmid, + void *msg, size_t msglen, + size_t *length, + size_t *reslength, + unsigned long *resgr0) +{ + unsigned long reg0 = resgr0 && *resgr0 ? *resgr0 : qid | 0x80000000UL; + union ap_queue_status_reg reg1; + unsigned long reg2; + union register_pair rp1, rp2; + + rp1.even = 0UL; + rp1.odd = 0UL; + rp2.even = (unsigned long)msg; + rp2.odd = (unsigned long)msglen; + + asm volatile( + " lgr 0,%[reg0]\n" /* qid param into gr0 */ + " lghi 2,0\n" /* 0 into gr2 (res length) */ + "0: ltgr %N[rp2],%N[rp2]\n" /* check buf len */ + " jz 2f\n" /* go out if buf len is 0 */ + "1: .insn rre,0xb2ae0000,%[rp1],%[rp2]\n" + " brc 6,0b\n" /* handle partial complete */ + "2: lgr %[reg0],0\n" /* gr0 (qid + info) into reg0 */ + " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ + " lgr %[reg2],2\n" /* gr2 (res length) into reg2 */ + : [reg0] "+&d" (reg0), [reg1] "=&d" (reg1.value), + [reg2] "=&d" (reg2), [rp1] "+&d" (rp1.pair), + [rp2] "+&d" (rp2.pair) + : + : "cc", "memory", "0", "1", "2"); + + if (reslength) + *reslength = reg2; + if (reg2 != 0 && rp2.odd == 0) { + /* + * Partially complete, status in gr1 is not set. + * Signal the caller that this dqap is only partially received + * with a special status response code 0xFF and *resgr0 updated + */ + reg1.status.response_code = 0xFF; + if (resgr0) + *resgr0 = reg0; + } else { + *psmid = (rp1.even << 32) + rp1.odd; + if (resgr0) + *resgr0 = 0; + } + + /* update *length with the nr of bytes stored into the msg buffer */ + if (length) + *length = msglen - rp2.odd; + + return reg1.status; +} + +/* + * Interface to tell the AP bus code that a configuration + * change has happened. The bus code should at least do + * an ap bus resource rescan. + */ +#if IS_ENABLED(CONFIG_ZCRYPT) +void ap_bus_cfg_chg(void); +#else +static inline void ap_bus_cfg_chg(void){} +#endif + +#endif /* _ASM_S390_AP_H_ */ diff --git a/arch/s390/include/asm/appldata.h b/arch/s390/include/asm/appldata.h new file mode 100644 index 0000000000..f2240392c7 --- /dev/null +++ b/arch/s390/include/asm/appldata.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2006 + * + * Author(s): Melissa Howland + */ + +#ifndef _ASM_S390_APPLDATA_H +#define _ASM_S390_APPLDATA_H + +#include +#include + +#define APPLDATA_START_INTERVAL_REC 0x80 +#define APPLDATA_STOP_REC 0x81 +#define APPLDATA_GEN_EVENT_REC 0x82 +#define APPLDATA_START_CONFIG_REC 0x83 + +/* + * Parameter list for DIAGNOSE X'DC' + */ +struct appldata_parameter_list { + u16 diag; + u8 function; + u8 parlist_length; + u32 unused01; + u16 reserved; + u16 buffer_length; + u32 unused02; + u64 product_id_addr; + u64 buffer_addr; +} __attribute__ ((packed)); + +struct appldata_product_id { + char prod_nr[7]; /* product number */ + u16 prod_fn; /* product function */ + u8 record_nr; /* record number */ + u16 version_nr; /* version */ + u16 release_nr; /* release */ + u16 mod_lvl; /* modification level */ +} __attribute__ ((packed)); + + +static inline int appldata_asm(struct appldata_parameter_list *parm_list, + struct appldata_product_id *id, + unsigned short fn, void *buffer, + unsigned short length) +{ + int ry; + + if (!MACHINE_IS_VM) + return -EOPNOTSUPP; + parm_list->diag = 0xdc; + parm_list->function = fn; + parm_list->parlist_length = sizeof(*parm_list); + parm_list->buffer_length = length; + parm_list->product_id_addr = (unsigned long) id; + parm_list->buffer_addr = virt_to_phys(buffer); + diag_stat_inc(DIAG_STAT_X0DC); + asm volatile( + " diag %1,%0,0xdc" + : "=d" (ry) + : "d" (parm_list), "m" (*parm_list), "m" (*id) + : "cc"); + return ry; +} + +#endif /* _ASM_S390_APPLDATA_H */ diff --git a/arch/s390/include/asm/archrandom.h b/arch/s390/include/asm/archrandom.h new file mode 100644 index 0000000000..1594049893 --- /dev/null +++ b/arch/s390/include/asm/archrandom.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Kernel interface for the s390 arch_random_* functions + * + * Copyright IBM Corp. 2017, 2022 + * + * Author: Harald Freudenberger + * + */ + +#ifndef _ASM_S390_ARCHRANDOM_H +#define _ASM_S390_ARCHRANDOM_H + +#include +#include +#include +#include + +DECLARE_STATIC_KEY_FALSE(s390_arch_random_available); +extern atomic64_t s390_arch_random_counter; + +static inline size_t __must_check arch_get_random_longs(unsigned long *v, size_t max_longs) +{ + return 0; +} + +static inline size_t __must_check arch_get_random_seed_longs(unsigned long *v, size_t max_longs) +{ + if (static_branch_likely(&s390_arch_random_available) && + in_task()) { + cpacf_trng(NULL, 0, (u8 *)v, max_longs * sizeof(*v)); + atomic64_add(max_longs * sizeof(*v), &s390_arch_random_counter); + return max_longs; + } + return 0; +} + +#endif /* _ASM_S390_ARCHRANDOM_H */ diff --git a/arch/s390/include/asm/asm-const.h b/arch/s390/include/asm/asm-const.h new file mode 100644 index 0000000000..11f615eb00 --- /dev/null +++ b/arch/s390/include/asm/asm-const.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_ASM_CONST_H +#define _ASM_S390_ASM_CONST_H + +#ifdef __ASSEMBLY__ +# define stringify_in_c(...) __VA_ARGS__ +#else +/* This version of stringify will deal with commas... */ +# define __stringify_in_c(...) #__VA_ARGS__ +# define stringify_in_c(...) __stringify_in_c(__VA_ARGS__) " " +#endif +#endif /* _ASM_S390_ASM_CONST_H */ diff --git a/arch/s390/include/asm/asm-extable.h b/arch/s390/include/asm/asm-extable.h new file mode 100644 index 0000000000..e6532477f1 --- /dev/null +++ b/arch/s390/include/asm/asm-extable.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_EXTABLE_H +#define __ASM_EXTABLE_H + +#include +#include +#include + +#define EX_TYPE_NONE 0 +#define EX_TYPE_FIXUP 1 +#define EX_TYPE_BPF 2 +#define EX_TYPE_UA_STORE 3 +#define EX_TYPE_UA_LOAD_MEM 4 +#define EX_TYPE_UA_LOAD_REG 5 +#define EX_TYPE_UA_LOAD_REGPAIR 6 + +#define EX_DATA_REG_ERR_SHIFT 0 +#define EX_DATA_REG_ERR GENMASK(3, 0) + +#define EX_DATA_REG_ADDR_SHIFT 4 +#define EX_DATA_REG_ADDR GENMASK(7, 4) + +#define EX_DATA_LEN_SHIFT 8 +#define EX_DATA_LEN GENMASK(11, 8) + +#define __EX_TABLE(_section, _fault, _target, _type) \ + stringify_in_c(.section _section,"a";) \ + stringify_in_c(.balign 4;) \ + stringify_in_c(.long (_fault) - .;) \ + stringify_in_c(.long (_target) - .;) \ + stringify_in_c(.short (_type);) \ + stringify_in_c(.short 0;) \ + stringify_in_c(.previous) + +#define __EX_TABLE_UA(_section, _fault, _target, _type, _regerr, _regaddr, _len)\ + stringify_in_c(.section _section,"a";) \ + stringify_in_c(.balign 4;) \ + stringify_in_c(.long (_fault) - .;) \ + stringify_in_c(.long (_target) - .;) \ + stringify_in_c(.short (_type);) \ + stringify_in_c(.macro extable_reg regerr, regaddr;) \ + stringify_in_c(.set .Lfound, 0;) \ + stringify_in_c(.set .Lcurr, 0;) \ + stringify_in_c(.irp rs,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15;) \ + stringify_in_c( .ifc "\regerr", "%%r\rs";) \ + stringify_in_c( .set .Lfound, 1;) \ + stringify_in_c( .set .Lregerr, .Lcurr;) \ + stringify_in_c( .endif;) \ + stringify_in_c( .set .Lcurr, .Lcurr+1;) \ + stringify_in_c(.endr;) \ + stringify_in_c(.ifne (.Lfound != 1);) \ + stringify_in_c( .error "extable_reg: bad register argument1";) \ + stringify_in_c(.endif;) \ + stringify_in_c(.set .Lfound, 0;) \ + stringify_in_c(.set .Lcurr, 0;) \ + stringify_in_c(.irp rs,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15;) \ + stringify_in_c( .ifc "\regaddr", "%%r\rs";) \ + stringify_in_c( .set .Lfound, 1;) \ + stringify_in_c( .set .Lregaddr, .Lcurr;) \ + stringify_in_c( .endif;) \ + stringify_in_c( .set .Lcurr, .Lcurr+1;) \ + stringify_in_c(.endr;) \ + stringify_in_c(.ifne (.Lfound != 1);) \ + stringify_in_c( .error "extable_reg: bad register argument2";) \ + stringify_in_c(.endif;) \ + stringify_in_c(.short .Lregerr << EX_DATA_REG_ERR_SHIFT | \ + .Lregaddr << EX_DATA_REG_ADDR_SHIFT | \ + _len << EX_DATA_LEN_SHIFT;) \ + stringify_in_c(.endm;) \ + stringify_in_c(extable_reg _regerr,_regaddr;) \ + stringify_in_c(.purgem extable_reg;) \ + stringify_in_c(.previous) + +#define EX_TABLE(_fault, _target) \ + __EX_TABLE(__ex_table, _fault, _target, EX_TYPE_FIXUP) + +#define EX_TABLE_AMODE31(_fault, _target) \ + __EX_TABLE(.amode31.ex_table, _fault, _target, EX_TYPE_FIXUP) + +#define EX_TABLE_UA_STORE(_fault, _target, _regerr) \ + __EX_TABLE_UA(__ex_table, _fault, _target, EX_TYPE_UA_STORE, _regerr, _regerr, 0) + +#define EX_TABLE_UA_LOAD_MEM(_fault, _target, _regerr, _regmem, _len) \ + __EX_TABLE_UA(__ex_table, _fault, _target, EX_TYPE_UA_LOAD_MEM, _regerr, _regmem, _len) + +#define EX_TABLE_UA_LOAD_REG(_fault, _target, _regerr, _regzero) \ + __EX_TABLE_UA(__ex_table, _fault, _target, EX_TYPE_UA_LOAD_REG, _regerr, _regzero, 0) + +#define EX_TABLE_UA_LOAD_REGPAIR(_fault, _target, _regerr, _regzero) \ + __EX_TABLE_UA(__ex_table, _fault, _target, EX_TYPE_UA_LOAD_REGPAIR, _regerr, _regzero, 0) + +#endif /* __ASM_EXTABLE_H */ diff --git a/arch/s390/include/asm/asm-prototypes.h b/arch/s390/include/asm/asm-prototypes.h new file mode 100644 index 0000000000..a873e873e1 --- /dev/null +++ b/arch/s390/include/asm/asm-prototypes.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_PROTOTYPES_H + +#include +#include +#include +#include + +__int128_t __ashlti3(__int128_t a, int b); +__int128_t __ashrti3(__int128_t a, int b); +__int128_t __lshrti3(__int128_t a, int b); + +#endif /* _ASM_S390_PROTOTYPES_H */ diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h new file mode 100644 index 0000000000..7138d189cc --- /dev/null +++ b/arch/s390/include/asm/atomic.h @@ -0,0 +1,150 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 1999, 2016 + * Author(s): Martin Schwidefsky , + * Denis Joseph Barrow, + * Arnd Bergmann, + */ + +#ifndef __ARCH_S390_ATOMIC__ +#define __ARCH_S390_ATOMIC__ + +#include +#include +#include +#include +#include + +static inline int arch_atomic_read(const atomic_t *v) +{ + return __atomic_read(v); +} +#define arch_atomic_read arch_atomic_read + +static inline void arch_atomic_set(atomic_t *v, int i) +{ + __atomic_set(v, i); +} +#define arch_atomic_set arch_atomic_set + +static inline int arch_atomic_add_return(int i, atomic_t *v) +{ + return __atomic_add_barrier(i, &v->counter) + i; +} +#define arch_atomic_add_return arch_atomic_add_return + +static inline int arch_atomic_fetch_add(int i, atomic_t *v) +{ + return __atomic_add_barrier(i, &v->counter); +} +#define arch_atomic_fetch_add arch_atomic_fetch_add + +static inline void arch_atomic_add(int i, atomic_t *v) +{ + __atomic_add(i, &v->counter); +} +#define arch_atomic_add arch_atomic_add + +#define arch_atomic_sub(_i, _v) arch_atomic_add(-(int)(_i), _v) +#define arch_atomic_sub_return(_i, _v) arch_atomic_add_return(-(int)(_i), _v) +#define arch_atomic_fetch_sub(_i, _v) arch_atomic_fetch_add(-(int)(_i), _v) + +#define ATOMIC_OPS(op) \ +static inline void arch_atomic_##op(int i, atomic_t *v) \ +{ \ + __atomic_##op(i, &v->counter); \ +} \ +static inline int arch_atomic_fetch_##op(int i, atomic_t *v) \ +{ \ + return __atomic_##op##_barrier(i, &v->counter); \ +} + +ATOMIC_OPS(and) +ATOMIC_OPS(or) +ATOMIC_OPS(xor) + +#undef ATOMIC_OPS + +#define arch_atomic_and arch_atomic_and +#define arch_atomic_or arch_atomic_or +#define arch_atomic_xor arch_atomic_xor +#define arch_atomic_fetch_and arch_atomic_fetch_and +#define arch_atomic_fetch_or arch_atomic_fetch_or +#define arch_atomic_fetch_xor arch_atomic_fetch_xor + +#define arch_atomic_xchg(v, new) (arch_xchg(&((v)->counter), new)) + +static inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new) +{ + return __atomic_cmpxchg(&v->counter, old, new); +} +#define arch_atomic_cmpxchg arch_atomic_cmpxchg + +#define ATOMIC64_INIT(i) { (i) } + +static inline s64 arch_atomic64_read(const atomic64_t *v) +{ + return __atomic64_read(v); +} +#define arch_atomic64_read arch_atomic64_read + +static inline void arch_atomic64_set(atomic64_t *v, s64 i) +{ + __atomic64_set(v, i); +} +#define arch_atomic64_set arch_atomic64_set + +static inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) +{ + return __atomic64_add_barrier(i, (long *)&v->counter) + i; +} +#define arch_atomic64_add_return arch_atomic64_add_return + +static inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) +{ + return __atomic64_add_barrier(i, (long *)&v->counter); +} +#define arch_atomic64_fetch_add arch_atomic64_fetch_add + +static inline void arch_atomic64_add(s64 i, atomic64_t *v) +{ + __atomic64_add(i, (long *)&v->counter); +} +#define arch_atomic64_add arch_atomic64_add + +#define arch_atomic64_xchg(v, new) (arch_xchg(&((v)->counter), new)) + +static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) +{ + return __atomic64_cmpxchg((long *)&v->counter, old, new); +} +#define arch_atomic64_cmpxchg arch_atomic64_cmpxchg + +#define ATOMIC64_OPS(op) \ +static inline void arch_atomic64_##op(s64 i, atomic64_t *v) \ +{ \ + __atomic64_##op(i, (long *)&v->counter); \ +} \ +static inline long arch_atomic64_fetch_##op(s64 i, atomic64_t *v) \ +{ \ + return __atomic64_##op##_barrier(i, (long *)&v->counter); \ +} + +ATOMIC64_OPS(and) +ATOMIC64_OPS(or) +ATOMIC64_OPS(xor) + +#undef ATOMIC64_OPS + +#define arch_atomic64_and arch_atomic64_and +#define arch_atomic64_or arch_atomic64_or +#define arch_atomic64_xor arch_atomic64_xor +#define arch_atomic64_fetch_and arch_atomic64_fetch_and +#define arch_atomic64_fetch_or arch_atomic64_fetch_or +#define arch_atomic64_fetch_xor arch_atomic64_fetch_xor + +#define arch_atomic64_sub_return(_i, _v) arch_atomic64_add_return(-(s64)(_i), _v) +#define arch_atomic64_fetch_sub(_i, _v) arch_atomic64_fetch_add(-(s64)(_i), _v) +#define arch_atomic64_sub(_i, _v) arch_atomic64_add(-(s64)(_i), _v) + +#endif /* __ARCH_S390_ATOMIC__ */ diff --git a/arch/s390/include/asm/atomic_ops.h b/arch/s390/include/asm/atomic_ops.h new file mode 100644 index 0000000000..50510e08b8 --- /dev/null +++ b/arch/s390/include/asm/atomic_ops.h @@ -0,0 +1,201 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Low level function for atomic operations + * + * Copyright IBM Corp. 1999, 2016 + */ + +#ifndef __ARCH_S390_ATOMIC_OPS__ +#define __ARCH_S390_ATOMIC_OPS__ + +static inline int __atomic_read(const atomic_t *v) +{ + int c; + + asm volatile( + " l %0,%1\n" + : "=d" (c) : "R" (v->counter)); + return c; +} + +static inline void __atomic_set(atomic_t *v, int i) +{ + asm volatile( + " st %1,%0\n" + : "=R" (v->counter) : "d" (i)); +} + +static inline s64 __atomic64_read(const atomic64_t *v) +{ + s64 c; + + asm volatile( + " lg %0,%1\n" + : "=d" (c) : "RT" (v->counter)); + return c; +} + +static inline void __atomic64_set(atomic64_t *v, s64 i) +{ + asm volatile( + " stg %1,%0\n" + : "=RT" (v->counter) : "d" (i)); +} + +#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES + +#define __ATOMIC_OP(op_name, op_type, op_string, op_barrier) \ +static inline op_type op_name(op_type val, op_type *ptr) \ +{ \ + op_type old; \ + \ + asm volatile( \ + op_string " %[old],%[val],%[ptr]\n" \ + op_barrier \ + : [old] "=d" (old), [ptr] "+QS" (*ptr) \ + : [val] "d" (val) : "cc", "memory"); \ + return old; \ +} \ + +#define __ATOMIC_OPS(op_name, op_type, op_string) \ + __ATOMIC_OP(op_name, op_type, op_string, "\n") \ + __ATOMIC_OP(op_name##_barrier, op_type, op_string, "bcr 14,0\n") + +__ATOMIC_OPS(__atomic_add, int, "laa") +__ATOMIC_OPS(__atomic_and, int, "lan") +__ATOMIC_OPS(__atomic_or, int, "lao") +__ATOMIC_OPS(__atomic_xor, int, "lax") + +__ATOMIC_OPS(__atomic64_add, long, "laag") +__ATOMIC_OPS(__atomic64_and, long, "lang") +__ATOMIC_OPS(__atomic64_or, long, "laog") +__ATOMIC_OPS(__atomic64_xor, long, "laxg") + +#undef __ATOMIC_OPS +#undef __ATOMIC_OP + +#define __ATOMIC_CONST_OP(op_name, op_type, op_string, op_barrier) \ +static __always_inline void op_name(op_type val, op_type *ptr) \ +{ \ + asm volatile( \ + op_string " %[ptr],%[val]\n" \ + op_barrier \ + : [ptr] "+QS" (*ptr) : [val] "i" (val) : "cc", "memory");\ +} + +#define __ATOMIC_CONST_OPS(op_name, op_type, op_string) \ + __ATOMIC_CONST_OP(op_name, op_type, op_string, "\n") \ + __ATOMIC_CONST_OP(op_name##_barrier, op_type, op_string, "bcr 14,0\n") + +__ATOMIC_CONST_OPS(__atomic_add_const, int, "asi") +__ATOMIC_CONST_OPS(__atomic64_add_const, long, "agsi") + +#undef __ATOMIC_CONST_OPS +#undef __ATOMIC_CONST_OP + +#else /* CONFIG_HAVE_MARCH_Z196_FEATURES */ + +#define __ATOMIC_OP(op_name, op_string) \ +static inline int op_name(int val, int *ptr) \ +{ \ + int old, new; \ + \ + asm volatile( \ + "0: lr %[new],%[old]\n" \ + op_string " %[new],%[val]\n" \ + " cs %[old],%[new],%[ptr]\n" \ + " jl 0b" \ + : [old] "=d" (old), [new] "=&d" (new), [ptr] "+Q" (*ptr)\ + : [val] "d" (val), "0" (*ptr) : "cc", "memory"); \ + return old; \ +} + +#define __ATOMIC_OPS(op_name, op_string) \ + __ATOMIC_OP(op_name, op_string) \ + __ATOMIC_OP(op_name##_barrier, op_string) + +__ATOMIC_OPS(__atomic_add, "ar") +__ATOMIC_OPS(__atomic_and, "nr") +__ATOMIC_OPS(__atomic_or, "or") +__ATOMIC_OPS(__atomic_xor, "xr") + +#undef __ATOMIC_OPS + +#define __ATOMIC64_OP(op_name, op_string) \ +static inline long op_name(long val, long *ptr) \ +{ \ + long old, new; \ + \ + asm volatile( \ + "0: lgr %[new],%[old]\n" \ + op_string " %[new],%[val]\n" \ + " csg %[old],%[new],%[ptr]\n" \ + " jl 0b" \ + : [old] "=d" (old), [new] "=&d" (new), [ptr] "+QS" (*ptr)\ + : [val] "d" (val), "0" (*ptr) : "cc", "memory"); \ + return old; \ +} + +#define __ATOMIC64_OPS(op_name, op_string) \ + __ATOMIC64_OP(op_name, op_string) \ + __ATOMIC64_OP(op_name##_barrier, op_string) + +__ATOMIC64_OPS(__atomic64_add, "agr") +__ATOMIC64_OPS(__atomic64_and, "ngr") +__ATOMIC64_OPS(__atomic64_or, "ogr") +__ATOMIC64_OPS(__atomic64_xor, "xgr") + +#undef __ATOMIC64_OPS + +#define __atomic_add_const(val, ptr) __atomic_add(val, ptr) +#define __atomic_add_const_barrier(val, ptr) __atomic_add(val, ptr) +#define __atomic64_add_const(val, ptr) __atomic64_add(val, ptr) +#define __atomic64_add_const_barrier(val, ptr) __atomic64_add(val, ptr) + +#endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */ + +static inline int __atomic_cmpxchg(int *ptr, int old, int new) +{ + asm volatile( + " cs %[old],%[new],%[ptr]" + : [old] "+d" (old), [ptr] "+Q" (*ptr) + : [new] "d" (new) + : "cc", "memory"); + return old; +} + +static inline bool __atomic_cmpxchg_bool(int *ptr, int old, int new) +{ + int old_expected = old; + + asm volatile( + " cs %[old],%[new],%[ptr]" + : [old] "+d" (old), [ptr] "+Q" (*ptr) + : [new] "d" (new) + : "cc", "memory"); + return old == old_expected; +} + +static inline long __atomic64_cmpxchg(long *ptr, long old, long new) +{ + asm volatile( + " csg %[old],%[new],%[ptr]" + : [old] "+d" (old), [ptr] "+QS" (*ptr) + : [new] "d" (new) + : "cc", "memory"); + return old; +} + +static inline bool __atomic64_cmpxchg_bool(long *ptr, long old, long new) +{ + long old_expected = old; + + asm volatile( + " csg %[old],%[new],%[ptr]" + : [old] "+d" (old), [ptr] "+QS" (*ptr) + : [new] "d" (new) + : "cc", "memory"); + return old == old_expected; +} + +#endif /* __ARCH_S390_ATOMIC_OPS__ */ diff --git a/arch/s390/include/asm/barrier.h b/arch/s390/include/asm/barrier.h new file mode 100644 index 0000000000..82de2a7c41 --- /dev/null +++ b/arch/s390/include/asm/barrier.h @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 1999, 2009 + * + * Author(s): Martin Schwidefsky + */ + +#ifndef __ASM_BARRIER_H +#define __ASM_BARRIER_H + +/* + * Force strict CPU ordering. + * And yes, this is required on UP too when we're talking + * to devices. + */ + +#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES +/* Fast-BCR without checkpoint synchronization */ +#define __ASM_BCR_SERIALIZE "bcr 14,0\n" +#else +#define __ASM_BCR_SERIALIZE "bcr 15,0\n" +#endif + +static __always_inline void bcr_serialize(void) +{ + asm volatile(__ASM_BCR_SERIALIZE : : : "memory"); +} + +#define __mb() bcr_serialize() +#define __rmb() barrier() +#define __wmb() barrier() +#define __dma_rmb() __mb() +#define __dma_wmb() __mb() +#define __smp_mb() __mb() +#define __smp_rmb() __rmb() +#define __smp_wmb() __wmb() + +#define __smp_store_release(p, v) \ +do { \ + compiletime_assert_atomic_type(*p); \ + barrier(); \ + WRITE_ONCE(*p, v); \ +} while (0) + +#define __smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = READ_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + barrier(); \ + ___p1; \ +}) + +#define __smp_mb__before_atomic() barrier() +#define __smp_mb__after_atomic() barrier() + +/** + * array_index_mask_nospec - generate a mask for array_idx() that is + * ~0UL when the bounds check succeeds and 0 otherwise + * @index: array element index + * @size: number of elements in array + */ +#define array_index_mask_nospec array_index_mask_nospec +static inline unsigned long array_index_mask_nospec(unsigned long index, + unsigned long size) +{ + unsigned long mask; + + if (__builtin_constant_p(size) && size > 0) { + asm(" clgr %2,%1\n" + " slbgr %0,%0\n" + :"=d" (mask) : "d" (size-1), "d" (index) :"cc"); + return mask; + } + asm(" clgr %1,%2\n" + " slbgr %0,%0\n" + :"=d" (mask) : "d" (size), "d" (index) :"cc"); + return ~mask; +} + +#include + +#endif /* __ASM_BARRIER_H */ diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h new file mode 100644 index 0000000000..2de74fcd05 --- /dev/null +++ b/arch/s390/include/asm/bitops.h @@ -0,0 +1,378 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 1999,2013 + * + * Author(s): Martin Schwidefsky , + * + * The description below was taken in large parts from the powerpc + * bitops header file: + * Within a word, bits are numbered LSB first. Lot's of places make + * this assumption by directly testing bits with (val & (1< 1 word) bitmaps on a + * big-endian system because, unlike little endian, the number of each + * bit depends on the word size. + * + * The bitop functions are defined to work on unsigned longs, so the bits + * end up numbered: + * |63..............0|127............64|191...........128|255...........192| + * + * We also have special functions which work with an MSB0 encoding. + * The bits are numbered: + * |0..............63|64............127|128...........191|192...........255| + * + * The main difference is that bit 0-63 in the bit number field needs to be + * reversed compared to the LSB0 encoded bit fields. This can be achieved by + * XOR with 0x3f. + * + */ + +#ifndef _S390_BITOPS_H +#define _S390_BITOPS_H + +#ifndef _LINUX_BITOPS_H +#error only can be included directly +#endif + +#include +#include +#include +#include +#include + +#define __BITOPS_WORDS(bits) (((bits) + BITS_PER_LONG - 1) / BITS_PER_LONG) + +static inline unsigned long * +__bitops_word(unsigned long nr, const volatile unsigned long *ptr) +{ + unsigned long addr; + + addr = (unsigned long)ptr + ((nr ^ (nr & (BITS_PER_LONG - 1))) >> 3); + return (unsigned long *)addr; +} + +static inline unsigned long __bitops_mask(unsigned long nr) +{ + return 1UL << (nr & (BITS_PER_LONG - 1)); +} + +static __always_inline void arch_set_bit(unsigned long nr, volatile unsigned long *ptr) +{ + unsigned long *addr = __bitops_word(nr, ptr); + unsigned long mask = __bitops_mask(nr); + + __atomic64_or(mask, (long *)addr); +} + +static __always_inline void arch_clear_bit(unsigned long nr, volatile unsigned long *ptr) +{ + unsigned long *addr = __bitops_word(nr, ptr); + unsigned long mask = __bitops_mask(nr); + + __atomic64_and(~mask, (long *)addr); +} + +static __always_inline void arch_change_bit(unsigned long nr, + volatile unsigned long *ptr) +{ + unsigned long *addr = __bitops_word(nr, ptr); + unsigned long mask = __bitops_mask(nr); + + __atomic64_xor(mask, (long *)addr); +} + +static inline bool arch_test_and_set_bit(unsigned long nr, + volatile unsigned long *ptr) +{ + unsigned long *addr = __bitops_word(nr, ptr); + unsigned long mask = __bitops_mask(nr); + unsigned long old; + + old = __atomic64_or_barrier(mask, (long *)addr); + return old & mask; +} + +static inline bool arch_test_and_clear_bit(unsigned long nr, + volatile unsigned long *ptr) +{ + unsigned long *addr = __bitops_word(nr, ptr); + unsigned long mask = __bitops_mask(nr); + unsigned long old; + + old = __atomic64_and_barrier(~mask, (long *)addr); + return old & mask; +} + +static inline bool arch_test_and_change_bit(unsigned long nr, + volatile unsigned long *ptr) +{ + unsigned long *addr = __bitops_word(nr, ptr); + unsigned long mask = __bitops_mask(nr); + unsigned long old; + + old = __atomic64_xor_barrier(mask, (long *)addr); + return old & mask; +} + +static __always_inline void +arch___set_bit(unsigned long nr, volatile unsigned long *addr) +{ + unsigned long *p = __bitops_word(nr, addr); + unsigned long mask = __bitops_mask(nr); + + *p |= mask; +} + +static __always_inline void +arch___clear_bit(unsigned long nr, volatile unsigned long *addr) +{ + unsigned long *p = __bitops_word(nr, addr); + unsigned long mask = __bitops_mask(nr); + + *p &= ~mask; +} + +static __always_inline void +arch___change_bit(unsigned long nr, volatile unsigned long *addr) +{ + unsigned long *p = __bitops_word(nr, addr); + unsigned long mask = __bitops_mask(nr); + + *p ^= mask; +} + +static __always_inline bool +arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr) +{ + unsigned long *p = __bitops_word(nr, addr); + unsigned long mask = __bitops_mask(nr); + unsigned long old; + + old = *p; + *p |= mask; + return old & mask; +} + +static __always_inline bool +arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) +{ + unsigned long *p = __bitops_word(nr, addr); + unsigned long mask = __bitops_mask(nr); + unsigned long old; + + old = *p; + *p &= ~mask; + return old & mask; +} + +static __always_inline bool +arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr) +{ + unsigned long *p = __bitops_word(nr, addr); + unsigned long mask = __bitops_mask(nr); + unsigned long old; + + old = *p; + *p ^= mask; + return old & mask; +} + +#define arch_test_bit generic_test_bit +#define arch_test_bit_acquire generic_test_bit_acquire + +static inline bool arch_test_and_set_bit_lock(unsigned long nr, + volatile unsigned long *ptr) +{ + if (arch_test_bit(nr, ptr)) + return true; + return arch_test_and_set_bit(nr, ptr); +} + +static inline void arch_clear_bit_unlock(unsigned long nr, + volatile unsigned long *ptr) +{ + smp_mb__before_atomic(); + arch_clear_bit(nr, ptr); +} + +static inline void arch___clear_bit_unlock(unsigned long nr, + volatile unsigned long *ptr) +{ + smp_mb(); + arch___clear_bit(nr, ptr); +} + +#include +#include +#include + +/* + * Functions which use MSB0 bit numbering. + * The bits are numbered: + * |0..............63|64............127|128...........191|192...........255| + */ +unsigned long find_first_bit_inv(const unsigned long *addr, unsigned long size); +unsigned long find_next_bit_inv(const unsigned long *addr, unsigned long size, + unsigned long offset); + +#define for_each_set_bit_inv(bit, addr, size) \ + for ((bit) = find_first_bit_inv((addr), (size)); \ + (bit) < (size); \ + (bit) = find_next_bit_inv((addr), (size), (bit) + 1)) + +static inline void set_bit_inv(unsigned long nr, volatile unsigned long *ptr) +{ + return set_bit(nr ^ (BITS_PER_LONG - 1), ptr); +} + +static inline void clear_bit_inv(unsigned long nr, volatile unsigned long *ptr) +{ + return clear_bit(nr ^ (BITS_PER_LONG - 1), ptr); +} + +static inline bool test_and_clear_bit_inv(unsigned long nr, + volatile unsigned long *ptr) +{ + return test_and_clear_bit(nr ^ (BITS_PER_LONG - 1), ptr); +} + +static inline void __set_bit_inv(unsigned long nr, volatile unsigned long *ptr) +{ + return __set_bit(nr ^ (BITS_PER_LONG - 1), ptr); +} + +static inline void __clear_bit_inv(unsigned long nr, volatile unsigned long *ptr) +{ + return __clear_bit(nr ^ (BITS_PER_LONG - 1), ptr); +} + +static inline bool test_bit_inv(unsigned long nr, + const volatile unsigned long *ptr) +{ + return test_bit(nr ^ (BITS_PER_LONG - 1), ptr); +} + +/** + * __flogr - find leftmost one + * @word - The word to search + * + * Returns the bit number of the most significant bit set, + * where the most significant bit has bit number 0. + * If no bit is set this function returns 64. + */ +static inline unsigned char __flogr(unsigned long word) +{ + if (__builtin_constant_p(word)) { + unsigned long bit = 0; + + if (!word) + return 64; + if (!(word & 0xffffffff00000000UL)) { + word <<= 32; + bit += 32; + } + if (!(word & 0xffff000000000000UL)) { + word <<= 16; + bit += 16; + } + if (!(word & 0xff00000000000000UL)) { + word <<= 8; + bit += 8; + } + if (!(word & 0xf000000000000000UL)) { + word <<= 4; + bit += 4; + } + if (!(word & 0xc000000000000000UL)) { + word <<= 2; + bit += 2; + } + if (!(word & 0x8000000000000000UL)) { + word <<= 1; + bit += 1; + } + return bit; + } else { + union register_pair rp; + + rp.even = word; + asm volatile( + " flogr %[rp],%[rp]\n" + : [rp] "+d" (rp.pair) : : "cc"); + return rp.even; + } +} + +/** + * __ffs - find first bit in word. + * @word: The word to search + * + * Undefined if no bit exists, so code should check against 0 first. + */ +static inline unsigned long __ffs(unsigned long word) +{ + return __flogr(-word & word) ^ (BITS_PER_LONG - 1); +} + +/** + * ffs - find first bit set + * @word: the word to search + * + * This is defined the same way as the libc and + * compiler builtin ffs routines (man ffs). + */ +static inline int ffs(int word) +{ + unsigned long mask = 2 * BITS_PER_LONG - 1; + unsigned int val = (unsigned int)word; + + return (1 + (__flogr(-val & val) ^ (BITS_PER_LONG - 1))) & mask; +} + +/** + * __fls - find last (most-significant) set bit in a long word + * @word: the word to search + * + * Undefined if no set bit exists, so code should check against 0 first. + */ +static inline unsigned long __fls(unsigned long word) +{ + return __flogr(word) ^ (BITS_PER_LONG - 1); +} + +/** + * fls64 - find last set bit in a 64-bit word + * @word: the word to search + * + * This is defined in a similar way as the libc and compiler builtin + * ffsll, but returns the position of the most significant set bit. + * + * fls64(value) returns 0 if value is 0 or the position of the last + * set bit if value is nonzero. The last (most significant) bit is + * at position 64. + */ +static inline int fls64(unsigned long word) +{ + unsigned long mask = 2 * BITS_PER_LONG - 1; + + return (1 + (__flogr(word) ^ (BITS_PER_LONG - 1))) & mask; +} + +/** + * fls - find last (most-significant) bit set + * @word: the word to search + * + * This is defined the same way as ffs. + * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. + */ +static inline int fls(unsigned int word) +{ + return fls64(word); +} + +#include +#include +#include +#include +#include + +#endif /* _S390_BITOPS_H */ diff --git a/arch/s390/include/asm/boot_data.h b/arch/s390/include/asm/boot_data.h new file mode 100644 index 0000000000..f7eed27b32 --- /dev/null +++ b/arch/s390/include/asm/boot_data.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_BOOT_DATA_H + +#include +#include + +extern char early_command_line[COMMAND_LINE_SIZE]; +extern struct ipl_parameter_block ipl_block; +extern int ipl_block_valid; +extern int ipl_secure_flag; + +extern unsigned long ipl_cert_list_addr; +extern unsigned long ipl_cert_list_size; + +extern unsigned long early_ipl_comp_list_addr; +extern unsigned long early_ipl_comp_list_size; + +#endif /* _ASM_S390_BOOT_DATA_H */ diff --git a/arch/s390/include/asm/bug.h b/arch/s390/include/asm/bug.h new file mode 100644 index 0000000000..aebe1e22c7 --- /dev/null +++ b/arch/s390/include/asm/bug.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_BUG_H +#define _ASM_S390_BUG_H + +#include + +#ifdef CONFIG_BUG + +#ifdef CONFIG_DEBUG_BUGVERBOSE + +#define __EMIT_BUG(x) do { \ + asm_inline volatile( \ + "0: mc 0,0\n" \ + ".section .rodata.str,\"aMS\",@progbits,1\n" \ + "1: .asciz \""__FILE__"\"\n" \ + ".previous\n" \ + ".section __bug_table,\"awM\",@progbits,%2\n" \ + "2: .long 0b-.\n" \ + " .long 1b-.\n" \ + " .short %0,%1\n" \ + " .org 2b+%2\n" \ + ".previous\n" \ + : : "i" (__LINE__), \ + "i" (x), \ + "i" (sizeof(struct bug_entry))); \ +} while (0) + +#else /* CONFIG_DEBUG_BUGVERBOSE */ + +#define __EMIT_BUG(x) do { \ + asm_inline volatile( \ + "0: mc 0,0\n" \ + ".section __bug_table,\"awM\",@progbits,%1\n" \ + "1: .long 0b-.\n" \ + " .short %0\n" \ + " .org 1b+%1\n" \ + ".previous\n" \ + : : "i" (x), \ + "i" (sizeof(struct bug_entry))); \ +} while (0) + +#endif /* CONFIG_DEBUG_BUGVERBOSE */ + +#define BUG() do { \ + __EMIT_BUG(0); \ + unreachable(); \ +} while (0) + +#define __WARN_FLAGS(flags) do { \ + __EMIT_BUG(BUGFLAG_WARNING|(flags)); \ +} while (0) + +#define WARN_ON(x) ({ \ + int __ret_warn_on = !!(x); \ + if (__builtin_constant_p(__ret_warn_on)) { \ + if (__ret_warn_on) \ + __WARN(); \ + } else { \ + if (unlikely(__ret_warn_on)) \ + __WARN(); \ + } \ + unlikely(__ret_warn_on); \ +}) + +#define HAVE_ARCH_BUG +#define HAVE_ARCH_WARN_ON +#endif /* CONFIG_BUG */ + +#include + +#endif /* _ASM_S390_BUG_H */ diff --git a/arch/s390/include/asm/cache.h b/arch/s390/include/asm/cache.h new file mode 100644 index 0000000000..00128174c0 --- /dev/null +++ b/arch/s390/include/asm/cache.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * S390 version + * Copyright IBM Corp. 1999 + * + * Derived from "include/asm-i386/cache.h" + * Copyright (C) 1992, Linus Torvalds + */ + +#ifndef __ARCH_S390_CACHE_H +#define __ARCH_S390_CACHE_H + +#define L1_CACHE_BYTES 256 +#define L1_CACHE_SHIFT 8 +#define NET_SKB_PAD 32 + +#define __read_mostly __section(".data..read_mostly") + +#endif diff --git a/arch/s390/include/asm/ccwdev.h b/arch/s390/include/asm/ccwdev.h new file mode 100644 index 0000000000..91d261751d --- /dev/null +++ b/arch/s390/include/asm/ccwdev.h @@ -0,0 +1,237 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2002, 2009 + * + * Author(s): Arnd Bergmann + * + * Interface for CCW device drivers + */ +#ifndef _S390_CCWDEV_H_ +#define _S390_CCWDEV_H_ + +#include +#include +#include +#include +#include +#include +#include + +/* structs from asm/cio.h */ +struct irb; +struct ccw1; +struct ccw_dev_id; + +/* simplified initializers for struct ccw_device: + * CCW_DEVICE and CCW_DEVICE_DEVTYPE initialize one + * entry in your MODULE_DEVICE_TABLE and set the match_flag correctly */ +#define CCW_DEVICE(cu, cum) \ + .cu_type=(cu), .cu_model=(cum), \ + .match_flags=(CCW_DEVICE_ID_MATCH_CU_TYPE \ + | (cum ? CCW_DEVICE_ID_MATCH_CU_MODEL : 0)) + +#define CCW_DEVICE_DEVTYPE(cu, cum, dev, devm) \ + .cu_type=(cu), .cu_model=(cum), .dev_type=(dev), .dev_model=(devm),\ + .match_flags=CCW_DEVICE_ID_MATCH_CU_TYPE \ + | ((cum) ? CCW_DEVICE_ID_MATCH_CU_MODEL : 0) \ + | CCW_DEVICE_ID_MATCH_DEVICE_TYPE \ + | ((devm) ? CCW_DEVICE_ID_MATCH_DEVICE_MODEL : 0) + +/* scan through an array of device ids and return the first + * entry that matches the device. + * + * the array must end with an entry containing zero match_flags + */ +static inline const struct ccw_device_id * +ccw_device_id_match(const struct ccw_device_id *array, + const struct ccw_device_id *match) +{ + const struct ccw_device_id *id = array; + + for (id = array; id->match_flags; id++) { + if ((id->match_flags & CCW_DEVICE_ID_MATCH_CU_TYPE) + && (id->cu_type != match->cu_type)) + continue; + + if ((id->match_flags & CCW_DEVICE_ID_MATCH_CU_MODEL) + && (id->cu_model != match->cu_model)) + continue; + + if ((id->match_flags & CCW_DEVICE_ID_MATCH_DEVICE_TYPE) + && (id->dev_type != match->dev_type)) + continue; + + if ((id->match_flags & CCW_DEVICE_ID_MATCH_DEVICE_MODEL) + && (id->dev_model != match->dev_model)) + continue; + + return id; + } + + return NULL; +} + +/** + * struct ccw_device - channel attached device + * @ccwlock: pointer to device lock + * @id: id of this device + * @drv: ccw driver for this device + * @dev: embedded device structure + * @online: online status of device + * @handler: interrupt handler + * + * @handler is a member of the device rather than the driver since a driver + * can have different interrupt handlers for different ccw devices + * (multi-subchannel drivers). + */ +struct ccw_device { + spinlock_t *ccwlock; +/* private: */ + struct ccw_device_private *private; /* cio private information */ + struct mutex reg_mutex; +/* public: */ + struct ccw_device_id id; + struct ccw_driver *drv; + struct device dev; + int online; + void (*handler) (struct ccw_device *, unsigned long, struct irb *); +}; + +/* + * Possible events used by the path_event notifier. + */ +#define PE_NONE 0x0 +#define PE_PATH_GONE 0x1 /* A path is no longer available. */ +#define PE_PATH_AVAILABLE 0x2 /* A path has become available and + was successfully verified. */ +#define PE_PATHGROUP_ESTABLISHED 0x4 /* A pathgroup was reset and had + to be established again. */ +#define PE_PATH_FCES_EVENT 0x8 /* The FCES Status of a path has + * changed. */ + +/* + * Possible CIO actions triggered by the unit check handler. + */ +enum uc_todo { + UC_TODO_RETRY, + UC_TODO_RETRY_ON_NEW_PATH, + UC_TODO_STOP +}; + +/** + * struct ccw_driver - device driver for channel attached devices + * @ids: ids supported by this driver + * @probe: function called on probe + * @remove: function called on remove + * @set_online: called when setting device online + * @set_offline: called when setting device offline + * @notify: notify driver of device state changes + * @path_event: notify driver of channel path events + * @shutdown: called at device shutdown + * @uc_handler: callback for unit check handler + * @driver: embedded device driver structure + * @int_class: interruption class to use for accounting interrupts + */ +struct ccw_driver { + struct ccw_device_id *ids; + int (*probe) (struct ccw_device *); + void (*remove) (struct ccw_device *); + int (*set_online) (struct ccw_device *); + int (*set_offline) (struct ccw_device *); + int (*notify) (struct ccw_device *, int); + void (*path_event) (struct ccw_device *, int *); + void (*shutdown) (struct ccw_device *); + enum uc_todo (*uc_handler) (struct ccw_device *, struct irb *); + struct device_driver driver; + enum interruption_class int_class; +}; + +extern struct ccw_device *get_ccwdev_by_busid(struct ccw_driver *cdrv, + const char *bus_id); + +/* devices drivers call these during module load and unload. + * When a driver is registered, its probe method is called + * when new devices for its type pop up */ +extern int ccw_driver_register (struct ccw_driver *driver); +extern void ccw_driver_unregister (struct ccw_driver *driver); +extern int ccw_device_set_options_mask(struct ccw_device *, unsigned long); +extern int ccw_device_set_options(struct ccw_device *, unsigned long); +extern void ccw_device_clear_options(struct ccw_device *, unsigned long); +int ccw_device_is_pathgroup(struct ccw_device *cdev); +int ccw_device_is_multipath(struct ccw_device *cdev); + +/* Allow for i/o completion notification after primary interrupt status. */ +#define CCWDEV_EARLY_NOTIFICATION 0x0001 +/* Report all interrupt conditions. */ +#define CCWDEV_REPORT_ALL 0x0002 +/* Try to perform path grouping. */ +#define CCWDEV_DO_PATHGROUP 0x0004 +/* Allow forced onlining of boxed devices. */ +#define CCWDEV_ALLOW_FORCE 0x0008 +/* Try to use multipath mode. */ +#define CCWDEV_DO_MULTIPATH 0x0010 + +extern int ccw_device_start(struct ccw_device *, struct ccw1 *, + unsigned long, __u8, unsigned long); +extern int ccw_device_start_timeout(struct ccw_device *, struct ccw1 *, + unsigned long, __u8, unsigned long, int); +extern int ccw_device_start_key(struct ccw_device *, struct ccw1 *, + unsigned long, __u8, __u8, unsigned long); +extern int ccw_device_start_timeout_key(struct ccw_device *, struct ccw1 *, + unsigned long, __u8, __u8, + unsigned long, int); + + +extern int ccw_device_resume(struct ccw_device *); +extern int ccw_device_halt(struct ccw_device *, unsigned long); +extern int ccw_device_clear(struct ccw_device *, unsigned long); +int ccw_device_tm_start_key(struct ccw_device *cdev, struct tcw *tcw, + unsigned long intparm, u8 lpm, u8 key); +int ccw_device_tm_start_key(struct ccw_device *, struct tcw *, + unsigned long, u8, u8); +int ccw_device_tm_start_timeout_key(struct ccw_device *, struct tcw *, + unsigned long, u8, u8, int); +int ccw_device_tm_start(struct ccw_device *, struct tcw *, + unsigned long, u8); +int ccw_device_tm_start_timeout(struct ccw_device *, struct tcw *, + unsigned long, u8, int); +int ccw_device_tm_intrg(struct ccw_device *cdev); + +int ccw_device_get_mdc(struct ccw_device *cdev, u8 mask); + +extern int ccw_device_set_online(struct ccw_device *cdev); +extern int ccw_device_set_offline(struct ccw_device *cdev); + + +extern struct ciw *ccw_device_get_ciw(struct ccw_device *, __u32 cmd); +extern __u8 ccw_device_get_path_mask(struct ccw_device *); +extern void ccw_device_get_id(struct ccw_device *, struct ccw_dev_id *); + +#define get_ccwdev_lock(x) (x)->ccwlock + +#define to_ccwdev(n) container_of(n, struct ccw_device, dev) +#define to_ccwdrv(n) container_of(n, struct ccw_driver, driver) + +extern struct ccw_device *ccw_device_create_console(struct ccw_driver *); +extern void ccw_device_destroy_console(struct ccw_device *); +extern int ccw_device_enable_console(struct ccw_device *); +extern void ccw_device_wait_idle(struct ccw_device *); + +extern void *ccw_device_dma_zalloc(struct ccw_device *cdev, size_t size); +extern void ccw_device_dma_free(struct ccw_device *cdev, + void *cpu_addr, size_t size); + +int ccw_device_siosl(struct ccw_device *); + +extern void ccw_device_get_schid(struct ccw_device *, struct subchannel_id *); + +struct channel_path_desc_fmt0 *ccw_device_get_chp_desc(struct ccw_device *, int); +u8 *ccw_device_get_util_str(struct ccw_device *cdev, int chp_idx); +int ccw_device_pnso(struct ccw_device *cdev, + struct chsc_pnso_area *pnso_area, u8 oc, + struct chsc_pnso_resume_token resume_token, int cnc); +int ccw_device_get_cssid(struct ccw_device *cdev, u8 *cssid); +int ccw_device_get_iid(struct ccw_device *cdev, u8 *iid); +int ccw_device_get_chpid(struct ccw_device *cdev, int chp_idx, u8 *chpid); +int ccw_device_get_chid(struct ccw_device *cdev, int chp_idx, u16 *chid); +#endif /* _S390_CCWDEV_H_ */ diff --git a/arch/s390/include/asm/ccwgroup.h b/arch/s390/include/asm/ccwgroup.h new file mode 100644 index 0000000000..11d2fb3de4 --- /dev/null +++ b/arch/s390/include/asm/ccwgroup.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef S390_CCWGROUP_H +#define S390_CCWGROUP_H + +struct ccw_device; +struct ccw_driver; + +/** + * struct ccwgroup_device - ccw group device + * @state: online/offline state + * @count: number of attached slave devices + * @dev: embedded device structure + * @cdev: variable number of slave devices, allocated as needed + * @ungroup_work: used to ungroup the ccwgroup device + */ +struct ccwgroup_device { + enum { + CCWGROUP_OFFLINE, + CCWGROUP_ONLINE, + } state; +/* private: */ + atomic_t onoff; + struct mutex reg_mutex; +/* public: */ + unsigned int count; + struct device dev; + struct work_struct ungroup_work; + struct ccw_device *cdev[]; +}; + +/** + * struct ccwgroup_driver - driver for ccw group devices + * @setup: function called during device creation to setup the device + * @remove: function called on remove + * @set_online: function called when device is set online + * @set_offline: function called when device is set offline + * @shutdown: function called when device is shut down + * @driver: embedded driver structure + * @ccw_driver: supported ccw_driver (optional) + */ +struct ccwgroup_driver { + int (*setup) (struct ccwgroup_device *); + void (*remove) (struct ccwgroup_device *); + int (*set_online) (struct ccwgroup_device *); + int (*set_offline) (struct ccwgroup_device *); + void (*shutdown)(struct ccwgroup_device *); + + struct device_driver driver; + struct ccw_driver *ccw_driver; +}; + +extern int ccwgroup_driver_register (struct ccwgroup_driver *cdriver); +extern void ccwgroup_driver_unregister (struct ccwgroup_driver *cdriver); +int ccwgroup_create_dev(struct device *root, struct ccwgroup_driver *gdrv, + int num_devices, const char *buf); + +extern int ccwgroup_set_online(struct ccwgroup_device *gdev); +int ccwgroup_set_offline(struct ccwgroup_device *gdev, bool call_gdrv); + +extern int ccwgroup_probe_ccwdev(struct ccw_device *cdev); +extern void ccwgroup_remove_ccwdev(struct ccw_device *cdev); + +#define to_ccwgroupdev(x) container_of((x), struct ccwgroup_device, dev) +#define to_ccwgroupdrv(x) container_of((x), struct ccwgroup_driver, driver) + +#if IS_ENABLED(CONFIG_CCWGROUP) +bool dev_is_ccwgroup(struct device *dev); +#else /* CONFIG_CCWGROUP */ +static inline bool dev_is_ccwgroup(struct device *dev) +{ + return false; +} +#endif /* CONFIG_CCWGROUP */ + +#endif diff --git a/arch/s390/include/asm/checksum.h b/arch/s390/include/asm/checksum.h new file mode 100644 index 0000000000..69837eec2f --- /dev/null +++ b/arch/s390/include/asm/checksum.h @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * S390 fast network checksum routines + * + * S390 version + * Copyright IBM Corp. 1999 + * Author(s): Ulrich Hild (first version) + * Martin Schwidefsky (heavily optimized CKSM version) + * D.J. Barrow (third attempt) + */ + +#ifndef _S390_CHECKSUM_H +#define _S390_CHECKSUM_H + +#include +#include + +/* + * Computes the checksum of a memory block at buff, length len, + * and adds in "sum" (32-bit). + * + * Returns a 32-bit number suitable for feeding into itself + * or csum_tcpudp_magic. + * + * This function must be called with even lengths, except + * for the last fragment, which may be odd. + * + * It's best to have buff aligned on a 32-bit boundary. + */ +static inline __wsum csum_partial(const void *buff, int len, __wsum sum) +{ + union register_pair rp = { + .even = (unsigned long) buff, + .odd = (unsigned long) len, + }; + + kasan_check_read(buff, len); + asm volatile( + "0: cksm %[sum],%[rp]\n" + " jo 0b\n" + : [sum] "+&d" (sum), [rp] "+&d" (rp.pair) : : "cc", "memory"); + return sum; +} + +/* + * Fold a partial checksum without adding pseudo headers. + */ +static inline __sum16 csum_fold(__wsum sum) +{ + u32 csum = (__force u32) sum; + + csum += (csum >> 16) | (csum << 16); + csum >>= 16; + return (__force __sum16) ~csum; +} + +/* + * This is a version of ip_compute_csum() optimized for IP headers, + * which always checksums on 4 octet boundaries. + */ +static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) +{ + __u64 csum = 0; + __u32 *ptr = (u32 *)iph; + + csum += *ptr++; + csum += *ptr++; + csum += *ptr++; + csum += *ptr++; + ihl -= 4; + while (ihl--) + csum += *ptr++; + csum += (csum >> 32) | (csum << 32); + return csum_fold((__force __wsum)(csum >> 32)); +} + +/* + * Computes the checksum of the TCP/UDP pseudo-header. + * Returns a 32-bit checksum. + */ +static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, + __u8 proto, __wsum sum) +{ + __u64 csum = (__force __u64)sum; + + csum += (__force __u32)saddr; + csum += (__force __u32)daddr; + csum += len; + csum += proto; + csum += (csum >> 32) | (csum << 32); + return (__force __wsum)(csum >> 32); +} + +/* + * Computes the checksum of the TCP/UDP pseudo-header. + * Returns a 16-bit checksum, already complemented. + */ +static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, + __u8 proto, __wsum sum) +{ + return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); +} + +/* + * Used for miscellaneous IP-like checksums, mainly icmp. + */ +static inline __sum16 ip_compute_csum(const void *buff, int len) +{ + return csum_fold(csum_partial(buff, len, 0)); +} + +#define _HAVE_ARCH_IPV6_CSUM +static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u32 len, __u8 proto, __wsum csum) +{ + __u64 sum = (__force __u64)csum; + + sum += (__force __u32)saddr->s6_addr32[0]; + sum += (__force __u32)saddr->s6_addr32[1]; + sum += (__force __u32)saddr->s6_addr32[2]; + sum += (__force __u32)saddr->s6_addr32[3]; + sum += (__force __u32)daddr->s6_addr32[0]; + sum += (__force __u32)daddr->s6_addr32[1]; + sum += (__force __u32)daddr->s6_addr32[2]; + sum += (__force __u32)daddr->s6_addr32[3]; + sum += len; + sum += proto; + sum += (sum >> 32) | (sum << 32); + return csum_fold((__force __wsum)(sum >> 32)); +} + +#endif /* _S390_CHECKSUM_H */ diff --git a/arch/s390/include/asm/chpid.h b/arch/s390/include/asm/chpid.h new file mode 100644 index 0000000000..20e0d22f29 --- /dev/null +++ b/arch/s390/include/asm/chpid.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2007, 2012 + * Author(s): Peter Oberparleiter + */ +#ifndef _ASM_S390_CHPID_H +#define _ASM_S390_CHPID_H + +#include +#include + +struct channel_path_desc_fmt0 { + u8 flags; + u8 lsn; + u8 desc; + u8 chpid; + u8 swla; + u8 zeroes; + u8 chla; + u8 chpp; +} __packed; + +static inline void chp_id_init(struct chp_id *chpid) +{ + memset(chpid, 0, sizeof(struct chp_id)); +} + +static inline int chp_id_is_equal(struct chp_id *a, struct chp_id *b) +{ + return (a->id == b->id) && (a->cssid == b->cssid); +} + +static inline void chp_id_next(struct chp_id *chpid) +{ + if (chpid->id < __MAX_CHPID) + chpid->id++; + else { + chpid->id = 0; + chpid->cssid++; + } +} + +static inline int chp_id_is_valid(struct chp_id *chpid) +{ + return (chpid->cssid <= __MAX_CSSID); +} + + +#define chp_id_for_each(c) \ + for (chp_id_init(c); chp_id_is_valid(c); chp_id_next(c)) +#endif /* _ASM_S390_CHPID_H */ diff --git a/arch/s390/include/asm/chsc.h b/arch/s390/include/asm/chsc.h new file mode 100644 index 0000000000..bb48ea380c --- /dev/null +++ b/arch/s390/include/asm/chsc.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2020 + * + * Author(s): Alexandra Winter + * + * Interface for Channel Subsystem Call + */ +#ifndef _ASM_S390_CHSC_H +#define _ASM_S390_CHSC_H + +#include + +/** + * Operation codes for CHSC PNSO: + * PNSO_OC_NET_BRIDGE_INFO - only addresses that are visible to a bridgeport + * PNSO_OC_NET_ADDR_INFO - all addresses + */ +#define PNSO_OC_NET_BRIDGE_INFO 0 +#define PNSO_OC_NET_ADDR_INFO 3 +/** + * struct chsc_pnso_naid_l2 - network address information descriptor + * @nit: Network interface token + * @addr_lnid: network address and logical network id (VLAN ID) + */ +struct chsc_pnso_naid_l2 { + u64 nit; + struct { u8 mac[6]; u16 lnid; } addr_lnid; +} __packed; + +struct chsc_pnso_resume_token { + u64 t1; + u64 t2; +} __packed; + +struct chsc_pnso_naihdr { + struct chsc_pnso_resume_token resume_token; + u32:32; + u32 instance; + u32:24; + u8 naids; + u32 reserved[3]; +} __packed; + +struct chsc_pnso_area { + struct chsc_header request; + u8:2; + u8 m:1; + u8:5; + u8:2; + u8 ssid:2; + u8 fmt:4; + u16 sch; + u8:8; + u8 cssid; + u16:16; + u8 oc; + u32:24; + struct chsc_pnso_resume_token resume_token; + u32 n:1; + u32:31; + u32 reserved[3]; + struct chsc_header response; + u32:32; + struct chsc_pnso_naihdr naihdr; + struct chsc_pnso_naid_l2 entries[]; +} __packed __aligned(PAGE_SIZE); + +#endif /* _ASM_S390_CHSC_H */ diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h new file mode 100644 index 0000000000..1c4f585dd3 --- /dev/null +++ b/arch/s390/include/asm/cio.h @@ -0,0 +1,378 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Common interface for I/O on S/390 + */ +#ifndef _ASM_S390_CIO_H_ +#define _ASM_S390_CIO_H_ + +#include +#include +#include +#include + +#define LPM_ANYPATH 0xff +#define __MAX_CSSID 0 +#define __MAX_SUBCHANNEL 65535 +#define __MAX_SSID 3 + +#include + +/** + * struct ccw1 - channel command word + * @cmd_code: command code + * @flags: flags, like IDA addressing, etc. + * @count: byte count + * @cda: data address + * + * The ccw is the basic structure to build channel programs that perform + * operations with the device or the control unit. Only Format-1 channel + * command words are supported. + */ +struct ccw1 { + __u8 cmd_code; + __u8 flags; + __u16 count; + __u32 cda; +} __attribute__ ((packed,aligned(8))); + +/** + * struct ccw0 - channel command word + * @cmd_code: command code + * @cda: data address + * @flags: flags, like IDA addressing, etc. + * @reserved: will be ignored + * @count: byte count + * + * The format-0 ccw structure. + */ +struct ccw0 { + __u8 cmd_code; + __u32 cda : 24; + __u8 flags; + __u8 reserved; + __u16 count; +} __packed __aligned(8); + +#define CCW_FLAG_DC 0x80 +#define CCW_FLAG_CC 0x40 +#define CCW_FLAG_SLI 0x20 +#define CCW_FLAG_SKIP 0x10 +#define CCW_FLAG_PCI 0x08 +#define CCW_FLAG_IDA 0x04 +#define CCW_FLAG_SUSPEND 0x02 + +#define CCW_CMD_READ_IPL 0x02 +#define CCW_CMD_NOOP 0x03 +#define CCW_CMD_BASIC_SENSE 0x04 +#define CCW_CMD_TIC 0x08 +#define CCW_CMD_STLCK 0x14 +#define CCW_CMD_SENSE_PGID 0x34 +#define CCW_CMD_SUSPEND_RECONN 0x5B +#define CCW_CMD_RDC 0x64 +#define CCW_CMD_RELEASE 0x94 +#define CCW_CMD_SET_PGID 0xAF +#define CCW_CMD_SENSE_ID 0xE4 +#define CCW_CMD_DCTL 0xF3 + +#define SENSE_MAX_COUNT 0x20 + +/** + * struct erw - extended report word + * @res0: reserved + * @auth: authorization check + * @pvrf: path-verification-required flag + * @cpt: channel-path timeout + * @fsavf: failing storage address validity flag + * @cons: concurrent sense + * @scavf: secondary ccw address validity flag + * @fsaf: failing storage address format + * @scnt: sense count, if @cons == %1 + * @res16: reserved + */ +struct erw { + __u32 res0 : 3; + __u32 auth : 1; + __u32 pvrf : 1; + __u32 cpt : 1; + __u32 fsavf : 1; + __u32 cons : 1; + __u32 scavf : 1; + __u32 fsaf : 1; + __u32 scnt : 6; + __u32 res16 : 16; +} __attribute__ ((packed)); + +/** + * struct erw_eadm - EADM Subchannel extended report word + * @b: aob error + * @r: arsb error + */ +struct erw_eadm { + __u32 : 16; + __u32 b : 1; + __u32 r : 1; + __u32 : 14; +} __packed; + +/** + * struct sublog - subchannel logout area + * @res0: reserved + * @esf: extended status flags + * @lpum: last path used mask + * @arep: ancillary report + * @fvf: field-validity flags + * @sacc: storage access code + * @termc: termination code + * @devsc: device-status check + * @serr: secondary error + * @ioerr: i/o-error alert + * @seqc: sequence code + */ +struct sublog { + __u32 res0 : 1; + __u32 esf : 7; + __u32 lpum : 8; + __u32 arep : 1; + __u32 fvf : 5; + __u32 sacc : 2; + __u32 termc : 2; + __u32 devsc : 1; + __u32 serr : 1; + __u32 ioerr : 1; + __u32 seqc : 3; +} __attribute__ ((packed)); + +/** + * struct esw0 - Format 0 Extended Status Word (ESW) + * @sublog: subchannel logout + * @erw: extended report word + * @faddr: failing storage address + * @saddr: secondary ccw address + */ +struct esw0 { + struct sublog sublog; + struct erw erw; + __u32 faddr[2]; + __u32 saddr; +} __attribute__ ((packed)); + +/** + * struct esw1 - Format 1 Extended Status Word (ESW) + * @zero0: reserved zeros + * @lpum: last path used mask + * @zero16: reserved zeros + * @erw: extended report word + * @zeros: three fullwords of zeros + */ +struct esw1 { + __u8 zero0; + __u8 lpum; + __u16 zero16; + struct erw erw; + __u32 zeros[3]; +} __attribute__ ((packed)); + +/** + * struct esw2 - Format 2 Extended Status Word (ESW) + * @zero0: reserved zeros + * @lpum: last path used mask + * @dcti: device-connect-time interval + * @erw: extended report word + * @zeros: three fullwords of zeros + */ +struct esw2 { + __u8 zero0; + __u8 lpum; + __u16 dcti; + struct erw erw; + __u32 zeros[3]; +} __attribute__ ((packed)); + +/** + * struct esw3 - Format 3 Extended Status Word (ESW) + * @zero0: reserved zeros + * @lpum: last path used mask + * @res: reserved + * @erw: extended report word + * @zeros: three fullwords of zeros + */ +struct esw3 { + __u8 zero0; + __u8 lpum; + __u16 res; + struct erw erw; + __u32 zeros[3]; +} __attribute__ ((packed)); + +/** + * struct esw_eadm - EADM Subchannel Extended Status Word (ESW) + * @sublog: subchannel logout + * @erw: extended report word + */ +struct esw_eadm { + __u32 sublog; + struct erw_eadm erw; + __u32 : 32; + __u32 : 32; + __u32 : 32; +} __packed; + +/** + * struct irb - interruption response block + * @scsw: subchannel status word + * @esw: extended status word + * @ecw: extended control word + * + * The irb that is handed to the device driver when an interrupt occurs. For + * solicited interrupts, the common I/O layer already performs checks whether + * a field is valid; a field not being valid is always passed as %0. + * If a unit check occurred, @ecw may contain sense data; this is retrieved + * by the common I/O layer itself if the device doesn't support concurrent + * sense (so that the device driver never needs to perform basic sense itself). + * For unsolicited interrupts, the irb is passed as-is (expect for sense data, + * if applicable). + */ +struct irb { + union scsw scsw; + union { + struct esw0 esw0; + struct esw1 esw1; + struct esw2 esw2; + struct esw3 esw3; + struct esw_eadm eadm; + } esw; + __u8 ecw[32]; +} __attribute__ ((packed,aligned(4))); + +/** + * struct ciw - command information word (CIW) layout + * @et: entry type + * @reserved: reserved bits + * @ct: command type + * @cmd: command code + * @count: command count + */ +struct ciw { + __u32 et : 2; + __u32 reserved : 2; + __u32 ct : 4; + __u32 cmd : 8; + __u32 count : 16; +} __attribute__ ((packed)); + +#define CIW_TYPE_RCD 0x0 /* read configuration data */ +#define CIW_TYPE_SII 0x1 /* set interface identifier */ +#define CIW_TYPE_RNI 0x2 /* read node identifier */ + +/* + * Node Descriptor as defined in SA22-7204, "Common I/O-Device Commands" + */ + +#define ND_VALIDITY_VALID 0 +#define ND_VALIDITY_OUTDATED 1 +#define ND_VALIDITY_INVALID 2 + +struct node_descriptor { + /* Flags. */ + union { + struct { + u32 validity:3; + u32 reserved:5; + } __packed; + u8 byte0; + } __packed; + + /* Node parameters. */ + u32 params:24; + + /* Node ID. */ + char type[6]; + char model[3]; + char manufacturer[3]; + char plant[2]; + char seq[12]; + u16 tag; +} __packed; + +/* + * Flags used as input parameters for do_IO() + */ +#define DOIO_ALLOW_SUSPEND 0x0001 /* allow for channel prog. suspend */ +#define DOIO_DENY_PREFETCH 0x0002 /* don't allow for CCW prefetch */ +#define DOIO_SUPPRESS_INTER 0x0004 /* suppress intermediate inter. */ + /* ... for suspended CCWs */ +/* Device or subchannel gone. */ +#define CIO_GONE 0x0001 +/* No path to device. */ +#define CIO_NO_PATH 0x0002 +/* Device has appeared. */ +#define CIO_OPER 0x0004 +/* Sick revalidation of device. */ +#define CIO_REVALIDATE 0x0008 +/* Device did not respond in time. */ +#define CIO_BOXED 0x0010 + +/** + * struct ccw_dev_id - unique identifier for ccw devices + * @ssid: subchannel set id + * @devno: device number + * + * This structure is not directly based on any hardware structure. The + * hardware identifies a device by its device number and its subchannel, + * which is in turn identified by its id. In order to get a unique identifier + * for ccw devices across subchannel sets, @struct ccw_dev_id has been + * introduced. + */ +struct ccw_dev_id { + u8 ssid; + u16 devno; +}; + +/** + * ccw_dev_id_is_equal() - compare two ccw_dev_ids + * @dev_id1: a ccw_dev_id + * @dev_id2: another ccw_dev_id + * Returns: + * %1 if the two structures are equal field-by-field, + * %0 if not. + * Context: + * any + */ +static inline int ccw_dev_id_is_equal(struct ccw_dev_id *dev_id1, + struct ccw_dev_id *dev_id2) +{ + if ((dev_id1->ssid == dev_id2->ssid) && + (dev_id1->devno == dev_id2->devno)) + return 1; + return 0; +} + +/** + * pathmask_to_pos() - find the position of the left-most bit in a pathmask + * @mask: pathmask with at least one bit set + */ +static inline u8 pathmask_to_pos(u8 mask) +{ + return 8 - ffs(mask); +} + +extern void css_schedule_reprobe(void); + +extern void *cio_dma_zalloc(size_t size); +extern void cio_dma_free(void *cpu_addr, size_t size); +extern struct device *cio_get_dma_css_dev(void); + +void *cio_gp_dma_zalloc(struct gen_pool *gp_dma, struct device *dma_dev, + size_t size); +void cio_gp_dma_free(struct gen_pool *gp_dma, void *cpu_addr, size_t size); +void cio_gp_dma_destroy(struct gen_pool *gp_dma, struct device *dma_dev); +struct gen_pool *cio_gp_dma_create(struct device *dma_dev, int nr_pages); + +/* Function from drivers/s390/cio/chsc.c */ +int chsc_sstpc(void *page, unsigned int op, u16 ctrl, long *clock_delta); +int chsc_sstpi(void *page, void *result, size_t size); +int chsc_stzi(void *page, void *result, size_t size); +int chsc_sgib(u32 origin); +int chsc_scud(u16 cu, u64 *esm, u8 *esm_valid); + +#endif diff --git a/arch/s390/include/asm/clocksource.h b/arch/s390/include/asm/clocksource.h new file mode 100644 index 0000000000..03434369fc --- /dev/null +++ b/arch/s390/include/asm/clocksource.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* s390-specific clocksource additions */ + +#ifndef _ASM_S390_CLOCKSOURCE_H +#define _ASM_S390_CLOCKSOURCE_H + +#endif /* _ASM_S390_CLOCKSOURCE_H */ diff --git a/arch/s390/include/asm/clp.h b/arch/s390/include/asm/clp.h new file mode 100644 index 0000000000..10919eeb75 --- /dev/null +++ b/arch/s390/include/asm/clp.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_CLP_H +#define _ASM_S390_CLP_H + +/* CLP common request & response block size */ +#define CLP_BLK_SIZE PAGE_SIZE + +/* Call Logical Processor - Command Code */ +#define CLP_SLPC 0x0001 + +#define CLP_LPS_BASE 0 +#define CLP_LPS_PCI 2 + +struct clp_req_hdr { + u16 len; + u16 cmd; + u32 fmt : 4; + u32 reserved1 : 28; + u64 reserved2; +} __packed; + +struct clp_rsp_hdr { + u16 len; + u16 rsp; + u32 fmt : 4; + u32 reserved1 : 28; + u64 reserved2; +} __packed; + +/* CLP Response Codes */ +#define CLP_RC_OK 0x0010 /* Command request successfully */ +#define CLP_RC_CMD 0x0020 /* Command code not recognized */ +#define CLP_RC_PERM 0x0030 /* Command not authorized */ +#define CLP_RC_FMT 0x0040 /* Invalid command request format */ +#define CLP_RC_LEN 0x0050 /* Invalid command request length */ +#define CLP_RC_8K 0x0060 /* Command requires 8K LPCB */ +#define CLP_RC_RESNOT0 0x0070 /* Reserved field not zero */ +#define CLP_RC_NODATA 0x0080 /* No data available */ +#define CLP_RC_FC_UNKNOWN 0x0100 /* Function code not recognized */ + +/* Store logical-processor characteristics request */ +struct clp_req_slpc { + struct clp_req_hdr hdr; +} __packed; + +struct clp_rsp_slpc { + struct clp_rsp_hdr hdr; + u32 reserved2[4]; + u32 lpif[8]; + u32 reserved3[8]; + u32 lpic[8]; +} __packed; + +struct clp_req_rsp_slpc { + struct clp_req_slpc request; + struct clp_rsp_slpc response; +} __packed; + +#endif diff --git a/arch/s390/include/asm/cmb.h b/arch/s390/include/asm/cmb.h new file mode 100644 index 0000000000..599594c372 --- /dev/null +++ b/arch/s390/include/asm/cmb.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef S390_CMB_H +#define S390_CMB_H + +#include + +struct ccw_device; +extern int enable_cmf(struct ccw_device *cdev); +extern int disable_cmf(struct ccw_device *cdev); +extern int __disable_cmf(struct ccw_device *cdev); +extern u64 cmf_read(struct ccw_device *cdev, int index); +extern int cmf_readall(struct ccw_device *cdev, struct cmbdata *data); + +#endif /* S390_CMB_H */ diff --git a/arch/s390/include/asm/cmpxchg.h b/arch/s390/include/asm/cmpxchg.h new file mode 100644 index 0000000000..aae0315374 --- /dev/null +++ b/arch/s390/include/asm/cmpxchg.h @@ -0,0 +1,207 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 1999, 2011 + * + * Author(s): Martin Schwidefsky , + */ + +#ifndef __ASM_CMPXCHG_H +#define __ASM_CMPXCHG_H + +#include +#include +#include + +void __xchg_called_with_bad_pointer(void); + +static __always_inline unsigned long +__arch_xchg(unsigned long x, unsigned long address, int size) +{ + unsigned long old; + int shift; + + switch (size) { + case 1: + shift = (3 ^ (address & 3)) << 3; + address ^= address & 3; + asm volatile( + " l %0,%1\n" + "0: lr 0,%0\n" + " nr 0,%3\n" + " or 0,%2\n" + " cs %0,0,%1\n" + " jl 0b\n" + : "=&d" (old), "+Q" (*(int *) address) + : "d" ((x & 0xff) << shift), "d" (~(0xff << shift)) + : "memory", "cc", "0"); + return old >> shift; + case 2: + shift = (2 ^ (address & 2)) << 3; + address ^= address & 2; + asm volatile( + " l %0,%1\n" + "0: lr 0,%0\n" + " nr 0,%3\n" + " or 0,%2\n" + " cs %0,0,%1\n" + " jl 0b\n" + : "=&d" (old), "+Q" (*(int *) address) + : "d" ((x & 0xffff) << shift), "d" (~(0xffff << shift)) + : "memory", "cc", "0"); + return old >> shift; + case 4: + asm volatile( + " l %0,%1\n" + "0: cs %0,%2,%1\n" + " jl 0b\n" + : "=&d" (old), "+Q" (*(int *) address) + : "d" (x) + : "memory", "cc"); + return old; + case 8: + asm volatile( + " lg %0,%1\n" + "0: csg %0,%2,%1\n" + " jl 0b\n" + : "=&d" (old), "+QS" (*(long *) address) + : "d" (x) + : "memory", "cc"); + return old; + } + __xchg_called_with_bad_pointer(); + return x; +} + +#define arch_xchg(ptr, x) \ +({ \ + __typeof__(*(ptr)) __ret; \ + \ + __ret = (__typeof__(*(ptr))) \ + __arch_xchg((unsigned long)(x), (unsigned long)(ptr), \ + sizeof(*(ptr))); \ + __ret; \ +}) + +void __cmpxchg_called_with_bad_pointer(void); + +static __always_inline unsigned long __cmpxchg(unsigned long address, + unsigned long old, + unsigned long new, int size) +{ + switch (size) { + case 1: { + unsigned int prev, shift, mask; + + shift = (3 ^ (address & 3)) << 3; + address ^= address & 3; + old = (old & 0xff) << shift; + new = (new & 0xff) << shift; + mask = ~(0xff << shift); + asm volatile( + " l %[prev],%[address]\n" + " nr %[prev],%[mask]\n" + " xilf %[mask],0xffffffff\n" + " or %[new],%[prev]\n" + " or %[prev],%[tmp]\n" + "0: lr %[tmp],%[prev]\n" + " cs %[prev],%[new],%[address]\n" + " jnl 1f\n" + " xr %[tmp],%[prev]\n" + " xr %[new],%[tmp]\n" + " nr %[tmp],%[mask]\n" + " jz 0b\n" + "1:" + : [prev] "=&d" (prev), + [address] "+Q" (*(int *)address), + [tmp] "+&d" (old), + [new] "+&d" (new), + [mask] "+&d" (mask) + :: "memory", "cc"); + return prev >> shift; + } + case 2: { + unsigned int prev, shift, mask; + + shift = (2 ^ (address & 2)) << 3; + address ^= address & 2; + old = (old & 0xffff) << shift; + new = (new & 0xffff) << shift; + mask = ~(0xffff << shift); + asm volatile( + " l %[prev],%[address]\n" + " nr %[prev],%[mask]\n" + " xilf %[mask],0xffffffff\n" + " or %[new],%[prev]\n" + " or %[prev],%[tmp]\n" + "0: lr %[tmp],%[prev]\n" + " cs %[prev],%[new],%[address]\n" + " jnl 1f\n" + " xr %[tmp],%[prev]\n" + " xr %[new],%[tmp]\n" + " nr %[tmp],%[mask]\n" + " jz 0b\n" + "1:" + : [prev] "=&d" (prev), + [address] "+Q" (*(int *)address), + [tmp] "+&d" (old), + [new] "+&d" (new), + [mask] "+&d" (mask) + :: "memory", "cc"); + return prev >> shift; + } + case 4: { + unsigned int prev = old; + + asm volatile( + " cs %[prev],%[new],%[address]\n" + : [prev] "+&d" (prev), + [address] "+Q" (*(int *)address) + : [new] "d" (new) + : "memory", "cc"); + return prev; + } + case 8: { + unsigned long prev = old; + + asm volatile( + " csg %[prev],%[new],%[address]\n" + : [prev] "+&d" (prev), + [address] "+QS" (*(long *)address) + : [new] "d" (new) + : "memory", "cc"); + return prev; + } + } + __cmpxchg_called_with_bad_pointer(); + return old; +} + +#define arch_cmpxchg(ptr, o, n) \ +({ \ + __typeof__(*(ptr)) __ret; \ + \ + __ret = (__typeof__(*(ptr))) \ + __cmpxchg((unsigned long)(ptr), (unsigned long)(o), \ + (unsigned long)(n), sizeof(*(ptr))); \ + __ret; \ +}) + +#define arch_cmpxchg64 arch_cmpxchg +#define arch_cmpxchg_local arch_cmpxchg +#define arch_cmpxchg64_local arch_cmpxchg + +#define system_has_cmpxchg128() 1 + +static __always_inline u128 arch_cmpxchg128(volatile u128 *ptr, u128 old, u128 new) +{ + asm volatile( + " cdsg %[old],%[new],%[ptr]\n" + : [old] "+d" (old), [ptr] "+QS" (*ptr) + : [new] "d" (new) + : "memory", "cc"); + return old; +} + +#define arch_cmpxchg128 arch_cmpxchg128 + +#endif /* __ASM_CMPXCHG_H */ diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h new file mode 100644 index 0000000000..3cb9d813f0 --- /dev/null +++ b/arch/s390/include/asm/compat.h @@ -0,0 +1,140 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390X_COMPAT_H +#define _ASM_S390X_COMPAT_H +/* + * Architecture specific compatibility types + */ +#include +#include +#include +#include +#include + +#define compat_mode_t compat_mode_t +typedef u16 compat_mode_t; + +#define __compat_uid_t __compat_uid_t +typedef u16 __compat_uid_t; +typedef u16 __compat_gid_t; + +#define compat_dev_t compat_dev_t +typedef u16 compat_dev_t; + +#define compat_ipc_pid_t compat_ipc_pid_t +typedef u16 compat_ipc_pid_t; + +#define compat_statfs compat_statfs + +#include + +#define __TYPE_IS_PTR(t) (!__builtin_types_compatible_p( \ + typeof(0?(__force t)0:0ULL), u64)) + +#define __SC_DELOUSE(t,v) ({ \ + BUILD_BUG_ON(sizeof(t) > 4 && !__TYPE_IS_PTR(t)); \ + (__force t)(__TYPE_IS_PTR(t) ? ((v) & 0x7fffffff) : (v)); \ +}) + +#define PSW32_MASK_USER 0x0000FF00UL + +#define PSW32_USER_BITS (PSW32_MASK_DAT | PSW32_MASK_IO | PSW32_MASK_EXT | \ + PSW32_DEFAULT_KEY | PSW32_MASK_BASE | \ + PSW32_MASK_MCHECK | PSW32_MASK_PSTATE | \ + PSW32_ASC_PRIMARY) + +#define COMPAT_UTS_MACHINE "s390\0\0\0\0" + +typedef u16 compat_nlink_t; + +typedef struct { + u32 mask; + u32 addr; +} __aligned(8) psw_compat_t; + +typedef struct { + psw_compat_t psw; + u32 gprs[NUM_GPRS]; + u32 acrs[NUM_ACRS]; + u32 orig_gpr2; +} s390_compat_regs; + +typedef struct { + u32 gprs_high[NUM_GPRS]; +} s390_compat_regs_high; + +struct compat_stat { + compat_dev_t st_dev; + u16 __pad1; + compat_ino_t st_ino; + compat_mode_t st_mode; + compat_nlink_t st_nlink; + __compat_uid_t st_uid; + __compat_gid_t st_gid; + compat_dev_t st_rdev; + u16 __pad2; + u32 st_size; + u32 st_blksize; + u32 st_blocks; + u32 st_atime; + u32 st_atime_nsec; + u32 st_mtime; + u32 st_mtime_nsec; + u32 st_ctime; + u32 st_ctime_nsec; + u32 __unused4; + u32 __unused5; +}; + +struct compat_statfs { + u32 f_type; + u32 f_bsize; + u32 f_blocks; + u32 f_bfree; + u32 f_bavail; + u32 f_files; + u32 f_ffree; + compat_fsid_t f_fsid; + u32 f_namelen; + u32 f_frsize; + u32 f_flags; + u32 f_spare[4]; +}; + +struct compat_statfs64 { + u32 f_type; + u32 f_bsize; + u64 f_blocks; + u64 f_bfree; + u64 f_bavail; + u64 f_files; + u64 f_ffree; + compat_fsid_t f_fsid; + u32 f_namelen; + u32 f_frsize; + u32 f_flags; + u32 f_spare[5]; +}; + +/* + * A pointer passed in from user mode. This should not + * be used for syscall parameters, just declare them + * as pointers because the syscall entry code will have + * appropriately converted them already. + */ + +static inline void __user *compat_ptr(compat_uptr_t uptr) +{ + return (void __user *)(unsigned long)(uptr & 0x7fffffffUL); +} +#define compat_ptr(uptr) compat_ptr(uptr) + +#ifdef CONFIG_COMPAT + +static inline int is_compat_task(void) +{ + return test_thread_flag(TIF_31BIT); +} + +#endif + +#endif /* _ASM_S390X_COMPAT_H */ diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h new file mode 100644 index 0000000000..b378e2b57a --- /dev/null +++ b/arch/s390/include/asm/cpacf.h @@ -0,0 +1,551 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * CP Assist for Cryptographic Functions (CPACF) + * + * Copyright IBM Corp. 2003, 2023 + * Author(s): Thomas Spatzier + * Jan Glauber + * Harald Freudenberger (freude@de.ibm.com) + * Martin Schwidefsky + */ +#ifndef _ASM_S390_CPACF_H +#define _ASM_S390_CPACF_H + +#include + +/* + * Instruction opcodes for the CPACF instructions + */ +#define CPACF_KMAC 0xb91e /* MSA */ +#define CPACF_KM 0xb92e /* MSA */ +#define CPACF_KMC 0xb92f /* MSA */ +#define CPACF_KIMD 0xb93e /* MSA */ +#define CPACF_KLMD 0xb93f /* MSA */ +#define CPACF_PCKMO 0xb928 /* MSA3 */ +#define CPACF_KMF 0xb92a /* MSA4 */ +#define CPACF_KMO 0xb92b /* MSA4 */ +#define CPACF_PCC 0xb92c /* MSA4 */ +#define CPACF_KMCTR 0xb92d /* MSA4 */ +#define CPACF_PRNO 0xb93c /* MSA5 */ +#define CPACF_KMA 0xb929 /* MSA8 */ +#define CPACF_KDSA 0xb93a /* MSA9 */ + +/* + * En/decryption modifier bits + */ +#define CPACF_ENCRYPT 0x00 +#define CPACF_DECRYPT 0x80 + +/* + * Function codes for the KM (CIPHER MESSAGE) instruction + */ +#define CPACF_KM_QUERY 0x00 +#define CPACF_KM_DEA 0x01 +#define CPACF_KM_TDEA_128 0x02 +#define CPACF_KM_TDEA_192 0x03 +#define CPACF_KM_AES_128 0x12 +#define CPACF_KM_AES_192 0x13 +#define CPACF_KM_AES_256 0x14 +#define CPACF_KM_PAES_128 0x1a +#define CPACF_KM_PAES_192 0x1b +#define CPACF_KM_PAES_256 0x1c +#define CPACF_KM_XTS_128 0x32 +#define CPACF_KM_XTS_256 0x34 +#define CPACF_KM_PXTS_128 0x3a +#define CPACF_KM_PXTS_256 0x3c + +/* + * Function codes for the KMC (CIPHER MESSAGE WITH CHAINING) + * instruction + */ +#define CPACF_KMC_QUERY 0x00 +#define CPACF_KMC_DEA 0x01 +#define CPACF_KMC_TDEA_128 0x02 +#define CPACF_KMC_TDEA_192 0x03 +#define CPACF_KMC_AES_128 0x12 +#define CPACF_KMC_AES_192 0x13 +#define CPACF_KMC_AES_256 0x14 +#define CPACF_KMC_PAES_128 0x1a +#define CPACF_KMC_PAES_192 0x1b +#define CPACF_KMC_PAES_256 0x1c +#define CPACF_KMC_PRNG 0x43 + +/* + * Function codes for the KMCTR (CIPHER MESSAGE WITH COUNTER) + * instruction + */ +#define CPACF_KMCTR_QUERY 0x00 +#define CPACF_KMCTR_DEA 0x01 +#define CPACF_KMCTR_TDEA_128 0x02 +#define CPACF_KMCTR_TDEA_192 0x03 +#define CPACF_KMCTR_AES_128 0x12 +#define CPACF_KMCTR_AES_192 0x13 +#define CPACF_KMCTR_AES_256 0x14 +#define CPACF_KMCTR_PAES_128 0x1a +#define CPACF_KMCTR_PAES_192 0x1b +#define CPACF_KMCTR_PAES_256 0x1c + +/* + * Function codes for the KIMD (COMPUTE INTERMEDIATE MESSAGE DIGEST) + * instruction + */ +#define CPACF_KIMD_QUERY 0x00 +#define CPACF_KIMD_SHA_1 0x01 +#define CPACF_KIMD_SHA_256 0x02 +#define CPACF_KIMD_SHA_512 0x03 +#define CPACF_KIMD_SHA3_224 0x20 +#define CPACF_KIMD_SHA3_256 0x21 +#define CPACF_KIMD_SHA3_384 0x22 +#define CPACF_KIMD_SHA3_512 0x23 +#define CPACF_KIMD_GHASH 0x41 + +/* + * Function codes for the KLMD (COMPUTE LAST MESSAGE DIGEST) + * instruction + */ +#define CPACF_KLMD_QUERY 0x00 +#define CPACF_KLMD_SHA_1 0x01 +#define CPACF_KLMD_SHA_256 0x02 +#define CPACF_KLMD_SHA_512 0x03 +#define CPACF_KLMD_SHA3_224 0x20 +#define CPACF_KLMD_SHA3_256 0x21 +#define CPACF_KLMD_SHA3_384 0x22 +#define CPACF_KLMD_SHA3_512 0x23 + +/* + * function codes for the KMAC (COMPUTE MESSAGE AUTHENTICATION CODE) + * instruction + */ +#define CPACF_KMAC_QUERY 0x00 +#define CPACF_KMAC_DEA 0x01 +#define CPACF_KMAC_TDEA_128 0x02 +#define CPACF_KMAC_TDEA_192 0x03 + +/* + * Function codes for the PCKMO (PERFORM CRYPTOGRAPHIC KEY MANAGEMENT) + * instruction + */ +#define CPACF_PCKMO_QUERY 0x00 +#define CPACF_PCKMO_ENC_DES_KEY 0x01 +#define CPACF_PCKMO_ENC_TDES_128_KEY 0x02 +#define CPACF_PCKMO_ENC_TDES_192_KEY 0x03 +#define CPACF_PCKMO_ENC_AES_128_KEY 0x12 +#define CPACF_PCKMO_ENC_AES_192_KEY 0x13 +#define CPACF_PCKMO_ENC_AES_256_KEY 0x14 +#define CPACF_PCKMO_ENC_ECC_P256_KEY 0x20 +#define CPACF_PCKMO_ENC_ECC_P384_KEY 0x21 +#define CPACF_PCKMO_ENC_ECC_P521_KEY 0x22 +#define CPACF_PCKMO_ENC_ECC_ED25519_KEY 0x28 +#define CPACF_PCKMO_ENC_ECC_ED448_KEY 0x29 + +/* + * Function codes for the PRNO (PERFORM RANDOM NUMBER OPERATION) + * instruction + */ +#define CPACF_PRNO_QUERY 0x00 +#define CPACF_PRNO_SHA512_DRNG_GEN 0x03 +#define CPACF_PRNO_SHA512_DRNG_SEED 0x83 +#define CPACF_PRNO_TRNG_Q_R2C_RATIO 0x70 +#define CPACF_PRNO_TRNG 0x72 + +/* + * Function codes for the KMA (CIPHER MESSAGE WITH AUTHENTICATION) + * instruction + */ +#define CPACF_KMA_QUERY 0x00 +#define CPACF_KMA_GCM_AES_128 0x12 +#define CPACF_KMA_GCM_AES_192 0x13 +#define CPACF_KMA_GCM_AES_256 0x14 + +/* + * Flags for the KMA (CIPHER MESSAGE WITH AUTHENTICATION) instruction + */ +#define CPACF_KMA_LPC 0x100 /* Last-Plaintext/Ciphertext */ +#define CPACF_KMA_LAAD 0x200 /* Last-AAD */ +#define CPACF_KMA_HS 0x400 /* Hash-subkey Supplied */ + +typedef struct { unsigned char bytes[16]; } cpacf_mask_t; + +/** + * cpacf_query() - check if a specific CPACF function is available + * @opcode: the opcode of the crypto instruction + * @func: the function code to test for + * + * Executes the query function for the given crypto instruction @opcode + * and checks if @func is available + * + * Returns 1 if @func is available for @opcode, 0 otherwise + */ +static __always_inline void __cpacf_query(unsigned int opcode, cpacf_mask_t *mask) +{ + asm volatile( + " lghi 0,0\n" /* query function */ + " lgr 1,%[mask]\n" + " spm 0\n" /* pckmo doesn't change the cc */ + /* Parameter regs are ignored, but must be nonzero and unique */ + "0: .insn rrf,%[opc] << 16,2,4,6,0\n" + " brc 1,0b\n" /* handle partial completion */ + : "=m" (*mask) + : [mask] "d" ((unsigned long)mask), [opc] "i" (opcode) + : "cc", "0", "1"); +} + +static __always_inline int __cpacf_check_opcode(unsigned int opcode) +{ + switch (opcode) { + case CPACF_KMAC: + case CPACF_KM: + case CPACF_KMC: + case CPACF_KIMD: + case CPACF_KLMD: + return test_facility(17); /* check for MSA */ + case CPACF_PCKMO: + return test_facility(76); /* check for MSA3 */ + case CPACF_KMF: + case CPACF_KMO: + case CPACF_PCC: + case CPACF_KMCTR: + return test_facility(77); /* check for MSA4 */ + case CPACF_PRNO: + return test_facility(57); /* check for MSA5 */ + case CPACF_KMA: + return test_facility(146); /* check for MSA8 */ + default: + BUG(); + } +} + +static __always_inline int cpacf_query(unsigned int opcode, cpacf_mask_t *mask) +{ + if (__cpacf_check_opcode(opcode)) { + __cpacf_query(opcode, mask); + return 1; + } + memset(mask, 0, sizeof(*mask)); + return 0; +} + +static inline int cpacf_test_func(cpacf_mask_t *mask, unsigned int func) +{ + return (mask->bytes[func >> 3] & (0x80 >> (func & 7))) != 0; +} + +static __always_inline int cpacf_query_func(unsigned int opcode, unsigned int func) +{ + cpacf_mask_t mask; + + if (cpacf_query(opcode, &mask)) + return cpacf_test_func(&mask, func); + return 0; +} + +/** + * cpacf_km() - executes the KM (CIPHER MESSAGE) instruction + * @func: the function code passed to KM; see CPACF_KM_xxx defines + * @param: address of parameter block; see POP for details on each func + * @dest: address of destination memory area + * @src: address of source memory area + * @src_len: length of src operand in bytes + * + * Returns 0 for the query func, number of processed bytes for + * encryption/decryption funcs + */ +static inline int cpacf_km(unsigned long func, void *param, + u8 *dest, const u8 *src, long src_len) +{ + union register_pair d, s; + + d.even = (unsigned long)dest; + s.even = (unsigned long)src; + s.odd = (unsigned long)src_len; + asm volatile( + " lgr 0,%[fc]\n" + " lgr 1,%[pba]\n" + "0: .insn rre,%[opc] << 16,%[dst],%[src]\n" + " brc 1,0b\n" /* handle partial completion */ + : [src] "+&d" (s.pair), [dst] "+&d" (d.pair) + : [fc] "d" (func), [pba] "d" ((unsigned long)param), + [opc] "i" (CPACF_KM) + : "cc", "memory", "0", "1"); + + return src_len - s.odd; +} + +/** + * cpacf_kmc() - executes the KMC (CIPHER MESSAGE WITH CHAINING) instruction + * @func: the function code passed to KM; see CPACF_KMC_xxx defines + * @param: address of parameter block; see POP for details on each func + * @dest: address of destination memory area + * @src: address of source memory area + * @src_len: length of src operand in bytes + * + * Returns 0 for the query func, number of processed bytes for + * encryption/decryption funcs + */ +static inline int cpacf_kmc(unsigned long func, void *param, + u8 *dest, const u8 *src, long src_len) +{ + union register_pair d, s; + + d.even = (unsigned long)dest; + s.even = (unsigned long)src; + s.odd = (unsigned long)src_len; + asm volatile( + " lgr 0,%[fc]\n" + " lgr 1,%[pba]\n" + "0: .insn rre,%[opc] << 16,%[dst],%[src]\n" + " brc 1,0b\n" /* handle partial completion */ + : [src] "+&d" (s.pair), [dst] "+&d" (d.pair) + : [fc] "d" (func), [pba] "d" ((unsigned long)param), + [opc] "i" (CPACF_KMC) + : "cc", "memory", "0", "1"); + + return src_len - s.odd; +} + +/** + * cpacf_kimd() - executes the KIMD (COMPUTE INTERMEDIATE MESSAGE DIGEST) + * instruction + * @func: the function code passed to KM; see CPACF_KIMD_xxx defines + * @param: address of parameter block; see POP for details on each func + * @src: address of source memory area + * @src_len: length of src operand in bytes + */ +static inline void cpacf_kimd(unsigned long func, void *param, + const u8 *src, long src_len) +{ + union register_pair s; + + s.even = (unsigned long)src; + s.odd = (unsigned long)src_len; + asm volatile( + " lgr 0,%[fc]\n" + " lgr 1,%[pba]\n" + "0: .insn rre,%[opc] << 16,0,%[src]\n" + " brc 1,0b\n" /* handle partial completion */ + : [src] "+&d" (s.pair) + : [fc] "d" (func), [pba] "d" ((unsigned long)(param)), + [opc] "i" (CPACF_KIMD) + : "cc", "memory", "0", "1"); +} + +/** + * cpacf_klmd() - executes the KLMD (COMPUTE LAST MESSAGE DIGEST) instruction + * @func: the function code passed to KM; see CPACF_KLMD_xxx defines + * @param: address of parameter block; see POP for details on each func + * @src: address of source memory area + * @src_len: length of src operand in bytes + */ +static inline void cpacf_klmd(unsigned long func, void *param, + const u8 *src, long src_len) +{ + union register_pair s; + + s.even = (unsigned long)src; + s.odd = (unsigned long)src_len; + asm volatile( + " lgr 0,%[fc]\n" + " lgr 1,%[pba]\n" + "0: .insn rre,%[opc] << 16,0,%[src]\n" + " brc 1,0b\n" /* handle partial completion */ + : [src] "+&d" (s.pair) + : [fc] "d" (func), [pba] "d" ((unsigned long)param), + [opc] "i" (CPACF_KLMD) + : "cc", "memory", "0", "1"); +} + +/** + * cpacf_kmac() - executes the KMAC (COMPUTE MESSAGE AUTHENTICATION CODE) + * instruction + * @func: the function code passed to KM; see CPACF_KMAC_xxx defines + * @param: address of parameter block; see POP for details on each func + * @src: address of source memory area + * @src_len: length of src operand in bytes + * + * Returns 0 for the query func, number of processed bytes for digest funcs + */ +static inline int cpacf_kmac(unsigned long func, void *param, + const u8 *src, long src_len) +{ + union register_pair s; + + s.even = (unsigned long)src; + s.odd = (unsigned long)src_len; + asm volatile( + " lgr 0,%[fc]\n" + " lgr 1,%[pba]\n" + "0: .insn rre,%[opc] << 16,0,%[src]\n" + " brc 1,0b\n" /* handle partial completion */ + : [src] "+&d" (s.pair) + : [fc] "d" (func), [pba] "d" ((unsigned long)param), + [opc] "i" (CPACF_KMAC) + : "cc", "memory", "0", "1"); + + return src_len - s.odd; +} + +/** + * cpacf_kmctr() - executes the KMCTR (CIPHER MESSAGE WITH COUNTER) instruction + * @func: the function code passed to KMCTR; see CPACF_KMCTR_xxx defines + * @param: address of parameter block; see POP for details on each func + * @dest: address of destination memory area + * @src: address of source memory area + * @src_len: length of src operand in bytes + * @counter: address of counter value + * + * Returns 0 for the query func, number of processed bytes for + * encryption/decryption funcs + */ +static inline int cpacf_kmctr(unsigned long func, void *param, u8 *dest, + const u8 *src, long src_len, u8 *counter) +{ + union register_pair d, s, c; + + d.even = (unsigned long)dest; + s.even = (unsigned long)src; + s.odd = (unsigned long)src_len; + c.even = (unsigned long)counter; + asm volatile( + " lgr 0,%[fc]\n" + " lgr 1,%[pba]\n" + "0: .insn rrf,%[opc] << 16,%[dst],%[src],%[ctr],0\n" + " brc 1,0b\n" /* handle partial completion */ + : [src] "+&d" (s.pair), [dst] "+&d" (d.pair), + [ctr] "+&d" (c.pair) + : [fc] "d" (func), [pba] "d" ((unsigned long)param), + [opc] "i" (CPACF_KMCTR) + : "cc", "memory", "0", "1"); + + return src_len - s.odd; +} + +/** + * cpacf_prno() - executes the PRNO (PERFORM RANDOM NUMBER OPERATION) + * instruction + * @func: the function code passed to PRNO; see CPACF_PRNO_xxx defines + * @param: address of parameter block; see POP for details on each func + * @dest: address of destination memory area + * @dest_len: size of destination memory area in bytes + * @seed: address of seed data + * @seed_len: size of seed data in bytes + */ +static inline void cpacf_prno(unsigned long func, void *param, + u8 *dest, unsigned long dest_len, + const u8 *seed, unsigned long seed_len) +{ + union register_pair d, s; + + d.even = (unsigned long)dest; + d.odd = (unsigned long)dest_len; + s.even = (unsigned long)seed; + s.odd = (unsigned long)seed_len; + asm volatile ( + " lgr 0,%[fc]\n" + " lgr 1,%[pba]\n" + "0: .insn rre,%[opc] << 16,%[dst],%[seed]\n" + " brc 1,0b\n" /* handle partial completion */ + : [dst] "+&d" (d.pair) + : [fc] "d" (func), [pba] "d" ((unsigned long)param), + [seed] "d" (s.pair), [opc] "i" (CPACF_PRNO) + : "cc", "memory", "0", "1"); +} + +/** + * cpacf_trng() - executes the TRNG subfunction of the PRNO instruction + * @ucbuf: buffer for unconditioned data + * @ucbuf_len: amount of unconditioned data to fetch in bytes + * @cbuf: buffer for conditioned data + * @cbuf_len: amount of conditioned data to fetch in bytes + */ +static inline void cpacf_trng(u8 *ucbuf, unsigned long ucbuf_len, + u8 *cbuf, unsigned long cbuf_len) +{ + union register_pair u, c; + + u.even = (unsigned long)ucbuf; + u.odd = (unsigned long)ucbuf_len; + c.even = (unsigned long)cbuf; + c.odd = (unsigned long)cbuf_len; + asm volatile ( + " lghi 0,%[fc]\n" + "0: .insn rre,%[opc] << 16,%[ucbuf],%[cbuf]\n" + " brc 1,0b\n" /* handle partial completion */ + : [ucbuf] "+&d" (u.pair), [cbuf] "+&d" (c.pair) + : [fc] "K" (CPACF_PRNO_TRNG), [opc] "i" (CPACF_PRNO) + : "cc", "memory", "0"); +} + +/** + * cpacf_pcc() - executes the PCC (PERFORM CRYPTOGRAPHIC COMPUTATION) + * instruction + * @func: the function code passed to PCC; see CPACF_KM_xxx defines + * @param: address of parameter block; see POP for details on each func + */ +static inline void cpacf_pcc(unsigned long func, void *param) +{ + asm volatile( + " lgr 0,%[fc]\n" + " lgr 1,%[pba]\n" + "0: .insn rre,%[opc] << 16,0,0\n" /* PCC opcode */ + " brc 1,0b\n" /* handle partial completion */ + : + : [fc] "d" (func), [pba] "d" ((unsigned long)param), + [opc] "i" (CPACF_PCC) + : "cc", "memory", "0", "1"); +} + +/** + * cpacf_pckmo() - executes the PCKMO (PERFORM CRYPTOGRAPHIC KEY + * MANAGEMENT) instruction + * @func: the function code passed to PCKMO; see CPACF_PCKMO_xxx defines + * @param: address of parameter block; see POP for details on each func + * + * Returns 0. + */ +static inline void cpacf_pckmo(long func, void *param) +{ + asm volatile( + " lgr 0,%[fc]\n" + " lgr 1,%[pba]\n" + " .insn rre,%[opc] << 16,0,0\n" /* PCKMO opcode */ + : + : [fc] "d" (func), [pba] "d" ((unsigned long)param), + [opc] "i" (CPACF_PCKMO) + : "cc", "memory", "0", "1"); +} + +/** + * cpacf_kma() - executes the KMA (CIPHER MESSAGE WITH AUTHENTICATION) + * instruction + * @func: the function code passed to KMA; see CPACF_KMA_xxx defines + * @param: address of parameter block; see POP for details on each func + * @dest: address of destination memory area + * @src: address of source memory area + * @src_len: length of src operand in bytes + * @aad: address of additional authenticated data memory area + * @aad_len: length of aad operand in bytes + */ +static inline void cpacf_kma(unsigned long func, void *param, u8 *dest, + const u8 *src, unsigned long src_len, + const u8 *aad, unsigned long aad_len) +{ + union register_pair d, s, a; + + d.even = (unsigned long)dest; + s.even = (unsigned long)src; + s.odd = (unsigned long)src_len; + a.even = (unsigned long)aad; + a.odd = (unsigned long)aad_len; + asm volatile( + " lgr 0,%[fc]\n" + " lgr 1,%[pba]\n" + "0: .insn rrf,%[opc] << 16,%[dst],%[src],%[aad],0\n" + " brc 1,0b\n" /* handle partial completion */ + : [dst] "+&d" (d.pair), [src] "+&d" (s.pair), + [aad] "+&d" (a.pair) + : [fc] "d" (func), [pba] "d" ((unsigned long)param), + [opc] "i" (CPACF_KMA) + : "cc", "memory", "0", "1"); +} + +#endif /* _ASM_S390_CPACF_H */ diff --git a/arch/s390/include/asm/cpcmd.h b/arch/s390/include/asm/cpcmd.h new file mode 100644 index 0000000000..c3c993abe9 --- /dev/null +++ b/arch/s390/include/asm/cpcmd.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * S390 version + * Copyright IBM Corp. 1999 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), + * Christian Borntraeger (cborntra@de.ibm.com), + */ + +#ifndef _ASM_S390_CPCMD_H +#define _ASM_S390_CPCMD_H + +/* + * the lowlevel function for cpcmd + */ +int __cpcmd(const char *cmd, char *response, int rlen, int *response_code); + +/* + * cpcmd is the in-kernel interface for issuing CP commands + * + * cmd: null-terminated command string, max 240 characters + * response: response buffer for VM's textual response + * rlen: size of the response buffer, cpcmd will not exceed this size + * but will cap the output, if its too large. Everything that + * did not fit into the buffer will be silently dropped + * response_code: return pointer for VM's error code + * return value: the size of the response. The caller can check if the buffer + * was large enough by comparing the return value and rlen + * NOTE: If the response buffer is not in real storage, cpcmd can sleep + */ +int cpcmd(const char *cmd, char *response, int rlen, int *response_code); + +#endif /* _ASM_S390_CPCMD_H */ diff --git a/arch/s390/include/asm/cpu.h b/arch/s390/include/asm/cpu.h new file mode 100644 index 0000000000..26c710cd34 --- /dev/null +++ b/arch/s390/include/asm/cpu.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2000, 2009 + * Author(s): Hartmut Penner , + * Martin Schwidefsky , + * Christian Ehrhardt , + */ + +#ifndef _ASM_S390_CPU_H +#define _ASM_S390_CPU_H + +#ifndef __ASSEMBLY__ + +#include +#include + +struct cpuid +{ + unsigned int version : 8; + unsigned int ident : 24; + unsigned int machine : 16; + unsigned int unused : 16; +} __attribute__ ((packed, aligned(8))); + +DECLARE_STATIC_KEY_FALSE(cpu_has_bear); + +#endif /* __ASSEMBLY__ */ +#endif /* _ASM_S390_CPU_H */ diff --git a/arch/s390/include/asm/cpu_mf-insn.h b/arch/s390/include/asm/cpu_mf-insn.h new file mode 100644 index 0000000000..a68b362e09 --- /dev/null +++ b/arch/s390/include/asm/cpu_mf-insn.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Support for CPU-MF instructions + * + * Copyright IBM Corp. 2019 + * Author(s): Hendrik Brueckner + */ +#ifndef _ASM_S390_CPU_MF_INSN_H +#define _ASM_S390_CPU_MF_INSN_H + +#ifdef __ASSEMBLY__ + +/* Macro to generate the STCCTM instruction with a customized + * M3 field designating the counter set. + */ +.macro STCCTM r1 m3 db2 + .insn rsy,0xeb0000000017,\r1,\m3 & 0xf,\db2 +.endm + +#endif /* __ASSEMBLY__ */ + +#endif diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h new file mode 100644 index 0000000000..a0de5b9b02 --- /dev/null +++ b/arch/s390/include/asm/cpu_mf.h @@ -0,0 +1,277 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * CPU-measurement facilities + * + * Copyright IBM Corp. 2012, 2018 + * Author(s): Hendrik Brueckner + * Jan Glauber + */ +#ifndef _ASM_S390_CPU_MF_H +#define _ASM_S390_CPU_MF_H + +#include +#include +#include + +asm(".include \"asm/cpu_mf-insn.h\"\n"); + +#define CPU_MF_INT_SF_IAE (1 << 31) /* invalid entry address */ +#define CPU_MF_INT_SF_ISE (1 << 30) /* incorrect SDBT entry */ +#define CPU_MF_INT_SF_PRA (1 << 29) /* program request alert */ +#define CPU_MF_INT_SF_SACA (1 << 23) /* sampler auth. change alert */ +#define CPU_MF_INT_SF_LSDA (1 << 22) /* loss of sample data alert */ +#define CPU_MF_INT_CF_MTDA (1 << 15) /* loss of MT ctr. data alert */ +#define CPU_MF_INT_CF_CACA (1 << 7) /* counter auth. change alert */ +#define CPU_MF_INT_CF_LCDA (1 << 6) /* loss of counter data alert */ +#define CPU_MF_INT_CF_MASK (CPU_MF_INT_CF_MTDA|CPU_MF_INT_CF_CACA| \ + CPU_MF_INT_CF_LCDA) +#define CPU_MF_INT_SF_MASK (CPU_MF_INT_SF_IAE|CPU_MF_INT_SF_ISE| \ + CPU_MF_INT_SF_PRA|CPU_MF_INT_SF_SACA| \ + CPU_MF_INT_SF_LSDA) + +#define CPU_MF_SF_RIBM_NOTAV 0x1 /* Sampling unavailable */ + +/* CPU measurement facility support */ +static inline int cpum_cf_avail(void) +{ + return test_facility(40) && test_facility(67); +} + +static inline int cpum_sf_avail(void) +{ + return test_facility(40) && test_facility(68); +} + +struct cpumf_ctr_info { + u16 cfvn; + u16 auth_ctl; + u16 enable_ctl; + u16 act_ctl; + u16 max_cpu; + u16 csvn; + u16 max_cg; + u16 reserved1; + u32 reserved2[12]; +} __packed; + +/* QUERY SAMPLING INFORMATION block */ +struct hws_qsi_info_block { /* Bit(s) */ + unsigned int b0_13:14; /* 0-13: zeros */ + unsigned int as:1; /* 14: basic-sampling authorization */ + unsigned int ad:1; /* 15: diag-sampling authorization */ + unsigned int b16_21:6; /* 16-21: zeros */ + unsigned int es:1; /* 22: basic-sampling enable control */ + unsigned int ed:1; /* 23: diag-sampling enable control */ + unsigned int b24_29:6; /* 24-29: zeros */ + unsigned int cs:1; /* 30: basic-sampling activation control */ + unsigned int cd:1; /* 31: diag-sampling activation control */ + unsigned int bsdes:16; /* 4-5: size of basic sampling entry */ + unsigned int dsdes:16; /* 6-7: size of diagnostic sampling entry */ + unsigned long min_sampl_rate; /* 8-15: minimum sampling interval */ + unsigned long max_sampl_rate; /* 16-23: maximum sampling interval*/ + unsigned long tear; /* 24-31: TEAR contents */ + unsigned long dear; /* 32-39: DEAR contents */ + unsigned int rsvrd0:24; /* 40-42: reserved */ + unsigned int ribm:8; /* 43: Reserved by IBM */ + unsigned int cpu_speed; /* 44-47: CPU speed */ + unsigned long long rsvrd1; /* 48-55: reserved */ + unsigned long long rsvrd2; /* 56-63: reserved */ +} __packed; + +/* SET SAMPLING CONTROLS request block */ +struct hws_lsctl_request_block { + unsigned int s:1; /* 0: maximum buffer indicator */ + unsigned int h:1; /* 1: part. level reserved for VM use*/ + unsigned long long b2_53:52;/* 2-53: zeros */ + unsigned int es:1; /* 54: basic-sampling enable control */ + unsigned int ed:1; /* 55: diag-sampling enable control */ + unsigned int b56_61:6; /* 56-61: - zeros */ + unsigned int cs:1; /* 62: basic-sampling activation control */ + unsigned int cd:1; /* 63: diag-sampling activation control */ + unsigned long interval; /* 8-15: sampling interval */ + unsigned long tear; /* 16-23: TEAR contents */ + unsigned long dear; /* 24-31: DEAR contents */ + /* 32-63: */ + unsigned long rsvrd1; /* reserved */ + unsigned long rsvrd2; /* reserved */ + unsigned long rsvrd3; /* reserved */ + unsigned long rsvrd4; /* reserved */ +} __packed; + +struct hws_basic_entry { + unsigned int def:16; /* 0-15 Data Entry Format */ + unsigned int R:4; /* 16-19 reserved */ + unsigned int U:4; /* 20-23 Number of unique instruct. */ + unsigned int z:2; /* zeros */ + unsigned int T:1; /* 26 PSW DAT mode */ + unsigned int W:1; /* 27 PSW wait state */ + unsigned int P:1; /* 28 PSW Problem state */ + unsigned int AS:2; /* 29-30 PSW address-space control */ + unsigned int I:1; /* 31 entry valid or invalid */ + unsigned int CL:2; /* 32-33 Configuration Level */ + unsigned int H:1; /* 34 Host Indicator */ + unsigned int LS:1; /* 35 Limited Sampling */ + unsigned int:12; + unsigned int prim_asn:16; /* primary ASN */ + unsigned long long ia; /* Instruction Address */ + unsigned long long gpp; /* Guest Program Parameter */ + unsigned long long hpp; /* Host Program Parameter */ +} __packed; + +struct hws_diag_entry { + unsigned int def:16; /* 0-15 Data Entry Format */ + unsigned int R:15; /* 16-19 and 20-30 reserved */ + unsigned int I:1; /* 31 entry valid or invalid */ + u8 data[]; /* Machine-dependent sample data */ +} __packed; + +struct hws_combined_entry { + struct hws_basic_entry basic; /* Basic-sampling data entry */ + struct hws_diag_entry diag; /* Diagnostic-sampling data entry */ +} __packed; + +union hws_trailer_header { + struct { + unsigned int f:1; /* 0 - Block Full Indicator */ + unsigned int a:1; /* 1 - Alert request control */ + unsigned int t:1; /* 2 - Timestamp format */ + unsigned int :29; /* 3 - 31: Reserved */ + unsigned int bsdes:16; /* 32-47: size of basic SDE */ + unsigned int dsdes:16; /* 48-63: size of diagnostic SDE */ + unsigned long long overflow; /* 64 - Overflow Count */ + }; + u128 val; +}; + +struct hws_trailer_entry { + union hws_trailer_header header; /* 0 - 15 Flags + Overflow Count */ + unsigned char timestamp[16]; /* 16 - 31 timestamp */ + unsigned long long reserved1; /* 32 -Reserved */ + unsigned long long reserved2; /* */ + union { /* 48 - reserved for programming use */ + struct { + unsigned int clock_base:1; /* in progusage2 */ + unsigned long long progusage1:63; + unsigned long long progusage2; + }; + unsigned long long progusage[2]; + }; +} __packed; + +/* Load program parameter */ +static inline void lpp(void *pp) +{ + asm volatile("lpp 0(%0)\n" :: "a" (pp) : "memory"); +} + +/* Query counter information */ +static inline int qctri(struct cpumf_ctr_info *info) +{ + int rc = -EINVAL; + + asm volatile ( + "0: qctri %1\n" + "1: lhi %0,0\n" + "2:\n" + EX_TABLE(1b, 2b) + : "+d" (rc), "=Q" (*info)); + return rc; +} + +/* Load CPU-counter-set controls */ +static inline int lcctl(u64 ctl) +{ + int cc; + + asm volatile ( + " lcctl %1\n" + " ipm %0\n" + " srl %0,28\n" + : "=d" (cc) : "Q" (ctl) : "cc"); + return cc; +} + +/* Extract CPU counter */ +static inline int __ecctr(u64 ctr, u64 *content) +{ + u64 _content; + int cc; + + asm volatile ( + " ecctr %0,%2\n" + " ipm %1\n" + " srl %1,28\n" + : "=d" (_content), "=d" (cc) : "d" (ctr) : "cc"); + *content = _content; + return cc; +} + +/* Extract CPU counter */ +static inline int ecctr(u64 ctr, u64 *val) +{ + u64 content; + int cc; + + cc = __ecctr(ctr, &content); + if (!cc) + *val = content; + return cc; +} + +/* Store CPU counter multiple for a particular counter set */ +enum stcctm_ctr_set { + EXTENDED = 0, + BASIC = 1, + PROBLEM_STATE = 2, + CRYPTO_ACTIVITY = 3, + MT_DIAG = 5, + MT_DIAG_CLEARING = 9, /* clears loss-of-MT-ctr-data alert */ +}; + +static __always_inline int stcctm(enum stcctm_ctr_set set, u64 range, u64 *dest) +{ + int cc; + + asm volatile ( + " STCCTM %2,%3,%1\n" + " ipm %0\n" + " srl %0,28\n" + : "=d" (cc) + : "Q" (*dest), "d" (range), "i" (set) + : "cc", "memory"); + return cc; +} + +/* Query sampling information */ +static inline int qsi(struct hws_qsi_info_block *info) +{ + int cc = 1; + + asm volatile( + "0: qsi %1\n" + "1: lhi %0,0\n" + "2:\n" + EX_TABLE(0b, 2b) EX_TABLE(1b, 2b) + : "+d" (cc), "+Q" (*info)); + return cc ? -EINVAL : 0; +} + +/* Load sampling controls */ +static inline int lsctl(struct hws_lsctl_request_block *req) +{ + int cc; + + cc = 1; + asm volatile( + "0: lsctl 0(%1)\n" + "1: ipm %0\n" + " srl %0,28\n" + "2:\n" + EX_TABLE(0b, 2b) EX_TABLE(1b, 2b) + : "+d" (cc), "+a" (req) + : "m" (*req) + : "cc", "memory"); + + return cc ? -EINVAL : 0; +} +#endif /* _ASM_S390_CPU_MF_H */ diff --git a/arch/s390/include/asm/cpufeature.h b/arch/s390/include/asm/cpufeature.h new file mode 100644 index 0000000000..9312046137 --- /dev/null +++ b/arch/s390/include/asm/cpufeature.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Module interface for CPU features + * + * Copyright IBM Corp. 2015, 2022 + * Author(s): Hendrik Brueckner + */ + +#ifndef __ASM_S390_CPUFEATURE_H +#define __ASM_S390_CPUFEATURE_H + +enum { + S390_CPU_FEATURE_MSA, + S390_CPU_FEATURE_VXRS, + S390_CPU_FEATURE_UV, + MAX_CPU_FEATURES +}; + +#define cpu_feature(feature) (feature) + +int cpu_have_feature(unsigned int nr); + +#endif /* __ASM_S390_CPUFEATURE_H */ diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h new file mode 100644 index 0000000000..30bb3ec4e5 --- /dev/null +++ b/arch/s390/include/asm/cputime.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2004 + * + * Author: Martin Schwidefsky + */ + +#ifndef _S390_CPUTIME_H +#define _S390_CPUTIME_H + +#include +#include + +/* + * Convert cputime to nanoseconds. + */ +#define cputime_to_nsecs(cputime) tod_to_ns(cputime) + +void account_idle_time_irq(void); + +#endif /* _S390_CPUTIME_H */ diff --git a/arch/s390/include/asm/crw.h b/arch/s390/include/asm/crw.h new file mode 100644 index 0000000000..97456d98fe --- /dev/null +++ b/arch/s390/include/asm/crw.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Data definitions for channel report processing + * Copyright IBM Corp. 2000, 2009 + * Author(s): Ingo Adlung , + * Martin Schwidefsky , + * Cornelia Huck , + */ + +#ifndef _ASM_S390_CRW_H +#define _ASM_S390_CRW_H + +#include + +/* + * Channel Report Word + */ +struct crw { + __u32 res1 : 1; /* reserved zero */ + __u32 slct : 1; /* solicited */ + __u32 oflw : 1; /* overflow */ + __u32 chn : 1; /* chained */ + __u32 rsc : 4; /* reporting source code */ + __u32 anc : 1; /* ancillary report */ + __u32 res2 : 1; /* reserved zero */ + __u32 erc : 6; /* error-recovery code */ + __u32 rsid : 16; /* reporting-source ID */ +} __attribute__ ((packed)); + +typedef void (*crw_handler_t)(struct crw *, struct crw *, int); + +extern int crw_register_handler(int rsc, crw_handler_t handler); +extern void crw_unregister_handler(int rsc); +extern void crw_handle_channel_report(void); +void crw_wait_for_channel_report(void); + +#define NR_RSCS 16 + +#define CRW_RSC_MONITOR 0x2 /* monitoring facility */ +#define CRW_RSC_SCH 0x3 /* subchannel */ +#define CRW_RSC_CPATH 0x4 /* channel path */ +#define CRW_RSC_CONFIG 0x9 /* configuration-alert facility */ +#define CRW_RSC_CSS 0xB /* channel subsystem */ + +#define CRW_ERC_EVENT 0x00 /* event information pending */ +#define CRW_ERC_AVAIL 0x01 /* available */ +#define CRW_ERC_INIT 0x02 /* initialized */ +#define CRW_ERC_TERROR 0x03 /* temporary error */ +#define CRW_ERC_IPARM 0x04 /* installed parm initialized */ +#define CRW_ERC_TERM 0x05 /* terminal */ +#define CRW_ERC_PERRN 0x06 /* perm. error, fac. not init */ +#define CRW_ERC_PERRI 0x07 /* perm. error, facility init */ +#define CRW_ERC_PMOD 0x08 /* installed parameters modified */ + +#endif /* _ASM_S390_CRW_H */ diff --git a/arch/s390/include/asm/css_chars.h b/arch/s390/include/asm/css_chars.h new file mode 100644 index 0000000000..638137d46c --- /dev/null +++ b/arch/s390/include/asm/css_chars.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_CSS_CHARS_H +#define _ASM_CSS_CHARS_H + +#include + +struct css_general_char { + u64 : 12; + u64 dynio : 1; /* bit 12 */ + u64 : 4; + u64 eadm : 1; /* bit 17 */ + u64 : 23; + u64 aif : 1; /* bit 41 */ + u64 : 3; + u64 mcss : 1; /* bit 45 */ + u64 fcs : 1; /* bit 46 */ + u64 : 1; + u64 ext_mb : 1; /* bit 48 */ + u64 : 7; + u64 aif_tdd : 1; /* bit 56 */ + u64 : 1; + u64 qebsm : 1; /* bit 58 */ + u64 : 2; + u64 aiv : 1; /* bit 61 */ + u64 : 2; + + u64 : 3; + u64 aif_osa : 1; /* bit 67 */ + u64 : 12; + u64 eadm_rf : 1; /* bit 80 */ + u64 : 1; + u64 cib : 1; /* bit 82 */ + u64 : 5; + u64 fcx : 1; /* bit 88 */ + u64 : 19; + u64 alt_ssi : 1; /* bit 108 */ + u64 : 1; + u64 narf : 1; /* bit 110 */ + u64 : 5; + u64 enarf: 1; /* bit 116 */ + u64 : 6; + u64 util_str : 1;/* bit 123 */ +} __packed; + +extern struct css_general_char css_general_characteristics; + +#endif diff --git a/arch/s390/include/asm/ctl_reg.h b/arch/s390/include/asm/ctl_reg.h new file mode 100644 index 0000000000..adf7d8cdac --- /dev/null +++ b/arch/s390/include/asm/ctl_reg.h @@ -0,0 +1,146 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 1999, 2009 + * + * Author(s): Martin Schwidefsky + */ + +#ifndef __ASM_CTL_REG_H +#define __ASM_CTL_REG_H + +#include + +#define CR0_CLOCK_COMPARATOR_SIGN BIT(63 - 10) +#define CR0_LOW_ADDRESS_PROTECTION BIT(63 - 35) +#define CR0_FETCH_PROTECTION_OVERRIDE BIT(63 - 38) +#define CR0_STORAGE_PROTECTION_OVERRIDE BIT(63 - 39) +#define CR0_EMERGENCY_SIGNAL_SUBMASK BIT(63 - 49) +#define CR0_EXTERNAL_CALL_SUBMASK BIT(63 - 50) +#define CR0_CLOCK_COMPARATOR_SUBMASK BIT(63 - 52) +#define CR0_CPU_TIMER_SUBMASK BIT(63 - 53) +#define CR0_SERVICE_SIGNAL_SUBMASK BIT(63 - 54) +#define CR0_UNUSED_56 BIT(63 - 56) +#define CR0_INTERRUPT_KEY_SUBMASK BIT(63 - 57) +#define CR0_MEASUREMENT_ALERT_SUBMASK BIT(63 - 58) + +#define CR14_UNUSED_32 BIT(63 - 32) +#define CR14_UNUSED_33 BIT(63 - 33) +#define CR14_CHANNEL_REPORT_SUBMASK BIT(63 - 35) +#define CR14_RECOVERY_SUBMASK BIT(63 - 36) +#define CR14_DEGRADATION_SUBMASK BIT(63 - 37) +#define CR14_EXTERNAL_DAMAGE_SUBMASK BIT(63 - 38) +#define CR14_WARNING_SUBMASK BIT(63 - 39) + +#ifndef __ASSEMBLY__ + +#include + +#define __ctl_load(array, low, high) do { \ + typedef struct { char _[sizeof(array)]; } addrtype; \ + \ + BUILD_BUG_ON(sizeof(addrtype) != (high - low + 1) * sizeof(long));\ + asm volatile( \ + " lctlg %1,%2,%0\n" \ + : \ + : "Q" (*(addrtype *)(&array)), "i" (low), "i" (high) \ + : "memory"); \ +} while (0) + +#define __ctl_store(array, low, high) do { \ + typedef struct { char _[sizeof(array)]; } addrtype; \ + \ + BUILD_BUG_ON(sizeof(addrtype) != (high - low + 1) * sizeof(long));\ + asm volatile( \ + " stctg %1,%2,%0\n" \ + : "=Q" (*(addrtype *)(&array)) \ + : "i" (low), "i" (high)); \ +} while (0) + +static __always_inline void __ctl_set_bit(unsigned int cr, unsigned int bit) +{ + unsigned long reg; + + __ctl_store(reg, cr, cr); + reg |= 1UL << bit; + __ctl_load(reg, cr, cr); +} + +static __always_inline void __ctl_clear_bit(unsigned int cr, unsigned int bit) +{ + unsigned long reg; + + __ctl_store(reg, cr, cr); + reg &= ~(1UL << bit); + __ctl_load(reg, cr, cr); +} + +void smp_ctl_set_clear_bit(int cr, int bit, bool set); + +static inline void ctl_set_bit(int cr, int bit) +{ + smp_ctl_set_clear_bit(cr, bit, true); +} + +static inline void ctl_clear_bit(int cr, int bit) +{ + smp_ctl_set_clear_bit(cr, bit, false); +} + +union ctlreg0 { + unsigned long val; + struct { + unsigned long : 8; + unsigned long tcx : 1; /* Transactional-Execution control */ + unsigned long pifo : 1; /* Transactional-Execution Program- + Interruption-Filtering Override */ + unsigned long : 3; + unsigned long ccc : 1; /* Cryptography counter control */ + unsigned long pec : 1; /* PAI extension control */ + unsigned long : 17; + unsigned long : 3; + unsigned long lap : 1; /* Low-address-protection control */ + unsigned long : 4; + unsigned long edat : 1; /* Enhanced-DAT-enablement control */ + unsigned long : 2; + unsigned long iep : 1; /* Instruction-Execution-Protection */ + unsigned long : 1; + unsigned long afp : 1; /* AFP-register control */ + unsigned long vx : 1; /* Vector enablement control */ + unsigned long : 7; + unsigned long sssm : 1; /* Service signal subclass mask */ + unsigned long : 9; + }; +}; + +union ctlreg2 { + unsigned long val; + struct { + unsigned long : 33; + unsigned long ducto : 25; + unsigned long : 1; + unsigned long gse : 1; + unsigned long : 1; + unsigned long tds : 1; + unsigned long tdc : 2; + }; +}; + +union ctlreg5 { + unsigned long val; + struct { + unsigned long : 33; + unsigned long pasteo: 25; + unsigned long : 6; + }; +}; + +union ctlreg15 { + unsigned long val; + struct { + unsigned long lsea : 61; + unsigned long : 3; + }; +}; + +#endif /* __ASSEMBLY__ */ +#endif /* __ASM_CTL_REG_H */ diff --git a/arch/s390/include/asm/current.h b/arch/s390/include/asm/current.h new file mode 100644 index 0000000000..68f8431527 --- /dev/null +++ b/arch/s390/include/asm/current.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * S390 version + * Copyright IBM Corp. 1999 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) + * + * Derived from "include/asm-i386/current.h" + */ + +#ifndef _S390_CURRENT_H +#define _S390_CURRENT_H + +#include + +struct task_struct; + +#define current ((struct task_struct *const)S390_lowcore.current_task) + +#endif /* !(_S390_CURRENT_H) */ diff --git a/arch/s390/include/asm/debug.h b/arch/s390/include/asm/debug.h new file mode 100644 index 0000000000..ccd4e148b5 --- /dev/null +++ b/arch/s390/include/asm/debug.h @@ -0,0 +1,490 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * S/390 debug facility + * + * Copyright IBM Corp. 1999, 2020 + */ +#ifndef _ASM_S390_DEBUG_H +#define _ASM_S390_DEBUG_H + +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG_MAX_LEVEL 6 /* debug levels range from 0 to 6 */ +#define DEBUG_OFF_LEVEL -1 /* level where debug is switched off */ +#define DEBUG_FLUSH_ALL -1 /* parameter to flush all areas */ +#define DEBUG_MAX_VIEWS 10 /* max number of views in proc fs */ +#define DEBUG_MAX_NAME_LEN 64 /* max length for a debugfs file name */ +#define DEBUG_DEFAULT_LEVEL 3 /* initial debug level */ + +#define DEBUG_DIR_ROOT "s390dbf" /* name of debug root directory in proc fs */ + +#define DEBUG_DATA(entry) (char *)(entry + 1) /* data is stored behind */ + /* the entry information */ + +#define __DEBUG_FEATURE_VERSION 3 /* version of debug feature */ + +struct __debug_entry { + unsigned long clock : 60; + unsigned long exception : 1; + unsigned long level : 3; + void *caller; + unsigned short cpu; +} __packed; + +typedef struct __debug_entry debug_entry_t; + +struct debug_view; + +typedef struct debug_info { + struct debug_info *next; + struct debug_info *prev; + refcount_t ref_count; + spinlock_t lock; + int level; + int nr_areas; + int pages_per_area; + int buf_size; + int entry_size; + debug_entry_t ***areas; + int active_area; + int *active_pages; + int *active_entries; + struct dentry *debugfs_root_entry; + struct dentry *debugfs_entries[DEBUG_MAX_VIEWS]; + struct debug_view *views[DEBUG_MAX_VIEWS]; + char name[DEBUG_MAX_NAME_LEN]; + umode_t mode; +} debug_info_t; + +typedef int (debug_header_proc_t) (debug_info_t *id, + struct debug_view *view, + int area, + debug_entry_t *entry, + char *out_buf); + +typedef int (debug_format_proc_t) (debug_info_t *id, + struct debug_view *view, char *out_buf, + const char *in_buf); +typedef int (debug_prolog_proc_t) (debug_info_t *id, + struct debug_view *view, + char *out_buf); +typedef int (debug_input_proc_t) (debug_info_t *id, + struct debug_view *view, + struct file *file, + const char __user *user_buf, + size_t in_buf_size, loff_t *offset); + +int debug_dflt_header_fn(debug_info_t *id, struct debug_view *view, + int area, debug_entry_t *entry, char *out_buf); + +struct debug_view { + char name[DEBUG_MAX_NAME_LEN]; + debug_prolog_proc_t *prolog_proc; + debug_header_proc_t *header_proc; + debug_format_proc_t *format_proc; + debug_input_proc_t *input_proc; + void *private_data; +}; + +extern struct debug_view debug_hex_ascii_view; +extern struct debug_view debug_sprintf_view; + +/* do NOT use the _common functions */ + +debug_entry_t *debug_event_common(debug_info_t *id, int level, + const void *data, int length); + +debug_entry_t *debug_exception_common(debug_info_t *id, int level, + const void *data, int length); + +/* Debug Feature API: */ + +debug_info_t *debug_register(const char *name, int pages, int nr_areas, + int buf_size); + +debug_info_t *debug_register_mode(const char *name, int pages, int nr_areas, + int buf_size, umode_t mode, uid_t uid, + gid_t gid); + +void debug_unregister(debug_info_t *id); + +void debug_set_level(debug_info_t *id, int new_level); + +void debug_set_critical(void); + +void debug_stop_all(void); + +/** + * debug_level_enabled() - Returns true if debug events for the specified + * level would be logged. Otherwise returns false. + * + * @id: handle for debug log + * @level: debug level + * + * Return: + * - %true if level is less or equal to the current debug level. + */ +static inline bool debug_level_enabled(debug_info_t *id, int level) +{ + return level <= id->level; +} + +/** + * debug_event() - writes binary debug entry to active debug area + * (if level <= actual debug level) + * + * @id: handle for debug log + * @level: debug level + * @data: pointer to data for debug entry + * @length: length of data in bytes + * + * Return: + * - Address of written debug entry + * - %NULL if error + */ +static inline debug_entry_t *debug_event(debug_info_t *id, int level, + void *data, int length) +{ + if ((!id) || (level > id->level) || (id->pages_per_area == 0)) + return NULL; + return debug_event_common(id, level, data, length); +} + +/** + * debug_int_event() - writes unsigned integer debug entry to active debug area + * (if level <= actual debug level) + * + * @id: handle for debug log + * @level: debug level + * @tag: integer value for debug entry + * + * Return: + * - Address of written debug entry + * - %NULL if error + */ +static inline debug_entry_t *debug_int_event(debug_info_t *id, int level, + unsigned int tag) +{ + unsigned int t = tag; + + if ((!id) || (level > id->level) || (id->pages_per_area == 0)) + return NULL; + return debug_event_common(id, level, &t, sizeof(unsigned int)); +} + +/** + * debug_long_event() - writes unsigned long debug entry to active debug area + * (if level <= actual debug level) + * + * @id: handle for debug log + * @level: debug level + * @tag: long integer value for debug entry + * + * Return: + * - Address of written debug entry + * - %NULL if error + */ +static inline debug_entry_t *debug_long_event(debug_info_t *id, int level, + unsigned long tag) +{ + unsigned long t = tag; + + if ((!id) || (level > id->level) || (id->pages_per_area == 0)) + return NULL; + return debug_event_common(id, level, &t, sizeof(unsigned long)); +} + +/** + * debug_text_event() - writes string debug entry in ascii format to active + * debug area (if level <= actual debug level) + * + * @id: handle for debug log + * @level: debug level + * @txt: string for debug entry + * + * Return: + * - Address of written debug entry + * - %NULL if error + */ +static inline debug_entry_t *debug_text_event(debug_info_t *id, int level, + const char *txt) +{ + if ((!id) || (level > id->level) || (id->pages_per_area == 0)) + return NULL; + return debug_event_common(id, level, txt, strlen(txt)); +} + +/* + * IMPORTANT: Use "%s" in sprintf format strings with care! Only pointers are + * stored in the s390dbf. See Documentation/arch/s390/s390dbf.rst for more details! + */ +extern debug_entry_t * +__debug_sprintf_event(debug_info_t *id, int level, char *string, ...) + __attribute__ ((format(printf, 3, 4))); + +/** + * debug_sprintf_event() - writes debug entry with format string + * and varargs (longs) to active debug area + * (if level $<=$ actual debug level). + * + * @_id: handle for debug log + * @_level: debug level + * @_fmt: format string for debug entry + * @...: varargs used as in sprintf() + * + * Return: + * - Address of written debug entry + * - %NULL if error + * + * floats and long long datatypes cannot be used as varargs. + */ +#define debug_sprintf_event(_id, _level, _fmt, ...) \ +({ \ + debug_entry_t *__ret; \ + debug_info_t *__id = _id; \ + int __level = _level; \ + \ + if ((!__id) || (__level > __id->level)) \ + __ret = NULL; \ + else \ + __ret = __debug_sprintf_event(__id, __level, \ + _fmt, ## __VA_ARGS__); \ + __ret; \ +}) + +/** + * debug_exception() - writes binary debug entry to active debug area + * (if level <= actual debug level) + * and switches to next debug area + * + * @id: handle for debug log + * @level: debug level + * @data: pointer to data for debug entry + * @length: length of data in bytes + * + * Return: + * - Address of written debug entry + * - %NULL if error + */ +static inline debug_entry_t *debug_exception(debug_info_t *id, int level, + void *data, int length) +{ + if ((!id) || (level > id->level) || (id->pages_per_area == 0)) + return NULL; + return debug_exception_common(id, level, data, length); +} + +/** + * debug_int_exception() - writes unsigned int debug entry to active debug area + * (if level <= actual debug level) + * and switches to next debug area + * + * @id: handle for debug log + * @level: debug level + * @tag: integer value for debug entry + * + * Return: + * - Address of written debug entry + * - %NULL if error + */ +static inline debug_entry_t *debug_int_exception(debug_info_t *id, int level, + unsigned int tag) +{ + unsigned int t = tag; + + if ((!id) || (level > id->level) || (id->pages_per_area == 0)) + return NULL; + return debug_exception_common(id, level, &t, sizeof(unsigned int)); +} + +/** + * debug_long_exception() - writes long debug entry to active debug area + * (if level <= actual debug level) + * and switches to next debug area + * + * @id: handle for debug log + * @level: debug level + * @tag: long integer value for debug entry + * + * Return: + * - Address of written debug entry + * - %NULL if error + */ +static inline debug_entry_t *debug_long_exception (debug_info_t *id, int level, + unsigned long tag) +{ + unsigned long t = tag; + + if ((!id) || (level > id->level) || (id->pages_per_area == 0)) + return NULL; + return debug_exception_common(id, level, &t, sizeof(unsigned long)); +} + +/** + * debug_text_exception() - writes string debug entry in ascii format to active + * debug area (if level <= actual debug level) + * and switches to next debug area + * area + * + * @id: handle for debug log + * @level: debug level + * @txt: string for debug entry + * + * Return: + * - Address of written debug entry + * - %NULL if error + */ +static inline debug_entry_t *debug_text_exception(debug_info_t *id, int level, + const char *txt) +{ + if ((!id) || (level > id->level) || (id->pages_per_area == 0)) + return NULL; + return debug_exception_common(id, level, txt, strlen(txt)); +} + +/* + * IMPORTANT: Use "%s" in sprintf format strings with care! Only pointers are + * stored in the s390dbf. See Documentation/arch/s390/s390dbf.rst for more details! + */ +extern debug_entry_t * +__debug_sprintf_exception(debug_info_t *id, int level, char *string, ...) + __attribute__ ((format(printf, 3, 4))); + + +/** + * debug_sprintf_exception() - writes debug entry with format string and + * varargs (longs) to active debug area + * (if level <= actual debug level) + * and switches to next debug area. + * + * @_id: handle for debug log + * @_level: debug level + * @_fmt: format string for debug entry + * @...: varargs used as in sprintf() + * + * Return: + * - Address of written debug entry + * - %NULL if error + * + * floats and long long datatypes cannot be used as varargs. + */ +#define debug_sprintf_exception(_id, _level, _fmt, ...) \ +({ \ + debug_entry_t *__ret; \ + debug_info_t *__id = _id; \ + int __level = _level; \ + \ + if ((!__id) || (__level > __id->level)) \ + __ret = NULL; \ + else \ + __ret = __debug_sprintf_exception(__id, __level, \ + _fmt, ## __VA_ARGS__);\ + __ret; \ +}) + +int debug_register_view(debug_info_t *id, struct debug_view *view); + +int debug_unregister_view(debug_info_t *id, struct debug_view *view); + +#ifndef MODULE + +/* + * Note: Initial page and area numbers must be fixed to allow static + * initialization. This enables very early tracing. Changes to these values + * must be reflected in __DEFINE_STATIC_AREA. + */ +#define EARLY_PAGES 8 +#define EARLY_AREAS 1 + +#define VNAME(var, suffix) __##var##_##suffix + +/* + * Define static areas for early trace data. During boot debug_register_static() + * will replace these with dynamically allocated areas to allow custom page and + * area sizes, and dynamic resizing. + */ +#define __DEFINE_STATIC_AREA(var) \ +static char VNAME(var, data)[EARLY_PAGES][PAGE_SIZE] __initdata; \ +static debug_entry_t *VNAME(var, pages)[EARLY_PAGES] __initdata = { \ + (debug_entry_t *)VNAME(var, data)[0], \ + (debug_entry_t *)VNAME(var, data)[1], \ + (debug_entry_t *)VNAME(var, data)[2], \ + (debug_entry_t *)VNAME(var, data)[3], \ + (debug_entry_t *)VNAME(var, data)[4], \ + (debug_entry_t *)VNAME(var, data)[5], \ + (debug_entry_t *)VNAME(var, data)[6], \ + (debug_entry_t *)VNAME(var, data)[7], \ +}; \ +static debug_entry_t **VNAME(var, areas)[EARLY_AREAS] __initdata = { \ + (debug_entry_t **)VNAME(var, pages), \ +}; \ +static int VNAME(var, active_pages)[EARLY_AREAS] __initdata; \ +static int VNAME(var, active_entries)[EARLY_AREAS] __initdata + +#define __DEBUG_INFO_INIT(var, _name, _buf_size) { \ + .next = NULL, \ + .prev = NULL, \ + .ref_count = REFCOUNT_INIT(1), \ + .lock = __SPIN_LOCK_UNLOCKED(var.lock), \ + .level = DEBUG_DEFAULT_LEVEL, \ + .nr_areas = EARLY_AREAS, \ + .pages_per_area = EARLY_PAGES, \ + .buf_size = (_buf_size), \ + .entry_size = sizeof(debug_entry_t) + (_buf_size), \ + .areas = VNAME(var, areas), \ + .active_area = 0, \ + .active_pages = VNAME(var, active_pages), \ + .active_entries = VNAME(var, active_entries), \ + .debugfs_root_entry = NULL, \ + .debugfs_entries = { NULL }, \ + .views = { NULL }, \ + .name = (_name), \ + .mode = 0600, \ +} + +#define __REGISTER_STATIC_DEBUG_INFO(var, name, pages, areas, view) \ +static int __init VNAME(var, reg)(void) \ +{ \ + debug_register_static(&var, (pages), (areas)); \ + debug_register_view(&var, (view)); \ + return 0; \ +} \ +arch_initcall(VNAME(var, reg)) + +/** + * DEFINE_STATIC_DEBUG_INFO - Define static debug_info_t + * + * @var: Name of debug_info_t variable + * @name: Name of debug log (e.g. used for debugfs entry) + * @pages: Number of pages per area + * @nr_areas: Number of debug areas + * @buf_size: Size of data area in each debug entry + * @view: Pointer to debug view struct + * + * Define a static debug_info_t for early tracing. The associated debugfs log + * is automatically registered with the specified debug view. + * + * Important: Users of this macro must not call any of the + * debug_register/_unregister() functions for this debug_info_t! + * + * Note: Tracing will start with a fixed number of initial pages and areas. + * The debug area will be changed to use the specified numbers during + * arch_initcall. + */ +#define DEFINE_STATIC_DEBUG_INFO(var, name, pages, nr_areas, buf_size, view) \ +__DEFINE_STATIC_AREA(var); \ +static debug_info_t __refdata var = \ + __DEBUG_INFO_INIT(var, (name), (buf_size)); \ +__REGISTER_STATIC_DEBUG_INFO(var, name, pages, nr_areas, view) + +void debug_register_static(debug_info_t *id, int pages_per_area, int nr_areas); + +#endif /* MODULE */ + +#endif /* _ASM_S390_DEBUG_H */ diff --git a/arch/s390/include/asm/delay.h b/arch/s390/include/asm/delay.h new file mode 100644 index 0000000000..21a8fe18fe --- /dev/null +++ b/arch/s390/include/asm/delay.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * S390 version + * Copyright IBM Corp. 1999 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) + * + * Derived from "include/asm-i386/delay.h" + * Copyright (C) 1993 Linus Torvalds + * + * Delay routines calling functions in arch/s390/lib/delay.c + */ + +#ifndef _S390_DELAY_H +#define _S390_DELAY_H + +void __ndelay(unsigned long nsecs); +void __udelay(unsigned long usecs); +void __delay(unsigned long loops); + +#define ndelay(n) __ndelay((unsigned long)(n)) +#define udelay(n) __udelay((unsigned long)(n)) +#define mdelay(n) __udelay((unsigned long)(n) * 1000) + +#endif /* defined(_S390_DELAY_H) */ diff --git a/arch/s390/include/asm/diag.h b/arch/s390/include/asm/diag.h new file mode 100644 index 0000000000..bed8041375 --- /dev/null +++ b/arch/s390/include/asm/diag.h @@ -0,0 +1,351 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * s390 diagnose functions + * + * Copyright IBM Corp. 2007 + * Author(s): Michael Holzheu + */ + +#ifndef _ASM_S390_DIAG_H +#define _ASM_S390_DIAG_H + +#include +#include +#include +#include + +enum diag_stat_enum { + DIAG_STAT_X008, + DIAG_STAT_X00C, + DIAG_STAT_X010, + DIAG_STAT_X014, + DIAG_STAT_X044, + DIAG_STAT_X064, + DIAG_STAT_X08C, + DIAG_STAT_X09C, + DIAG_STAT_X0DC, + DIAG_STAT_X204, + DIAG_STAT_X210, + DIAG_STAT_X224, + DIAG_STAT_X250, + DIAG_STAT_X258, + DIAG_STAT_X26C, + DIAG_STAT_X288, + DIAG_STAT_X2C4, + DIAG_STAT_X2FC, + DIAG_STAT_X304, + DIAG_STAT_X308, + DIAG_STAT_X318, + DIAG_STAT_X320, + DIAG_STAT_X500, + NR_DIAG_STAT +}; + +void diag_stat_inc(enum diag_stat_enum nr); +void diag_stat_inc_norecursion(enum diag_stat_enum nr); + +/* + * Diagnose 10: Release page range + */ +static inline void diag10_range(unsigned long start_pfn, unsigned long num_pfn) +{ + unsigned long start_addr, end_addr; + + start_addr = pfn_to_phys(start_pfn); + end_addr = pfn_to_phys(start_pfn + num_pfn - 1); + + diag_stat_inc(DIAG_STAT_X010); + asm volatile( + "0: diag %0,%1,0x10\n" + "1: nopr %%r7\n" + EX_TABLE(0b, 1b) + EX_TABLE(1b, 1b) + : : "a" (start_addr), "a" (end_addr)); +} + +/* + * Diagnose 14: Input spool file manipulation + */ +extern int diag14(unsigned long rx, unsigned long ry1, unsigned long subcode); + +/* + * Diagnose 210: Get information about a virtual device + */ +struct diag210 { + u16 vrdcdvno; /* device number (input) */ + u16 vrdclen; /* data block length (input) */ + u8 vrdcvcla; /* virtual device class (output) */ + u8 vrdcvtyp; /* virtual device type (output) */ + u8 vrdcvsta; /* virtual device status (output) */ + u8 vrdcvfla; /* virtual device flags (output) */ + u8 vrdcrccl; /* real device class (output) */ + u8 vrdccrty; /* real device type (output) */ + u8 vrdccrmd; /* real device model (output) */ + u8 vrdccrft; /* real device feature (output) */ +} __packed __aligned(4); + +extern int diag210(struct diag210 *addr); + +struct diag8c { + u8 flags; + u8 num_partitions; + u16 width; + u16 height; + u8 data[]; +} __packed __aligned(4); + +extern int diag8c(struct diag8c *out, struct ccw_dev_id *devno); + +/* bit is set in flags, when physical cpu info is included in diag 204 data */ +#define DIAG204_LPAR_PHYS_FLG 0x80 +#define DIAG204_LPAR_NAME_LEN 8 /* lpar name len in diag 204 data */ +#define DIAG204_CPU_NAME_LEN 16 /* type name len of cpus in diag224 name table */ + +/* diag 204 subcodes */ +enum diag204_sc { + DIAG204_SUBC_STIB4 = 4, + DIAG204_SUBC_RSI = 5, + DIAG204_SUBC_STIB6 = 6, + DIAG204_SUBC_STIB7 = 7 +}; + +#define DIAG204_SUBCODE_MASK 0xffff + +/* The two available diag 204 data formats */ +enum diag204_format { + DIAG204_INFO_SIMPLE = 0, + DIAG204_INFO_EXT = 0x00010000 +}; + +enum diag204_cpu_flags { + DIAG204_CPU_ONLINE = 0x20, + DIAG204_CPU_CAPPED = 0x40, +}; + +struct diag204_info_blk_hdr { + __u8 npar; + __u8 flags; + __u16 tslice; + __u16 phys_cpus; + __u16 this_part; + __u64 curtod; +} __packed; + +struct diag204_x_info_blk_hdr { + __u8 npar; + __u8 flags; + __u16 tslice; + __u16 phys_cpus; + __u16 this_part; + __u64 curtod1; + __u64 curtod2; + char reserved[40]; +} __packed; + +struct diag204_part_hdr { + __u8 pn; + __u8 cpus; + char reserved[6]; + char part_name[DIAG204_LPAR_NAME_LEN]; +} __packed; + +struct diag204_x_part_hdr { + __u8 pn; + __u8 cpus; + __u8 rcpus; + __u8 pflag; + __u32 mlu; + char part_name[DIAG204_LPAR_NAME_LEN]; + char lpc_name[8]; + char os_name[8]; + __u64 online_cs; + __u64 online_es; + __u8 upid; + __u8 reserved:3; + __u8 mtid:5; + char reserved1[2]; + __u32 group_mlu; + char group_name[8]; + char hardware_group_name[8]; + char reserved2[24]; +} __packed; + +struct diag204_cpu_info { + __u16 cpu_addr; + char reserved1[2]; + __u8 ctidx; + __u8 cflag; + __u16 weight; + __u64 acc_time; + __u64 lp_time; +} __packed; + +struct diag204_x_cpu_info { + __u16 cpu_addr; + char reserved1[2]; + __u8 ctidx; + __u8 cflag; + __u16 weight; + __u64 acc_time; + __u64 lp_time; + __u16 min_weight; + __u16 cur_weight; + __u16 max_weight; + char reseved2[2]; + __u64 online_time; + __u64 wait_time; + __u32 pma_weight; + __u32 polar_weight; + __u32 cpu_type_cap; + __u32 group_cpu_type_cap; + char reserved3[32]; +} __packed; + +struct diag204_phys_hdr { + char reserved1[1]; + __u8 cpus; + char reserved2[6]; + char mgm_name[8]; +} __packed; + +struct diag204_x_phys_hdr { + char reserved1[1]; + __u8 cpus; + char reserved2[6]; + char mgm_name[8]; + char reserved3[80]; +} __packed; + +struct diag204_phys_cpu { + __u16 cpu_addr; + char reserved1[2]; + __u8 ctidx; + char reserved2[3]; + __u64 mgm_time; + char reserved3[8]; +} __packed; + +struct diag204_x_phys_cpu { + __u16 cpu_addr; + char reserved1[2]; + __u8 ctidx; + char reserved2[1]; + __u16 weight; + __u64 mgm_time; + char reserved3[80]; +} __packed; + +struct diag204_x_part_block { + struct diag204_x_part_hdr hdr; + struct diag204_x_cpu_info cpus[]; +} __packed; + +struct diag204_x_phys_block { + struct diag204_x_phys_hdr hdr; + struct diag204_x_phys_cpu cpus[]; +} __packed; + +enum diag26c_sc { + DIAG26C_PORT_VNIC = 0x00000024, + DIAG26C_MAC_SERVICES = 0x00000030 +}; + +enum diag26c_version { + DIAG26C_VERSION2 = 0x00000002, /* z/VM 5.4.0 */ + DIAG26C_VERSION6_VM65918 = 0x00020006 /* z/VM 6.4.0 + VM65918 */ +}; + +#define DIAG26C_VNIC_INFO 0x0002 +struct diag26c_vnic_req { + u32 resp_buf_len; + u32 resp_version; + u16 req_format; + u16 vlan_id; + u64 sys_name; + u8 res[2]; + u16 devno; +} __packed __aligned(8); + +#define VNIC_INFO_PROT_L3 1 +#define VNIC_INFO_PROT_L2 2 +/* Note: this is the bare minimum, use it for uninitialized VNICs only. */ +struct diag26c_vnic_resp { + u32 version; + u32 entry_cnt; + /* VNIC info: */ + u32 next_entry; + u64 owner; + u16 devno; + u8 status; + u8 type; + u64 lan_owner; + u64 lan_name; + u64 port_name; + u8 port_type; + u8 ext_status:6; + u8 protocol:2; + u16 base_devno; + u32 port_num; + u32 ifindex; + u32 maxinfo; + u32 dev_count; + /* 3x device info: */ + u8 dev_info1[28]; + u8 dev_info2[28]; + u8 dev_info3[28]; +} __packed __aligned(8); + +#define DIAG26C_GET_MAC 0x0000 +struct diag26c_mac_req { + u32 resp_buf_len; + u32 resp_version; + u16 op_code; + u16 devno; + u8 res[4]; +}; + +struct diag26c_mac_resp { + u32 version; + u8 mac[ETH_ALEN]; + u8 res[2]; +} __aligned(8); + +#define CPNC_LINUX 0x4 +union diag318_info { + unsigned long val; + struct { + unsigned long cpnc : 8; + unsigned long cpvc : 56; + }; +}; + +int diag204(unsigned long subcode, unsigned long size, void *addr); +int diag224(void *ptr); +int diag26c(void *req, void *resp, enum diag26c_sc subcode); + +struct hypfs_diag0c_entry; + +/* + * This structure must contain only pointers/references into + * the AMODE31 text section. + */ +struct diag_ops { + int (*diag210)(struct diag210 *addr); + int (*diag26c)(void *req, void *resp, enum diag26c_sc subcode); + int (*diag14)(unsigned long rx, unsigned long ry1, unsigned long subcode); + int (*diag8c)(struct diag8c *addr, struct ccw_dev_id *devno, size_t len); + void (*diag0c)(struct hypfs_diag0c_entry *entry); + void (*diag308_reset)(void); +}; + +extern struct diag_ops diag_amode31_ops; +extern struct diag210 *__diag210_tmp_amode31; + +int _diag210_amode31(struct diag210 *addr); +int _diag26c_amode31(void *req, void *resp, enum diag26c_sc subcode); +int _diag14_amode31(unsigned long rx, unsigned long ry1, unsigned long subcode); +void _diag0c_amode31(struct hypfs_diag0c_entry *entry); +void _diag308_reset_amode31(void); +int _diag8c_amode31(struct diag8c *addr, struct ccw_dev_id *devno, size_t len); + +#endif /* _ASM_S390_DIAG_H */ diff --git a/arch/s390/include/asm/dis.h b/arch/s390/include/asm/dis.h new file mode 100644 index 0000000000..c18ed60919 --- /dev/null +++ b/arch/s390/include/asm/dis.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Disassemble s390 instructions. + * + * Copyright IBM Corp. 2007 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), + */ + +#ifndef __ASM_S390_DIS_H__ +#define __ASM_S390_DIS_H__ + +#include + +static inline int insn_length(unsigned char code) +{ + return ((((int) code + 64) >> 7) + 1) << 1; +} + +struct pt_regs; + +void show_code(struct pt_regs *regs); +void print_fn_code(unsigned char *code, unsigned long len); +struct s390_insn *find_insn(unsigned char *code); + +static inline int is_known_insn(unsigned char *code) +{ + return !!find_insn(code); +} + +#endif /* __ASM_S390_DIS_H__ */ diff --git a/arch/s390/include/asm/dma.h b/arch/s390/include/asm/dma.h new file mode 100644 index 0000000000..7fe3e31956 --- /dev/null +++ b/arch/s390/include/asm/dma.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_DMA_H +#define _ASM_S390_DMA_H + +#include + +/* + * MAX_DMA_ADDRESS is ambiguous because on s390 its completely unrelated + * to DMA. It _is_ used for the s390 memory zone split at 2GB caused + * by the 31 bit heritage. + */ +#define MAX_DMA_ADDRESS __va(0x80000000) + +#endif /* _ASM_S390_DMA_H */ diff --git a/arch/s390/include/asm/dwarf.h b/arch/s390/include/asm/dwarf.h new file mode 100644 index 0000000000..4f21ae561e --- /dev/null +++ b/arch/s390/include/asm/dwarf.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_DWARF_H +#define _ASM_S390_DWARF_H + +#ifdef __ASSEMBLY__ + +#define CFI_STARTPROC .cfi_startproc +#define CFI_ENDPROC .cfi_endproc +#define CFI_DEF_CFA_OFFSET .cfi_def_cfa_offset +#define CFI_ADJUST_CFA_OFFSET .cfi_adjust_cfa_offset +#define CFI_RESTORE .cfi_restore + +#ifdef CONFIG_AS_CFI_VAL_OFFSET +#define CFI_VAL_OFFSET .cfi_val_offset +#else +#define CFI_VAL_OFFSET # +#endif + +#ifndef BUILD_VDSO + /* + * Emit CFI data in .debug_frame sections and not in .eh_frame + * sections. The .eh_frame CFI is used for runtime unwind + * information that is not being used. Hence, vmlinux.lds.S + * can discard the .eh_frame sections. + */ + .cfi_sections .debug_frame +#else + /* + * For vDSO, emit CFI data in both, .eh_frame and .debug_frame + * sections. + */ + .cfi_sections .eh_frame, .debug_frame +#endif + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_S390_DWARF_H */ diff --git a/arch/s390/include/asm/eadm.h b/arch/s390/include/asm/eadm.h new file mode 100644 index 0000000000..06f795855a --- /dev/null +++ b/arch/s390/include/asm/eadm.h @@ -0,0 +1,120 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_EADM_H +#define _ASM_S390_EADM_H + +#include +#include +#include + +struct arqb { + u64 data; + u16 fmt:4; + u16:12; + u16 cmd_code; + u16:16; + u16 msb_count; + u32 reserved[12]; +} __packed; + +#define ARQB_CMD_MOVE 1 + +struct arsb { + u16 fmt:4; + u32:28; + u8 ef; + u8:8; + u8 ecbi; + u8:8; + u8 fvf; + u16:16; + u8 eqc; + u32:32; + u64 fail_msb; + u64 fail_aidaw; + u64 fail_ms; + u64 fail_scm; + u32 reserved[4]; +} __packed; + +#define EQC_WR_PROHIBIT 22 + +struct msb { + u8 fmt:4; + u8 oc:4; + u8 flags; + u16:12; + u16 bs:4; + u32 blk_count; + u64 data_addr; + u64 scm_addr; + u64:64; +} __packed; + +struct aidaw { + u8 flags; + u32 :24; + u32 :32; + u64 data_addr; +} __packed; + +#define MSB_OC_CLEAR 0 +#define MSB_OC_READ 1 +#define MSB_OC_WRITE 2 +#define MSB_OC_RELEASE 3 + +#define MSB_FLAG_BNM 0x80 +#define MSB_FLAG_IDA 0x40 + +#define MSB_BS_4K 0 +#define MSB_BS_1M 1 + +#define AOB_NR_MSB 124 + +struct aob { + struct arqb request; + struct arsb response; + struct msb msb[AOB_NR_MSB]; +} __packed __aligned(PAGE_SIZE); + +struct aob_rq_header { + struct scm_device *scmdev; + char data[]; +}; + +struct scm_device { + u64 address; + u64 size; + unsigned int nr_max_block; + struct device dev; + struct { + unsigned int persistence:4; + unsigned int oper_state:4; + unsigned int data_state:4; + unsigned int rank:4; + unsigned int release:1; + unsigned int res_id:8; + } __packed attrs; +}; + +#define OP_STATE_GOOD 1 +#define OP_STATE_TEMP_ERR 2 +#define OP_STATE_PERM_ERR 3 + +enum scm_event {SCM_CHANGE, SCM_AVAIL}; + +struct scm_driver { + struct device_driver drv; + int (*probe) (struct scm_device *scmdev); + void (*remove) (struct scm_device *scmdev); + void (*notify) (struct scm_device *scmdev, enum scm_event event); + void (*handler) (struct scm_device *scmdev, void *data, + blk_status_t error); +}; + +int scm_driver_register(struct scm_driver *scmdrv); +void scm_driver_unregister(struct scm_driver *scmdrv); + +int eadm_start_aob(struct aob *aob); +void scm_irq_handler(struct aob *aob, blk_status_t error); + +#endif /* _ASM_S390_EADM_H */ diff --git a/arch/s390/include/asm/ebcdic.h b/arch/s390/include/asm/ebcdic.h new file mode 100644 index 0000000000..efb50fc686 --- /dev/null +++ b/arch/s390/include/asm/ebcdic.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * EBCDIC -> ASCII, ASCII -> EBCDIC conversion routines. + * + * S390 version + * Copyright IBM Corp. 1999 + * Author(s): Martin Schwidefsky + */ + +#ifndef _EBCDIC_H +#define _EBCDIC_H + +#include + +extern __u8 _ascebc_500[256]; /* ASCII -> EBCDIC 500 conversion table */ +extern __u8 _ebcasc_500[256]; /* EBCDIC 500 -> ASCII conversion table */ +extern __u8 _ascebc[256]; /* ASCII -> EBCDIC conversion table */ +extern __u8 _ebcasc[256]; /* EBCDIC -> ASCII conversion table */ +extern __u8 _ebc_tolower[256]; /* EBCDIC -> lowercase */ +extern __u8 _ebc_toupper[256]; /* EBCDIC -> uppercase */ + +static inline void +codepage_convert(const __u8 *codepage, volatile char *addr, unsigned long nr) +{ + if (nr-- <= 0) + return; + asm volatile( + " bras 1,1f\n" + " tr 0(1,%0),0(%2)\n" + "0: tr 0(256,%0),0(%2)\n" + " la %0,256(%0)\n" + "1: ahi %1,-256\n" + " jnm 0b\n" + " ex %1,0(1)" + : "+&a" (addr), "+&a" (nr) + : "a" (codepage) : "cc", "memory", "1"); +} + +#define ASCEBC(addr,nr) codepage_convert(_ascebc, addr, nr) +#define EBCASC(addr,nr) codepage_convert(_ebcasc, addr, nr) +#define ASCEBC_500(addr,nr) codepage_convert(_ascebc_500, addr, nr) +#define EBCASC_500(addr,nr) codepage_convert(_ebcasc_500, addr, nr) +#define EBC_TOLOWER(addr,nr) codepage_convert(_ebc_tolower, addr, nr) +#define EBC_TOUPPER(addr,nr) codepage_convert(_ebc_toupper, addr, nr) + +#endif + diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h new file mode 100644 index 0000000000..70a30ae258 --- /dev/null +++ b/arch/s390/include/asm/elf.h @@ -0,0 +1,302 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * S390 version + * + * Derived from "include/asm-i386/elf.h" + */ + +#ifndef __ASMS390_ELF_H +#define __ASMS390_ELF_H + +/* s390 relocations defined by the ABIs */ +#define R_390_NONE 0 /* No reloc. */ +#define R_390_8 1 /* Direct 8 bit. */ +#define R_390_12 2 /* Direct 12 bit. */ +#define R_390_16 3 /* Direct 16 bit. */ +#define R_390_32 4 /* Direct 32 bit. */ +#define R_390_PC32 5 /* PC relative 32 bit. */ +#define R_390_GOT12 6 /* 12 bit GOT offset. */ +#define R_390_GOT32 7 /* 32 bit GOT offset. */ +#define R_390_PLT32 8 /* 32 bit PC relative PLT address. */ +#define R_390_COPY 9 /* Copy symbol at runtime. */ +#define R_390_GLOB_DAT 10 /* Create GOT entry. */ +#define R_390_JMP_SLOT 11 /* Create PLT entry. */ +#define R_390_RELATIVE 12 /* Adjust by program base. */ +#define R_390_GOTOFF32 13 /* 32 bit offset to GOT. */ +#define R_390_GOTPC 14 /* 32 bit PC rel. offset to GOT. */ +#define R_390_GOT16 15 /* 16 bit GOT offset. */ +#define R_390_PC16 16 /* PC relative 16 bit. */ +#define R_390_PC16DBL 17 /* PC relative 16 bit shifted by 1. */ +#define R_390_PLT16DBL 18 /* 16 bit PC rel. PLT shifted by 1. */ +#define R_390_PC32DBL 19 /* PC relative 32 bit shifted by 1. */ +#define R_390_PLT32DBL 20 /* 32 bit PC rel. PLT shifted by 1. */ +#define R_390_GOTPCDBL 21 /* 32 bit PC rel. GOT shifted by 1. */ +#define R_390_64 22 /* Direct 64 bit. */ +#define R_390_PC64 23 /* PC relative 64 bit. */ +#define R_390_GOT64 24 /* 64 bit GOT offset. */ +#define R_390_PLT64 25 /* 64 bit PC relative PLT address. */ +#define R_390_GOTENT 26 /* 32 bit PC rel. to GOT entry >> 1. */ +#define R_390_GOTOFF16 27 /* 16 bit offset to GOT. */ +#define R_390_GOTOFF64 28 /* 64 bit offset to GOT. */ +#define R_390_GOTPLT12 29 /* 12 bit offset to jump slot. */ +#define R_390_GOTPLT16 30 /* 16 bit offset to jump slot. */ +#define R_390_GOTPLT32 31 /* 32 bit offset to jump slot. */ +#define R_390_GOTPLT64 32 /* 64 bit offset to jump slot. */ +#define R_390_GOTPLTENT 33 /* 32 bit rel. offset to jump slot. */ +#define R_390_PLTOFF16 34 /* 16 bit offset from GOT to PLT. */ +#define R_390_PLTOFF32 35 /* 32 bit offset from GOT to PLT. */ +#define R_390_PLTOFF64 36 /* 16 bit offset from GOT to PLT. */ +#define R_390_TLS_LOAD 37 /* Tag for load insn in TLS code. */ +#define R_390_TLS_GDCALL 38 /* Tag for function call in general + dynamic TLS code. */ +#define R_390_TLS_LDCALL 39 /* Tag for function call in local + dynamic TLS code. */ +#define R_390_TLS_GD32 40 /* Direct 32 bit for general dynamic + thread local data. */ +#define R_390_TLS_GD64 41 /* Direct 64 bit for general dynamic + thread local data. */ +#define R_390_TLS_GOTIE12 42 /* 12 bit GOT offset for static TLS + block offset. */ +#define R_390_TLS_GOTIE32 43 /* 32 bit GOT offset for static TLS + block offset. */ +#define R_390_TLS_GOTIE64 44 /* 64 bit GOT offset for static TLS + block offset. */ +#define R_390_TLS_LDM32 45 /* Direct 32 bit for local dynamic + thread local data in LD code. */ +#define R_390_TLS_LDM64 46 /* Direct 64 bit for local dynamic + thread local data in LD code. */ +#define R_390_TLS_IE32 47 /* 32 bit address of GOT entry for + negated static TLS block offset. */ +#define R_390_TLS_IE64 48 /* 64 bit address of GOT entry for + negated static TLS block offset. */ +#define R_390_TLS_IEENT 49 /* 32 bit rel. offset to GOT entry for + negated static TLS block offset. */ +#define R_390_TLS_LE32 50 /* 32 bit negated offset relative to + static TLS block. */ +#define R_390_TLS_LE64 51 /* 64 bit negated offset relative to + static TLS block. */ +#define R_390_TLS_LDO32 52 /* 32 bit offset relative to TLS + block. */ +#define R_390_TLS_LDO64 53 /* 64 bit offset relative to TLS + block. */ +#define R_390_TLS_DTPMOD 54 /* ID of module containing symbol. */ +#define R_390_TLS_DTPOFF 55 /* Offset in TLS block. */ +#define R_390_TLS_TPOFF 56 /* Negate offset in static TLS + block. */ +#define R_390_20 57 /* Direct 20 bit. */ +#define R_390_GOT20 58 /* 20 bit GOT offset. */ +#define R_390_GOTPLT20 59 /* 20 bit offset to jump slot. */ +#define R_390_TLS_GOTIE20 60 /* 20 bit GOT offset for static TLS + block offset. */ +/* Keep this the last entry. */ +#define R_390_NUM 61 + +enum { + HWCAP_NR_ESAN3 = 0, + HWCAP_NR_ZARCH = 1, + HWCAP_NR_STFLE = 2, + HWCAP_NR_MSA = 3, + HWCAP_NR_LDISP = 4, + HWCAP_NR_EIMM = 5, + HWCAP_NR_DFP = 6, + HWCAP_NR_HPAGE = 7, + HWCAP_NR_ETF3EH = 8, + HWCAP_NR_HIGH_GPRS = 9, + HWCAP_NR_TE = 10, + HWCAP_NR_VXRS = 11, + HWCAP_NR_VXRS_BCD = 12, + HWCAP_NR_VXRS_EXT = 13, + HWCAP_NR_GS = 14, + HWCAP_NR_VXRS_EXT2 = 15, + HWCAP_NR_VXRS_PDE = 16, + HWCAP_NR_SORT = 17, + HWCAP_NR_DFLT = 18, + HWCAP_NR_VXRS_PDE2 = 19, + HWCAP_NR_NNPA = 20, + HWCAP_NR_PCI_MIO = 21, + HWCAP_NR_SIE = 22, + HWCAP_NR_MAX +}; + +/* Bits present in AT_HWCAP. */ +#define HWCAP_ESAN3 BIT(HWCAP_NR_ESAN3) +#define HWCAP_ZARCH BIT(HWCAP_NR_ZARCH) +#define HWCAP_STFLE BIT(HWCAP_NR_STFLE) +#define HWCAP_MSA BIT(HWCAP_NR_MSA) +#define HWCAP_LDISP BIT(HWCAP_NR_LDISP) +#define HWCAP_EIMM BIT(HWCAP_NR_EIMM) +#define HWCAP_DFP BIT(HWCAP_NR_DFP) +#define HWCAP_HPAGE BIT(HWCAP_NR_HPAGE) +#define HWCAP_ETF3EH BIT(HWCAP_NR_ETF3EH) +#define HWCAP_HIGH_GPRS BIT(HWCAP_NR_HIGH_GPRS) +#define HWCAP_TE BIT(HWCAP_NR_TE) +#define HWCAP_VXRS BIT(HWCAP_NR_VXRS) +#define HWCAP_VXRS_BCD BIT(HWCAP_NR_VXRS_BCD) +#define HWCAP_VXRS_EXT BIT(HWCAP_NR_VXRS_EXT) +#define HWCAP_GS BIT(HWCAP_NR_GS) +#define HWCAP_VXRS_EXT2 BIT(HWCAP_NR_VXRS_EXT2) +#define HWCAP_VXRS_PDE BIT(HWCAP_NR_VXRS_PDE) +#define HWCAP_SORT BIT(HWCAP_NR_SORT) +#define HWCAP_DFLT BIT(HWCAP_NR_DFLT) +#define HWCAP_VXRS_PDE2 BIT(HWCAP_NR_VXRS_PDE2) +#define HWCAP_NNPA BIT(HWCAP_NR_NNPA) +#define HWCAP_PCI_MIO BIT(HWCAP_NR_PCI_MIO) +#define HWCAP_SIE BIT(HWCAP_NR_SIE) + +/* + * These are used to set parameters in the core dumps. + */ +#define ELF_CLASS ELFCLASS64 +#define ELF_DATA ELFDATA2MSB +#define ELF_ARCH EM_S390 + +/* s390 specific phdr types */ +#define PT_S390_PGSTE 0x70000000 + +/* + * ELF register definitions.. + */ + +#include + +#include +#include +#include + +typedef s390_fp_regs elf_fpregset_t; +typedef s390_regs elf_gregset_t; + +typedef s390_fp_regs compat_elf_fpregset_t; +typedef s390_compat_regs compat_elf_gregset_t; + +#include /* for task_struct */ +#include + +/* + * This is used to ensure we don't load something for the wrong architecture. + */ +#define elf_check_arch(x) \ + (((x)->e_machine == EM_S390 || (x)->e_machine == EM_S390_OLD) \ + && (x)->e_ident[EI_CLASS] == ELF_CLASS) +#define compat_elf_check_arch(x) \ + (((x)->e_machine == EM_S390 || (x)->e_machine == EM_S390_OLD) \ + && (x)->e_ident[EI_CLASS] == ELF_CLASS) +#define compat_start_thread start_thread31 + +struct arch_elf_state { + int rc; +}; + +#define INIT_ARCH_ELF_STATE { .rc = 0 } + +#define arch_check_elf(ehdr, interp, interp_ehdr, state) (0) +#ifdef CONFIG_PGSTE +#define arch_elf_pt_proc(ehdr, phdr, elf, interp, state) \ +({ \ + struct arch_elf_state *_state = state; \ + if ((phdr)->p_type == PT_S390_PGSTE && \ + !page_table_allocate_pgste && \ + !test_thread_flag(TIF_PGSTE) && \ + !current->mm->context.alloc_pgste) { \ + set_thread_flag(TIF_PGSTE); \ + set_pt_regs_flag(task_pt_regs(current), \ + PIF_EXECVE_PGSTE_RESTART); \ + _state->rc = -EAGAIN; \ + } \ + _state->rc; \ +}) +#else +#define arch_elf_pt_proc(ehdr, phdr, elf, interp, state) \ +({ \ + (state)->rc; \ +}) +#endif + +/* For SVR4/S390 the function pointer to be registered with `atexit` is + passed in R14. */ +#define ELF_PLAT_INIT(_r, load_addr) \ + do { \ + _r->gprs[14] = 0; \ + } while (0) + +#define CORE_DUMP_USE_REGSET +#define ELF_EXEC_PAGESIZE PAGE_SIZE + +/* This is the location that an ET_DYN program is loaded if exec'ed. Typical + use of this is to invoke "./ld.so someprog" to test out a new version of + the loader. We need to make sure that it is out of the way of the program + that it will "exec", and that there is sufficient room for the brk. 64-bit + tasks are aligned to 4GB. */ +#define ELF_ET_DYN_BASE (is_compat_task() ? \ + (STACK_TOP / 3 * 2) : \ + (STACK_TOP / 3 * 2) & ~((1UL << 32) - 1)) + +/* This yields a mask that user programs can use to figure out what + instruction set this CPU supports. */ + +extern unsigned long elf_hwcap; +#define ELF_HWCAP (elf_hwcap) + +/* This yields a string that ld.so will use to load implementation + specific libraries for optimization. This is more specific in + intent than poking at uname or /proc/cpuinfo. + + For the moment, we have only optimizations for the Intel generations, + but that could change... */ + +#define ELF_PLATFORM_SIZE 8 +extern char elf_platform[]; +#define ELF_PLATFORM (elf_platform) + +#ifndef CONFIG_COMPAT +#define SET_PERSONALITY(ex) \ +do { \ + set_personality(PER_LINUX | \ + (current->personality & (~PER_MASK))); \ + current->thread.sys_call_table = sys_call_table; \ +} while (0) +#else /* CONFIG_COMPAT */ +#define SET_PERSONALITY(ex) \ +do { \ + if (personality(current->personality) != PER_LINUX32) \ + set_personality(PER_LINUX | \ + (current->personality & ~PER_MASK)); \ + if ((ex).e_ident[EI_CLASS] == ELFCLASS32) { \ + set_thread_flag(TIF_31BIT); \ + current->thread.sys_call_table = \ + sys_call_table_emu; \ + } else { \ + clear_thread_flag(TIF_31BIT); \ + current->thread.sys_call_table = \ + sys_call_table; \ + } \ +} while (0) +#endif /* CONFIG_COMPAT */ + +/* + * Cache aliasing on the latest machines calls for a mapping granularity + * of 512KB for the anonymous mapping base. For 64-bit processes use a + * 512KB alignment and a randomization of up to 1GB. For 31-bit processes + * the virtual address space is limited, use no alignment and limit the + * randomization to 8MB. + * For the additional randomization of the program break use 32MB for + * 64-bit and 8MB for 31-bit. + */ +#define BRK_RND_MASK (is_compat_task() ? 0x7ffUL : 0x1fffUL) +#define MMAP_RND_MASK (is_compat_task() ? 0x7ffUL : 0x3ff80UL) +#define MMAP_ALIGN_MASK (is_compat_task() ? 0 : 0x7fUL) +#define STACK_RND_MASK MMAP_RND_MASK + +/* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */ +#define ARCH_DLINFO \ +do { \ + NEW_AUX_ENT(AT_SYSINFO_EHDR, \ + (unsigned long)current->mm->context.vdso_base); \ +} while (0) + +struct linux_binprm; + +#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 +int arch_setup_additional_pages(struct linux_binprm *, int); + +#endif diff --git a/arch/s390/include/asm/entry-common.h b/arch/s390/include/asm/entry-common.h new file mode 100644 index 0000000000..fdd319a622 --- /dev/null +++ b/arch/s390/include/asm/entry-common.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef ARCH_S390_ENTRY_COMMON_H +#define ARCH_S390_ENTRY_COMMON_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#define ARCH_EXIT_TO_USER_MODE_WORK (_TIF_GUARDED_STORAGE | _TIF_PER_TRAP) + +void do_per_trap(struct pt_regs *regs); + +static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) +{ + if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) + debug_user_asce(0); + + pai_kernel_enter(regs); +} + +#define arch_enter_from_user_mode arch_enter_from_user_mode + +static __always_inline void arch_exit_to_user_mode_work(struct pt_regs *regs, + unsigned long ti_work) +{ + if (ti_work & _TIF_PER_TRAP) { + clear_thread_flag(TIF_PER_TRAP); + do_per_trap(regs); + } + + if (ti_work & _TIF_GUARDED_STORAGE) + gs_load_bc_cb(regs); +} + +#define arch_exit_to_user_mode_work arch_exit_to_user_mode_work + +static __always_inline void arch_exit_to_user_mode(void) +{ + if (test_cpu_flag(CIF_FPU)) + __load_fpu_regs(); + + if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) + debug_user_asce(1); + + pai_kernel_exit(current_pt_regs()); +} + +#define arch_exit_to_user_mode arch_exit_to_user_mode + +static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, + unsigned long ti_work) +{ + choose_random_kstack_offset(get_tod_clock_fast() & 0xff); +} + +#define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare + +#endif diff --git a/arch/s390/include/asm/exec.h b/arch/s390/include/asm/exec.h new file mode 100644 index 0000000000..641bfbec9d --- /dev/null +++ b/arch/s390/include/asm/exec.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 1999, 2009 + * + * Author(s): Martin Schwidefsky + */ + +#ifndef __ASM_EXEC_H +#define __ASM_EXEC_H + +extern unsigned long arch_align_stack(unsigned long sp); + +#endif /* __ASM_EXEC_H */ diff --git a/arch/s390/include/asm/extable.h b/arch/s390/include/asm/extable.h new file mode 100644 index 0000000000..af6ba52743 --- /dev/null +++ b/arch/s390/include/asm/extable.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __S390_EXTABLE_H +#define __S390_EXTABLE_H + +#include +#include + +/* + * The exception table consists of three addresses: + * + * - Address of an instruction that is allowed to fault. + * - Address at which the program should continue. + * - Optional address of handler that takes pt_regs * argument and runs in + * interrupt context. + * + * No registers are modified, so it is entirely up to the continuation code + * to figure out what to do. + * + * All the routines below use bits of fixup code that are out of line + * with the main instruction path. This means when everything is well, + * we don't even have to jump over them. Further, they do not intrude + * on our cache or tlb entries. + */ + +struct exception_table_entry +{ + int insn, fixup; + short type, data; +}; + +extern struct exception_table_entry *__start_amode31_ex_table; +extern struct exception_table_entry *__stop_amode31_ex_table; + +const struct exception_table_entry *s390_search_extables(unsigned long addr); + +static inline unsigned long extable_fixup(const struct exception_table_entry *x) +{ + return (unsigned long)&x->fixup + x->fixup; +} + +#define ARCH_HAS_RELATIVE_EXTABLE + +static inline void swap_ex_entry_fixup(struct exception_table_entry *a, + struct exception_table_entry *b, + struct exception_table_entry tmp, + int delta) +{ + a->fixup = b->fixup + delta; + b->fixup = tmp.fixup - delta; + a->type = b->type; + b->type = tmp.type; + a->data = b->data; + b->data = tmp.data; +} +#define swap_ex_entry_fixup swap_ex_entry_fixup + +#ifdef CONFIG_BPF_JIT + +bool ex_handler_bpf(const struct exception_table_entry *ex, struct pt_regs *regs); + +#else /* !CONFIG_BPF_JIT */ + +static inline bool ex_handler_bpf(const struct exception_table_entry *ex, struct pt_regs *regs) +{ + return false; +} + +#endif /* CONFIG_BPF_JIT */ + +bool fixup_exception(struct pt_regs *regs); + +#endif diff --git a/arch/s390/include/asm/extmem.h b/arch/s390/include/asm/extmem.h new file mode 100644 index 0000000000..568fd81bb7 --- /dev/null +++ b/arch/s390/include/asm/extmem.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * definitions for external memory segment support + * Copyright IBM Corp. 2003 + */ + +#ifndef _ASM_S390X_DCSS_H +#define _ASM_S390X_DCSS_H +#ifndef __ASSEMBLY__ + +/* possible values for segment type as returned by segment_info */ +#define SEG_TYPE_SW 0 +#define SEG_TYPE_EW 1 +#define SEG_TYPE_SR 2 +#define SEG_TYPE_ER 3 +#define SEG_TYPE_SN 4 +#define SEG_TYPE_EN 5 +#define SEG_TYPE_SC 6 +#define SEG_TYPE_EWEN 7 + +#define SEGMENT_SHARED 0 +#define SEGMENT_EXCLUSIVE 1 + +int segment_load (char *name, int segtype, unsigned long *addr, unsigned long *length); +void segment_unload(char *name); +void segment_save(char *name); +int segment_type (char* name); +int segment_modify_shared (char *name, int do_nonshared); +void segment_warning(int rc, char *seg_name); + +#endif +#endif diff --git a/arch/s390/include/asm/facility.h b/arch/s390/include/asm/facility.h new file mode 100644 index 0000000000..94b6919026 --- /dev/null +++ b/arch/s390/include/asm/facility.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 1999, 2009 + * + * Author(s): Martin Schwidefsky + */ + +#ifndef __ASM_FACILITY_H +#define __ASM_FACILITY_H + +#include + +#include +#include +#include +#include + +#include + +#define MAX_FACILITY_BIT (sizeof(stfle_fac_list) * 8) + +extern u64 stfle_fac_list[16]; +extern u64 alt_stfle_fac_list[16]; + +static inline void __set_facility(unsigned long nr, void *facilities) +{ + unsigned char *ptr = (unsigned char *) facilities; + + if (nr >= MAX_FACILITY_BIT) + return; + ptr[nr >> 3] |= 0x80 >> (nr & 7); +} + +static inline void __clear_facility(unsigned long nr, void *facilities) +{ + unsigned char *ptr = (unsigned char *) facilities; + + if (nr >= MAX_FACILITY_BIT) + return; + ptr[nr >> 3] &= ~(0x80 >> (nr & 7)); +} + +static inline int __test_facility(unsigned long nr, void *facilities) +{ + unsigned char *ptr; + + if (nr >= MAX_FACILITY_BIT) + return 0; + ptr = (unsigned char *) facilities + (nr >> 3); + return (*ptr & (0x80 >> (nr & 7))) != 0; +} + +/* + * The test_facility function uses the bit ordering where the MSB is bit 0. + * That makes it easier to query facility bits with the bit number as + * documented in the Principles of Operation. + */ +static inline int test_facility(unsigned long nr) +{ + unsigned long facilities_als[] = { FACILITIES_ALS }; + + if (__builtin_constant_p(nr) && nr < sizeof(facilities_als) * 8) { + if (__test_facility(nr, &facilities_als)) + return 1; + } + return __test_facility(nr, &stfle_fac_list); +} + +static inline unsigned long __stfle_asm(u64 *stfle_fac_list, int size) +{ + unsigned long reg0 = size - 1; + + asm volatile( + " lgr 0,%[reg0]\n" + " .insn s,0xb2b00000,%[list]\n" /* stfle */ + " lgr %[reg0],0\n" + : [reg0] "+&d" (reg0), [list] "+Q" (*stfle_fac_list) + : + : "memory", "cc", "0"); + return reg0; +} + +/** + * stfle - Store facility list extended + * @stfle_fac_list: array where facility list can be stored + * @size: size of passed in array in double words + */ +static inline void __stfle(u64 *stfle_fac_list, int size) +{ + unsigned long nr; + u32 stfl_fac_list; + + asm volatile( + " stfl 0(0)\n" + : "=m" (S390_lowcore.stfl_fac_list)); + stfl_fac_list = S390_lowcore.stfl_fac_list; + memcpy(stfle_fac_list, &stfl_fac_list, 4); + nr = 4; /* bytes stored by stfl */ + if (stfl_fac_list & 0x01000000) { + /* More facility bits available with stfle */ + nr = __stfle_asm(stfle_fac_list, size); + nr = min_t(unsigned long, (nr + 1) * 8, size * 8); + } + memset((char *) stfle_fac_list + nr, 0, size * 8 - nr); +} + +static inline void stfle(u64 *stfle_fac_list, int size) +{ + preempt_disable(); + __stfle(stfle_fac_list, size); + preempt_enable(); +} + +#endif /* __ASM_FACILITY_H */ diff --git a/arch/s390/include/asm/fcx.h b/arch/s390/include/asm/fcx.h new file mode 100644 index 0000000000..29784b4b44 --- /dev/null +++ b/arch/s390/include/asm/fcx.h @@ -0,0 +1,312 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Functions for assembling fcx enabled I/O control blocks. + * + * Copyright IBM Corp. 2008 + * Author(s): Peter Oberparleiter + */ + +#ifndef _ASM_S390_FCX_H +#define _ASM_S390_FCX_H + +#include + +#define TCW_FORMAT_DEFAULT 0 +#define TCW_TIDAW_FORMAT_DEFAULT 0 +#define TCW_FLAGS_INPUT_TIDA (1 << (23 - 5)) +#define TCW_FLAGS_TCCB_TIDA (1 << (23 - 6)) +#define TCW_FLAGS_OUTPUT_TIDA (1 << (23 - 7)) +#define TCW_FLAGS_TIDAW_FORMAT(x) ((x) & 3) << (23 - 9) +#define TCW_FLAGS_GET_TIDAW_FORMAT(x) (((x) >> (23 - 9)) & 3) + +/** + * struct tcw - Transport Control Word (TCW) + * @format: TCW format + * @flags: TCW flags + * @tccbl: Transport-Command-Control-Block Length + * @r: Read Operations + * @w: Write Operations + * @output: Output-Data Address + * @input: Input-Data Address + * @tsb: Transport-Status-Block Address + * @tccb: Transport-Command-Control-Block Address + * @output_count: Output Count + * @input_count: Input Count + * @intrg: Interrogate TCW Address + */ +struct tcw { + u32 format:2; + u32 :6; + u32 flags:24; + u32 :8; + u32 tccbl:6; + u32 r:1; + u32 w:1; + u32 :16; + u64 output; + u64 input; + u64 tsb; + u64 tccb; + u32 output_count; + u32 input_count; + u32 :32; + u32 :32; + u32 :32; + u32 intrg; +} __attribute__ ((packed, aligned(64))); + +#define TIDAW_FLAGS_LAST (1 << (7 - 0)) +#define TIDAW_FLAGS_SKIP (1 << (7 - 1)) +#define TIDAW_FLAGS_DATA_INT (1 << (7 - 2)) +#define TIDAW_FLAGS_TTIC (1 << (7 - 3)) +#define TIDAW_FLAGS_INSERT_CBC (1 << (7 - 4)) + +/** + * struct tidaw - Transport-Indirect-Addressing Word (TIDAW) + * @flags: TIDAW flags. Can be an arithmetic OR of the following constants: + * %TIDAW_FLAGS_LAST, %TIDAW_FLAGS_SKIP, %TIDAW_FLAGS_DATA_INT, + * %TIDAW_FLAGS_TTIC, %TIDAW_FLAGS_INSERT_CBC + * @count: Count + * @addr: Address + */ +struct tidaw { + u32 flags:8; + u32 :24; + u32 count; + u64 addr; +} __attribute__ ((packed, aligned(16))); + +/** + * struct tsa_iostat - I/O-Status Transport-Status Area (IO-Stat TSA) + * @dev_time: Device Time + * @def_time: Defer Time + * @queue_time: Queue Time + * @dev_busy_time: Device-Busy Time + * @dev_act_time: Device-Active-Only Time + * @sense: Sense Data (if present) + */ +struct tsa_iostat { + u32 dev_time; + u32 def_time; + u32 queue_time; + u32 dev_busy_time; + u32 dev_act_time; + u8 sense[32]; +} __attribute__ ((packed)); + +/** + * struct tsa_ddpcs - Device-Detected-Program-Check Transport-Status Area (DDPC TSA) + * @rc: Reason Code + * @rcq: Reason Code Qualifier + * @sense: Sense Data (if present) + */ +struct tsa_ddpc { + u32 :24; + u32 rc:8; + u8 rcq[16]; + u8 sense[32]; +} __attribute__ ((packed)); + +#define TSA_INTRG_FLAGS_CU_STATE_VALID (1 << (7 - 0)) +#define TSA_INTRG_FLAGS_DEV_STATE_VALID (1 << (7 - 1)) +#define TSA_INTRG_FLAGS_OP_STATE_VALID (1 << (7 - 2)) + +/** + * struct tsa_intrg - Interrogate Transport-Status Area (Intrg. TSA) + * @format: Format + * @flags: Flags. Can be an arithmetic OR of the following constants: + * %TSA_INTRG_FLAGS_CU_STATE_VALID, %TSA_INTRG_FLAGS_DEV_STATE_VALID, + * %TSA_INTRG_FLAGS_OP_STATE_VALID + * @cu_state: Controle-Unit State + * @dev_state: Device State + * @op_state: Operation State + * @sd_info: State-Dependent Information + * @dl_id: Device-Level Identifier + * @dd_data: Device-Dependent Data + */ +struct tsa_intrg { + u32 format:8; + u32 flags:8; + u32 cu_state:8; + u32 dev_state:8; + u32 op_state:8; + u32 :24; + u8 sd_info[12]; + u32 dl_id; + u8 dd_data[28]; +} __attribute__ ((packed)); + +#define TSB_FORMAT_NONE 0 +#define TSB_FORMAT_IOSTAT 1 +#define TSB_FORMAT_DDPC 2 +#define TSB_FORMAT_INTRG 3 + +#define TSB_FLAGS_DCW_OFFSET_VALID (1 << (7 - 0)) +#define TSB_FLAGS_COUNT_VALID (1 << (7 - 1)) +#define TSB_FLAGS_CACHE_MISS (1 << (7 - 2)) +#define TSB_FLAGS_TIME_VALID (1 << (7 - 3)) +#define TSB_FLAGS_FORMAT(x) ((x) & 7) +#define TSB_FORMAT(t) ((t)->flags & 7) + +/** + * struct tsb - Transport-Status Block (TSB) + * @length: Length + * @flags: Flags. Can be an arithmetic OR of the following constants: + * %TSB_FLAGS_DCW_OFFSET_VALID, %TSB_FLAGS_COUNT_VALID, %TSB_FLAGS_CACHE_MISS, + * %TSB_FLAGS_TIME_VALID + * @dcw_offset: DCW Offset + * @count: Count + * @tsa: Transport-Status-Area + */ +struct tsb { + u32 length:8; + u32 flags:8; + u32 dcw_offset:16; + u32 count; + u32 :32; + union { + struct tsa_iostat iostat; + struct tsa_ddpc ddpc; + struct tsa_intrg intrg; + } __attribute__ ((packed)) tsa; +} __attribute__ ((packed, aligned(8))); + +#define DCW_INTRG_FORMAT_DEFAULT 0 + +#define DCW_INTRG_RC_UNSPECIFIED 0 +#define DCW_INTRG_RC_TIMEOUT 1 + +#define DCW_INTRG_RCQ_UNSPECIFIED 0 +#define DCW_INTRG_RCQ_PRIMARY 1 +#define DCW_INTRG_RCQ_SECONDARY 2 + +#define DCW_INTRG_FLAGS_MPM (1 << (7 - 0)) +#define DCW_INTRG_FLAGS_PPR (1 << (7 - 1)) +#define DCW_INTRG_FLAGS_CRIT (1 << (7 - 2)) + +/** + * struct dcw_intrg_data - Interrogate DCW data + * @format: Format. Should be %DCW_INTRG_FORMAT_DEFAULT + * @rc: Reason Code. Can be one of %DCW_INTRG_RC_UNSPECIFIED, + * %DCW_INTRG_RC_TIMEOUT + * @rcq: Reason Code Qualifier: Can be one of %DCW_INTRG_RCQ_UNSPECIFIED, + * %DCW_INTRG_RCQ_PRIMARY, %DCW_INTRG_RCQ_SECONDARY + * @lpm: Logical-Path Mask + * @pam: Path-Available Mask + * @pim: Path-Installed Mask + * @timeout: Timeout + * @flags: Flags. Can be an arithmetic OR of %DCW_INTRG_FLAGS_MPM, + * %DCW_INTRG_FLAGS_PPR, %DCW_INTRG_FLAGS_CRIT + * @time: Time + * @prog_id: Program Identifier + * @prog_data: Program-Dependent Data + */ +struct dcw_intrg_data { + u32 format:8; + u32 rc:8; + u32 rcq:8; + u32 lpm:8; + u32 pam:8; + u32 pim:8; + u32 timeout:16; + u32 flags:8; + u32 :24; + u32 :32; + u64 time; + u64 prog_id; + u8 prog_data[]; +} __attribute__ ((packed)); + +#define DCW_FLAGS_CC (1 << (7 - 1)) + +#define DCW_CMD_WRITE 0x01 +#define DCW_CMD_READ 0x02 +#define DCW_CMD_CONTROL 0x03 +#define DCW_CMD_SENSE 0x04 +#define DCW_CMD_SENSE_ID 0xe4 +#define DCW_CMD_INTRG 0x40 + +/** + * struct dcw - Device-Command Word (DCW) + * @cmd: Command Code. Can be one of %DCW_CMD_WRITE, %DCW_CMD_READ, + * %DCW_CMD_CONTROL, %DCW_CMD_SENSE, %DCW_CMD_SENSE_ID, %DCW_CMD_INTRG + * @flags: Flags. Can be an arithmetic OR of %DCW_FLAGS_CC + * @cd_count: Control-Data Count + * @count: Count + * @cd: Control Data + */ +struct dcw { + u32 cmd:8; + u32 flags:8; + u32 :8; + u32 cd_count:8; + u32 count; + u8 cd[]; +} __attribute__ ((packed)); + +#define TCCB_FORMAT_DEFAULT 0x7f +#define TCCB_MAX_DCW 30 +#define TCCB_MAX_SIZE (sizeof(struct tccb_tcah) + \ + TCCB_MAX_DCW * sizeof(struct dcw) + \ + sizeof(struct tccb_tcat)) +#define TCCB_SAC_DEFAULT 0x1ffe +#define TCCB_SAC_INTRG 0x1fff + +/** + * struct tccb_tcah - Transport-Command-Area Header (TCAH) + * @format: Format. Should be %TCCB_FORMAT_DEFAULT + * @tcal: Transport-Command-Area Length + * @sac: Service-Action Code. Can be one of %TCCB_SAC_DEFAULT, %TCCB_SAC_INTRG + * @prio: Priority + */ +struct tccb_tcah { + u32 format:8; + u32 :24; + u32 :24; + u32 tcal:8; + u32 sac:16; + u32 :8; + u32 prio:8; + u32 :32; +} __attribute__ ((packed)); + +/** + * struct tccb_tcat - Transport-Command-Area Trailer (TCAT) + * @count: Transport Count + */ +struct tccb_tcat { + u32 :32; + u32 count; +} __attribute__ ((packed)); + +/** + * struct tccb - (partial) Transport-Command-Control Block (TCCB) + * @tcah: TCAH + * @tca: Transport-Command Area + */ +struct tccb { + struct tccb_tcah tcah; + u8 tca[]; +} __attribute__ ((packed, aligned(8))); + +struct tcw *tcw_get_intrg(struct tcw *tcw); +void *tcw_get_data(struct tcw *tcw); +struct tccb *tcw_get_tccb(struct tcw *tcw); +struct tsb *tcw_get_tsb(struct tcw *tcw); + +void tcw_init(struct tcw *tcw, int r, int w); +void tcw_finalize(struct tcw *tcw, int num_tidaws); + +void tcw_set_intrg(struct tcw *tcw, struct tcw *intrg_tcw); +void tcw_set_data(struct tcw *tcw, void *data, int use_tidal); +void tcw_set_tccb(struct tcw *tcw, struct tccb *tccb); +void tcw_set_tsb(struct tcw *tcw, struct tsb *tsb); + +void tccb_init(struct tccb *tccb, size_t tccb_size, u32 sac); +void tsb_init(struct tsb *tsb); +struct dcw *tccb_add_dcw(struct tccb *tccb, size_t tccb_size, u8 cmd, u8 flags, + void *cd, u8 cd_count, u32 count); +struct tidaw *tcw_add_tidaw(struct tcw *tcw, int num_tidaws, u8 flags, + void *addr, u32 count); + +#endif /* _ASM_S390_FCX_H */ diff --git a/arch/s390/include/asm/fpu/api.h b/arch/s390/include/asm/fpu/api.h new file mode 100644 index 0000000000..9acf48e53a --- /dev/null +++ b/arch/s390/include/asm/fpu/api.h @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * In-kernel FPU support functions + * + * + * Consider these guidelines before using in-kernel FPU functions: + * + * 1. Use kernel_fpu_begin() and kernel_fpu_end() to enclose all in-kernel + * use of floating-point or vector registers and instructions. + * + * 2. For kernel_fpu_begin(), specify the vector register range you want to + * use with the KERNEL_VXR_* constants. Consider these usage guidelines: + * + * a) If your function typically runs in process-context, use the lower + * half of the vector registers, for example, specify KERNEL_VXR_LOW. + * b) If your function typically runs in soft-irq or hard-irq context, + * prefer using the upper half of the vector registers, for example, + * specify KERNEL_VXR_HIGH. + * + * If you adhere to these guidelines, an interrupted process context + * does not require to save and restore vector registers because of + * disjoint register ranges. + * + * Also note that the __kernel_fpu_begin()/__kernel_fpu_end() functions + * includes logic to save and restore up to 16 vector registers at once. + * + * 3. You can nest kernel_fpu_begin()/kernel_fpu_end() by using different + * struct kernel_fpu states. Vector registers that are in use by outer + * levels are saved and restored. You can minimize the save and restore + * effort by choosing disjoint vector register ranges. + * + * 5. To use vector floating-point instructions, specify the KERNEL_FPC + * flag to save and restore floating-point controls in addition to any + * vector register range. + * + * 6. To use floating-point registers and instructions only, specify the + * KERNEL_FPR flag. This flag triggers a save and restore of vector + * registers V0 to V15 and floating-point controls. + * + * Copyright IBM Corp. 2015 + * Author(s): Hendrik Brueckner + */ + +#ifndef _ASM_S390_FPU_API_H +#define _ASM_S390_FPU_API_H + +#include +#include + +void save_fpu_regs(void); +void load_fpu_regs(void); +void __load_fpu_regs(void); + +static inline int test_fp_ctl(u32 fpc) +{ + u32 orig_fpc; + int rc; + + asm volatile( + " efpc %1\n" + " sfpc %2\n" + "0: sfpc %1\n" + " la %0,0\n" + "1:\n" + EX_TABLE(0b,1b) + : "=d" (rc), "=&d" (orig_fpc) + : "d" (fpc), "0" (-EINVAL)); + return rc; +} + +#define KERNEL_FPC 1 +#define KERNEL_VXR_V0V7 2 +#define KERNEL_VXR_V8V15 4 +#define KERNEL_VXR_V16V23 8 +#define KERNEL_VXR_V24V31 16 + +#define KERNEL_VXR_LOW (KERNEL_VXR_V0V7|KERNEL_VXR_V8V15) +#define KERNEL_VXR_MID (KERNEL_VXR_V8V15|KERNEL_VXR_V16V23) +#define KERNEL_VXR_HIGH (KERNEL_VXR_V16V23|KERNEL_VXR_V24V31) + +#define KERNEL_VXR (KERNEL_VXR_LOW|KERNEL_VXR_HIGH) +#define KERNEL_FPR (KERNEL_FPC|KERNEL_VXR_LOW) + +struct kernel_fpu; + +/* + * Note the functions below must be called with preemption disabled. + * Do not enable preemption before calling __kernel_fpu_end() to prevent + * an corruption of an existing kernel FPU state. + * + * Prefer using the kernel_fpu_begin()/kernel_fpu_end() pair of functions. + */ +void __kernel_fpu_begin(struct kernel_fpu *state, u32 flags); +void __kernel_fpu_end(struct kernel_fpu *state, u32 flags); + + +static inline void kernel_fpu_begin(struct kernel_fpu *state, u32 flags) +{ + preempt_disable(); + state->mask = S390_lowcore.fpu_flags; + if (!test_cpu_flag(CIF_FPU)) + /* Save user space FPU state and register contents */ + save_fpu_regs(); + else if (state->mask & flags) + /* Save FPU/vector register in-use by the kernel */ + __kernel_fpu_begin(state, flags); + S390_lowcore.fpu_flags |= flags; +} + +static inline void kernel_fpu_end(struct kernel_fpu *state, u32 flags) +{ + S390_lowcore.fpu_flags = state->mask; + if (state->mask & flags) + /* Restore FPU/vector register in-use by the kernel */ + __kernel_fpu_end(state, flags); + preempt_enable(); +} + +#endif /* _ASM_S390_FPU_API_H */ diff --git a/arch/s390/include/asm/fpu/internal.h b/arch/s390/include/asm/fpu/internal.h new file mode 100644 index 0000000000..bbdadb1c9e --- /dev/null +++ b/arch/s390/include/asm/fpu/internal.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * FPU state and register content conversion primitives + * + * Copyright IBM Corp. 2015 + * Author(s): Hendrik Brueckner + */ + +#ifndef _ASM_S390_FPU_INTERNAL_H +#define _ASM_S390_FPU_INTERNAL_H + +#include +#include +#include + +static inline void save_vx_regs(__vector128 *vxrs) +{ + asm volatile( + " la 1,%0\n" + " .word 0xe70f,0x1000,0x003e\n" /* vstm 0,15,0(1) */ + " .word 0xe70f,0x1100,0x0c3e\n" /* vstm 16,31,256(1) */ + : "=Q" (*(struct vx_array *) vxrs) : : "1"); +} + +static inline void convert_vx_to_fp(freg_t *fprs, __vector128 *vxrs) +{ + int i; + + for (i = 0; i < __NUM_FPRS; i++) + fprs[i].ui = vxrs[i].high; +} + +static inline void convert_fp_to_vx(__vector128 *vxrs, freg_t *fprs) +{ + int i; + + for (i = 0; i < __NUM_FPRS; i++) + vxrs[i].high = fprs[i].ui; +} + +static inline void fpregs_store(_s390_fp_regs *fpregs, struct fpu *fpu) +{ + fpregs->pad = 0; + fpregs->fpc = fpu->fpc; + if (MACHINE_HAS_VX) + convert_vx_to_fp((freg_t *)&fpregs->fprs, fpu->vxrs); + else + memcpy((freg_t *)&fpregs->fprs, fpu->fprs, + sizeof(fpregs->fprs)); +} + +static inline void fpregs_load(_s390_fp_regs *fpregs, struct fpu *fpu) +{ + fpu->fpc = fpregs->fpc; + if (MACHINE_HAS_VX) + convert_fp_to_vx(fpu->vxrs, (freg_t *)&fpregs->fprs); + else + memcpy(fpu->fprs, (freg_t *)&fpregs->fprs, + sizeof(fpregs->fprs)); +} + +#endif /* _ASM_S390_FPU_INTERNAL_H */ diff --git a/arch/s390/include/asm/fpu/types.h b/arch/s390/include/asm/fpu/types.h new file mode 100644 index 0000000000..d889e94368 --- /dev/null +++ b/arch/s390/include/asm/fpu/types.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * FPU data structures + * + * Copyright IBM Corp. 2015 + * Author(s): Hendrik Brueckner + */ + +#ifndef _ASM_S390_FPU_TYPES_H +#define _ASM_S390_FPU_TYPES_H + +#include + +struct fpu { + __u32 fpc; /* Floating-point control */ + void *regs; /* Pointer to the current save area */ + union { + /* Floating-point register save area */ + freg_t fprs[__NUM_FPRS]; + /* Vector register save area */ + __vector128 vxrs[__NUM_VXRS]; + }; +}; + +/* VX array structure for address operand constraints in inline assemblies */ +struct vx_array { __vector128 _[__NUM_VXRS]; }; + +/* In-kernel FPU state structure */ +struct kernel_fpu { + u32 mask; + u32 fpc; + union { + freg_t fprs[__NUM_FPRS]; + __vector128 vxrs[__NUM_VXRS]; + }; +}; + +#endif /* _ASM_S390_FPU_TYPES_H */ diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h new file mode 100644 index 0000000000..5a82b08f03 --- /dev/null +++ b/arch/s390/include/asm/ftrace.h @@ -0,0 +1,169 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_FTRACE_H +#define _ASM_S390_FTRACE_H + +#define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR +#define ARCH_SUPPORTS_FTRACE_OPS 1 +#define MCOUNT_INSN_SIZE 6 + +#ifndef __ASSEMBLY__ + +#ifdef CONFIG_CC_IS_CLANG +/* https://bugs.llvm.org/show_bug.cgi?id=41424 */ +#define ftrace_return_address(n) 0UL +#else +#define ftrace_return_address(n) __builtin_return_address(n) +#endif + +void ftrace_caller(void); + +extern void *ftrace_func; + +struct dyn_arch_ftrace { }; + +#define MCOUNT_ADDR 0 +#define FTRACE_ADDR ((unsigned long)ftrace_caller) + +#define KPROBE_ON_FTRACE_NOP 0 +#define KPROBE_ON_FTRACE_CALL 1 + +struct module; +struct dyn_ftrace; + +bool ftrace_need_init_nop(void); +#define ftrace_need_init_nop ftrace_need_init_nop + +int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec); +#define ftrace_init_nop ftrace_init_nop + +static inline unsigned long ftrace_call_adjust(unsigned long addr) +{ + return addr; +} + +struct ftrace_regs { + struct pt_regs regs; +}; + +static __always_inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs *fregs) +{ + struct pt_regs *regs = &fregs->regs; + + if (test_pt_regs_flag(regs, PIF_FTRACE_FULL_REGS)) + return regs; + return NULL; +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +struct fgraph_ret_regs { + unsigned long gpr2; + unsigned long fp; +}; + +static __always_inline unsigned long fgraph_ret_regs_return_value(struct fgraph_ret_regs *ret_regs) +{ + return ret_regs->gpr2; +} + +static __always_inline unsigned long fgraph_ret_regs_frame_pointer(struct fgraph_ret_regs *ret_regs) +{ + return ret_regs->fp; +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +static __always_inline unsigned long +ftrace_regs_get_instruction_pointer(const struct ftrace_regs *fregs) +{ + return fregs->regs.psw.addr; +} + +static __always_inline void +ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs, + unsigned long ip) +{ + fregs->regs.psw.addr = ip; +} + +#define ftrace_regs_get_argument(fregs, n) \ + regs_get_kernel_argument(&(fregs)->regs, n) +#define ftrace_regs_get_stack_pointer(fregs) \ + kernel_stack_pointer(&(fregs)->regs) +#define ftrace_regs_return_value(fregs) \ + regs_return_value(&(fregs)->regs) +#define ftrace_regs_set_return_value(fregs, ret) \ + regs_set_return_value(&(fregs)->regs, ret) +#define ftrace_override_function_with_return(fregs) \ + override_function_with_return(&(fregs)->regs) +#define ftrace_regs_query_register_offset(name) \ + regs_query_register_offset(name) + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +/* + * When an ftrace registered caller is tracing a function that is + * also set by a register_ftrace_direct() call, it needs to be + * differentiated in the ftrace_caller trampoline. To do this, + * place the direct caller in the ORIG_GPR2 part of pt_regs. This + * tells the ftrace_caller that there's a direct caller. + */ +static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs, unsigned long addr) +{ + struct pt_regs *regs = &fregs->regs; + regs->orig_gpr2 = addr; +} +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + +/* + * Even though the system call numbers are identical for s390/s390x a + * different system call table is used for compat tasks. This may lead + * to e.g. incorrect or missing trace event sysfs files. + * Therefore simply do not trace compat system calls at all. + * See kernel/trace/trace_syscalls.c. + */ +#define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS +static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs) +{ + return is_compat_task(); +} + +#define ARCH_HAS_SYSCALL_MATCH_SYM_NAME +static inline bool arch_syscall_match_sym_name(const char *sym, + const char *name) +{ + /* + * Skip __s390_ and __s390x_ prefix - due to compat wrappers + * and aliasing some symbols of 64 bit system call functions + * may get the __s390_ prefix instead of the __s390x_ prefix. + */ + return !strcmp(sym + 7, name) || !strcmp(sym + 8, name); +} + +#endif /* __ASSEMBLY__ */ + +#ifdef CONFIG_FUNCTION_TRACER + +#define FTRACE_NOP_INSN .word 0xc004, 0x0000, 0x0000 /* brcl 0,0 */ + +#ifndef CC_USING_HOTPATCH + +#define FTRACE_GEN_MCOUNT_RECORD(name) \ + .section __mcount_loc, "a", @progbits; \ + .quad name; \ + .previous; + +#else /* !CC_USING_HOTPATCH */ + +#define FTRACE_GEN_MCOUNT_RECORD(name) + +#endif /* !CC_USING_HOTPATCH */ + +#define FTRACE_GEN_NOP_ASM(name) \ + FTRACE_GEN_MCOUNT_RECORD(name) \ + FTRACE_NOP_INSN + +#else /* CONFIG_FUNCTION_TRACER */ + +#define FTRACE_GEN_NOP_ASM(name) + +#endif /* CONFIG_FUNCTION_TRACER */ + +#endif /* _ASM_S390_FTRACE_H */ diff --git a/arch/s390/include/asm/ftrace.lds.h b/arch/s390/include/asm/ftrace.lds.h new file mode 100644 index 0000000000..968adfd412 --- /dev/null +++ b/arch/s390/include/asm/ftrace.lds.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef DIV_ROUND_UP +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +#endif + +#define SIZEOF_MCOUNT_LOC_ENTRY 8 +#define SIZEOF_FTRACE_HOTPATCH_TRAMPOLINE 24 +#define FTRACE_HOTPATCH_TRAMPOLINES_SIZE(n) \ + DIV_ROUND_UP(SIZEOF_FTRACE_HOTPATCH_TRAMPOLINE * (n), \ + SIZEOF_MCOUNT_LOC_ENTRY) + +#ifdef CONFIG_FUNCTION_TRACER +#define FTRACE_HOTPATCH_TRAMPOLINES_TEXT \ + . = ALIGN(8); \ + __ftrace_hotpatch_trampolines_start = .; \ + . = . + FTRACE_HOTPATCH_TRAMPOLINES_SIZE(__stop_mcount_loc - \ + __start_mcount_loc); \ + __ftrace_hotpatch_trampolines_end = .; +#else +#define FTRACE_HOTPATCH_TRAMPOLINES_TEXT +#endif diff --git a/arch/s390/include/asm/futex.h b/arch/s390/include/asm/futex.h new file mode 100644 index 0000000000..eaeaeb3ff0 --- /dev/null +++ b/arch/s390/include/asm/futex.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_FUTEX_H +#define _ASM_S390_FUTEX_H + +#include +#include +#include +#include +#include + +#define __futex_atomic_op(insn, ret, oldval, newval, uaddr, oparg) \ + asm volatile( \ + " sacf 256\n" \ + "0: l %1,0(%6)\n" \ + "1:"insn \ + "2: cs %1,%2,0(%6)\n" \ + "3: jl 1b\n" \ + " lhi %0,0\n" \ + "4: sacf 768\n" \ + EX_TABLE(0b,4b) EX_TABLE(1b,4b) \ + EX_TABLE(2b,4b) EX_TABLE(3b,4b) \ + : "=d" (ret), "=&d" (oldval), "=&d" (newval), \ + "=m" (*uaddr) \ + : "0" (-EFAULT), "d" (oparg), "a" (uaddr), \ + "m" (*uaddr) : "cc"); + +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) +{ + int oldval = 0, newval, ret; + + switch (op) { + case FUTEX_OP_SET: + __futex_atomic_op("lr %2,%5\n", + ret, oldval, newval, uaddr, oparg); + break; + case FUTEX_OP_ADD: + __futex_atomic_op("lr %2,%1\nar %2,%5\n", + ret, oldval, newval, uaddr, oparg); + break; + case FUTEX_OP_OR: + __futex_atomic_op("lr %2,%1\nor %2,%5\n", + ret, oldval, newval, uaddr, oparg); + break; + case FUTEX_OP_ANDN: + __futex_atomic_op("lr %2,%1\nnr %2,%5\n", + ret, oldval, newval, uaddr, oparg); + break; + case FUTEX_OP_XOR: + __futex_atomic_op("lr %2,%1\nxr %2,%5\n", + ret, oldval, newval, uaddr, oparg); + break; + default: + ret = -ENOSYS; + } + + if (!ret) + *oval = oldval; + + return ret; +} + +static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) +{ + int ret; + + asm volatile( + " sacf 256\n" + "0: cs %1,%4,0(%5)\n" + "1: la %0,0\n" + "2: sacf 768\n" + EX_TABLE(0b,2b) EX_TABLE(1b,2b) + : "=d" (ret), "+d" (oldval), "=m" (*uaddr) + : "0" (-EFAULT), "d" (newval), "a" (uaddr), "m" (*uaddr) + : "cc", "memory"); + *uval = oldval; + return ret; +} + +#endif /* _ASM_S390_FUTEX_H */ diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h new file mode 100644 index 0000000000..5cc46e0dde --- /dev/null +++ b/arch/s390/include/asm/gmap.h @@ -0,0 +1,188 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KVM guest address space mapping code + * + * Copyright IBM Corp. 2007, 2016 + * Author(s): Martin Schwidefsky + */ + +#ifndef _ASM_S390_GMAP_H +#define _ASM_S390_GMAP_H + +#include +#include + +/* Generic bits for GMAP notification on DAT table entry changes. */ +#define GMAP_NOTIFY_SHADOW 0x2 +#define GMAP_NOTIFY_MPROT 0x1 + +/* Status bits only for huge segment entries */ +#define _SEGMENT_ENTRY_GMAP_IN 0x8000 /* invalidation notify bit */ +#define _SEGMENT_ENTRY_GMAP_UC 0x4000 /* dirty (migration) */ + +/** + * struct gmap_struct - guest address space + * @list: list head for the mm->context gmap list + * @crst_list: list of all crst tables used in the guest address space + * @mm: pointer to the parent mm_struct + * @guest_to_host: radix tree with guest to host address translation + * @host_to_guest: radix tree with pointer to segment table entries + * @guest_table_lock: spinlock to protect all entries in the guest page table + * @ref_count: reference counter for the gmap structure + * @table: pointer to the page directory + * @asce: address space control element for gmap page table + * @pfault_enabled: defines if pfaults are applicable for the guest + * @guest_handle: protected virtual machine handle for the ultravisor + * @host_to_rmap: radix tree with gmap_rmap lists + * @children: list of shadow gmap structures + * @pt_list: list of all page tables used in the shadow guest address space + * @shadow_lock: spinlock to protect the shadow gmap list + * @parent: pointer to the parent gmap for shadow guest address spaces + * @orig_asce: ASCE for which the shadow page table has been created + * @edat_level: edat level to be used for the shadow translation + * @removed: flag to indicate if a shadow guest address space has been removed + * @initialized: flag to indicate if a shadow guest address space can be used + */ +struct gmap { + struct list_head list; + struct list_head crst_list; + struct mm_struct *mm; + struct radix_tree_root guest_to_host; + struct radix_tree_root host_to_guest; + spinlock_t guest_table_lock; + refcount_t ref_count; + unsigned long *table; + unsigned long asce; + unsigned long asce_end; + void *private; + bool pfault_enabled; + /* only set for protected virtual machines */ + unsigned long guest_handle; + /* Additional data for shadow guest address spaces */ + struct radix_tree_root host_to_rmap; + struct list_head children; + struct list_head pt_list; + spinlock_t shadow_lock; + struct gmap *parent; + unsigned long orig_asce; + int edat_level; + bool removed; + bool initialized; +}; + +/** + * struct gmap_rmap - reverse mapping for shadow page table entries + * @next: pointer to next rmap in the list + * @raddr: virtual rmap address in the shadow guest address space + */ +struct gmap_rmap { + struct gmap_rmap *next; + unsigned long raddr; +}; + +#define gmap_for_each_rmap(pos, head) \ + for (pos = (head); pos; pos = pos->next) + +#define gmap_for_each_rmap_safe(pos, n, head) \ + for (pos = (head); n = pos ? pos->next : NULL, pos; pos = n) + +/** + * struct gmap_notifier - notify function block for page invalidation + * @notifier_call: address of callback function + */ +struct gmap_notifier { + struct list_head list; + struct rcu_head rcu; + void (*notifier_call)(struct gmap *gmap, unsigned long start, + unsigned long end); +}; + +static inline int gmap_is_shadow(struct gmap *gmap) +{ + return !!gmap->parent; +} + +struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit); +void gmap_remove(struct gmap *gmap); +struct gmap *gmap_get(struct gmap *gmap); +void gmap_put(struct gmap *gmap); + +void gmap_enable(struct gmap *gmap); +void gmap_disable(struct gmap *gmap); +struct gmap *gmap_get_enabled(void); +int gmap_map_segment(struct gmap *gmap, unsigned long from, + unsigned long to, unsigned long len); +int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len); +unsigned long __gmap_translate(struct gmap *, unsigned long gaddr); +unsigned long gmap_translate(struct gmap *, unsigned long gaddr); +int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr); +int gmap_fault(struct gmap *, unsigned long gaddr, unsigned int fault_flags); +void gmap_discard(struct gmap *, unsigned long from, unsigned long to); +void __gmap_zap(struct gmap *, unsigned long gaddr); +void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr); + +int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val); + +struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, + int edat_level); +int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level); +int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, + int fake); +int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, + int fake); +int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, + int fake); +int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, + int fake); +int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, + unsigned long *pgt, int *dat_protection, int *fake); +int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte); + +void gmap_register_pte_notifier(struct gmap_notifier *); +void gmap_unregister_pte_notifier(struct gmap_notifier *); + +int gmap_mprotect_notify(struct gmap *, unsigned long start, + unsigned long len, int prot); + +void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4], + unsigned long gaddr, unsigned long vmaddr); +int gmap_mark_unmergeable(void); +void s390_unlist_old_asce(struct gmap *gmap); +int s390_replace_asce(struct gmap *gmap); +void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns); +int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, + unsigned long end, bool interruptible); + +/** + * s390_uv_destroy_range - Destroy a range of pages in the given mm. + * @mm: the mm on which to operate on + * @start: the start of the range + * @end: the end of the range + * + * This function will call cond_sched, so it should not generate stalls, but + * it will otherwise only return when it completed. + */ +static inline void s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, + unsigned long end) +{ + (void)__s390_uv_destroy_range(mm, start, end, false); +} + +/** + * s390_uv_destroy_range_interruptible - Destroy a range of pages in the + * given mm, but stop when a fatal signal is received. + * @mm: the mm on which to operate on + * @start: the start of the range + * @end: the end of the range + * + * This function will call cond_sched, so it should not generate stalls. If + * a fatal signal is received, it will return with -EINTR immediately, + * without finishing destroying the whole range. Upon successful + * completion, 0 is returned. + */ +static inline int s390_uv_destroy_range_interruptible(struct mm_struct *mm, unsigned long start, + unsigned long end) +{ + return __s390_uv_destroy_range(mm, start, end, true); +} +#endif /* _ASM_S390_GMAP_H */ diff --git a/arch/s390/include/asm/hardirq.h b/arch/s390/include/asm/hardirq.h new file mode 100644 index 0000000000..58668ffb54 --- /dev/null +++ b/arch/s390/include/asm/hardirq.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * S390 version + * Copyright IBM Corp. 1999, 2000 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), + * Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com) + * + * Derived from "include/asm-i386/hardirq.h" + */ + +#ifndef __ASM_HARDIRQ_H +#define __ASM_HARDIRQ_H + +#include + +#define local_softirq_pending() (S390_lowcore.softirq_pending) +#define set_softirq_pending(x) (S390_lowcore.softirq_pending = (x)) +#define or_softirq_pending(x) (S390_lowcore.softirq_pending |= (x)) + +#define __ARCH_IRQ_STAT +#define __ARCH_IRQ_EXIT_IRQS_DISABLED + +static inline void ack_bad_irq(unsigned int irq) +{ + printk(KERN_CRIT "unexpected IRQ trap at vector %02x\n", irq); +} + +#endif /* __ASM_HARDIRQ_H */ diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h new file mode 100644 index 0000000000..deb198a610 --- /dev/null +++ b/arch/s390/include/asm/hugetlb.h @@ -0,0 +1,147 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * IBM System z Huge TLB Page Support for Kernel. + * + * Copyright IBM Corp. 2008 + * Author(s): Gerald Schaefer + */ + +#ifndef _ASM_S390_HUGETLB_H +#define _ASM_S390_HUGETLB_H + +#include +#include + +#define hugetlb_free_pgd_range free_pgd_range +#define hugepages_supported() (MACHINE_HAS_EDAT1) + +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned long sz); +void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); +pte_t huge_ptep_get(pte_t *ptep); +pte_t huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep); + +/* + * If the arch doesn't supply something else, assume that hugepage + * size aligned regions are ok without further preparation. + */ +static inline int prepare_hugepage_range(struct file *file, + unsigned long addr, unsigned long len) +{ + struct hstate *h = hstate_file(file); + + if (len & ~huge_page_mask(h)) + return -EINVAL; + if (addr & ~huge_page_mask(h)) + return -EINVAL; + return 0; +} + +static inline void arch_clear_hugepage_flags(struct page *page) +{ + clear_bit(PG_arch_1, &page->flags); +} +#define arch_clear_hugepage_flags arch_clear_hugepage_flags + +static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned long sz) +{ + if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) + set_pte(ptep, __pte(_REGION3_ENTRY_EMPTY)); + else + set_pte(ptep, __pte(_SEGMENT_ENTRY_EMPTY)); +} + +static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + return huge_ptep_get_and_clear(vma->vm_mm, address, ptep); +} + +static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t pte, int dirty) +{ + int changed = !pte_same(huge_ptep_get(ptep), pte); + if (changed) { + huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); + __set_huge_pte_at(vma->vm_mm, addr, ptep, pte); + } + return changed; +} + +static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + pte_t pte = huge_ptep_get_and_clear(mm, addr, ptep); + __set_huge_pte_at(mm, addr, ptep, pte_wrprotect(pte)); +} + +static inline pte_t mk_huge_pte(struct page *page, pgprot_t pgprot) +{ + return mk_pte(page, pgprot); +} + +static inline int huge_pte_none(pte_t pte) +{ + return pte_none(pte); +} + +static inline int huge_pte_none_mostly(pte_t pte) +{ + return huge_pte_none(pte); +} + +static inline int huge_pte_write(pte_t pte) +{ + return pte_write(pte); +} + +static inline int huge_pte_dirty(pte_t pte) +{ + return pte_dirty(pte); +} + +static inline pte_t huge_pte_mkwrite(pte_t pte) +{ + return pte_mkwrite_novma(pte); +} + +static inline pte_t huge_pte_mkdirty(pte_t pte) +{ + return pte_mkdirty(pte); +} + +static inline pte_t huge_pte_wrprotect(pte_t pte) +{ + return pte_wrprotect(pte); +} + +static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) +{ + return pte_modify(pte, newprot); +} + +static inline pte_t huge_pte_mkuffd_wp(pte_t pte) +{ + return pte; +} + +static inline pte_t huge_pte_clear_uffd_wp(pte_t pte) +{ + return pte; +} + +static inline int huge_pte_uffd_wp(pte_t pte) +{ + return 0; +} + +static inline bool gigantic_page_runtime_supported(void) +{ + return true; +} + +#endif /* _ASM_S390_HUGETLB_H */ diff --git a/arch/s390/include/asm/hw_irq.h b/arch/s390/include/asm/hw_irq.h new file mode 100644 index 0000000000..9078b5b6b8 --- /dev/null +++ b/arch/s390/include/asm/hw_irq.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _HW_IRQ_H +#define _HW_IRQ_H + +#include +#include + +void __init init_airq_interrupts(void); +void __init init_cio_interrupts(void); + +#endif diff --git a/arch/s390/include/asm/idals.h b/arch/s390/include/asm/idals.h new file mode 100644 index 0000000000..59fcc3c72e --- /dev/null +++ b/arch/s390/include/asm/idals.h @@ -0,0 +1,244 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Author(s)......: Holger Smolinski + * Martin Schwidefsky + * Bugreports.to..: + * Copyright IBM Corp. 2000 + * + * History of changes + * 07/24/00 new file + * 05/04/02 code restructuring. + */ + +#ifndef _S390_IDALS_H +#define _S390_IDALS_H + +#include +#include +#include +#include +#include +#include + +#define IDA_SIZE_LOG 12 /* 11 for 2k , 12 for 4k */ +#define IDA_BLOCK_SIZE (1L<> 31) != 0; +} + + +/* + * Return the number of idal words needed for an address/length pair. + */ +static inline unsigned int idal_nr_words(void *vaddr, unsigned int length) +{ + return ((__pa(vaddr) & (IDA_BLOCK_SIZE-1)) + length + + (IDA_BLOCK_SIZE-1)) >> IDA_SIZE_LOG; +} + +/* + * Return the number of 2K IDA words needed for an address/length pair. + */ +static inline unsigned int idal_2k_nr_words(void *vaddr, unsigned int length) +{ + return ((__pa(vaddr) & (IDA_2K_BLOCK_SIZE - 1)) + length + + (IDA_2K_BLOCK_SIZE - 1)) >> IDA_2K_SIZE_LOG; +} + +/* + * Create the list of idal words for an address/length pair. + */ +static inline unsigned long *idal_create_words(unsigned long *idaws, + void *vaddr, unsigned int length) +{ + unsigned long paddr; + unsigned int cidaw; + + paddr = __pa(vaddr); + cidaw = ((paddr & (IDA_BLOCK_SIZE-1)) + length + + (IDA_BLOCK_SIZE-1)) >> IDA_SIZE_LOG; + *idaws++ = paddr; + paddr &= -IDA_BLOCK_SIZE; + while (--cidaw > 0) { + paddr += IDA_BLOCK_SIZE; + *idaws++ = paddr; + } + return idaws; +} + +/* + * Sets the address of the data in CCW. + * If necessary it allocates an IDAL and sets the appropriate flags. + */ +static inline int +set_normalized_cda(struct ccw1 * ccw, void *vaddr) +{ + unsigned int nridaws; + unsigned long *idal; + + if (ccw->flags & CCW_FLAG_IDA) + return -EINVAL; + nridaws = idal_nr_words(vaddr, ccw->count); + if (nridaws > 0) { + idal = kmalloc(nridaws * sizeof(unsigned long), + GFP_ATOMIC | GFP_DMA ); + if (idal == NULL) + return -ENOMEM; + idal_create_words(idal, vaddr, ccw->count); + ccw->flags |= CCW_FLAG_IDA; + vaddr = idal; + } + ccw->cda = (__u32)(unsigned long) vaddr; + return 0; +} + +/* + * Releases any allocated IDAL related to the CCW. + */ +static inline void +clear_normalized_cda(struct ccw1 * ccw) +{ + if (ccw->flags & CCW_FLAG_IDA) { + kfree((void *)(unsigned long) ccw->cda); + ccw->flags &= ~CCW_FLAG_IDA; + } + ccw->cda = 0; +} + +/* + * Idal buffer extension + */ +struct idal_buffer { + size_t size; + size_t page_order; + void *data[]; +}; + +/* + * Allocate an idal buffer + */ +static inline struct idal_buffer * +idal_buffer_alloc(size_t size, int page_order) +{ + struct idal_buffer *ib; + int nr_chunks, nr_ptrs, i; + + nr_ptrs = (size + IDA_BLOCK_SIZE - 1) >> IDA_SIZE_LOG; + nr_chunks = (4096 << page_order) >> IDA_SIZE_LOG; + ib = kmalloc(struct_size(ib, data, nr_ptrs), GFP_DMA | GFP_KERNEL); + if (ib == NULL) + return ERR_PTR(-ENOMEM); + ib->size = size; + ib->page_order = page_order; + for (i = 0; i < nr_ptrs; i++) { + if ((i & (nr_chunks - 1)) != 0) { + ib->data[i] = ib->data[i-1] + IDA_BLOCK_SIZE; + continue; + } + ib->data[i] = (void *) + __get_free_pages(GFP_KERNEL, page_order); + if (ib->data[i] != NULL) + continue; + // Not enough memory + while (i >= nr_chunks) { + i -= nr_chunks; + free_pages((unsigned long) ib->data[i], + ib->page_order); + } + kfree(ib); + return ERR_PTR(-ENOMEM); + } + return ib; +} + +/* + * Free an idal buffer. + */ +static inline void +idal_buffer_free(struct idal_buffer *ib) +{ + int nr_chunks, nr_ptrs, i; + + nr_ptrs = (ib->size + IDA_BLOCK_SIZE - 1) >> IDA_SIZE_LOG; + nr_chunks = (4096 << ib->page_order) >> IDA_SIZE_LOG; + for (i = 0; i < nr_ptrs; i += nr_chunks) + free_pages((unsigned long) ib->data[i], ib->page_order); + kfree(ib); +} + +/* + * Test if a idal list is really needed. + */ +static inline int +__idal_buffer_is_needed(struct idal_buffer *ib) +{ + return ib->size > (4096ul << ib->page_order) || + idal_is_needed(ib->data[0], ib->size); +} + +/* + * Set channel data address to idal buffer. + */ +static inline void +idal_buffer_set_cda(struct idal_buffer *ib, struct ccw1 *ccw) +{ + if (__idal_buffer_is_needed(ib)) { + // setup idals; + ccw->cda = (u32)(addr_t) ib->data; + ccw->flags |= CCW_FLAG_IDA; + } else + // we do not need idals - use direct addressing + ccw->cda = (u32)(addr_t) ib->data[0]; + ccw->count = ib->size; +} + +/* + * Copy count bytes from an idal buffer to user memory + */ +static inline size_t +idal_buffer_to_user(struct idal_buffer *ib, void __user *to, size_t count) +{ + size_t left; + int i; + + BUG_ON(count > ib->size); + for (i = 0; count > IDA_BLOCK_SIZE; i++) { + left = copy_to_user(to, ib->data[i], IDA_BLOCK_SIZE); + if (left) + return left + count - IDA_BLOCK_SIZE; + to = (void __user *) to + IDA_BLOCK_SIZE; + count -= IDA_BLOCK_SIZE; + } + return copy_to_user(to, ib->data[i], count); +} + +/* + * Copy count bytes from user memory to an idal buffer + */ +static inline size_t +idal_buffer_from_user(struct idal_buffer *ib, const void __user *from, size_t count) +{ + size_t left; + int i; + + BUG_ON(count > ib->size); + for (i = 0; count > IDA_BLOCK_SIZE; i++) { + left = copy_from_user(ib->data[i], from, IDA_BLOCK_SIZE); + if (left) + return left + count - IDA_BLOCK_SIZE; + from = (void __user *) from + IDA_BLOCK_SIZE; + count -= IDA_BLOCK_SIZE; + } + return copy_from_user(ib->data[i], from, count); +} + +#endif diff --git a/arch/s390/include/asm/idle.h b/arch/s390/include/asm/idle.h new file mode 100644 index 0000000000..09f763b9eb --- /dev/null +++ b/arch/s390/include/asm/idle.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2014 + * + * Author: Martin Schwidefsky + */ + +#ifndef _S390_IDLE_H +#define _S390_IDLE_H + +#include +#include + +struct s390_idle_data { + unsigned long idle_count; + unsigned long idle_time; + unsigned long clock_idle_enter; + unsigned long timer_idle_enter; + unsigned long mt_cycles_enter[8]; +}; + +extern struct device_attribute dev_attr_idle_count; +extern struct device_attribute dev_attr_idle_time_us; + +void psw_idle(struct s390_idle_data *data, unsigned long psw_mask); + +#endif /* _S390_IDLE_H */ diff --git a/arch/s390/include/asm/io.h b/arch/s390/include/asm/io.h new file mode 100644 index 0000000000..4453ad7c11 --- /dev/null +++ b/arch/s390/include/asm/io.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * S390 version + * Copyright IBM Corp. 1999 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) + * + * Derived from "include/asm-i386/io.h" + */ + +#ifndef _S390_IO_H +#define _S390_IO_H + +#include +#include +#include +#include + +#define xlate_dev_mem_ptr xlate_dev_mem_ptr +void *xlate_dev_mem_ptr(phys_addr_t phys); +#define unxlate_dev_mem_ptr unxlate_dev_mem_ptr +void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); + +#define IO_SPACE_LIMIT 0 + +/* + * I/O memory mapping functions. + */ +#define ioremap_prot ioremap_prot +#define iounmap iounmap + +#define _PAGE_IOREMAP pgprot_val(PAGE_KERNEL) + +#define ioremap_wc(addr, size) \ + ioremap_prot((addr), (size), pgprot_val(pgprot_writecombine(PAGE_KERNEL))) +#define ioremap_wt(addr, size) \ + ioremap_prot((addr), (size), pgprot_val(pgprot_writethrough(PAGE_KERNEL))) + +static inline void __iomem *ioport_map(unsigned long port, unsigned int nr) +{ + return NULL; +} + +static inline void ioport_unmap(void __iomem *p) +{ +} + +#ifdef CONFIG_PCI + +/* + * s390 needs a private implementation of pci_iomap since ioremap with its + * offset parameter isn't sufficient. That's because BAR spaces are not + * disjunctive on s390 so we need the bar parameter of pci_iomap to find + * the corresponding device and create the mapping cookie. + */ +#define pci_iomap pci_iomap +#define pci_iomap_range pci_iomap_range +#define pci_iounmap pci_iounmap +#define pci_iomap_wc pci_iomap_wc +#define pci_iomap_wc_range pci_iomap_wc_range + +#define memcpy_fromio(dst, src, count) zpci_memcpy_fromio(dst, src, count) +#define memcpy_toio(dst, src, count) zpci_memcpy_toio(dst, src, count) +#define memset_io(dst, val, count) zpci_memset_io(dst, val, count) + +#define mmiowb() zpci_barrier() + +#define __raw_readb zpci_read_u8 +#define __raw_readw zpci_read_u16 +#define __raw_readl zpci_read_u32 +#define __raw_readq zpci_read_u64 +#define __raw_writeb zpci_write_u8 +#define __raw_writew zpci_write_u16 +#define __raw_writel zpci_write_u32 +#define __raw_writeq zpci_write_u64 + +#endif /* CONFIG_PCI */ + +#include + +#endif diff --git a/arch/s390/include/asm/ipl.h b/arch/s390/include/asm/ipl.h new file mode 100644 index 0000000000..b0d0003247 --- /dev/null +++ b/arch/s390/include/asm/ipl.h @@ -0,0 +1,170 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * s390 (re)ipl support + * + * Copyright IBM Corp. 2007 + */ + +#ifndef _ASM_S390_IPL_H +#define _ASM_S390_IPL_H + +#include +#include +#include +#include +#include +#include + +struct ipl_parameter_block { + struct ipl_pl_hdr hdr; + union { + struct ipl_pb_hdr pb0_hdr; + struct ipl_pb0_common common; + struct ipl_pb0_fcp fcp; + struct ipl_pb0_ccw ccw; + struct ipl_pb0_eckd eckd; + struct ipl_pb0_nvme nvme; + char raw[PAGE_SIZE - sizeof(struct ipl_pl_hdr)]; + }; +} __packed __aligned(PAGE_SIZE); + +#define NSS_NAME_SIZE 8 + +#define IPL_BP_FCP_LEN (sizeof(struct ipl_pl_hdr) + \ + sizeof(struct ipl_pb0_fcp)) +#define IPL_BP0_FCP_LEN (sizeof(struct ipl_pb0_fcp)) + +#define IPL_BP_NVME_LEN (sizeof(struct ipl_pl_hdr) + \ + sizeof(struct ipl_pb0_nvme)) +#define IPL_BP0_NVME_LEN (sizeof(struct ipl_pb0_nvme)) + +#define IPL_BP_CCW_LEN (sizeof(struct ipl_pl_hdr) + \ + sizeof(struct ipl_pb0_ccw)) +#define IPL_BP0_CCW_LEN (sizeof(struct ipl_pb0_ccw)) + +#define IPL_BP_ECKD_LEN (sizeof(struct ipl_pl_hdr) + \ + sizeof(struct ipl_pb0_eckd)) +#define IPL_BP0_ECKD_LEN (sizeof(struct ipl_pb0_eckd)) + +#define IPL_MAX_SUPPORTED_VERSION (0) + +#define IPL_RB_CERT_UNKNOWN ((unsigned short)-1) + +#define DIAG308_VMPARM_SIZE (64) +#define DIAG308_SCPDATA_OFFSET offsetof(struct ipl_parameter_block, \ + fcp.scp_data) +#define DIAG308_SCPDATA_SIZE (PAGE_SIZE - DIAG308_SCPDATA_OFFSET) + +struct save_area; +struct save_area * __init save_area_alloc(bool is_boot_cpu); +struct save_area * __init save_area_boot_cpu(void); +void __init save_area_add_regs(struct save_area *, void *regs); +void __init save_area_add_vxrs(struct save_area *, __vector128 *vxrs); + +extern void s390_reset_system(void); +extern size_t ipl_block_get_ascii_vmparm(char *dest, size_t size, + const struct ipl_parameter_block *ipb); + +enum ipl_type { + IPL_TYPE_UNKNOWN = 1, + IPL_TYPE_CCW = 2, + IPL_TYPE_FCP = 4, + IPL_TYPE_FCP_DUMP = 8, + IPL_TYPE_NSS = 16, + IPL_TYPE_NVME = 32, + IPL_TYPE_NVME_DUMP = 64, + IPL_TYPE_ECKD = 128, + IPL_TYPE_ECKD_DUMP = 256, +}; + +struct ipl_info +{ + enum ipl_type type; + union { + struct { + struct ccw_dev_id dev_id; + } ccw; + struct { + struct ccw_dev_id dev_id; + } eckd; + struct { + struct ccw_dev_id dev_id; + u64 wwpn; + u64 lun; + } fcp; + struct { + u32 fid; + u32 nsid; + } nvme; + struct { + char name[NSS_NAME_SIZE + 1]; + } nss; + } data; +}; + +extern struct ipl_info ipl_info; +extern void setup_ipl(void); +extern void set_os_info_reipl_block(void); + +static inline bool is_ipl_type_dump(void) +{ + return (ipl_info.type == IPL_TYPE_FCP_DUMP) || + (ipl_info.type == IPL_TYPE_ECKD_DUMP) || + (ipl_info.type == IPL_TYPE_NVME_DUMP); +} + +struct ipl_report { + struct ipl_parameter_block *ipib; + struct list_head components; + struct list_head certificates; + size_t size; +}; + +struct ipl_report_component { + struct list_head list; + struct ipl_rb_component_entry entry; +}; + +struct ipl_report_certificate { + struct list_head list; + struct ipl_rb_certificate_entry entry; + void *key; +}; + +struct kexec_buf; +struct ipl_report *ipl_report_init(struct ipl_parameter_block *ipib); +void *ipl_report_finish(struct ipl_report *report); +int ipl_report_free(struct ipl_report *report); +int ipl_report_add_component(struct ipl_report *report, struct kexec_buf *kbuf, + unsigned char flags, unsigned short cert); +int ipl_report_add_certificate(struct ipl_report *report, void *key, + unsigned long addr, unsigned long len); + +/* + * DIAG 308 support + */ +enum diag308_subcode { + DIAG308_CLEAR_RESET = 0, + DIAG308_LOAD_NORMAL_RESET = 1, + DIAG308_REL_HSA = 2, + DIAG308_LOAD_CLEAR = 3, + DIAG308_LOAD_NORMAL_DUMP = 4, + DIAG308_SET = 5, + DIAG308_STORE = 6, + DIAG308_LOAD_NORMAL = 7, +}; + +enum diag308_subcode_flags { + DIAG308_FLAG_EI = 1UL << 16, +}; + +enum diag308_rc { + DIAG308_RC_OK = 0x0001, + DIAG308_RC_NOCONFIG = 0x0102, +}; + +extern int diag308(unsigned long subcode, void *addr); +extern void store_status(void (*fn)(void *), void *data); +extern void lgr_info_log(void); + +#endif /* _ASM_S390_IPL_H */ diff --git a/arch/s390/include/asm/irq.h b/arch/s390/include/asm/irq.h new file mode 100644 index 0000000000..89902f7547 --- /dev/null +++ b/arch/s390/include/asm/irq.h @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_IRQ_H +#define _ASM_IRQ_H + +#define EXT_INTERRUPT 0 +#define IO_INTERRUPT 1 +#define THIN_INTERRUPT 2 + +#define NR_IRQS_BASE 3 + +#define NR_IRQS NR_IRQS_BASE +#define NR_IRQS_LEGACY NR_IRQS_BASE + +/* External interruption codes */ +#define EXT_IRQ_INTERRUPT_KEY 0x0040 +#define EXT_IRQ_CLK_COMP 0x1004 +#define EXT_IRQ_CPU_TIMER 0x1005 +#define EXT_IRQ_WARNING_TRACK 0x1007 +#define EXT_IRQ_MALFUNC_ALERT 0x1200 +#define EXT_IRQ_EMERGENCY_SIG 0x1201 +#define EXT_IRQ_EXTERNAL_CALL 0x1202 +#define EXT_IRQ_TIMING_ALERT 0x1406 +#define EXT_IRQ_MEASURE_ALERT 0x1407 +#define EXT_IRQ_SERVICE_SIG 0x2401 +#define EXT_IRQ_CP_SERVICE 0x2603 +#define EXT_IRQ_IUCV 0x4000 + +#ifndef __ASSEMBLY__ + +#include +#include +#include +#include + +enum interruption_class { + IRQEXT_CLK, + IRQEXT_EXC, + IRQEXT_EMS, + IRQEXT_TMR, + IRQEXT_TLA, + IRQEXT_PFL, + IRQEXT_DSD, + IRQEXT_VRT, + IRQEXT_SCP, + IRQEXT_IUC, + IRQEXT_CMS, + IRQEXT_CMC, + IRQEXT_FTP, + IRQIO_CIO, + IRQIO_DAS, + IRQIO_C15, + IRQIO_C70, + IRQIO_TAP, + IRQIO_VMR, + IRQIO_LCS, + IRQIO_CTC, + IRQIO_ADM, + IRQIO_CSC, + IRQIO_VIR, + IRQIO_QAI, + IRQIO_APB, + IRQIO_PCF, + IRQIO_PCD, + IRQIO_MSI, + IRQIO_VAI, + IRQIO_GAL, + NMI_NMI, + CPU_RST, + NR_ARCH_IRQS +}; + +struct irq_stat { + unsigned int irqs[NR_ARCH_IRQS]; +}; + +DECLARE_PER_CPU_SHARED_ALIGNED(struct irq_stat, irq_stat); + +static __always_inline void inc_irq_stat(enum interruption_class irq) +{ + __this_cpu_inc(irq_stat.irqs[irq]); +} + +struct ext_code { + union { + struct { + unsigned short subcode; + unsigned short code; + }; + unsigned int int_code; + }; +}; + +typedef void (*ext_int_handler_t)(struct ext_code, unsigned int, unsigned long); + +int register_external_irq(u16 code, ext_int_handler_t handler); +int unregister_external_irq(u16 code, ext_int_handler_t handler); + +enum irq_subclass { + IRQ_SUBCLASS_MEASUREMENT_ALERT = 5, + IRQ_SUBCLASS_SERVICE_SIGNAL = 9, +}; + +#define CR0_IRQ_SUBCLASS_MASK \ + ((1UL << (63 - 30)) /* Warning Track */ | \ + (1UL << (63 - 48)) /* Malfunction Alert */ | \ + (1UL << (63 - 49)) /* Emergency Signal */ | \ + (1UL << (63 - 50)) /* External Call */ | \ + (1UL << (63 - 52)) /* Clock Comparator */ | \ + (1UL << (63 - 53)) /* CPU Timer */ | \ + (1UL << (63 - 54)) /* Service Signal */ | \ + (1UL << (63 - 57)) /* Interrupt Key */ | \ + (1UL << (63 - 58)) /* Measurement Alert */ | \ + (1UL << (63 - 59)) /* Timing Alert */ | \ + (1UL << (63 - 62))) /* IUCV */ + +void irq_subclass_register(enum irq_subclass subclass); +void irq_subclass_unregister(enum irq_subclass subclass); + +#define irq_canonicalize(irq) (irq) + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_IRQ_H */ diff --git a/arch/s390/include/asm/irq_work.h b/arch/s390/include/asm/irq_work.h new file mode 100644 index 0000000000..603783766d --- /dev/null +++ b/arch/s390/include/asm/irq_work.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_IRQ_WORK_H +#define _ASM_S390_IRQ_WORK_H + +static inline bool arch_irq_work_has_interrupt(void) +{ + return true; +} + +void arch_irq_work_raise(void); + +#endif /* _ASM_S390_IRQ_WORK_H */ diff --git a/arch/s390/include/asm/irqflags.h b/arch/s390/include/asm/irqflags.h new file mode 100644 index 0000000000..02427b205c --- /dev/null +++ b/arch/s390/include/asm/irqflags.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2006, 2010 + * Author(s): Martin Schwidefsky + */ + +#ifndef __ASM_IRQFLAGS_H +#define __ASM_IRQFLAGS_H + +#include + +#define ARCH_IRQ_ENABLED (3UL << (BITS_PER_LONG - 8)) + +/* store then OR system mask. */ +#define __arch_local_irq_stosm(__or) \ +({ \ + unsigned long __mask; \ + asm volatile( \ + " stosm %0,%1" \ + : "=Q" (__mask) : "i" (__or) : "memory"); \ + __mask; \ +}) + +/* store then AND system mask. */ +#define __arch_local_irq_stnsm(__and) \ +({ \ + unsigned long __mask; \ + asm volatile( \ + " stnsm %0,%1" \ + : "=Q" (__mask) : "i" (__and) : "memory"); \ + __mask; \ +}) + +/* set system mask. */ +static __always_inline void __arch_local_irq_ssm(unsigned long flags) +{ + asm volatile("ssm %0" : : "Q" (flags) : "memory"); +} + +static __always_inline unsigned long arch_local_save_flags(void) +{ + return __arch_local_irq_stnsm(0xff); +} + +static __always_inline unsigned long arch_local_irq_save(void) +{ + return __arch_local_irq_stnsm(0xfc); +} + +static __always_inline void arch_local_irq_disable(void) +{ + arch_local_irq_save(); +} + +static __always_inline void arch_local_irq_enable(void) +{ + __arch_local_irq_stosm(0x03); +} + +/* This only restores external and I/O interrupt state */ +static __always_inline void arch_local_irq_restore(unsigned long flags) +{ + /* only disabled->disabled and disabled->enabled is valid */ + if (flags & ARCH_IRQ_ENABLED) + arch_local_irq_enable(); +} + +static __always_inline bool arch_irqs_disabled_flags(unsigned long flags) +{ + return !(flags & ARCH_IRQ_ENABLED); +} + +static __always_inline bool arch_irqs_disabled(void) +{ + return arch_irqs_disabled_flags(arch_local_save_flags()); +} + +#endif /* __ASM_IRQFLAGS_H */ diff --git a/arch/s390/include/asm/isc.h b/arch/s390/include/asm/isc.h new file mode 100644 index 0000000000..b2cc1ec78d --- /dev/null +++ b/arch/s390/include/asm/isc.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_ISC_H +#define _ASM_S390_ISC_H + +#include + +/* + * I/O interruption subclasses used by drivers. + * Please add all used iscs here so that it is possible to distribute + * isc usage between drivers. + * Reminder: 0 is highest priority, 7 lowest. + */ +#define MAX_ISC 7 + +/* Regular I/O interrupts. */ +#define IO_SCH_ISC 3 /* regular I/O subchannels */ +#define CONSOLE_ISC 1 /* console I/O subchannel */ +#define EADM_SCH_ISC 4 /* EADM subchannels */ +#define CHSC_SCH_ISC 7 /* CHSC subchannels */ +#define VFIO_CCW_ISC IO_SCH_ISC /* VFIO-CCW I/O subchannels */ +/* Adapter interrupts. */ +#define QDIO_AIRQ_ISC IO_SCH_ISC /* I/O subchannel in qdio mode */ +#define PCI_ISC 2 /* PCI I/O subchannels */ +#define GAL_ISC 5 /* GIB alert */ +#define AP_ISC 6 /* adjunct processor (crypto) devices */ + +/* Functions for registration of I/O interruption subclasses */ +void isc_register(unsigned int isc); +void isc_unregister(unsigned int isc); + +#endif /* _ASM_S390_ISC_H */ diff --git a/arch/s390/include/asm/itcw.h b/arch/s390/include/asm/itcw.h new file mode 100644 index 0000000000..59b7396132 --- /dev/null +++ b/arch/s390/include/asm/itcw.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Functions for incremental construction of fcx enabled I/O control blocks. + * + * Copyright IBM Corp. 2008 + * Author(s): Peter Oberparleiter + */ + +#ifndef _ASM_S390_ITCW_H +#define _ASM_S390_ITCW_H + +#include +#include + +#define ITCW_OP_READ 0 +#define ITCW_OP_WRITE 1 + +struct itcw; + +struct tcw *itcw_get_tcw(struct itcw *itcw); +size_t itcw_calc_size(int intrg, int max_tidaws, int intrg_max_tidaws); +struct itcw *itcw_init(void *buffer, size_t size, int op, int intrg, + int max_tidaws, int intrg_max_tidaws); +struct dcw *itcw_add_dcw(struct itcw *itcw, u8 cmd, u8 flags, void *cd, + u8 cd_count, u32 count); +struct tidaw *itcw_add_tidaw(struct itcw *itcw, u8 flags, void *addr, + u32 count); +void itcw_set_data(struct itcw *itcw, void *addr, int use_tidal); +void itcw_finalize(struct itcw *itcw); + +#endif /* _ASM_S390_ITCW_H */ diff --git a/arch/s390/include/asm/jump_label.h b/arch/s390/include/asm/jump_label.h new file mode 100644 index 0000000000..895f774bbc --- /dev/null +++ b/arch/s390/include/asm/jump_label.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_JUMP_LABEL_H +#define _ASM_S390_JUMP_LABEL_H + +#define HAVE_JUMP_LABEL_BATCH + +#ifndef __ASSEMBLY__ + +#include +#include + +#define JUMP_LABEL_NOP_SIZE 6 + +#ifdef CONFIG_CC_IS_CLANG +#define JUMP_LABEL_STATIC_KEY_CONSTRAINT "i" +#elif __GNUC__ < 9 +#define JUMP_LABEL_STATIC_KEY_CONSTRAINT "X" +#else +#define JUMP_LABEL_STATIC_KEY_CONSTRAINT "jdd" +#endif + +/* + * We use a brcl 0, instruction for jump labels so it + * can be easily distinguished from a hotpatch generated instruction. + */ +static __always_inline bool arch_static_branch(struct static_key *key, bool branch) +{ + asm_volatile_goto("0: brcl 0,%l[label]\n" + ".pushsection __jump_table,\"aw\"\n" + ".balign 8\n" + ".long 0b-.,%l[label]-.\n" + ".quad %0+%1-.\n" + ".popsection\n" + : : JUMP_LABEL_STATIC_KEY_CONSTRAINT (key), "i" (branch) : : label); + return false; +label: + return true; +} + +static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) +{ + asm_volatile_goto("0: brcl 15,%l[label]\n" + ".pushsection __jump_table,\"aw\"\n" + ".balign 8\n" + ".long 0b-.,%l[label]-.\n" + ".quad %0+%1-.\n" + ".popsection\n" + : : JUMP_LABEL_STATIC_KEY_CONSTRAINT (key), "i" (branch) : : label); + return false; +label: + return true; +} + +#endif /* __ASSEMBLY__ */ +#endif diff --git a/arch/s390/include/asm/kasan.h b/arch/s390/include/asm/kasan.h new file mode 100644 index 0000000000..0cffead0f2 --- /dev/null +++ b/arch/s390/include/asm/kasan.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_KASAN_H +#define __ASM_KASAN_H + +#include + +#ifdef CONFIG_KASAN + +#define KASAN_SHADOW_SCALE_SHIFT 3 +#define KASAN_SHADOW_SIZE \ + (_AC(1, UL) << (_REGION1_SHIFT - KASAN_SHADOW_SCALE_SHIFT)) +#define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL) +#define KASAN_SHADOW_START KASAN_SHADOW_OFFSET +#define KASAN_SHADOW_END (KASAN_SHADOW_START + KASAN_SHADOW_SIZE) + +#endif + +#endif diff --git a/arch/s390/include/asm/kdebug.h b/arch/s390/include/asm/kdebug.h new file mode 100644 index 0000000000..4377238e47 --- /dev/null +++ b/arch/s390/include/asm/kdebug.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _S390_KDEBUG_H +#define _S390_KDEBUG_H + +/* + * Feb 2006 Ported to s390 + */ + +struct pt_regs; + +enum die_val { + DIE_OOPS = 1, + DIE_BPT, + DIE_SSTEP, + DIE_PANIC, + DIE_NMI, + DIE_DIE, + DIE_NMIWATCHDOG, + DIE_KERNELDEBUG, + DIE_TRAP, + DIE_GPF, + DIE_CALL, + DIE_NMI_IPI, +}; + +extern void __noreturn die(struct pt_regs *, const char *); + +#endif diff --git a/arch/s390/include/asm/kexec.h b/arch/s390/include/asm/kexec.h new file mode 100644 index 0000000000..1bd08eb56d --- /dev/null +++ b/arch/s390/include/asm/kexec.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2005 + * + * Author(s): Rolf Adelsberger + * + */ + +#ifndef _S390_KEXEC_H +#define _S390_KEXEC_H + +#include + +#include +#include +#include +/* + * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return. + * I.e. Maximum page that is mapped directly into kernel memory, + * and kmap is not required. + */ + +/* Maximum physical address we can use pages from */ +#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL) + +/* Maximum address we can reach in physical address mode */ +#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL) + +/* Maximum address we can use for the control pages */ +/* Not more than 2GB */ +#define KEXEC_CONTROL_MEMORY_LIMIT (1UL<<31) + +/* Allocate control page with GFP_DMA */ +#define KEXEC_CONTROL_MEMORY_GFP (GFP_DMA | __GFP_NORETRY) + +/* Maximum address we can use for the crash control pages */ +#define KEXEC_CRASH_CONTROL_MEMORY_LIMIT (-1UL) + +/* Allocate one page for the pdp and the second for the code */ +#define KEXEC_CONTROL_PAGE_SIZE 4096 + +/* Alignment of crashkernel memory */ +#define KEXEC_CRASH_MEM_ALIGN HPAGE_SIZE + +/* The native architecture */ +#define KEXEC_ARCH KEXEC_ARCH_S390 + +/* Allow kexec_file to load a segment to 0 */ +#define KEXEC_BUF_MEM_UNKNOWN -1 + +/* Provide a dummy definition to avoid build failures. */ +static inline void crash_setup_regs(struct pt_regs *newregs, + struct pt_regs *oldregs) { } + +struct kimage; +struct s390_load_data { + /* Pointer to the kernel buffer. Used to register cmdline etc.. */ + void *kernel_buf; + + /* Load address of the kernel_buf. */ + unsigned long kernel_mem; + + /* Parmarea in the kernel buffer. */ + struct parmarea *parm; + + /* Total size of loaded segments in memory. Used as an offset. */ + size_t memsz; + + struct ipl_report *report; +}; + +int s390_verify_sig(const char *kernel, unsigned long kernel_len); +void *kexec_file_add_components(struct kimage *image, + int (*add_kernel)(struct kimage *image, + struct s390_load_data *data)); +int arch_kexec_do_relocs(int r_type, void *loc, unsigned long val, + unsigned long addr); + +#define ARCH_HAS_KIMAGE_ARCH + +struct kimage_arch { + void *ipl_buf; +}; + +extern const struct kexec_file_ops s390_kexec_image_ops; +extern const struct kexec_file_ops s390_kexec_elf_ops; + +#ifdef CONFIG_CRASH_DUMP +void crash_free_reserved_phys_range(unsigned long begin, unsigned long end); +#define crash_free_reserved_phys_range crash_free_reserved_phys_range + +void arch_kexec_protect_crashkres(void); +#define arch_kexec_protect_crashkres arch_kexec_protect_crashkres + +void arch_kexec_unprotect_crashkres(void); +#define arch_kexec_unprotect_crashkres arch_kexec_unprotect_crashkres +#endif + +#ifdef CONFIG_KEXEC_FILE +struct purgatory_info; +int arch_kexec_apply_relocations_add(struct purgatory_info *pi, + Elf_Shdr *section, + const Elf_Shdr *relsec, + const Elf_Shdr *symtab); +#define arch_kexec_apply_relocations_add arch_kexec_apply_relocations_add + +int arch_kimage_file_post_load_cleanup(struct kimage *image); +#define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup +#endif +#endif /*_S390_KEXEC_H */ diff --git a/arch/s390/include/asm/kfence.h b/arch/s390/include/asm/kfence.h new file mode 100644 index 0000000000..e47fd8cbe7 --- /dev/null +++ b/arch/s390/include/asm/kfence.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_S390_KFENCE_H +#define _ASM_S390_KFENCE_H + +#include +#include +#include +#include + +void __kernel_map_pages(struct page *page, int numpages, int enable); + +static __always_inline bool arch_kfence_init_pool(void) +{ + return true; +} + +#define arch_kfence_test_address(addr) ((addr) & PAGE_MASK) + +/* + * Do not split kfence pool to 4k mapping with arch_kfence_init_pool(), + * but earlier where page table allocations still happen with memblock. + * Reason is that arch_kfence_init_pool() gets called when the system + * is still in a limbo state - disabling and enabling bottom halves is + * not yet allowed, but that is what our page_table_alloc() would do. + */ +static __always_inline void kfence_split_mapping(void) +{ +#ifdef CONFIG_KFENCE + unsigned long pool_pages = KFENCE_POOL_SIZE >> PAGE_SHIFT; + + set_memory_4k((unsigned long)__kfence_pool, pool_pages); +#endif +} + +static inline bool kfence_protect_page(unsigned long addr, bool protect) +{ + __kernel_map_pages(virt_to_page((void *)addr), 1, !protect); + return true; +} + +#endif /* _ASM_S390_KFENCE_H */ diff --git a/arch/s390/include/asm/kprobes.h b/arch/s390/include/asm/kprobes.h new file mode 100644 index 0000000000..83f732ca3a --- /dev/null +++ b/arch/s390/include/asm/kprobes.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +#ifndef _ASM_S390_KPROBES_H +#define _ASM_S390_KPROBES_H +/* + * Kernel Probes (KProbes) + * + * Copyright IBM Corp. 2002, 2006 + * + * 2002-Oct Created by Vamsi Krishna S Kernel + * Probes initial implementation ( includes suggestions from + * Rusty Russell). + * 2004-Nov Modified for PPC64 by Ananth N Mavinakayanahalli + * + * 2005-Dec Used as a template for s390 by Mike Grundy + * + */ +#include +#include + +#define BREAKPOINT_INSTRUCTION 0x0002 + +#define FIXUP_PSW_NORMAL 0x08 +#define FIXUP_BRANCH_NOT_TAKEN 0x04 +#define FIXUP_RETURN_REGISTER 0x02 +#define FIXUP_NOT_REQUIRED 0x01 + +int probe_is_prohibited_opcode(u16 *insn); +int probe_get_fixup_type(u16 *insn); +int probe_is_insn_relative_long(u16 *insn); + +#ifdef CONFIG_KPROBES +#include +#include +#include + +#define __ARCH_WANT_KPROBES_INSN_SLOT + +struct pt_regs; +struct kprobe; + +typedef u16 kprobe_opcode_t; + +/* Maximum instruction size is 3 (16bit) halfwords: */ +#define MAX_INSN_SIZE 0x0003 +#define MAX_STACK_SIZE 64 +#define MIN_STACK_SIZE(ADDR) (((MAX_STACK_SIZE) < \ + (((unsigned long)task_stack_page(current)) + THREAD_SIZE - (ADDR))) \ + ? (MAX_STACK_SIZE) \ + : (((unsigned long)task_stack_page(current)) + THREAD_SIZE - (ADDR))) + +#define kretprobe_blacklist_size 0 + +/* Architecture specific copy of original instruction */ +struct arch_specific_insn { + /* copy of original instruction */ + kprobe_opcode_t *insn; +}; + +struct prev_kprobe { + struct kprobe *kp; + unsigned long status; +}; + +/* per-cpu kprobe control block */ +struct kprobe_ctlblk { + unsigned long kprobe_status; + unsigned long kprobe_saved_imask; + unsigned long kprobe_saved_ctl[3]; + struct prev_kprobe prev_kprobe; +}; + +void arch_remove_kprobe(struct kprobe *p); + +int kprobe_fault_handler(struct pt_regs *regs, int trapnr); +int kprobe_exceptions_notify(struct notifier_block *self, + unsigned long val, void *data); + +#define flush_insn_slot(p) do { } while (0) + +#endif /* CONFIG_KPROBES */ +#endif /* _ASM_S390_KPROBES_H */ diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h new file mode 100644 index 0000000000..427f9528a7 --- /dev/null +++ b/arch/s390/include/asm/kvm_host.h @@ -0,0 +1,1060 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * definition for kernel virtual machines on s390 + * + * Copyright IBM Corp. 2008, 2018 + * + * Author(s): Carsten Otte + */ + + +#ifndef ASM_KVM_HOST_H +#define ASM_KVM_HOST_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define KVM_S390_BSCA_CPU_SLOTS 64 +#define KVM_S390_ESCA_CPU_SLOTS 248 +#define KVM_MAX_VCPUS 255 + +/* + * These seem to be used for allocating ->chip in the routing table, which we + * don't use. 1 is as small as we can get to reduce the needed memory. If we + * need to look at ->chip later on, we'll need to revisit this. + */ +#define KVM_NR_IRQCHIPS 1 +#define KVM_IRQCHIP_NUM_PINS 1 +#define KVM_HALT_POLL_NS_DEFAULT 50000 + +/* s390-specific vcpu->requests bit members */ +#define KVM_REQ_ENABLE_IBS KVM_ARCH_REQ(0) +#define KVM_REQ_DISABLE_IBS KVM_ARCH_REQ(1) +#define KVM_REQ_ICPT_OPEREXC KVM_ARCH_REQ(2) +#define KVM_REQ_START_MIGRATION KVM_ARCH_REQ(3) +#define KVM_REQ_STOP_MIGRATION KVM_ARCH_REQ(4) +#define KVM_REQ_VSIE_RESTART KVM_ARCH_REQ(5) +#define KVM_REQ_REFRESH_GUEST_PREFIX \ + KVM_ARCH_REQ_FLAGS(6, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) + +#define SIGP_CTRL_C 0x80 +#define SIGP_CTRL_SCN_MASK 0x3f + +union bsca_sigp_ctrl { + __u8 value; + struct { + __u8 c : 1; + __u8 r : 1; + __u8 scn : 6; + }; +}; + +union esca_sigp_ctrl { + __u16 value; + struct { + __u8 c : 1; + __u8 reserved: 7; + __u8 scn; + }; +}; + +struct esca_entry { + union esca_sigp_ctrl sigp_ctrl; + __u16 reserved1[3]; + __u64 sda; + __u64 reserved2[6]; +}; + +struct bsca_entry { + __u8 reserved0; + union bsca_sigp_ctrl sigp_ctrl; + __u16 reserved[3]; + __u64 sda; + __u64 reserved2[2]; +}; + +union ipte_control { + unsigned long val; + struct { + unsigned long k : 1; + unsigned long kh : 31; + unsigned long kg : 32; + }; +}; + +union sca_utility { + __u16 val; + struct { + __u16 mtcr : 1; + __u16 reserved : 15; + }; +}; + +struct bsca_block { + union ipte_control ipte_control; + __u64 reserved[5]; + __u64 mcn; + union sca_utility utility; + __u8 reserved2[6]; + struct bsca_entry cpu[KVM_S390_BSCA_CPU_SLOTS]; +}; + +struct esca_block { + union ipte_control ipte_control; + __u64 reserved1[6]; + union sca_utility utility; + __u8 reserved2[6]; + __u64 mcn[4]; + __u64 reserved3[20]; + struct esca_entry cpu[KVM_S390_ESCA_CPU_SLOTS]; +}; + +/* + * This struct is used to store some machine check info from lowcore + * for machine checks that happen while the guest is running. + * This info in host's lowcore might be overwritten by a second machine + * check from host when host is in the machine check's high-level handling. + * The size is 24 bytes. + */ +struct mcck_volatile_info { + __u64 mcic; + __u64 failing_storage_address; + __u32 ext_damage_code; + __u32 reserved; +}; + +#define CR0_INITIAL_MASK (CR0_UNUSED_56 | CR0_INTERRUPT_KEY_SUBMASK | \ + CR0_MEASUREMENT_ALERT_SUBMASK) +#define CR14_INITIAL_MASK (CR14_UNUSED_32 | CR14_UNUSED_33 | \ + CR14_EXTERNAL_DAMAGE_SUBMASK) + +#define SIDAD_SIZE_MASK 0xff +#define sida_addr(sie_block) phys_to_virt((sie_block)->sidad & PAGE_MASK) +#define sida_size(sie_block) \ + ((((sie_block)->sidad & SIDAD_SIZE_MASK) + 1) * PAGE_SIZE) + +#define CPUSTAT_STOPPED 0x80000000 +#define CPUSTAT_WAIT 0x10000000 +#define CPUSTAT_ECALL_PEND 0x08000000 +#define CPUSTAT_STOP_INT 0x04000000 +#define CPUSTAT_IO_INT 0x02000000 +#define CPUSTAT_EXT_INT 0x01000000 +#define CPUSTAT_RUNNING 0x00800000 +#define CPUSTAT_RETAINED 0x00400000 +#define CPUSTAT_TIMING_SUB 0x00020000 +#define CPUSTAT_SIE_SUB 0x00010000 +#define CPUSTAT_RRF 0x00008000 +#define CPUSTAT_SLSV 0x00004000 +#define CPUSTAT_SLSR 0x00002000 +#define CPUSTAT_ZARCH 0x00000800 +#define CPUSTAT_MCDS 0x00000100 +#define CPUSTAT_KSS 0x00000200 +#define CPUSTAT_SM 0x00000080 +#define CPUSTAT_IBS 0x00000040 +#define CPUSTAT_GED2 0x00000010 +#define CPUSTAT_G 0x00000008 +#define CPUSTAT_GED 0x00000004 +#define CPUSTAT_J 0x00000002 +#define CPUSTAT_P 0x00000001 + +struct kvm_s390_sie_block { + atomic_t cpuflags; /* 0x0000 */ + __u32 : 1; /* 0x0004 */ + __u32 prefix : 18; + __u32 : 1; + __u32 ibc : 12; + __u8 reserved08[4]; /* 0x0008 */ +#define PROG_IN_SIE (1<<0) + __u32 prog0c; /* 0x000c */ + union { + __u8 reserved10[16]; /* 0x0010 */ + struct { + __u64 pv_handle_cpu; + __u64 pv_handle_config; + }; + }; +#define PROG_BLOCK_SIE (1<<0) +#define PROG_REQUEST (1<<1) + atomic_t prog20; /* 0x0020 */ + __u8 reserved24[4]; /* 0x0024 */ + __u64 cputm; /* 0x0028 */ + __u64 ckc; /* 0x0030 */ + __u64 epoch; /* 0x0038 */ + __u32 svcc; /* 0x0040 */ +#define LCTL_CR0 0x8000 +#define LCTL_CR6 0x0200 +#define LCTL_CR9 0x0040 +#define LCTL_CR10 0x0020 +#define LCTL_CR11 0x0010 +#define LCTL_CR14 0x0002 + __u16 lctl; /* 0x0044 */ + __s16 icpua; /* 0x0046 */ +#define ICTL_OPEREXC 0x80000000 +#define ICTL_PINT 0x20000000 +#define ICTL_LPSW 0x00400000 +#define ICTL_STCTL 0x00040000 +#define ICTL_ISKE 0x00004000 +#define ICTL_SSKE 0x00002000 +#define ICTL_RRBE 0x00001000 +#define ICTL_TPROT 0x00000200 + __u32 ictl; /* 0x0048 */ +#define ECA_CEI 0x80000000 +#define ECA_IB 0x40000000 +#define ECA_SIGPI 0x10000000 +#define ECA_MVPGI 0x01000000 +#define ECA_AIV 0x00200000 +#define ECA_VX 0x00020000 +#define ECA_PROTEXCI 0x00002000 +#define ECA_APIE 0x00000008 +#define ECA_SII 0x00000001 + __u32 eca; /* 0x004c */ +#define ICPT_INST 0x04 +#define ICPT_PROGI 0x08 +#define ICPT_INSTPROGI 0x0C +#define ICPT_EXTREQ 0x10 +#define ICPT_EXTINT 0x14 +#define ICPT_IOREQ 0x18 +#define ICPT_WAIT 0x1c +#define ICPT_VALIDITY 0x20 +#define ICPT_STOP 0x28 +#define ICPT_OPEREXC 0x2C +#define ICPT_PARTEXEC 0x38 +#define ICPT_IOINST 0x40 +#define ICPT_KSS 0x5c +#define ICPT_MCHKREQ 0x60 +#define ICPT_INT_ENABLE 0x64 +#define ICPT_PV_INSTR 0x68 +#define ICPT_PV_NOTIFY 0x6c +#define ICPT_PV_PREF 0x70 + __u8 icptcode; /* 0x0050 */ + __u8 icptstatus; /* 0x0051 */ + __u16 ihcpu; /* 0x0052 */ + __u8 reserved54; /* 0x0054 */ +#define IICTL_CODE_NONE 0x00 +#define IICTL_CODE_MCHK 0x01 +#define IICTL_CODE_EXT 0x02 +#define IICTL_CODE_IO 0x03 +#define IICTL_CODE_RESTART 0x04 +#define IICTL_CODE_SPECIFICATION 0x10 +#define IICTL_CODE_OPERAND 0x11 + __u8 iictl; /* 0x0055 */ + __u16 ipa; /* 0x0056 */ + __u32 ipb; /* 0x0058 */ + __u32 scaoh; /* 0x005c */ +#define FPF_BPBC 0x20 + __u8 fpf; /* 0x0060 */ +#define ECB_GS 0x40 +#define ECB_TE 0x10 +#define ECB_SPECI 0x08 +#define ECB_SRSI 0x04 +#define ECB_HOSTPROTINT 0x02 +#define ECB_PTF 0x01 + __u8 ecb; /* 0x0061 */ +#define ECB2_CMMA 0x80 +#define ECB2_IEP 0x20 +#define ECB2_PFMFI 0x08 +#define ECB2_ESCA 0x04 +#define ECB2_ZPCI_LSI 0x02 + __u8 ecb2; /* 0x0062 */ +#define ECB3_AISI 0x20 +#define ECB3_AISII 0x10 +#define ECB3_DEA 0x08 +#define ECB3_AES 0x04 +#define ECB3_RI 0x01 + __u8 ecb3; /* 0x0063 */ +#define ESCA_SCAOL_MASK ~0x3fU + __u32 scaol; /* 0x0064 */ + __u8 sdf; /* 0x0068 */ + __u8 epdx; /* 0x0069 */ + __u8 cpnc; /* 0x006a */ + __u8 reserved6b; /* 0x006b */ + __u32 todpr; /* 0x006c */ +#define GISA_FORMAT1 0x00000001 + __u32 gd; /* 0x0070 */ + __u8 reserved74[12]; /* 0x0074 */ + __u64 mso; /* 0x0080 */ + __u64 msl; /* 0x0088 */ + psw_t gpsw; /* 0x0090 */ + __u64 gg14; /* 0x00a0 */ + __u64 gg15; /* 0x00a8 */ + __u8 reservedb0[8]; /* 0x00b0 */ +#define HPID_KVM 0x4 +#define HPID_VSIE 0x5 + __u8 hpid; /* 0x00b8 */ + __u8 reservedb9[7]; /* 0x00b9 */ + union { + struct { + __u32 eiparams; /* 0x00c0 */ + __u16 extcpuaddr; /* 0x00c4 */ + __u16 eic; /* 0x00c6 */ + }; + __u64 mcic; /* 0x00c0 */ + } __packed; + __u32 reservedc8; /* 0x00c8 */ + union { + struct { + __u16 pgmilc; /* 0x00cc */ + __u16 iprcc; /* 0x00ce */ + }; + __u32 edc; /* 0x00cc */ + } __packed; + union { + struct { + __u32 dxc; /* 0x00d0 */ + __u16 mcn; /* 0x00d4 */ + __u8 perc; /* 0x00d6 */ + __u8 peratmid; /* 0x00d7 */ + }; + __u64 faddr; /* 0x00d0 */ + } __packed; + __u64 peraddr; /* 0x00d8 */ + __u8 eai; /* 0x00e0 */ + __u8 peraid; /* 0x00e1 */ + __u8 oai; /* 0x00e2 */ + __u8 armid; /* 0x00e3 */ + __u8 reservede4[4]; /* 0x00e4 */ + union { + __u64 tecmc; /* 0x00e8 */ + struct { + __u16 subchannel_id; /* 0x00e8 */ + __u16 subchannel_nr; /* 0x00ea */ + __u32 io_int_parm; /* 0x00ec */ + __u32 io_int_word; /* 0x00f0 */ + }; + } __packed; + __u8 reservedf4[8]; /* 0x00f4 */ +#define CRYCB_FORMAT_MASK 0x00000003 +#define CRYCB_FORMAT0 0x00000000 +#define CRYCB_FORMAT1 0x00000001 +#define CRYCB_FORMAT2 0x00000003 + __u32 crycbd; /* 0x00fc */ + __u64 gcr[16]; /* 0x0100 */ + union { + __u64 gbea; /* 0x0180 */ + __u64 sidad; + }; + __u8 reserved188[8]; /* 0x0188 */ + __u64 sdnxo; /* 0x0190 */ + __u8 reserved198[8]; /* 0x0198 */ + __u32 fac; /* 0x01a0 */ + __u8 reserved1a4[20]; /* 0x01a4 */ + __u64 cbrlo; /* 0x01b8 */ + __u8 reserved1c0[8]; /* 0x01c0 */ +#define ECD_HOSTREGMGMT 0x20000000 +#define ECD_MEF 0x08000000 +#define ECD_ETOKENF 0x02000000 +#define ECD_ECC 0x00200000 + __u32 ecd; /* 0x01c8 */ + __u8 reserved1cc[18]; /* 0x01cc */ + __u64 pp; /* 0x01de */ + __u8 reserved1e6[2]; /* 0x01e6 */ + __u64 itdba; /* 0x01e8 */ + __u64 riccbd; /* 0x01f0 */ + __u64 gvrd; /* 0x01f8 */ +} __packed __aligned(512); + +struct kvm_s390_itdb { + __u8 data[256]; +}; + +struct sie_page { + struct kvm_s390_sie_block sie_block; + struct mcck_volatile_info mcck_info; /* 0x0200 */ + __u8 reserved218[360]; /* 0x0218 */ + __u64 pv_grregs[16]; /* 0x0380 */ + __u8 reserved400[512]; /* 0x0400 */ + struct kvm_s390_itdb itdb; /* 0x0600 */ + __u8 reserved700[2304]; /* 0x0700 */ +}; + +struct kvm_vcpu_stat { + struct kvm_vcpu_stat_generic generic; + u64 exit_userspace; + u64 exit_null; + u64 exit_external_request; + u64 exit_io_request; + u64 exit_external_interrupt; + u64 exit_stop_request; + u64 exit_validity; + u64 exit_instruction; + u64 exit_pei; + u64 halt_no_poll_steal; + u64 instruction_lctl; + u64 instruction_lctlg; + u64 instruction_stctl; + u64 instruction_stctg; + u64 exit_program_interruption; + u64 exit_instr_and_program; + u64 exit_operation_exception; + u64 deliver_ckc; + u64 deliver_cputm; + u64 deliver_external_call; + u64 deliver_emergency_signal; + u64 deliver_service_signal; + u64 deliver_virtio; + u64 deliver_stop_signal; + u64 deliver_prefix_signal; + u64 deliver_restart_signal; + u64 deliver_program; + u64 deliver_io; + u64 deliver_machine_check; + u64 exit_wait_state; + u64 inject_ckc; + u64 inject_cputm; + u64 inject_external_call; + u64 inject_emergency_signal; + u64 inject_mchk; + u64 inject_pfault_init; + u64 inject_program; + u64 inject_restart; + u64 inject_set_prefix; + u64 inject_stop_signal; + u64 instruction_epsw; + u64 instruction_gs; + u64 instruction_io_other; + u64 instruction_lpsw; + u64 instruction_lpswe; + u64 instruction_pfmf; + u64 instruction_ptff; + u64 instruction_sck; + u64 instruction_sckpf; + u64 instruction_stidp; + u64 instruction_spx; + u64 instruction_stpx; + u64 instruction_stap; + u64 instruction_iske; + u64 instruction_ri; + u64 instruction_rrbe; + u64 instruction_sske; + u64 instruction_ipte_interlock; + u64 instruction_stsi; + u64 instruction_stfl; + u64 instruction_tb; + u64 instruction_tpi; + u64 instruction_tprot; + u64 instruction_tsch; + u64 instruction_sie; + u64 instruction_essa; + u64 instruction_sthyi; + u64 instruction_sigp_sense; + u64 instruction_sigp_sense_running; + u64 instruction_sigp_external_call; + u64 instruction_sigp_emergency; + u64 instruction_sigp_cond_emergency; + u64 instruction_sigp_start; + u64 instruction_sigp_stop; + u64 instruction_sigp_stop_store_status; + u64 instruction_sigp_store_status; + u64 instruction_sigp_store_adtl_status; + u64 instruction_sigp_arch; + u64 instruction_sigp_prefix; + u64 instruction_sigp_restart; + u64 instruction_sigp_init_cpu_reset; + u64 instruction_sigp_cpu_reset; + u64 instruction_sigp_unknown; + u64 instruction_diagnose_10; + u64 instruction_diagnose_44; + u64 instruction_diagnose_9c; + u64 diag_9c_ignored; + u64 diag_9c_forward; + u64 instruction_diagnose_258; + u64 instruction_diagnose_308; + u64 instruction_diagnose_500; + u64 instruction_diagnose_other; + u64 pfault_sync; +}; + +#define PGM_OPERATION 0x01 +#define PGM_PRIVILEGED_OP 0x02 +#define PGM_EXECUTE 0x03 +#define PGM_PROTECTION 0x04 +#define PGM_ADDRESSING 0x05 +#define PGM_SPECIFICATION 0x06 +#define PGM_DATA 0x07 +#define PGM_FIXED_POINT_OVERFLOW 0x08 +#define PGM_FIXED_POINT_DIVIDE 0x09 +#define PGM_DECIMAL_OVERFLOW 0x0a +#define PGM_DECIMAL_DIVIDE 0x0b +#define PGM_HFP_EXPONENT_OVERFLOW 0x0c +#define PGM_HFP_EXPONENT_UNDERFLOW 0x0d +#define PGM_HFP_SIGNIFICANCE 0x0e +#define PGM_HFP_DIVIDE 0x0f +#define PGM_SEGMENT_TRANSLATION 0x10 +#define PGM_PAGE_TRANSLATION 0x11 +#define PGM_TRANSLATION_SPEC 0x12 +#define PGM_SPECIAL_OPERATION 0x13 +#define PGM_OPERAND 0x15 +#define PGM_TRACE_TABEL 0x16 +#define PGM_VECTOR_PROCESSING 0x1b +#define PGM_SPACE_SWITCH 0x1c +#define PGM_HFP_SQUARE_ROOT 0x1d +#define PGM_PC_TRANSLATION_SPEC 0x1f +#define PGM_AFX_TRANSLATION 0x20 +#define PGM_ASX_TRANSLATION 0x21 +#define PGM_LX_TRANSLATION 0x22 +#define PGM_EX_TRANSLATION 0x23 +#define PGM_PRIMARY_AUTHORITY 0x24 +#define PGM_SECONDARY_AUTHORITY 0x25 +#define PGM_LFX_TRANSLATION 0x26 +#define PGM_LSX_TRANSLATION 0x27 +#define PGM_ALET_SPECIFICATION 0x28 +#define PGM_ALEN_TRANSLATION 0x29 +#define PGM_ALE_SEQUENCE 0x2a +#define PGM_ASTE_VALIDITY 0x2b +#define PGM_ASTE_SEQUENCE 0x2c +#define PGM_EXTENDED_AUTHORITY 0x2d +#define PGM_LSTE_SEQUENCE 0x2e +#define PGM_ASTE_INSTANCE 0x2f +#define PGM_STACK_FULL 0x30 +#define PGM_STACK_EMPTY 0x31 +#define PGM_STACK_SPECIFICATION 0x32 +#define PGM_STACK_TYPE 0x33 +#define PGM_STACK_OPERATION 0x34 +#define PGM_ASCE_TYPE 0x38 +#define PGM_REGION_FIRST_TRANS 0x39 +#define PGM_REGION_SECOND_TRANS 0x3a +#define PGM_REGION_THIRD_TRANS 0x3b +#define PGM_MONITOR 0x40 +#define PGM_PER 0x80 +#define PGM_CRYPTO_OPERATION 0x119 + +/* irq types in ascend order of priorities */ +enum irq_types { + IRQ_PEND_SET_PREFIX = 0, + IRQ_PEND_RESTART, + IRQ_PEND_SIGP_STOP, + IRQ_PEND_IO_ISC_7, + IRQ_PEND_IO_ISC_6, + IRQ_PEND_IO_ISC_5, + IRQ_PEND_IO_ISC_4, + IRQ_PEND_IO_ISC_3, + IRQ_PEND_IO_ISC_2, + IRQ_PEND_IO_ISC_1, + IRQ_PEND_IO_ISC_0, + IRQ_PEND_VIRTIO, + IRQ_PEND_PFAULT_DONE, + IRQ_PEND_PFAULT_INIT, + IRQ_PEND_EXT_HOST, + IRQ_PEND_EXT_SERVICE, + IRQ_PEND_EXT_SERVICE_EV, + IRQ_PEND_EXT_TIMING, + IRQ_PEND_EXT_CPU_TIMER, + IRQ_PEND_EXT_CLOCK_COMP, + IRQ_PEND_EXT_EXTERNAL, + IRQ_PEND_EXT_EMERGENCY, + IRQ_PEND_EXT_MALFUNC, + IRQ_PEND_EXT_IRQ_KEY, + IRQ_PEND_MCHK_REP, + IRQ_PEND_PROG, + IRQ_PEND_SVC, + IRQ_PEND_MCHK_EX, + IRQ_PEND_COUNT +}; + +/* We have 2M for virtio device descriptor pages. Smallest amount of + * memory per page is 24 bytes (1 queue), so (2048*1024) / 24 = 87381 + */ +#define KVM_S390_MAX_VIRTIO_IRQS 87381 + +/* + * Repressible (non-floating) machine check interrupts + * subclass bits in MCIC + */ +#define MCHK_EXTD_BIT 58 +#define MCHK_DEGR_BIT 56 +#define MCHK_WARN_BIT 55 +#define MCHK_REP_MASK ((1UL << MCHK_DEGR_BIT) | \ + (1UL << MCHK_EXTD_BIT) | \ + (1UL << MCHK_WARN_BIT)) + +/* Exigent machine check interrupts subclass bits in MCIC */ +#define MCHK_SD_BIT 63 +#define MCHK_PD_BIT 62 +#define MCHK_EX_MASK ((1UL << MCHK_SD_BIT) | (1UL << MCHK_PD_BIT)) + +#define IRQ_PEND_EXT_MASK ((1UL << IRQ_PEND_EXT_IRQ_KEY) | \ + (1UL << IRQ_PEND_EXT_CLOCK_COMP) | \ + (1UL << IRQ_PEND_EXT_CPU_TIMER) | \ + (1UL << IRQ_PEND_EXT_MALFUNC) | \ + (1UL << IRQ_PEND_EXT_EMERGENCY) | \ + (1UL << IRQ_PEND_EXT_EXTERNAL) | \ + (1UL << IRQ_PEND_EXT_TIMING) | \ + (1UL << IRQ_PEND_EXT_HOST) | \ + (1UL << IRQ_PEND_EXT_SERVICE) | \ + (1UL << IRQ_PEND_EXT_SERVICE_EV) | \ + (1UL << IRQ_PEND_VIRTIO) | \ + (1UL << IRQ_PEND_PFAULT_INIT) | \ + (1UL << IRQ_PEND_PFAULT_DONE)) + +#define IRQ_PEND_IO_MASK ((1UL << IRQ_PEND_IO_ISC_0) | \ + (1UL << IRQ_PEND_IO_ISC_1) | \ + (1UL << IRQ_PEND_IO_ISC_2) | \ + (1UL << IRQ_PEND_IO_ISC_3) | \ + (1UL << IRQ_PEND_IO_ISC_4) | \ + (1UL << IRQ_PEND_IO_ISC_5) | \ + (1UL << IRQ_PEND_IO_ISC_6) | \ + (1UL << IRQ_PEND_IO_ISC_7)) + +#define IRQ_PEND_MCHK_MASK ((1UL << IRQ_PEND_MCHK_REP) | \ + (1UL << IRQ_PEND_MCHK_EX)) + +#define IRQ_PEND_EXT_II_MASK ((1UL << IRQ_PEND_EXT_CPU_TIMER) | \ + (1UL << IRQ_PEND_EXT_CLOCK_COMP) | \ + (1UL << IRQ_PEND_EXT_EMERGENCY) | \ + (1UL << IRQ_PEND_EXT_EXTERNAL) | \ + (1UL << IRQ_PEND_EXT_SERVICE) | \ + (1UL << IRQ_PEND_EXT_SERVICE_EV)) + +struct kvm_s390_interrupt_info { + struct list_head list; + u64 type; + union { + struct kvm_s390_io_info io; + struct kvm_s390_ext_info ext; + struct kvm_s390_pgm_info pgm; + struct kvm_s390_emerg_info emerg; + struct kvm_s390_extcall_info extcall; + struct kvm_s390_prefix_info prefix; + struct kvm_s390_stop_info stop; + struct kvm_s390_mchk_info mchk; + }; +}; + +struct kvm_s390_irq_payload { + struct kvm_s390_io_info io; + struct kvm_s390_ext_info ext; + struct kvm_s390_pgm_info pgm; + struct kvm_s390_emerg_info emerg; + struct kvm_s390_extcall_info extcall; + struct kvm_s390_prefix_info prefix; + struct kvm_s390_stop_info stop; + struct kvm_s390_mchk_info mchk; +}; + +struct kvm_s390_local_interrupt { + spinlock_t lock; + DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS); + struct kvm_s390_irq_payload irq; + unsigned long pending_irqs; +}; + +#define FIRQ_LIST_IO_ISC_0 0 +#define FIRQ_LIST_IO_ISC_1 1 +#define FIRQ_LIST_IO_ISC_2 2 +#define FIRQ_LIST_IO_ISC_3 3 +#define FIRQ_LIST_IO_ISC_4 4 +#define FIRQ_LIST_IO_ISC_5 5 +#define FIRQ_LIST_IO_ISC_6 6 +#define FIRQ_LIST_IO_ISC_7 7 +#define FIRQ_LIST_PFAULT 8 +#define FIRQ_LIST_VIRTIO 9 +#define FIRQ_LIST_COUNT 10 +#define FIRQ_CNTR_IO 0 +#define FIRQ_CNTR_SERVICE 1 +#define FIRQ_CNTR_VIRTIO 2 +#define FIRQ_CNTR_PFAULT 3 +#define FIRQ_MAX_COUNT 4 + +/* mask the AIS mode for a given ISC */ +#define AIS_MODE_MASK(isc) (0x80 >> isc) + +#define KVM_S390_AIS_MODE_ALL 0 +#define KVM_S390_AIS_MODE_SINGLE 1 + +struct kvm_s390_float_interrupt { + unsigned long pending_irqs; + unsigned long masked_irqs; + spinlock_t lock; + struct list_head lists[FIRQ_LIST_COUNT]; + int counters[FIRQ_MAX_COUNT]; + struct kvm_s390_mchk_info mchk; + struct kvm_s390_ext_info srv_signal; + int next_rr_cpu; + struct mutex ais_lock; + u8 simm; + u8 nimm; +}; + +struct kvm_hw_wp_info_arch { + unsigned long addr; + unsigned long phys_addr; + int len; + char *old_data; +}; + +struct kvm_hw_bp_info_arch { + unsigned long addr; + int len; +}; + +/* + * Only the upper 16 bits of kvm_guest_debug->control are arch specific. + * Further KVM_GUESTDBG flags which an be used from userspace can be found in + * arch/s390/include/uapi/asm/kvm.h + */ +#define KVM_GUESTDBG_EXIT_PENDING 0x10000000 + +#define guestdbg_enabled(vcpu) \ + (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) +#define guestdbg_sstep_enabled(vcpu) \ + (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) +#define guestdbg_hw_bp_enabled(vcpu) \ + (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) +#define guestdbg_exit_pending(vcpu) (guestdbg_enabled(vcpu) && \ + (vcpu->guest_debug & KVM_GUESTDBG_EXIT_PENDING)) + +#define KVM_GUESTDBG_VALID_MASK \ + (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP |\ + KVM_GUESTDBG_USE_HW_BP | KVM_GUESTDBG_EXIT_PENDING) + +struct kvm_guestdbg_info_arch { + unsigned long cr0; + unsigned long cr9; + unsigned long cr10; + unsigned long cr11; + struct kvm_hw_bp_info_arch *hw_bp_info; + struct kvm_hw_wp_info_arch *hw_wp_info; + int nr_hw_bp; + int nr_hw_wp; + unsigned long last_bp; +}; + +struct kvm_s390_pv_vcpu { + u64 handle; + unsigned long stor_base; +}; + +struct kvm_vcpu_arch { + struct kvm_s390_sie_block *sie_block; + /* if vsie is active, currently executed shadow sie control block */ + struct kvm_s390_sie_block *vsie_block; + unsigned int host_acrs[NUM_ACRS]; + struct gs_cb *host_gscb; + struct fpu host_fpregs; + struct kvm_s390_local_interrupt local_int; + struct hrtimer ckc_timer; + struct kvm_s390_pgm_info pgm; + struct gmap *gmap; + /* backup location for the currently enabled gmap when scheduled out */ + struct gmap *enabled_gmap; + struct kvm_guestdbg_info_arch guestdbg; + unsigned long pfault_token; + unsigned long pfault_select; + unsigned long pfault_compare; + bool cputm_enabled; + /* + * The seqcount protects updates to cputm_start and sie_block.cputm, + * this way we can have non-blocking reads with consistent values. + * Only the owning VCPU thread (vcpu->cpu) is allowed to change these + * values and to start/stop/enable/disable cpu timer accounting. + */ + seqcount_t cputm_seqcount; + __u64 cputm_start; + bool gs_enabled; + bool skey_enabled; + struct kvm_s390_pv_vcpu pv; + union diag318_info diag318_info; +}; + +struct kvm_vm_stat { + struct kvm_vm_stat_generic generic; + u64 inject_io; + u64 inject_float_mchk; + u64 inject_pfault_done; + u64 inject_service_signal; + u64 inject_virtio; + u64 aen_forward; +}; + +struct kvm_arch_memory_slot { +}; + +struct s390_map_info { + struct list_head list; + __u64 guest_addr; + __u64 addr; + struct page *page; +}; + +struct s390_io_adapter { + unsigned int id; + int isc; + bool maskable; + bool masked; + bool swap; + bool suppressible; +}; + +#define MAX_S390_IO_ADAPTERS ((MAX_ISC + 1) * 8) +#define MAX_S390_ADAPTER_MAPS 256 + +/* maximum size of facilities and facility mask is 2k bytes */ +#define S390_ARCH_FAC_LIST_SIZE_BYTE (1<<11) +#define S390_ARCH_FAC_LIST_SIZE_U64 \ + (S390_ARCH_FAC_LIST_SIZE_BYTE / sizeof(u64)) +#define S390_ARCH_FAC_MASK_SIZE_BYTE S390_ARCH_FAC_LIST_SIZE_BYTE +#define S390_ARCH_FAC_MASK_SIZE_U64 \ + (S390_ARCH_FAC_MASK_SIZE_BYTE / sizeof(u64)) + +struct kvm_s390_cpu_model { + /* facility mask supported by kvm & hosting machine */ + __u64 fac_mask[S390_ARCH_FAC_LIST_SIZE_U64]; + struct kvm_s390_vm_cpu_subfunc subfuncs; + /* facility list requested by guest (in dma page) */ + __u64 *fac_list; + u64 cpuid; + unsigned short ibc; + /* subset of available UV-features for pv-guests enabled by user space */ + struct kvm_s390_vm_cpu_uv_feat uv_feat_guest; +}; + +typedef int (*crypto_hook)(struct kvm_vcpu *vcpu); + +struct kvm_s390_crypto { + struct kvm_s390_crypto_cb *crycb; + struct rw_semaphore pqap_hook_rwsem; + crypto_hook *pqap_hook; + __u32 crycbd; + __u8 aes_kw; + __u8 dea_kw; + __u8 apie; +}; + +#define APCB0_MASK_SIZE 1 +struct kvm_s390_apcb0 { + __u64 apm[APCB0_MASK_SIZE]; /* 0x0000 */ + __u64 aqm[APCB0_MASK_SIZE]; /* 0x0008 */ + __u64 adm[APCB0_MASK_SIZE]; /* 0x0010 */ + __u64 reserved18; /* 0x0018 */ +}; + +#define APCB1_MASK_SIZE 4 +struct kvm_s390_apcb1 { + __u64 apm[APCB1_MASK_SIZE]; /* 0x0000 */ + __u64 aqm[APCB1_MASK_SIZE]; /* 0x0020 */ + __u64 adm[APCB1_MASK_SIZE]; /* 0x0040 */ + __u64 reserved60[4]; /* 0x0060 */ +}; + +struct kvm_s390_crypto_cb { + struct kvm_s390_apcb0 apcb0; /* 0x0000 */ + __u8 reserved20[0x0048 - 0x0020]; /* 0x0020 */ + __u8 dea_wrapping_key_mask[24]; /* 0x0048 */ + __u8 aes_wrapping_key_mask[32]; /* 0x0060 */ + struct kvm_s390_apcb1 apcb1; /* 0x0080 */ +}; + +struct kvm_s390_gisa { + union { + struct { /* common to all formats */ + u32 next_alert; + u8 ipm; + u8 reserved01[2]; + u8 iam; + }; + struct { /* format 0 */ + u32 next_alert; + u8 ipm; + u8 reserved01; + u8 : 6; + u8 g : 1; + u8 c : 1; + u8 iam; + u8 reserved02[4]; + u32 airq_count; + } g0; + struct { /* format 1 */ + u32 next_alert; + u8 ipm; + u8 simm; + u8 nimm; + u8 iam; + u8 aism[8]; + u8 : 6; + u8 g : 1; + u8 c : 1; + u8 reserved03[11]; + u32 airq_count; + } g1; + struct { + u64 word[4]; + } u64; + }; +}; + +struct kvm_s390_gib { + u32 alert_list_origin; + u32 reserved01; + u8:5; + u8 nisc:3; + u8 reserved03[3]; + u32 reserved04[5]; +}; + +/* + * sie_page2 has to be allocated as DMA because fac_list, crycb and + * gisa need 31bit addresses in the sie control block. + */ +struct sie_page2 { + __u64 fac_list[S390_ARCH_FAC_LIST_SIZE_U64]; /* 0x0000 */ + struct kvm_s390_crypto_cb crycb; /* 0x0800 */ + struct kvm_s390_gisa gisa; /* 0x0900 */ + struct kvm *kvm; /* 0x0920 */ + u8 reserved928[0x1000 - 0x928]; /* 0x0928 */ +}; + +struct kvm_s390_vsie { + struct mutex mutex; + struct radix_tree_root addr_to_page; + int page_count; + int next; + struct page *pages[KVM_MAX_VCPUS]; +}; + +struct kvm_s390_gisa_iam { + u8 mask; + spinlock_t ref_lock; + u32 ref_count[MAX_ISC + 1]; +}; + +struct kvm_s390_gisa_interrupt { + struct kvm_s390_gisa *origin; + struct kvm_s390_gisa_iam alert; + struct hrtimer timer; + u64 expires; + DECLARE_BITMAP(kicked_mask, KVM_MAX_VCPUS); +}; + +struct kvm_s390_pv { + u64 handle; + u64 guest_len; + unsigned long stor_base; + void *stor_var; + bool dumping; + void *set_aside; + struct list_head need_cleanup; + struct mmu_notifier mmu_notifier; +}; + +struct kvm_arch{ + void *sca; + int use_esca; + rwlock_t sca_lock; + debug_info_t *dbf; + struct kvm_s390_float_interrupt float_int; + struct kvm_device *flic; + struct gmap *gmap; + unsigned long mem_limit; + int css_support; + int use_irqchip; + int use_cmma; + int use_pfmfi; + int use_skf; + int use_zpci_interp; + int user_cpu_state_ctrl; + int user_sigp; + int user_stsi; + int user_instr0; + struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS]; + wait_queue_head_t ipte_wq; + int ipte_lock_count; + struct mutex ipte_mutex; + spinlock_t start_stop_lock; + struct sie_page2 *sie_page2; + struct kvm_s390_cpu_model model; + struct kvm_s390_crypto crypto; + struct kvm_s390_vsie vsie; + u8 epdx; + u64 epoch; + int migration_mode; + atomic64_t cmma_dirty_pages; + /* subset of available cpu features enabled by user space */ + DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); + /* indexed by vcpu_idx */ + DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS); + struct kvm_s390_gisa_interrupt gisa_int; + struct kvm_s390_pv pv; + struct list_head kzdev_list; + spinlock_t kzdev_list_lock; +}; + +#define KVM_HVA_ERR_BAD (-1UL) +#define KVM_HVA_ERR_RO_BAD (-2UL) + +static inline bool kvm_is_error_hva(unsigned long addr) +{ + return IS_ERR_VALUE(addr); +} + +#define ASYNC_PF_PER_VCPU 64 +struct kvm_arch_async_pf { + unsigned long pfault_token; +}; + +bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu); + +void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work); + +bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work); + +void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work); + +static inline void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu) {} + +void kvm_arch_crypto_clear_masks(struct kvm *kvm); +void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm, + unsigned long *aqm, unsigned long *adm); + +int __sie64a(phys_addr_t sie_block_phys, struct kvm_s390_sie_block *sie_block, u64 *rsa); + +static inline int sie64a(struct kvm_s390_sie_block *sie_block, u64 *rsa) +{ + return __sie64a(virt_to_phys(sie_block), sie_block, rsa); +} + +extern char sie_exit; + +bool kvm_s390_pv_is_protected(struct kvm *kvm); +bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu); + +extern int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc); +extern int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc); + +static inline void kvm_arch_sync_events(struct kvm *kvm) {} +static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} +static inline void kvm_arch_free_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot) {} +static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {} +static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} +static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot) {} +static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} + +#define __KVM_HAVE_ARCH_VM_FREE +void kvm_arch_free_vm(struct kvm *kvm); + +struct zpci_kvm_hook { + int (*kvm_register)(void *opaque, struct kvm *kvm); + void (*kvm_unregister)(void *opaque); +}; + +extern struct zpci_kvm_hook zpci_kvm_hook; + +#endif diff --git a/arch/s390/include/asm/kvm_para.h b/arch/s390/include/asm/kvm_para.h new file mode 100644 index 0000000000..df73a05276 --- /dev/null +++ b/arch/s390/include/asm/kvm_para.h @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * definition for paravirtual devices on s390 + * + * Copyright IBM Corp. 2008 + * + * Author(s): Christian Borntraeger + */ +/* + * Hypercalls for KVM on s390. The calling convention is similar to the + * s390 ABI, so we use R2-R6 for parameters 1-5. In addition we use R1 + * as hypercall number and R7 as parameter 6. The return value is + * written to R2. We use the diagnose instruction as hypercall. To avoid + * conflicts with existing diagnoses for LPAR and z/VM, we do not use + * the instruction encoded number, but specify the number in R1 and + * use 0x500 as KVM hypercall + * + * Copyright IBM Corp. 2007,2008 + * Author(s): Christian Borntraeger + */ +#ifndef __S390_KVM_PARA_H +#define __S390_KVM_PARA_H + +#include +#include + +#define HYPERCALL_FMT_0 +#define HYPERCALL_FMT_1 , "0" (r2) +#define HYPERCALL_FMT_2 , "d" (r3) HYPERCALL_FMT_1 +#define HYPERCALL_FMT_3 , "d" (r4) HYPERCALL_FMT_2 +#define HYPERCALL_FMT_4 , "d" (r5) HYPERCALL_FMT_3 +#define HYPERCALL_FMT_5 , "d" (r6) HYPERCALL_FMT_4 +#define HYPERCALL_FMT_6 , "d" (r7) HYPERCALL_FMT_5 + +#define HYPERCALL_PARM_0 +#define HYPERCALL_PARM_1 , unsigned long arg1 +#define HYPERCALL_PARM_2 HYPERCALL_PARM_1, unsigned long arg2 +#define HYPERCALL_PARM_3 HYPERCALL_PARM_2, unsigned long arg3 +#define HYPERCALL_PARM_4 HYPERCALL_PARM_3, unsigned long arg4 +#define HYPERCALL_PARM_5 HYPERCALL_PARM_4, unsigned long arg5 +#define HYPERCALL_PARM_6 HYPERCALL_PARM_5, unsigned long arg6 + +#define HYPERCALL_REGS_0 +#define HYPERCALL_REGS_1 \ + register unsigned long r2 asm("2") = arg1 +#define HYPERCALL_REGS_2 \ + HYPERCALL_REGS_1; \ + register unsigned long r3 asm("3") = arg2 +#define HYPERCALL_REGS_3 \ + HYPERCALL_REGS_2; \ + register unsigned long r4 asm("4") = arg3 +#define HYPERCALL_REGS_4 \ + HYPERCALL_REGS_3; \ + register unsigned long r5 asm("5") = arg4 +#define HYPERCALL_REGS_5 \ + HYPERCALL_REGS_4; \ + register unsigned long r6 asm("6") = arg5 +#define HYPERCALL_REGS_6 \ + HYPERCALL_REGS_5; \ + register unsigned long r7 asm("7") = arg6 + +#define HYPERCALL_ARGS_0 +#define HYPERCALL_ARGS_1 , arg1 +#define HYPERCALL_ARGS_2 HYPERCALL_ARGS_1, arg2 +#define HYPERCALL_ARGS_3 HYPERCALL_ARGS_2, arg3 +#define HYPERCALL_ARGS_4 HYPERCALL_ARGS_3, arg4 +#define HYPERCALL_ARGS_5 HYPERCALL_ARGS_4, arg5 +#define HYPERCALL_ARGS_6 HYPERCALL_ARGS_5, arg6 + +#define GENERATE_KVM_HYPERCALL_FUNC(args) \ +static inline \ +long __kvm_hypercall##args(unsigned long nr HYPERCALL_PARM_##args) \ +{ \ + register unsigned long __nr asm("1") = nr; \ + register long __rc asm("2"); \ + HYPERCALL_REGS_##args; \ + \ + asm volatile ( \ + " diag 2,4,0x500\n" \ + : "=d" (__rc) \ + : "d" (__nr) HYPERCALL_FMT_##args \ + : "memory", "cc"); \ + return __rc; \ +} \ + \ +static inline \ +long kvm_hypercall##args(unsigned long nr HYPERCALL_PARM_##args) \ +{ \ + diag_stat_inc(DIAG_STAT_X500); \ + return __kvm_hypercall##args(nr HYPERCALL_ARGS_##args); \ +} + +GENERATE_KVM_HYPERCALL_FUNC(0) +GENERATE_KVM_HYPERCALL_FUNC(1) +GENERATE_KVM_HYPERCALL_FUNC(2) +GENERATE_KVM_HYPERCALL_FUNC(3) +GENERATE_KVM_HYPERCALL_FUNC(4) +GENERATE_KVM_HYPERCALL_FUNC(5) +GENERATE_KVM_HYPERCALL_FUNC(6) + +/* kvm on s390 is always paravirtualization enabled */ +static inline int kvm_para_available(void) +{ + return 1; +} + +/* No feature bits are currently assigned for kvm on s390 */ +static inline unsigned int kvm_arch_para_features(void) +{ + return 0; +} + +static inline unsigned int kvm_arch_para_hints(void) +{ + return 0; +} + +static inline bool kvm_check_and_clear_guest_paused(void) +{ + return false; +} + +#endif /* __S390_KVM_PARA_H */ diff --git a/arch/s390/include/asm/linkage.h b/arch/s390/include/asm/linkage.h new file mode 100644 index 0000000000..df3fb7d822 --- /dev/null +++ b/arch/s390/include/asm/linkage.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_LINKAGE_H +#define __ASM_LINKAGE_H + +#include + +#define __ALIGN .balign CONFIG_FUNCTION_ALIGNMENT, 0x07 +#define __ALIGN_STR __stringify(__ALIGN) + +#endif diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h new file mode 100644 index 0000000000..69ccc464a4 --- /dev/null +++ b/arch/s390/include/asm/lowcore.h @@ -0,0 +1,232 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 1999, 2012 + * Author(s): Hartmut Penner , + * Martin Schwidefsky , + * Denis Joseph Barrow, + */ + +#ifndef _ASM_S390_LOWCORE_H +#define _ASM_S390_LOWCORE_H + +#include +#include +#include +#include + +#define LC_ORDER 1 +#define LC_PAGES 2 + +struct pgm_tdb { + u64 data[32]; +}; + +struct lowcore { + __u8 pad_0x0000[0x0014-0x0000]; /* 0x0000 */ + __u32 ipl_parmblock_ptr; /* 0x0014 */ + __u8 pad_0x0018[0x0080-0x0018]; /* 0x0018 */ + __u32 ext_params; /* 0x0080 */ + union { + struct { + __u16 ext_cpu_addr; /* 0x0084 */ + __u16 ext_int_code; /* 0x0086 */ + }; + __u32 ext_int_code_addr; + }; + __u32 svc_int_code; /* 0x0088 */ + union { + struct { + __u16 pgm_ilc; /* 0x008c */ + __u16 pgm_code; /* 0x008e */ + }; + __u32 pgm_int_code; + }; + __u32 data_exc_code; /* 0x0090 */ + __u16 mon_class_num; /* 0x0094 */ + union { + struct { + __u8 per_code; /* 0x0096 */ + __u8 per_atmid; /* 0x0097 */ + }; + __u16 per_code_combined; + }; + __u64 per_address; /* 0x0098 */ + __u8 exc_access_id; /* 0x00a0 */ + __u8 per_access_id; /* 0x00a1 */ + __u8 op_access_id; /* 0x00a2 */ + __u8 ar_mode_id; /* 0x00a3 */ + __u8 pad_0x00a4[0x00a8-0x00a4]; /* 0x00a4 */ + __u64 trans_exc_code; /* 0x00a8 */ + __u64 monitor_code; /* 0x00b0 */ + union { + struct { + __u16 subchannel_id; /* 0x00b8 */ + __u16 subchannel_nr; /* 0x00ba */ + __u32 io_int_parm; /* 0x00bc */ + __u32 io_int_word; /* 0x00c0 */ + }; + struct tpi_info tpi_info; /* 0x00b8 */ + }; + __u8 pad_0x00c4[0x00c8-0x00c4]; /* 0x00c4 */ + __u32 stfl_fac_list; /* 0x00c8 */ + __u8 pad_0x00cc[0x00e8-0x00cc]; /* 0x00cc */ + __u64 mcck_interruption_code; /* 0x00e8 */ + __u8 pad_0x00f0[0x00f4-0x00f0]; /* 0x00f0 */ + __u32 external_damage_code; /* 0x00f4 */ + __u64 failing_storage_address; /* 0x00f8 */ + __u8 pad_0x0100[0x0110-0x0100]; /* 0x0100 */ + __u64 pgm_last_break; /* 0x0110 */ + __u8 pad_0x0118[0x0120-0x0118]; /* 0x0118 */ + psw_t restart_old_psw; /* 0x0120 */ + psw_t external_old_psw; /* 0x0130 */ + psw_t svc_old_psw; /* 0x0140 */ + psw_t program_old_psw; /* 0x0150 */ + psw_t mcck_old_psw; /* 0x0160 */ + psw_t io_old_psw; /* 0x0170 */ + __u8 pad_0x0180[0x01a0-0x0180]; /* 0x0180 */ + psw_t restart_psw; /* 0x01a0 */ + psw_t external_new_psw; /* 0x01b0 */ + psw_t svc_new_psw; /* 0x01c0 */ + psw_t program_new_psw; /* 0x01d0 */ + psw_t mcck_new_psw; /* 0x01e0 */ + psw_t io_new_psw; /* 0x01f0 */ + + /* Save areas. */ + __u64 save_area_sync[8]; /* 0x0200 */ + __u64 save_area_async[8]; /* 0x0240 */ + __u64 save_area_restart[1]; /* 0x0280 */ + + /* CPU flags. */ + __u64 cpu_flags; /* 0x0288 */ + + /* Return psws. */ + psw_t return_psw; /* 0x0290 */ + psw_t return_mcck_psw; /* 0x02a0 */ + + __u64 last_break; /* 0x02b0 */ + + /* CPU accounting and timing values. */ + __u64 sys_enter_timer; /* 0x02b8 */ + __u64 mcck_enter_timer; /* 0x02c0 */ + __u64 exit_timer; /* 0x02c8 */ + __u64 user_timer; /* 0x02d0 */ + __u64 guest_timer; /* 0x02d8 */ + __u64 system_timer; /* 0x02e0 */ + __u64 hardirq_timer; /* 0x02e8 */ + __u64 softirq_timer; /* 0x02f0 */ + __u64 steal_timer; /* 0x02f8 */ + __u64 avg_steal_timer; /* 0x0300 */ + __u64 last_update_timer; /* 0x0308 */ + __u64 last_update_clock; /* 0x0310 */ + __u64 int_clock; /* 0x0318 */ + __u8 pad_0x0320[0x0328-0x0320]; /* 0x0320 */ + __u64 clock_comparator; /* 0x0328 */ + __u64 boot_clock[2]; /* 0x0330 */ + + /* Current process. */ + __u64 current_task; /* 0x0340 */ + __u64 kernel_stack; /* 0x0348 */ + + /* Interrupt, DAT-off and restartstack. */ + __u64 async_stack; /* 0x0350 */ + __u64 nodat_stack; /* 0x0358 */ + __u64 restart_stack; /* 0x0360 */ + __u64 mcck_stack; /* 0x0368 */ + /* Restart function and parameter. */ + __u64 restart_fn; /* 0x0370 */ + __u64 restart_data; /* 0x0378 */ + __u32 restart_source; /* 0x0380 */ + __u32 restart_flags; /* 0x0384 */ + + /* Address space pointer. */ + __u64 kernel_asce; /* 0x0388 */ + __u64 user_asce; /* 0x0390 */ + + /* + * The lpp and current_pid fields form a + * 64-bit value that is set as program + * parameter with the LPP instruction. + */ + __u32 lpp; /* 0x0398 */ + __u32 current_pid; /* 0x039c */ + + /* SMP info area */ + __u32 cpu_nr; /* 0x03a0 */ + __u32 softirq_pending; /* 0x03a4 */ + __s32 preempt_count; /* 0x03a8 */ + __u32 spinlock_lockval; /* 0x03ac */ + __u32 spinlock_index; /* 0x03b0 */ + __u32 fpu_flags; /* 0x03b4 */ + __u64 percpu_offset; /* 0x03b8 */ + __u8 pad_0x03c0[0x03c8-0x03c0]; /* 0x03c0 */ + __u64 machine_flags; /* 0x03c8 */ + __u64 gmap; /* 0x03d0 */ + __u8 pad_0x03d8[0x0400-0x03d8]; /* 0x03d8 */ + + __u32 return_lpswe; /* 0x0400 */ + __u32 return_mcck_lpswe; /* 0x0404 */ + __u8 pad_0x040a[0x0e00-0x0408]; /* 0x0408 */ + + /* + * 0xe00 contains the address of the IPL Parameter Information + * block. Dump tools need IPIB for IPL after dump. + * Note: do not change the position of any fields in 0x0e00-0x0f00 + */ + __u64 ipib; /* 0x0e00 */ + __u32 ipib_checksum; /* 0x0e08 */ + __u64 vmcore_info; /* 0x0e0c */ + __u8 pad_0x0e14[0x0e18-0x0e14]; /* 0x0e14 */ + __u64 os_info; /* 0x0e18 */ + __u8 pad_0x0e20[0x11b0-0x0e20]; /* 0x0e20 */ + + /* Pointer to the machine check extended save area */ + __u64 mcesad; /* 0x11b0 */ + + /* 64 bit extparam used for pfault/diag 250: defined by architecture */ + __u64 ext_params2; /* 0x11B8 */ + __u8 pad_0x11c0[0x1200-0x11C0]; /* 0x11C0 */ + + /* CPU register save area: defined by architecture */ + __u64 floating_pt_save_area[16]; /* 0x1200 */ + __u64 gpregs_save_area[16]; /* 0x1280 */ + psw_t psw_save_area; /* 0x1300 */ + __u8 pad_0x1310[0x1318-0x1310]; /* 0x1310 */ + __u32 prefixreg_save_area; /* 0x1318 */ + __u32 fpt_creg_save_area; /* 0x131c */ + __u8 pad_0x1320[0x1324-0x1320]; /* 0x1320 */ + __u32 tod_progreg_save_area; /* 0x1324 */ + __u32 cpu_timer_save_area[2]; /* 0x1328 */ + __u32 clock_comp_save_area[2]; /* 0x1330 */ + __u64 last_break_save_area; /* 0x1338 */ + __u32 access_regs_save_area[16]; /* 0x1340 */ + __u64 cregs_save_area[16]; /* 0x1380 */ + __u8 pad_0x1400[0x1500-0x1400]; /* 0x1400 */ + /* Cryptography-counter designation */ + __u64 ccd; /* 0x1500 */ + /* AI-extension counter designation */ + __u64 aicd; /* 0x1508 */ + __u8 pad_0x1510[0x1800-0x1510]; /* 0x1510 */ + + /* Transaction abort diagnostic block */ + struct pgm_tdb pgm_tdb; /* 0x1800 */ + __u8 pad_0x1900[0x2000-0x1900]; /* 0x1900 */ +} __packed __aligned(8192); + +#define S390_lowcore (*((struct lowcore *) 0)) + +extern struct lowcore *lowcore_ptr[]; + +static inline void set_prefix(__u32 address) +{ + asm volatile("spx %0" : : "Q" (address) : "memory"); +} + +static inline __u32 store_prefix(void) +{ + __u32 address; + + asm volatile("stpx %0" : "=Q" (address)); + return address; +} + +#endif /* _ASM_S390_LOWCORE_H */ diff --git a/arch/s390/include/asm/maccess.h b/arch/s390/include/asm/maccess.h new file mode 100644 index 0000000000..50225940d9 --- /dev/null +++ b/arch/s390/include/asm/maccess.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_S390_MACCESS_H +#define __ASM_S390_MACCESS_H + +#include + +#define MEMCPY_REAL_SIZE PAGE_SIZE +#define MEMCPY_REAL_MASK PAGE_MASK + +struct iov_iter; + +extern unsigned long __memcpy_real_area; +extern pte_t *memcpy_real_ptep; +size_t memcpy_real_iter(struct iov_iter *iter, unsigned long src, size_t count); +int memcpy_real(void *dest, unsigned long src, size_t count); +#ifdef CONFIG_CRASH_DUMP +int copy_oldmem_kernel(void *dst, unsigned long src, size_t count); +#endif + +#endif /* __ASM_S390_MACCESS_H */ diff --git a/arch/s390/include/asm/mem_encrypt.h b/arch/s390/include/asm/mem_encrypt.h new file mode 100644 index 0000000000..b85e13505a --- /dev/null +++ b/arch/s390/include/asm/mem_encrypt.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef S390_MEM_ENCRYPT_H__ +#define S390_MEM_ENCRYPT_H__ + +#ifndef __ASSEMBLY__ + +int set_memory_encrypted(unsigned long vaddr, int numpages); +int set_memory_decrypted(unsigned long vaddr, int numpages); + +#endif /* __ASSEMBLY__ */ + +#endif /* S390_MEM_ENCRYPT_H__ */ diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h new file mode 100644 index 0000000000..829d68e2c6 --- /dev/null +++ b/arch/s390/include/asm/mmu.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __MMU_H +#define __MMU_H + +#include +#include +#include + +typedef struct { + spinlock_t lock; + cpumask_t cpu_attach_mask; + atomic_t flush_count; + unsigned int flush_mm; + struct list_head pgtable_list; + struct list_head gmap_list; + unsigned long gmap_asce; + unsigned long asce; + unsigned long asce_limit; + unsigned long vdso_base; + /* The mmu context belongs to a secure guest. */ + atomic_t protected_count; + /* + * The following bitfields need a down_write on the mm + * semaphore when they are written to. As they are only + * written once, they can be read without a lock. + * + * The mmu context allocates 4K page tables. + */ + unsigned int alloc_pgste:1; + /* The mmu context uses extended page tables. */ + unsigned int has_pgste:1; + /* The mmu context uses storage keys. */ + unsigned int uses_skeys:1; + /* The mmu context uses CMM. */ + unsigned int uses_cmm:1; + /* The gmaps associated with this context are allowed to use huge pages. */ + unsigned int allow_gmap_hpage_1m:1; +} mm_context_t; + +#define INIT_MM_CONTEXT(name) \ + .context.lock = __SPIN_LOCK_UNLOCKED(name.context.lock), \ + .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \ + .context.gmap_list = LIST_HEAD_INIT(name.context.gmap_list), + +#endif diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h new file mode 100644 index 0000000000..2a38af5a00 --- /dev/null +++ b/arch/s390/include/asm/mmu_context.h @@ -0,0 +1,128 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * S390 version + * + * Derived from "include/asm-i386/mmu_context.h" + */ + +#ifndef __S390_MMU_CONTEXT_H +#define __S390_MMU_CONTEXT_H + +#include +#include +#include +#include +#include +#include + +#define init_new_context init_new_context +static inline int init_new_context(struct task_struct *tsk, + struct mm_struct *mm) +{ + unsigned long asce_type, init_entry; + + spin_lock_init(&mm->context.lock); + INIT_LIST_HEAD(&mm->context.pgtable_list); + INIT_LIST_HEAD(&mm->context.gmap_list); + cpumask_clear(&mm->context.cpu_attach_mask); + atomic_set(&mm->context.flush_count, 0); + atomic_set(&mm->context.protected_count, 0); + mm->context.gmap_asce = 0; + mm->context.flush_mm = 0; +#ifdef CONFIG_PGSTE + mm->context.alloc_pgste = page_table_allocate_pgste || + test_thread_flag(TIF_PGSTE) || + (current->mm && current->mm->context.alloc_pgste); + mm->context.has_pgste = 0; + mm->context.uses_skeys = 0; + mm->context.uses_cmm = 0; + mm->context.allow_gmap_hpage_1m = 0; +#endif + switch (mm->context.asce_limit) { + default: + /* + * context created by exec, the value of asce_limit can + * only be zero in this case + */ + VM_BUG_ON(mm->context.asce_limit); + /* continue as 3-level task */ + mm->context.asce_limit = _REGION2_SIZE; + fallthrough; + case _REGION2_SIZE: + /* forked 3-level task */ + init_entry = _REGION3_ENTRY_EMPTY; + asce_type = _ASCE_TYPE_REGION3; + break; + case TASK_SIZE_MAX: + /* forked 5-level task */ + init_entry = _REGION1_ENTRY_EMPTY; + asce_type = _ASCE_TYPE_REGION1; + break; + case _REGION1_SIZE: + /* forked 4-level task */ + init_entry = _REGION2_ENTRY_EMPTY; + asce_type = _ASCE_TYPE_REGION2; + break; + } + mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | asce_type; + crst_table_init((unsigned long *) mm->pgd, init_entry); + return 0; +} + +static inline void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + int cpu = smp_processor_id(); + + if (next == &init_mm) + S390_lowcore.user_asce = s390_invalid_asce; + else + S390_lowcore.user_asce = next->context.asce; + cpumask_set_cpu(cpu, &next->context.cpu_attach_mask); + /* Clear previous user-ASCE from CR7 */ + __ctl_load(s390_invalid_asce, 7, 7); + if (prev != next) + cpumask_clear_cpu(cpu, &prev->context.cpu_attach_mask); +} +#define switch_mm_irqs_off switch_mm_irqs_off + +static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + unsigned long flags; + + local_irq_save(flags); + switch_mm_irqs_off(prev, next, tsk); + local_irq_restore(flags); +} + +#define finish_arch_post_lock_switch finish_arch_post_lock_switch +static inline void finish_arch_post_lock_switch(void) +{ + struct task_struct *tsk = current; + struct mm_struct *mm = tsk->mm; + + if (mm) { + preempt_disable(); + while (atomic_read(&mm->context.flush_count)) + cpu_relax(); + cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm)); + __tlb_flush_mm_lazy(mm); + preempt_enable(); + } + __ctl_load(S390_lowcore.user_asce, 7, 7); +} + +#define activate_mm activate_mm +static inline void activate_mm(struct mm_struct *prev, + struct mm_struct *next) +{ + switch_mm(prev, next, current); + cpumask_set_cpu(smp_processor_id(), mm_cpumask(next)); + __ctl_load(S390_lowcore.user_asce, 7, 7); +} + +#include + +#endif /* __S390_MMU_CONTEXT_H */ diff --git a/arch/s390/include/asm/mmzone.h b/arch/s390/include/asm/mmzone.h new file mode 100644 index 0000000000..73e3e7c697 --- /dev/null +++ b/arch/s390/include/asm/mmzone.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * NUMA support for s390 + * + * Copyright IBM Corp. 2015 + */ + +#ifndef _ASM_S390_MMZONE_H +#define _ASM_S390_MMZONE_H + +#ifdef CONFIG_NUMA + +extern struct pglist_data *node_data[]; +#define NODE_DATA(nid) (node_data[nid]) + +#endif /* CONFIG_NUMA */ +#endif /* _ASM_S390_MMZONE_H */ diff --git a/arch/s390/include/asm/module.h b/arch/s390/include/asm/module.h new file mode 100644 index 0000000000..9f1eea1587 --- /dev/null +++ b/arch/s390/include/asm/module.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_MODULE_H +#define _ASM_S390_MODULE_H + +#include + +/* + * This file contains the s390 architecture specific module code. + */ + +struct mod_arch_syminfo { + unsigned long got_offset; + unsigned long plt_offset; + int got_initialized; + int plt_initialized; +}; + +struct mod_arch_specific { + /* Starting offset of got in the module core memory. */ + unsigned long got_offset; + /* Starting offset of plt in the module core memory. */ + unsigned long plt_offset; + /* Size of the got. */ + unsigned long got_size; + /* Size of the plt. */ + unsigned long plt_size; + /* Number of symbols in syminfo. */ + int nsyms; + /* Additional symbol information (got and plt offsets). */ + struct mod_arch_syminfo *syminfo; +#ifdef CONFIG_FUNCTION_TRACER + /* Start of memory reserved for ftrace hotpatch trampolines. */ + struct ftrace_hotpatch_trampoline *trampolines_start; + /* End of memory reserved for ftrace hotpatch trampolines. */ + struct ftrace_hotpatch_trampoline *trampolines_end; + /* Next unused ftrace hotpatch trampoline slot. */ + struct ftrace_hotpatch_trampoline *next_trampoline; +#endif /* CONFIG_FUNCTION_TRACER */ +}; + +#endif /* _ASM_S390_MODULE_H */ diff --git a/arch/s390/include/asm/msi.h b/arch/s390/include/asm/msi.h new file mode 100644 index 0000000000..399343ed9f --- /dev/null +++ b/arch/s390/include/asm/msi.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_MSI_H +#define _ASM_S390_MSI_H +#include + +/* + * Work around S390 not using irq_domain at all so we can't set + * IRQ_DOMAIN_FLAG_ISOLATED_MSI. See for an explanation how it works: + * + * https://lore.kernel.org/r/31af8174-35e9-ebeb-b9ef-74c90d4bfd93@linux.ibm.com/ + * + * Note this is less isolated than the ARM/x86 versions as userspace can trigger + * MSI belonging to kernel devices within the same gisa. + */ +#define arch_is_isolated_msi() true + +#endif diff --git a/arch/s390/include/asm/nmi.h b/arch/s390/include/asm/nmi.h new file mode 100644 index 0000000000..227466ce9e --- /dev/null +++ b/arch/s390/include/asm/nmi.h @@ -0,0 +1,108 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Machine check handler definitions + * + * Copyright IBM Corp. 2000, 2009 + * Author(s): Ingo Adlung , + * Martin Schwidefsky , + * Cornelia Huck , + */ + +#ifndef _ASM_S390_NMI_H +#define _ASM_S390_NMI_H + +#include +#include + +#define MCIC_SUBCLASS_MASK (1ULL<<63 | 1ULL<<62 | 1ULL<<61 | \ + 1ULL<<59 | 1ULL<<58 | 1ULL<<56 | \ + 1ULL<<55 | 1ULL<<54 | 1ULL<<53 | \ + 1ULL<<52 | 1ULL<<47 | 1ULL<<46 | \ + 1ULL<<45 | 1ULL<<44) +#define MCCK_CODE_SYSTEM_DAMAGE BIT(63) +#define MCCK_CODE_EXT_DAMAGE BIT(63 - 5) +#define MCCK_CODE_CP BIT(63 - 9) +#define MCCK_CODE_STG_ERROR BIT(63 - 16) +#define MCCK_CODE_STG_KEY_ERROR BIT(63 - 18) +#define MCCK_CODE_STG_DEGRAD BIT(63 - 19) +#define MCCK_CODE_PSW_MWP_VALID BIT(63 - 20) +#define MCCK_CODE_PSW_IA_VALID BIT(63 - 23) +#define MCCK_CODE_STG_FAIL_ADDR BIT(63 - 24) +#define MCCK_CODE_CR_VALID BIT(63 - 29) +#define MCCK_CODE_GS_VALID BIT(63 - 36) +#define MCCK_CODE_FC_VALID BIT(63 - 43) +#define MCCK_CODE_CPU_TIMER_VALID BIT(63 - 46) + +#ifndef __ASSEMBLY__ + +union mci { + unsigned long val; + struct { + u64 sd : 1; /* 00 system damage */ + u64 pd : 1; /* 01 instruction-processing damage */ + u64 sr : 1; /* 02 system recovery */ + u64 : 1; /* 03 */ + u64 cd : 1; /* 04 timing-facility damage */ + u64 ed : 1; /* 05 external damage */ + u64 : 1; /* 06 */ + u64 dg : 1; /* 07 degradation */ + u64 w : 1; /* 08 warning pending */ + u64 cp : 1; /* 09 channel-report pending */ + u64 sp : 1; /* 10 service-processor damage */ + u64 ck : 1; /* 11 channel-subsystem damage */ + u64 : 2; /* 12-13 */ + u64 b : 1; /* 14 backed up */ + u64 : 1; /* 15 */ + u64 se : 1; /* 16 storage error uncorrected */ + u64 sc : 1; /* 17 storage error corrected */ + u64 ke : 1; /* 18 storage-key error uncorrected */ + u64 ds : 1; /* 19 storage degradation */ + u64 wp : 1; /* 20 psw mwp validity */ + u64 ms : 1; /* 21 psw mask and key validity */ + u64 pm : 1; /* 22 psw program mask and cc validity */ + u64 ia : 1; /* 23 psw instruction address validity */ + u64 fa : 1; /* 24 failing storage address validity */ + u64 vr : 1; /* 25 vector register validity */ + u64 ec : 1; /* 26 external damage code validity */ + u64 fp : 1; /* 27 floating point register validity */ + u64 gr : 1; /* 28 general register validity */ + u64 cr : 1; /* 29 control register validity */ + u64 : 1; /* 30 */ + u64 st : 1; /* 31 storage logical validity */ + u64 ie : 1; /* 32 indirect storage error */ + u64 ar : 1; /* 33 access register validity */ + u64 da : 1; /* 34 delayed access exception */ + u64 : 1; /* 35 */ + u64 gs : 1; /* 36 guarded storage registers validity */ + u64 : 5; /* 37-41 */ + u64 pr : 1; /* 42 tod programmable register validity */ + u64 fc : 1; /* 43 fp control register validity */ + u64 ap : 1; /* 44 ancillary report */ + u64 : 1; /* 45 */ + u64 ct : 1; /* 46 cpu timer validity */ + u64 cc : 1; /* 47 clock comparator validity */ + u64 : 16; /* 47-63 */ + }; +}; + +#define MCESA_ORIGIN_MASK (~0x3ffUL) +#define MCESA_LC_MASK (0xfUL) +#define MCESA_MIN_SIZE (1024) +#define MCESA_MAX_SIZE (2048) + +struct mcesa { + u8 vector_save_area[1024]; + u8 guarded_storage_save_area[32]; +}; + +struct pt_regs; + +void nmi_alloc_mcesa_early(u64 *mcesad); +int nmi_alloc_mcesa(u64 *mcesad); +void nmi_free_mcesa(u64 *mcesad); + +void s390_handle_mcck(void); +void s390_do_machine_check(struct pt_regs *regs); + +#endif /* __ASSEMBLY__ */ +#endif /* _ASM_S390_NMI_H */ diff --git a/arch/s390/include/asm/nospec-branch.h b/arch/s390/include/asm/nospec-branch.h new file mode 100644 index 0000000000..82725cf783 --- /dev/null +++ b/arch/s390/include/asm/nospec-branch.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_EXPOLINE_H +#define _ASM_S390_EXPOLINE_H + +#ifndef __ASSEMBLY__ + +#include + +extern int nospec_disable; + +void nospec_init_branches(void); +void nospec_auto_detect(void); +void nospec_revert(s32 *start, s32 *end); + +static inline bool nospec_uses_trampoline(void) +{ + return __is_defined(CC_USING_EXPOLINE) && !nospec_disable; +} + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_S390_EXPOLINE_H */ diff --git a/arch/s390/include/asm/nospec-insn.h b/arch/s390/include/asm/nospec-insn.h new file mode 100644 index 0000000000..7a946c42ad --- /dev/null +++ b/arch/s390/include/asm/nospec-insn.h @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_NOSPEC_ASM_H +#define _ASM_S390_NOSPEC_ASM_H + +#include +#include + +#ifdef __ASSEMBLY__ + +#ifdef CC_USING_EXPOLINE + +/* + * The expoline macros are used to create thunks in the same format + * as gcc generates them. The 'comdat' section flag makes sure that + * the various thunks are merged into a single copy. + */ + .macro __THUNK_PROLOG_NAME name +#ifdef CONFIG_EXPOLINE_EXTERN + .pushsection .text,"ax",@progbits + __ALIGN +#else + .pushsection .text.\name,"axG",@progbits,\name,comdat +#endif + .globl \name + .hidden \name + .type \name,@function +\name: + CFI_STARTPROC + .endm + + .macro __THUNK_EPILOG_NAME name + CFI_ENDPROC +#ifdef CONFIG_EXPOLINE_EXTERN + .size \name, .-\name +#endif + .popsection + .endm + + .macro __THUNK_PROLOG_BR r1 + __THUNK_PROLOG_NAME __s390_indirect_jump_r\r1 + .endm + + .macro __THUNK_EPILOG_BR r1 + __THUNK_EPILOG_NAME __s390_indirect_jump_r\r1 + .endm + + .macro __THUNK_BR r1 + jg __s390_indirect_jump_r\r1 + .endm + + .macro __THUNK_BRASL r1,r2 + brasl \r1,__s390_indirect_jump_r\r2 + .endm + + .macro __DECODE_R expand,reg + .set .L__decode_fail,1 + .irp r1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 + .ifc \reg,%r\r1 + \expand \r1 + .set .L__decode_fail,0 + .endif + .endr + .if .L__decode_fail == 1 + .error "__DECODE_R failed" + .endif + .endm + + .macro __DECODE_RR expand,rsave,rtarget + .set .L__decode_fail,1 + .irp r1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 + .ifc \rsave,%r\r1 + .irp r2,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 + .ifc \rtarget,%r\r2 + \expand \r1,\r2 + .set .L__decode_fail,0 + .endif + .endr + .endif + .endr + .if .L__decode_fail == 1 + .error "__DECODE_RR failed" + .endif + .endm + + .macro __THUNK_EX_BR reg + exrl 0,555f + j . +555: br \reg + .endm + +#ifdef CONFIG_EXPOLINE_EXTERN + .macro GEN_BR_THUNK reg + .endm + .macro GEN_BR_THUNK_EXTERN reg +#else + .macro GEN_BR_THUNK reg +#endif + __DECODE_R __THUNK_PROLOG_BR,\reg + __THUNK_EX_BR \reg + __DECODE_R __THUNK_EPILOG_BR,\reg + .endm + + .macro BR_EX reg +557: __DECODE_R __THUNK_BR,\reg + .pushsection .s390_indirect_branches,"a",@progbits + .long 557b-. + .popsection + .endm + + .macro BASR_EX rsave,rtarget +559: __DECODE_RR __THUNK_BRASL,\rsave,\rtarget + .pushsection .s390_indirect_branches,"a",@progbits + .long 559b-. + .popsection + .endm + +#else + .macro GEN_BR_THUNK reg + .endm + + .macro BR_EX reg + br \reg + .endm + + .macro BASR_EX rsave,rtarget + basr \rsave,\rtarget + .endm +#endif /* CC_USING_EXPOLINE */ + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_S390_NOSPEC_ASM_H */ diff --git a/arch/s390/include/asm/numa.h b/arch/s390/include/asm/numa.h new file mode 100644 index 0000000000..23cd5d1b73 --- /dev/null +++ b/arch/s390/include/asm/numa.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * NUMA support for s390 + * + * Declare the NUMA core code structures and functions. + * + * Copyright IBM Corp. 2015 + */ + +#ifndef _ASM_S390_NUMA_H +#define _ASM_S390_NUMA_H + +#ifdef CONFIG_NUMA + +#include + +void numa_setup(void); + +#else + +static inline void numa_setup(void) { } + +#endif /* CONFIG_NUMA */ + +#endif /* _ASM_S390_NUMA_H */ diff --git a/arch/s390/include/asm/os_info.h b/arch/s390/include/asm/os_info.h new file mode 100644 index 0000000000..a4d2e103f1 --- /dev/null +++ b/arch/s390/include/asm/os_info.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * OS info memory interface + * + * Copyright IBM Corp. 2012 + * Author(s): Michael Holzheu + */ +#ifndef _ASM_S390_OS_INFO_H +#define _ASM_S390_OS_INFO_H + +#include + +#define OS_INFO_VERSION_MAJOR 1 +#define OS_INFO_VERSION_MINOR 1 +#define OS_INFO_MAGIC 0x4f53494e464f535aULL /* OSINFOSZ */ + +#define OS_INFO_VMCOREINFO 0 +#define OS_INFO_REIPL_BLOCK 1 +#define OS_INFO_FLAGS_ENTRY 2 + +#define OS_INFO_FLAG_REIPL_CLEAR (1UL << 0) + +struct os_info_entry { + u64 addr; + u64 size; + u32 csum; +} __packed; + +struct os_info { + u64 magic; + u32 csum; + u16 version_major; + u16 version_minor; + u64 crashkernel_addr; + u64 crashkernel_size; + struct os_info_entry entry[3]; + u8 reserved[4004]; +} __packed; + +void os_info_init(void); +void os_info_entry_add(int nr, void *ptr, u64 len); +void os_info_crashkernel_add(unsigned long base, unsigned long size); +u32 os_info_csum(struct os_info *os_info); + +#ifdef CONFIG_CRASH_DUMP +void *os_info_old_entry(int nr, unsigned long *size); +#else +static inline void *os_info_old_entry(int nr, unsigned long *size) +{ + return NULL; +} +#endif + +#endif /* _ASM_S390_OS_INFO_H */ diff --git a/arch/s390/include/asm/page-states.h b/arch/s390/include/asm/page-states.h new file mode 100644 index 0000000000..c33c4deb54 --- /dev/null +++ b/arch/s390/include/asm/page-states.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2017 + * Author(s): Claudio Imbrenda + */ + +#ifndef PAGE_STATES_H +#define PAGE_STATES_H + +#define ESSA_GET_STATE 0 +#define ESSA_SET_STABLE 1 +#define ESSA_SET_UNUSED 2 +#define ESSA_SET_VOLATILE 3 +#define ESSA_SET_POT_VOLATILE 4 +#define ESSA_SET_STABLE_RESIDENT 5 +#define ESSA_SET_STABLE_IF_RESIDENT 6 +#define ESSA_SET_STABLE_NODAT 7 + +#define ESSA_MAX ESSA_SET_STABLE_NODAT + +#endif diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h new file mode 100644 index 0000000000..cfec074331 --- /dev/null +++ b/arch/s390/include/asm/page.h @@ -0,0 +1,218 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * S390 version + * Copyright IBM Corp. 1999, 2000 + * Author(s): Hartmut Penner (hp@de.ibm.com) + */ + +#ifndef _S390_PAGE_H +#define _S390_PAGE_H + +#include +#include + +#define _PAGE_SHIFT 12 +#define _PAGE_SIZE (_AC(1, UL) << _PAGE_SHIFT) +#define _PAGE_MASK (~(_PAGE_SIZE - 1)) + +/* PAGE_SHIFT determines the page size */ +#define PAGE_SHIFT _PAGE_SHIFT +#define PAGE_SIZE _PAGE_SIZE +#define PAGE_MASK _PAGE_MASK +#define PAGE_DEFAULT_ACC _AC(0, UL) +/* storage-protection override */ +#define PAGE_SPO_ACC 9 +#define PAGE_DEFAULT_KEY (PAGE_DEFAULT_ACC << 4) + +#define HPAGE_SHIFT 20 +#define HPAGE_SIZE (1UL << HPAGE_SHIFT) +#define HPAGE_MASK (~(HPAGE_SIZE - 1)) +#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) +#define HUGE_MAX_HSTATE 2 + +#define ARCH_HAS_SETCLEAR_HUGE_PTE +#define ARCH_HAS_HUGE_PTE_TYPE +#define ARCH_HAS_PREPARE_HUGEPAGE +#define ARCH_HAS_HUGEPAGE_CLEAR_FLUSH + +#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA + +#include +#ifndef __ASSEMBLY__ + +void __storage_key_init_range(unsigned long start, unsigned long end); + +static inline void storage_key_init_range(unsigned long start, unsigned long end) +{ + if (PAGE_DEFAULT_KEY != 0) + __storage_key_init_range(start, end); +} + +#define clear_page(page) memset((page), 0, PAGE_SIZE) + +/* + * copy_page uses the mvcl instruction with 0xb0 padding byte in order to + * bypass caches when copying a page. Especially when copying huge pages + * this keeps L1 and L2 data caches alive. + */ +static inline void copy_page(void *to, void *from) +{ + union register_pair dst, src; + + dst.even = (unsigned long) to; + dst.odd = 0x1000; + src.even = (unsigned long) from; + src.odd = 0xb0001000; + + asm volatile( + " mvcl %[dst],%[src]" + : [dst] "+&d" (dst.pair), [src] "+&d" (src.pair) + : : "memory", "cc"); +} + +#define clear_user_page(page, vaddr, pg) clear_page(page) +#define copy_user_page(to, from, vaddr, pg) copy_page(to, from) + +#define vma_alloc_zeroed_movable_folio(vma, vaddr) \ + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) + +/* + * These are used to make use of C type-checking.. + */ + +typedef struct { unsigned long pgprot; } pgprot_t; +typedef struct { unsigned long pgste; } pgste_t; +typedef struct { unsigned long pte; } pte_t; +typedef struct { unsigned long pmd; } pmd_t; +typedef struct { unsigned long pud; } pud_t; +typedef struct { unsigned long p4d; } p4d_t; +typedef struct { unsigned long pgd; } pgd_t; +typedef pte_t *pgtable_t; + +#define pgprot_val(x) ((x).pgprot) +#define pgste_val(x) ((x).pgste) + +static inline unsigned long pte_val(pte_t pte) +{ + return pte.pte; +} + +static inline unsigned long pmd_val(pmd_t pmd) +{ + return pmd.pmd; +} + +static inline unsigned long pud_val(pud_t pud) +{ + return pud.pud; +} + +static inline unsigned long p4d_val(p4d_t p4d) +{ + return p4d.p4d; +} + +static inline unsigned long pgd_val(pgd_t pgd) +{ + return pgd.pgd; +} + +#define __pgste(x) ((pgste_t) { (x) } ) +#define __pte(x) ((pte_t) { (x) } ) +#define __pmd(x) ((pmd_t) { (x) } ) +#define __pud(x) ((pud_t) { (x) } ) +#define __p4d(x) ((p4d_t) { (x) } ) +#define __pgd(x) ((pgd_t) { (x) } ) +#define __pgprot(x) ((pgprot_t) { (x) } ) + +static inline void page_set_storage_key(unsigned long addr, + unsigned char skey, int mapped) +{ + if (!mapped) + asm volatile(".insn rrf,0xb22b0000,%0,%1,8,0" + : : "d" (skey), "a" (addr)); + else + asm volatile("sske %0,%1" : : "d" (skey), "a" (addr)); +} + +static inline unsigned char page_get_storage_key(unsigned long addr) +{ + unsigned char skey; + + asm volatile("iske %0,%1" : "=d" (skey) : "a" (addr)); + return skey; +} + +static inline int page_reset_referenced(unsigned long addr) +{ + int cc; + + asm volatile( + " rrbe 0,%1\n" + " ipm %0\n" + " srl %0,28\n" + : "=d" (cc) : "a" (addr) : "cc"); + return cc; +} + +/* Bits int the storage key */ +#define _PAGE_CHANGED 0x02 /* HW changed bit */ +#define _PAGE_REFERENCED 0x04 /* HW referenced bit */ +#define _PAGE_FP_BIT 0x08 /* HW fetch protection bit */ +#define _PAGE_ACC_BITS 0xf0 /* HW access control bits */ + +struct page; +void arch_free_page(struct page *page, int order); +void arch_alloc_page(struct page *page, int order); +void arch_set_page_dat(struct page *page, int order); + +static inline int devmem_is_allowed(unsigned long pfn) +{ + return 0; +} + +#define HAVE_ARCH_FREE_PAGE +#define HAVE_ARCH_ALLOC_PAGE + +#if IS_ENABLED(CONFIG_PGSTE) +int arch_make_page_accessible(struct page *page); +#define HAVE_ARCH_MAKE_PAGE_ACCESSIBLE +#endif + +#define __PAGE_OFFSET 0x0UL +#define PAGE_OFFSET 0x0UL + +#define __pa(x) ((unsigned long)(x)) +#define __va(x) ((void *)(unsigned long)(x)) + +#define phys_to_pfn(phys) ((phys) >> PAGE_SHIFT) +#define pfn_to_phys(pfn) ((pfn) << PAGE_SHIFT) + +#define phys_to_page(phys) pfn_to_page(phys_to_pfn(phys)) +#define page_to_phys(page) pfn_to_phys(page_to_pfn(page)) + +static inline void *pfn_to_virt(unsigned long pfn) +{ + return __va(pfn_to_phys(pfn)); +} + +static inline unsigned long virt_to_pfn(const void *kaddr) +{ + return phys_to_pfn(__pa(kaddr)); +} + +#define pfn_to_kaddr(pfn) pfn_to_virt(pfn) + +#define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr)) +#define page_to_virt(page) pfn_to_virt(page_to_pfn(page)) + +#define virt_addr_valid(kaddr) pfn_valid(virt_to_pfn(kaddr)) + +#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC + +#endif /* !__ASSEMBLY__ */ + +#include +#include + +#endif /* _S390_PAGE_H */ diff --git a/arch/s390/include/asm/pai.h b/arch/s390/include/asm/pai.h new file mode 100644 index 0000000000..7d1888e3de --- /dev/null +++ b/arch/s390/include/asm/pai.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Processor Activity Instrumentation support for cryptography counters + * + * Copyright IBM Corp. 2022 + * Author(s): Thomas Richter + */ +#ifndef _ASM_S390_PAI_H +#define _ASM_S390_PAI_H + +#include +#include +#include + +struct qpaci_info_block { + u64 header; + struct { + u64 : 8; + u64 num_cc : 8; /* # of supported crypto counters */ + u64 : 9; + u64 num_nnpa : 7; /* # of supported NNPA counters */ + u64 : 32; + }; +}; + +static inline int qpaci(struct qpaci_info_block *info) +{ + /* Size of info (in double words minus one) */ + size_t size = sizeof(*info) / sizeof(u64) - 1; + int cc; + + asm volatile( + " lgr 0,%[size]\n" + " .insn s,0xb28f0000,%[info]\n" + " lgr %[size],0\n" + " ipm %[cc]\n" + " srl %[cc],28\n" + : [cc] "=d" (cc), [info] "=Q" (*info), [size] "+&d" (size) + : + : "0", "cc", "memory"); + return cc ? (size + 1) * sizeof(u64) : 0; +} + +#define PAI_CRYPTO_BASE 0x1000 /* First event number */ +#define PAI_CRYPTO_MAXCTR 256 /* Max # of event counters */ +#define PAI_CRYPTO_KERNEL_OFFSET 2048 +#define PAI_NNPA_BASE 0x1800 /* First event number */ +#define PAI_NNPA_MAXCTR 128 /* Max # of event counters */ + +DECLARE_STATIC_KEY_FALSE(pai_key); + +static __always_inline void pai_kernel_enter(struct pt_regs *regs) +{ + if (!IS_ENABLED(CONFIG_PERF_EVENTS)) + return; + if (!static_branch_unlikely(&pai_key)) + return; + if (!S390_lowcore.ccd) + return; + if (!user_mode(regs)) + return; + WRITE_ONCE(S390_lowcore.ccd, S390_lowcore.ccd | PAI_CRYPTO_KERNEL_OFFSET); +} + +static __always_inline void pai_kernel_exit(struct pt_regs *regs) +{ + if (!IS_ENABLED(CONFIG_PERF_EVENTS)) + return; + if (!static_branch_unlikely(&pai_key)) + return; + if (!S390_lowcore.ccd) + return; + if (!user_mode(regs)) + return; + WRITE_ONCE(S390_lowcore.ccd, S390_lowcore.ccd & ~PAI_CRYPTO_KERNEL_OFFSET); +} + +enum paievt_mode { + PAI_MODE_NONE, + PAI_MODE_SAMPLING, + PAI_MODE_COUNTING, +}; + +#endif diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h new file mode 100644 index 0000000000..b248694e00 --- /dev/null +++ b/arch/s390/include/asm/pci.h @@ -0,0 +1,327 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_S390_PCI_H +#define __ASM_S390_PCI_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#define PCIBIOS_MIN_IO 0x1000 +#define PCIBIOS_MIN_MEM 0x10000000 + +#define pcibios_assign_all_busses() (0) + +void __iomem *pci_iomap(struct pci_dev *, int, unsigned long); +void pci_iounmap(struct pci_dev *, void __iomem *); +int pci_domain_nr(struct pci_bus *); +int pci_proc_domain(struct pci_bus *); + +#define ZPCI_BUS_NR 0 /* default bus number */ + +#define ZPCI_NR_DMA_SPACES 1 +#define ZPCI_NR_DEVICES CONFIG_PCI_NR_FUNCTIONS +#define ZPCI_DOMAIN_BITMAP_SIZE (1 << 16) + +#ifdef PCI +#if (ZPCI_NR_DEVICES > ZPCI_DOMAIN_BITMAP_SIZE) +# error ZPCI_NR_DEVICES can not be bigger than ZPCI_DOMAIN_BITMAP_SIZE +#endif +#endif /* PCI */ + +/* PCI Function Controls */ +#define ZPCI_FC_FN_ENABLED 0x80 +#define ZPCI_FC_ERROR 0x40 +#define ZPCI_FC_BLOCKED 0x20 +#define ZPCI_FC_DMA_ENABLED 0x10 + +#define ZPCI_FMB_DMA_COUNTER_VALID (1 << 23) + +struct zpci_fmb_fmt0 { + u64 dma_rbytes; + u64 dma_wbytes; +}; + +struct zpci_fmb_fmt1 { + u64 rx_bytes; + u64 rx_packets; + u64 tx_bytes; + u64 tx_packets; +}; + +struct zpci_fmb_fmt2 { + u64 consumed_work_units; + u64 max_work_units; +}; + +struct zpci_fmb_fmt3 { + u64 tx_bytes; +}; + +struct zpci_fmb { + u32 format : 8; + u32 fmt_ind : 24; + u32 samples; + u64 last_update; + /* common counters */ + u64 ld_ops; + u64 st_ops; + u64 stb_ops; + u64 rpcit_ops; + /* format specific counters */ + union { + struct zpci_fmb_fmt0 fmt0; + struct zpci_fmb_fmt1 fmt1; + struct zpci_fmb_fmt2 fmt2; + struct zpci_fmb_fmt3 fmt3; + }; +} __packed __aligned(128); + +enum zpci_state { + ZPCI_FN_STATE_STANDBY = 0, + ZPCI_FN_STATE_CONFIGURED = 1, + ZPCI_FN_STATE_RESERVED = 2, +}; + +struct zpci_bar_struct { + struct resource *res; /* bus resource */ + void __iomem *mio_wb; + void __iomem *mio_wt; + u32 val; /* bar start & 3 flag bits */ + u16 map_idx; /* index into bar mapping array */ + u8 size; /* order 2 exponent */ +}; + +struct s390_domain; +struct kvm_zdev; + +#define ZPCI_FUNCTIONS_PER_BUS 256 +struct zpci_bus { + struct kref kref; + struct pci_bus *bus; + struct zpci_dev *function[ZPCI_FUNCTIONS_PER_BUS]; + struct list_head resources; + struct list_head bus_next; + struct resource bus_resource; + int pchid; + int domain_nr; + bool multifunction; + enum pci_bus_speed max_bus_speed; +}; + +/* Private data per function */ +struct zpci_dev { + struct zpci_bus *zbus; + struct list_head entry; /* list of all zpci_devices, needed for hotplug, etc. */ + struct list_head iommu_list; + struct kref kref; + struct rcu_head rcu; + struct hotplug_slot hotplug_slot; + + enum zpci_state state; + u32 fid; /* function ID, used by sclp */ + u32 fh; /* function handle, used by insn's */ + u32 gisa; /* GISA designation for passthrough */ + u16 vfn; /* virtual function number */ + u16 pchid; /* physical channel ID */ + u16 maxstbl; /* Maximum store block size */ + u8 pfgid; /* function group ID */ + u8 pft; /* pci function type */ + u8 port; + u8 dtsm; /* Supported DT mask */ + u8 rid_available : 1; + u8 has_hp_slot : 1; + u8 has_resources : 1; + u8 is_physfn : 1; + u8 util_str_avail : 1; + u8 irqs_registered : 1; + u8 reserved : 2; + unsigned int devfn; /* DEVFN part of the RID*/ + + struct mutex lock; + u8 pfip[CLP_PFIP_NR_SEGMENTS]; /* pci function internal path */ + u32 uid; /* user defined id */ + u8 util_str[CLP_UTIL_STR_LEN]; /* utility string */ + + /* IRQ stuff */ + u64 msi_addr; /* MSI address */ + unsigned int max_msi; /* maximum number of MSI's */ + unsigned int msi_first_bit; + unsigned int msi_nr_irqs; + struct airq_iv *aibv; /* adapter interrupt bit vector */ + unsigned long aisb; /* number of the summary bit */ + + /* DMA stuff */ + unsigned long *dma_table; + int tlb_refresh; + + spinlock_t iommu_bitmap_lock; + unsigned long *iommu_bitmap; + unsigned long *lazy_bitmap; + unsigned long iommu_size; + unsigned long iommu_pages; + unsigned int next_bit; + + struct iommu_device iommu_dev; /* IOMMU core handle */ + + char res_name[16]; + bool mio_capable; + struct zpci_bar_struct bars[PCI_STD_NUM_BARS]; + + u64 start_dma; /* Start of available DMA addresses */ + u64 end_dma; /* End of available DMA addresses */ + u64 dma_mask; /* DMA address space mask */ + + /* Function measurement block */ + struct zpci_fmb *fmb; + u16 fmb_update; /* update interval */ + u16 fmb_length; + /* software counters */ + atomic64_t allocated_pages; + atomic64_t mapped_pages; + atomic64_t unmapped_pages; + + u8 version; + enum pci_bus_speed max_bus_speed; + + struct dentry *debugfs_dev; + + /* IOMMU and passthrough */ + struct s390_domain *s390_domain; /* s390 IOMMU domain data */ + struct kvm_zdev *kzdev; + struct mutex kzdev_lock; +}; + +static inline bool zdev_enabled(struct zpci_dev *zdev) +{ + return (zdev->fh & (1UL << 31)) ? true : false; +} + +extern const struct attribute_group *zpci_attr_groups[]; +extern unsigned int s390_pci_force_floating __initdata; +extern unsigned int s390_pci_no_rid; + +extern union zpci_sic_iib *zpci_aipb; +extern struct airq_iv *zpci_aif_sbv; + +/* ----------------------------------------------------------------------------- + Prototypes +----------------------------------------------------------------------------- */ +/* Base stuff */ +struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state); +int zpci_enable_device(struct zpci_dev *); +int zpci_disable_device(struct zpci_dev *); +int zpci_scan_configured_device(struct zpci_dev *zdev, u32 fh); +int zpci_deconfigure_device(struct zpci_dev *zdev); +void zpci_device_reserved(struct zpci_dev *zdev); +bool zpci_is_device_configured(struct zpci_dev *zdev); + +int zpci_hot_reset_device(struct zpci_dev *zdev); +int zpci_register_ioat(struct zpci_dev *, u8, u64, u64, u64, u8 *); +int zpci_unregister_ioat(struct zpci_dev *, u8); +void zpci_remove_reserved_devices(void); +void zpci_update_fh(struct zpci_dev *zdev, u32 fh); + +/* CLP */ +int clp_setup_writeback_mio(void); +int clp_scan_pci_devices(void); +int clp_query_pci_fn(struct zpci_dev *zdev); +int clp_enable_fh(struct zpci_dev *zdev, u32 *fh, u8 nr_dma_as); +int clp_disable_fh(struct zpci_dev *zdev, u32 *fh); +int clp_get_state(u32 fid, enum zpci_state *state); +int clp_refresh_fh(u32 fid, u32 *fh); + +/* UID */ +void update_uid_checking(bool new); + +/* IOMMU Interface */ +int zpci_init_iommu(struct zpci_dev *zdev); +void zpci_destroy_iommu(struct zpci_dev *zdev); + +#ifdef CONFIG_PCI +static inline bool zpci_use_mio(struct zpci_dev *zdev) +{ + return static_branch_likely(&have_mio) && zdev->mio_capable; +} + +/* Error handling and recovery */ +void zpci_event_error(void *); +void zpci_event_availability(void *); +bool zpci_is_enabled(void); +#else /* CONFIG_PCI */ +static inline void zpci_event_error(void *e) {} +static inline void zpci_event_availability(void *e) {} +#endif /* CONFIG_PCI */ + +#ifdef CONFIG_HOTPLUG_PCI_S390 +int zpci_init_slot(struct zpci_dev *); +void zpci_exit_slot(struct zpci_dev *); +#else /* CONFIG_HOTPLUG_PCI_S390 */ +static inline int zpci_init_slot(struct zpci_dev *zdev) +{ + return 0; +} +static inline void zpci_exit_slot(struct zpci_dev *zdev) {} +#endif /* CONFIG_HOTPLUG_PCI_S390 */ + +/* Helpers */ +static inline struct zpci_dev *to_zpci(struct pci_dev *pdev) +{ + struct zpci_bus *zbus = pdev->sysdata; + + return zbus->function[pdev->devfn]; +} + +static inline struct zpci_dev *to_zpci_dev(struct device *dev) +{ + return to_zpci(to_pci_dev(dev)); +} + +struct zpci_dev *get_zdev_by_fid(u32); + +/* DMA */ +int zpci_dma_init(void); +void zpci_dma_exit(void); +int zpci_dma_init_device(struct zpci_dev *zdev); +int zpci_dma_exit_device(struct zpci_dev *zdev); + +/* IRQ */ +int __init zpci_irq_init(void); +void __init zpci_irq_exit(void); + +/* FMB */ +int zpci_fmb_enable_device(struct zpci_dev *); +int zpci_fmb_disable_device(struct zpci_dev *); + +/* Debug */ +int zpci_debug_init(void); +void zpci_debug_exit(void); +void zpci_debug_init_device(struct zpci_dev *, const char *); +void zpci_debug_exit_device(struct zpci_dev *); + +/* Error handling */ +int zpci_report_error(struct pci_dev *, struct zpci_report_error_header *); +int zpci_clear_error_state(struct zpci_dev *zdev); +int zpci_reset_load_store_blocked(struct zpci_dev *zdev); + +#ifdef CONFIG_NUMA + +/* Returns the node based on PCI bus */ +static inline int __pcibus_to_node(const struct pci_bus *bus) +{ + return NUMA_NO_NODE; +} + +static inline const struct cpumask * +cpumask_of_pcibus(const struct pci_bus *bus) +{ + return cpu_online_mask; +} + +#endif /* CONFIG_NUMA */ + +#endif diff --git a/arch/s390/include/asm/pci_clp.h b/arch/s390/include/asm/pci_clp.h new file mode 100644 index 0000000000..d6189ed14f --- /dev/null +++ b/arch/s390/include/asm/pci_clp.h @@ -0,0 +1,217 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_PCI_CLP_H +#define _ASM_S390_PCI_CLP_H + +#include + +/* + * Call Logical Processor - Command Codes + */ +#define CLP_SLPC 0x0001 +#define CLP_LIST_PCI 0x0002 +#define CLP_QUERY_PCI_FN 0x0003 +#define CLP_QUERY_PCI_FNGRP 0x0004 +#define CLP_SET_PCI_FN 0x0005 + +/* PCI function handle list entry */ +struct clp_fh_list_entry { + u16 device_id; + u16 vendor_id; + u32 config_state : 1; + u32 : 31; + u32 fid; /* PCI function id */ + u32 fh; /* PCI function handle */ +} __packed; + +#define CLP_RC_SETPCIFN_FH 0x0101 /* Invalid PCI fn handle */ +#define CLP_RC_SETPCIFN_FHOP 0x0102 /* Fn handle not valid for op */ +#define CLP_RC_SETPCIFN_DMAAS 0x0103 /* Invalid DMA addr space */ +#define CLP_RC_SETPCIFN_RES 0x0104 /* Insufficient resources */ +#define CLP_RC_SETPCIFN_ALRDY 0x0105 /* Fn already in requested state */ +#define CLP_RC_SETPCIFN_ERR 0x0106 /* Fn in permanent error state */ +#define CLP_RC_SETPCIFN_RECPND 0x0107 /* Error recovery pending */ +#define CLP_RC_SETPCIFN_BUSY 0x0108 /* Fn busy */ +#define CLP_RC_LISTPCI_BADRT 0x010a /* Resume token not recognized */ +#define CLP_RC_QUERYPCIFG_PFGID 0x010b /* Unrecognized PFGID */ + +/* request or response block header length */ +#define LIST_PCI_HDR_LEN 32 + +/* Number of function handles fitting in response block */ +#define CLP_FH_LIST_NR_ENTRIES \ + ((CLP_BLK_SIZE - 2 * LIST_PCI_HDR_LEN) \ + / sizeof(struct clp_fh_list_entry)) + +#define CLP_SET_ENABLE_PCI_FN 0 /* Yes, 0 enables it */ +#define CLP_SET_DISABLE_PCI_FN 1 /* Yes, 1 disables it */ +#define CLP_SET_ENABLE_MIO 2 +#define CLP_SET_DISABLE_MIO 3 + +#define CLP_UTIL_STR_LEN 64 +#define CLP_PFIP_NR_SEGMENTS 4 + +extern bool zpci_unique_uid; + +struct clp_rsp_slpc_pci { + struct clp_rsp_hdr hdr; + u32 reserved2[4]; + u32 lpif[8]; + u32 reserved3[4]; + u32 vwb : 1; + u32 : 1; + u32 mio_wb : 6; + u32 : 24; + u32 reserved5[3]; + u32 lpic[8]; +} __packed; + +/* List PCI functions request */ +struct clp_req_list_pci { + struct clp_req_hdr hdr; + u64 resume_token; + u64 reserved2; +} __packed; + +/* List PCI functions response */ +struct clp_rsp_list_pci { + struct clp_rsp_hdr hdr; + u64 resume_token; + u32 reserved2; + u16 max_fn; + u8 : 7; + u8 uid_checking : 1; + u8 entry_size; + struct clp_fh_list_entry fh_list[CLP_FH_LIST_NR_ENTRIES]; +} __packed; + +struct mio_info { + u32 valid : 6; + u32 : 26; + u32 : 32; + struct { + u64 wb; + u64 wt; + } addr[PCI_STD_NUM_BARS]; + u32 reserved[6]; +} __packed; + +/* Query PCI function request */ +struct clp_req_query_pci { + struct clp_req_hdr hdr; + u32 fh; /* function handle */ + u32 reserved2; + u64 reserved3; +} __packed; + +/* Query PCI function response */ +struct clp_rsp_query_pci { + struct clp_rsp_hdr hdr; + u16 vfn; /* virtual fn number */ + u16 : 3; + u16 rid_avail : 1; + u16 is_physfn : 1; + u16 reserved1 : 1; + u16 mio_addr_avail : 1; + u16 util_str_avail : 1; /* utility string available? */ + u16 pfgid : 8; /* pci function group id */ + u32 fid; /* pci function id */ + u8 bar_size[PCI_STD_NUM_BARS]; + u16 pchid; + __le32 bar[PCI_STD_NUM_BARS]; + u8 pfip[CLP_PFIP_NR_SEGMENTS]; /* pci function internal path */ + u16 : 12; + u16 port : 4; + u8 fmb_len; + u8 pft; /* pci function type */ + u64 sdma; /* start dma as */ + u64 edma; /* end dma as */ +#define ZPCI_RID_MASK_DEVFN 0x00ff + u16 rid; /* BUS/DEVFN PCI address */ + u16 reserved0; + u32 reserved[10]; + u32 uid; /* user defined id */ + u8 util_str[CLP_UTIL_STR_LEN]; /* utility string */ + u32 reserved2[16]; + struct mio_info mio; +} __packed; + +/* Query PCI function group request */ +struct clp_req_query_pci_grp { + struct clp_req_hdr hdr; + u32 reserved2 : 24; + u32 pfgid : 8; /* function group id */ + u32 reserved3; + u64 reserved4; +} __packed; + +/* Query PCI function group response */ +struct clp_rsp_query_pci_grp { + struct clp_rsp_hdr hdr; + u16 : 4; + u16 noi : 12; /* number of interrupts */ + u8 version; + u8 : 6; + u8 frame : 1; + u8 refresh : 1; /* TLB refresh mode */ + u16 : 3; + u16 maxstbl : 13; /* Maximum store block size */ + u16 mui; + u8 dtsm; /* Supported DT mask */ + u8 reserved3; + u16 maxfaal; + u16 : 4; + u16 dnoi : 12; + u16 maxcpu; + u64 dasm; /* dma address space mask */ + u64 msia; /* MSI address */ + u64 reserved4; + u64 reserved5; +} __packed; + +/* Set PCI function request */ +struct clp_req_set_pci { + struct clp_req_hdr hdr; + u32 fh; /* function handle */ + u16 reserved2; + u8 oc; /* operation controls */ + u8 ndas; /* number of dma spaces */ + u32 reserved3; + u32 gisa; /* GISA designation */ +} __packed; + +/* Set PCI function response */ +struct clp_rsp_set_pci { + struct clp_rsp_hdr hdr; + u32 fh; /* function handle */ + u32 reserved1; + u64 reserved2; + struct mio_info mio; +} __packed; + +/* Combined request/response block structures used by clp insn */ +struct clp_req_rsp_slpc_pci { + struct clp_req_slpc request; + struct clp_rsp_slpc_pci response; +} __packed; + +struct clp_req_rsp_list_pci { + struct clp_req_list_pci request; + struct clp_rsp_list_pci response; +} __packed; + +struct clp_req_rsp_set_pci { + struct clp_req_set_pci request; + struct clp_rsp_set_pci response; +} __packed; + +struct clp_req_rsp_query_pci { + struct clp_req_query_pci request; + struct clp_rsp_query_pci response; +} __packed; + +struct clp_req_rsp_query_pci_grp { + struct clp_req_query_pci_grp request; + struct clp_rsp_query_pci_grp response; +} __packed; + +#endif diff --git a/arch/s390/include/asm/pci_debug.h b/arch/s390/include/asm/pci_debug.h new file mode 100644 index 0000000000..3bb4e7e33a --- /dev/null +++ b/arch/s390/include/asm/pci_debug.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _S390_ASM_PCI_DEBUG_H +#define _S390_ASM_PCI_DEBUG_H + +#include + +extern debug_info_t *pci_debug_msg_id; +extern debug_info_t *pci_debug_err_id; + +#define zpci_dbg(imp, fmt, args...) \ + debug_sprintf_event(pci_debug_msg_id, imp, fmt, ##args) + +#define zpci_err(text...) \ + do { \ + char debug_buffer[16]; \ + snprintf(debug_buffer, 16, text); \ + debug_text_event(pci_debug_err_id, 0, debug_buffer); \ + } while (0) + +static inline void zpci_err_hex_level(int level, void *addr, int len) +{ + debug_event(pci_debug_err_id, level, addr, len); +} + +static inline void zpci_err_hex(void *addr, int len) +{ + zpci_err_hex_level(0, addr, len); +} + +#endif diff --git a/arch/s390/include/asm/pci_dma.h b/arch/s390/include/asm/pci_dma.h new file mode 100644 index 0000000000..7119c04c51 --- /dev/null +++ b/arch/s390/include/asm/pci_dma.h @@ -0,0 +1,198 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_PCI_DMA_H +#define _ASM_S390_PCI_DMA_H + +/* I/O Translation Anchor (IOTA) */ +enum zpci_ioat_dtype { + ZPCI_IOTA_STO = 0, + ZPCI_IOTA_RTTO = 1, + ZPCI_IOTA_RSTO = 2, + ZPCI_IOTA_RFTO = 3, + ZPCI_IOTA_PFAA = 4, + ZPCI_IOTA_IOPFAA = 5, + ZPCI_IOTA_IOPTO = 7 +}; + +#define ZPCI_IOTA_IOT_ENABLED 0x800UL +#define ZPCI_IOTA_DT_ST (ZPCI_IOTA_STO << 2) +#define ZPCI_IOTA_DT_RT (ZPCI_IOTA_RTTO << 2) +#define ZPCI_IOTA_DT_RS (ZPCI_IOTA_RSTO << 2) +#define ZPCI_IOTA_DT_RF (ZPCI_IOTA_RFTO << 2) +#define ZPCI_IOTA_DT_PF (ZPCI_IOTA_PFAA << 2) +#define ZPCI_IOTA_FS_4K 0 +#define ZPCI_IOTA_FS_1M 1 +#define ZPCI_IOTA_FS_2G 2 +#define ZPCI_KEY (PAGE_DEFAULT_KEY << 5) + +#define ZPCI_TABLE_SIZE_RT (1UL << 42) + +#define ZPCI_IOTA_STO_FLAG (ZPCI_IOTA_IOT_ENABLED | ZPCI_KEY | ZPCI_IOTA_DT_ST) +#define ZPCI_IOTA_RTTO_FLAG (ZPCI_IOTA_IOT_ENABLED | ZPCI_KEY | ZPCI_IOTA_DT_RT) +#define ZPCI_IOTA_RSTO_FLAG (ZPCI_IOTA_IOT_ENABLED | ZPCI_KEY | ZPCI_IOTA_DT_RS) +#define ZPCI_IOTA_RFTO_FLAG (ZPCI_IOTA_IOT_ENABLED | ZPCI_KEY | ZPCI_IOTA_DT_RF) +#define ZPCI_IOTA_RFAA_FLAG (ZPCI_IOTA_IOT_ENABLED | ZPCI_KEY | ZPCI_IOTA_DT_PF | ZPCI_IOTA_FS_2G) + +/* I/O Region and segment tables */ +#define ZPCI_INDEX_MASK 0x7ffUL + +#define ZPCI_TABLE_TYPE_MASK 0xc +#define ZPCI_TABLE_TYPE_RFX 0xc +#define ZPCI_TABLE_TYPE_RSX 0x8 +#define ZPCI_TABLE_TYPE_RTX 0x4 +#define ZPCI_TABLE_TYPE_SX 0x0 + +#define ZPCI_TABLE_LEN_RFX 0x3 +#define ZPCI_TABLE_LEN_RSX 0x3 +#define ZPCI_TABLE_LEN_RTX 0x3 + +#define ZPCI_TABLE_OFFSET_MASK 0xc0 +#define ZPCI_TABLE_SIZE 0x4000 +#define ZPCI_TABLE_ALIGN ZPCI_TABLE_SIZE +#define ZPCI_TABLE_ENTRY_SIZE (sizeof(unsigned long)) +#define ZPCI_TABLE_ENTRIES (ZPCI_TABLE_SIZE / ZPCI_TABLE_ENTRY_SIZE) + +#define ZPCI_TABLE_BITS 11 +#define ZPCI_PT_BITS 8 +#define ZPCI_ST_SHIFT (ZPCI_PT_BITS + PAGE_SHIFT) +#define ZPCI_RT_SHIFT (ZPCI_ST_SHIFT + ZPCI_TABLE_BITS) + +#define ZPCI_RTE_FLAG_MASK 0x3fffUL +#define ZPCI_RTE_ADDR_MASK (~ZPCI_RTE_FLAG_MASK) +#define ZPCI_STE_FLAG_MASK 0x7ffUL +#define ZPCI_STE_ADDR_MASK (~ZPCI_STE_FLAG_MASK) + +/* I/O Page tables */ +#define ZPCI_PTE_VALID_MASK 0x400 +#define ZPCI_PTE_INVALID 0x400 +#define ZPCI_PTE_VALID 0x000 +#define ZPCI_PT_SIZE 0x800 +#define ZPCI_PT_ALIGN ZPCI_PT_SIZE +#define ZPCI_PT_ENTRIES (ZPCI_PT_SIZE / ZPCI_TABLE_ENTRY_SIZE) +#define ZPCI_PT_MASK (ZPCI_PT_ENTRIES - 1) + +#define ZPCI_PTE_FLAG_MASK 0xfffUL +#define ZPCI_PTE_ADDR_MASK (~ZPCI_PTE_FLAG_MASK) + +/* Shared bits */ +#define ZPCI_TABLE_VALID 0x00 +#define ZPCI_TABLE_INVALID 0x20 +#define ZPCI_TABLE_PROTECTED 0x200 +#define ZPCI_TABLE_UNPROTECTED 0x000 + +#define ZPCI_TABLE_VALID_MASK 0x20 +#define ZPCI_TABLE_PROT_MASK 0x200 + +static inline unsigned int calc_rtx(dma_addr_t ptr) +{ + return ((unsigned long) ptr >> ZPCI_RT_SHIFT) & ZPCI_INDEX_MASK; +} + +static inline unsigned int calc_sx(dma_addr_t ptr) +{ + return ((unsigned long) ptr >> ZPCI_ST_SHIFT) & ZPCI_INDEX_MASK; +} + +static inline unsigned int calc_px(dma_addr_t ptr) +{ + return ((unsigned long) ptr >> PAGE_SHIFT) & ZPCI_PT_MASK; +} + +static inline void set_pt_pfaa(unsigned long *entry, phys_addr_t pfaa) +{ + *entry &= ZPCI_PTE_FLAG_MASK; + *entry |= (pfaa & ZPCI_PTE_ADDR_MASK); +} + +static inline void set_rt_sto(unsigned long *entry, phys_addr_t sto) +{ + *entry &= ZPCI_RTE_FLAG_MASK; + *entry |= (sto & ZPCI_RTE_ADDR_MASK); + *entry |= ZPCI_TABLE_TYPE_RTX; +} + +static inline void set_st_pto(unsigned long *entry, phys_addr_t pto) +{ + *entry &= ZPCI_STE_FLAG_MASK; + *entry |= (pto & ZPCI_STE_ADDR_MASK); + *entry |= ZPCI_TABLE_TYPE_SX; +} + +static inline void validate_rt_entry(unsigned long *entry) +{ + *entry &= ~ZPCI_TABLE_VALID_MASK; + *entry &= ~ZPCI_TABLE_OFFSET_MASK; + *entry |= ZPCI_TABLE_VALID; + *entry |= ZPCI_TABLE_LEN_RTX; +} + +static inline void validate_st_entry(unsigned long *entry) +{ + *entry &= ~ZPCI_TABLE_VALID_MASK; + *entry |= ZPCI_TABLE_VALID; +} + +static inline void invalidate_pt_entry(unsigned long *entry) +{ + WARN_ON_ONCE((*entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_INVALID); + *entry &= ~ZPCI_PTE_VALID_MASK; + *entry |= ZPCI_PTE_INVALID; +} + +static inline void validate_pt_entry(unsigned long *entry) +{ + WARN_ON_ONCE((*entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID); + *entry &= ~ZPCI_PTE_VALID_MASK; + *entry |= ZPCI_PTE_VALID; +} + +static inline void entry_set_protected(unsigned long *entry) +{ + *entry &= ~ZPCI_TABLE_PROT_MASK; + *entry |= ZPCI_TABLE_PROTECTED; +} + +static inline void entry_clr_protected(unsigned long *entry) +{ + *entry &= ~ZPCI_TABLE_PROT_MASK; + *entry |= ZPCI_TABLE_UNPROTECTED; +} + +static inline int reg_entry_isvalid(unsigned long entry) +{ + return (entry & ZPCI_TABLE_VALID_MASK) == ZPCI_TABLE_VALID; +} + +static inline int pt_entry_isvalid(unsigned long entry) +{ + return (entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID; +} + +static inline unsigned long *get_rt_sto(unsigned long entry) +{ + if ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_RTX) + return phys_to_virt(entry & ZPCI_RTE_ADDR_MASK); + else + return NULL; + +} + +static inline unsigned long *get_st_pto(unsigned long entry) +{ + if ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_SX) + return phys_to_virt(entry & ZPCI_STE_ADDR_MASK); + else + return NULL; +} + +/* Prototypes */ +void dma_free_seg_table(unsigned long); +unsigned long *dma_alloc_cpu_table(gfp_t gfp); +void dma_cleanup_tables(unsigned long *); +unsigned long *dma_walk_cpu_trans(unsigned long *rto, dma_addr_t dma_addr, + gfp_t gfp); +void dma_update_cpu_trans(unsigned long *entry, phys_addr_t page_addr, int flags); + +extern const struct dma_map_ops s390_pci_dma_ops; + + +#endif diff --git a/arch/s390/include/asm/pci_insn.h b/arch/s390/include/asm/pci_insn.h new file mode 100644 index 0000000000..e5f57cfe1d --- /dev/null +++ b/arch/s390/include/asm/pci_insn.h @@ -0,0 +1,159 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_PCI_INSN_H +#define _ASM_S390_PCI_INSN_H + +#include + +/* Load/Store status codes */ +#define ZPCI_PCI_ST_FUNC_NOT_ENABLED 4 +#define ZPCI_PCI_ST_FUNC_IN_ERR 8 +#define ZPCI_PCI_ST_BLOCKED 12 +#define ZPCI_PCI_ST_INSUF_RES 16 +#define ZPCI_PCI_ST_INVAL_AS 20 +#define ZPCI_PCI_ST_FUNC_ALREADY_ENABLED 24 +#define ZPCI_PCI_ST_DMA_AS_NOT_ENABLED 28 +#define ZPCI_PCI_ST_2ND_OP_IN_INV_AS 36 +#define ZPCI_PCI_ST_FUNC_NOT_AVAIL 40 +#define ZPCI_PCI_ST_ALREADY_IN_RQ_STATE 44 + +/* Load/Store return codes */ +#define ZPCI_PCI_LS_OK 0 +#define ZPCI_PCI_LS_ERR 1 +#define ZPCI_PCI_LS_BUSY 2 +#define ZPCI_PCI_LS_INVAL_HANDLE 3 + +/* Load/Store address space identifiers */ +#define ZPCI_PCIAS_MEMIO_0 0 +#define ZPCI_PCIAS_MEMIO_1 1 +#define ZPCI_PCIAS_MEMIO_2 2 +#define ZPCI_PCIAS_MEMIO_3 3 +#define ZPCI_PCIAS_MEMIO_4 4 +#define ZPCI_PCIAS_MEMIO_5 5 +#define ZPCI_PCIAS_CFGSPC 15 + +/* Modify PCI Function Controls */ +#define ZPCI_MOD_FC_REG_INT 2 +#define ZPCI_MOD_FC_DEREG_INT 3 +#define ZPCI_MOD_FC_REG_IOAT 4 +#define ZPCI_MOD_FC_DEREG_IOAT 5 +#define ZPCI_MOD_FC_REREG_IOAT 6 +#define ZPCI_MOD_FC_RESET_ERROR 7 +#define ZPCI_MOD_FC_RESET_BLOCK 9 +#define ZPCI_MOD_FC_SET_MEASURE 10 +#define ZPCI_MOD_FC_REG_INT_D 16 +#define ZPCI_MOD_FC_DEREG_INT_D 17 + +/* FIB function controls */ +#define ZPCI_FIB_FC_ENABLED 0x80 +#define ZPCI_FIB_FC_ERROR 0x40 +#define ZPCI_FIB_FC_LS_BLOCKED 0x20 +#define ZPCI_FIB_FC_DMAAS_REG 0x10 + +/* FIB function controls */ +#define ZPCI_FIB_FC_ENABLED 0x80 +#define ZPCI_FIB_FC_ERROR 0x40 +#define ZPCI_FIB_FC_LS_BLOCKED 0x20 +#define ZPCI_FIB_FC_DMAAS_REG 0x10 + +struct zpci_fib_fmt0 { + u32 : 1; + u32 isc : 3; /* Interrupt subclass */ + u32 noi : 12; /* Number of interrupts */ + u32 : 2; + u32 aibvo : 6; /* Adapter interrupt bit vector offset */ + u32 sum : 1; /* Adapter int summary bit enabled */ + u32 : 1; + u32 aisbo : 6; /* Adapter int summary bit offset */ + u32 : 32; + u64 aibv; /* Adapter int bit vector address */ + u64 aisb; /* Adapter int summary bit address */ +}; + +struct zpci_fib_fmt1 { + u32 : 4; + u32 noi : 12; + u32 : 16; + u32 dibvo : 16; + u32 : 16; + u64 : 64; + u64 : 64; +}; + +/* Function Information Block */ +struct zpci_fib { + u32 fmt : 8; /* format */ + u32 : 24; + u32 : 32; + u8 fc; /* function controls */ + u64 : 56; + u64 pba; /* PCI base address */ + u64 pal; /* PCI address limit */ + u64 iota; /* I/O Translation Anchor */ + union { + struct zpci_fib_fmt0 fmt0; + struct zpci_fib_fmt1 fmt1; + }; + u64 fmb_addr; /* Function measurement block address and key */ + u32 : 32; + u32 gd; +} __packed __aligned(8); + +/* Set Interruption Controls Operation Controls */ +#define SIC_IRQ_MODE_ALL 0 +#define SIC_IRQ_MODE_SINGLE 1 +#define SIC_SET_AENI_CONTROLS 2 +#define SIC_IRQ_MODE_DIRECT 4 +#define SIC_IRQ_MODE_D_ALL 16 +#define SIC_IRQ_MODE_D_SINGLE 17 +#define SIC_IRQ_MODE_SET_CPU 18 + +/* directed interruption information block */ +struct zpci_diib { + u32 : 1; + u32 isc : 3; + u32 : 28; + u16 : 16; + u16 nr_cpus; + u64 disb_addr; + u64 : 64; + u64 : 64; +} __packed __aligned(8); + +/* cpu directed interruption information block */ +struct zpci_cdiib { + u64 : 64; + u64 dibv_addr; + u64 : 64; + u64 : 64; + u64 : 64; +} __packed __aligned(8); + +/* adapter interruption parameters block */ +struct zpci_aipb { + u64 faisb; + u64 gait; + u16 : 13; + u16 afi : 3; + u32 : 32; + u16 faal; +} __packed __aligned(8); + +union zpci_sic_iib { + struct zpci_diib diib; + struct zpci_cdiib cdiib; + struct zpci_aipb aipb; +}; + +DECLARE_STATIC_KEY_FALSE(have_mio); + +u8 zpci_mod_fc(u64 req, struct zpci_fib *fib, u8 *status); +int zpci_refresh_trans(u64 fn, u64 addr, u64 range); +int __zpci_load(u64 *data, u64 req, u64 offset); +int zpci_load(u64 *data, const volatile void __iomem *addr, unsigned long len); +int __zpci_store(u64 data, u64 req, u64 offset); +int zpci_store(const volatile void __iomem *addr, u64 data, unsigned long len); +int __zpci_store_block(const u64 *data, u64 req, u64 offset); +void zpci_barrier(void); +int zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib); + +#endif diff --git a/arch/s390/include/asm/pci_io.h b/arch/s390/include/asm/pci_io.h new file mode 100644 index 0000000000..2686bee800 --- /dev/null +++ b/arch/s390/include/asm/pci_io.h @@ -0,0 +1,206 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_PCI_IO_H +#define _ASM_S390_PCI_IO_H + +#ifdef CONFIG_PCI + +#include +#include +#include + +/* I/O size constraints */ +#define ZPCI_MAX_READ_SIZE 8 +#define ZPCI_MAX_WRITE_SIZE 128 +#define ZPCI_BOUNDARY_SIZE (1 << 12) +#define ZPCI_BOUNDARY_MASK (ZPCI_BOUNDARY_SIZE - 1) + +/* I/O Map */ +#define ZPCI_IOMAP_SHIFT 48 +#define ZPCI_IOMAP_ADDR_SHIFT 62 +#define ZPCI_IOMAP_ADDR_BASE (1UL << ZPCI_IOMAP_ADDR_SHIFT) +#define ZPCI_IOMAP_ADDR_OFF_MASK ((1UL << ZPCI_IOMAP_SHIFT) - 1) +#define ZPCI_IOMAP_MAX_ENTRIES \ + (1UL << (ZPCI_IOMAP_ADDR_SHIFT - ZPCI_IOMAP_SHIFT)) +#define ZPCI_IOMAP_ADDR_IDX_MASK \ + ((ZPCI_IOMAP_ADDR_BASE - 1) & ~ZPCI_IOMAP_ADDR_OFF_MASK) + +struct zpci_iomap_entry { + u32 fh; + u8 bar; + u16 count; +}; + +extern struct zpci_iomap_entry *zpci_iomap_start; + +#define ZPCI_ADDR(idx) (ZPCI_IOMAP_ADDR_BASE | ((u64) idx << ZPCI_IOMAP_SHIFT)) +#define ZPCI_IDX(addr) \ + (((__force u64) addr & ZPCI_IOMAP_ADDR_IDX_MASK) >> ZPCI_IOMAP_SHIFT) +#define ZPCI_OFFSET(addr) \ + ((__force u64) addr & ZPCI_IOMAP_ADDR_OFF_MASK) + +#define ZPCI_CREATE_REQ(handle, space, len) \ + ((u64) handle << 32 | space << 16 | len) + +#define zpci_read(LENGTH, RETTYPE) \ +static inline RETTYPE zpci_read_##RETTYPE(const volatile void __iomem *addr) \ +{ \ + u64 data; \ + int rc; \ + \ + rc = zpci_load(&data, addr, LENGTH); \ + if (rc) \ + data = -1ULL; \ + return (RETTYPE) data; \ +} + +#define zpci_write(LENGTH, VALTYPE) \ +static inline void zpci_write_##VALTYPE(VALTYPE val, \ + const volatile void __iomem *addr) \ +{ \ + u64 data = (VALTYPE) val; \ + \ + zpci_store(addr, data, LENGTH); \ +} + +zpci_read(8, u64) +zpci_read(4, u32) +zpci_read(2, u16) +zpci_read(1, u8) +zpci_write(8, u64) +zpci_write(4, u32) +zpci_write(2, u16) +zpci_write(1, u8) + +static inline int zpci_write_single(volatile void __iomem *dst, const void *src, + unsigned long len) +{ + u64 val; + + switch (len) { + case 1: + val = (u64) *((u8 *) src); + break; + case 2: + val = (u64) *((u16 *) src); + break; + case 4: + val = (u64) *((u32 *) src); + break; + case 8: + val = (u64) *((u64 *) src); + break; + default: + val = 0; /* let FW report error */ + break; + } + return zpci_store(dst, val, len); +} + +static inline int zpci_read_single(void *dst, const volatile void __iomem *src, + unsigned long len) +{ + u64 data; + int cc; + + cc = zpci_load(&data, src, len); + if (cc) + goto out; + + switch (len) { + case 1: + *((u8 *) dst) = (u8) data; + break; + case 2: + *((u16 *) dst) = (u16) data; + break; + case 4: + *((u32 *) dst) = (u32) data; + break; + case 8: + *((u64 *) dst) = (u64) data; + break; + } +out: + return cc; +} + +int zpci_write_block(volatile void __iomem *dst, const void *src, + unsigned long len); + +static inline int zpci_get_max_io_size(u64 src, u64 dst, int len, int max) +{ + int offset = dst & ZPCI_BOUNDARY_MASK; + int size; + + size = min3(len, ZPCI_BOUNDARY_SIZE - offset, max); + if (IS_ALIGNED(src, 8) && IS_ALIGNED(dst, 8) && IS_ALIGNED(size, 8)) + return size; + + if (size >= 8) + return 8; + return rounddown_pow_of_two(size); +} + +static inline int zpci_memcpy_fromio(void *dst, + const volatile void __iomem *src, + unsigned long n) +{ + int size, rc = 0; + + while (n > 0) { + size = zpci_get_max_io_size((u64 __force) src, + (u64) dst, n, + ZPCI_MAX_READ_SIZE); + rc = zpci_read_single(dst, src, size); + if (rc) + break; + src += size; + dst += size; + n -= size; + } + return rc; +} + +static inline int zpci_memcpy_toio(volatile void __iomem *dst, + const void *src, unsigned long n) +{ + int size, rc = 0; + + if (!src) + return -EINVAL; + + while (n > 0) { + size = zpci_get_max_io_size((u64 __force) dst, + (u64) src, n, + ZPCI_MAX_WRITE_SIZE); + if (size > 8) /* main path */ + rc = zpci_write_block(dst, src, size); + else + rc = zpci_write_single(dst, src, size); + if (rc) + break; + src += size; + dst += size; + n -= size; + } + return rc; +} + +static inline int zpci_memset_io(volatile void __iomem *dst, + unsigned char val, size_t count) +{ + u8 *src = kmalloc(count, GFP_KERNEL); + int rc; + + if (src == NULL) + return -ENOMEM; + memset(src, val, count); + + rc = zpci_memcpy_toio(dst, src, count); + kfree(src); + return rc; +} + +#endif /* CONFIG_PCI */ + +#endif /* _ASM_S390_PCI_IO_H */ diff --git a/arch/s390/include/asm/percpu.h b/arch/s390/include/asm/percpu.h new file mode 100644 index 0000000000..264095dd84 --- /dev/null +++ b/arch/s390/include/asm/percpu.h @@ -0,0 +1,185 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ARCH_S390_PERCPU__ +#define __ARCH_S390_PERCPU__ + +#include +#include + +/* + * s390 uses its own implementation for per cpu data, the offset of + * the cpu local data area is cached in the cpu's lowcore memory. + */ +#define __my_cpu_offset S390_lowcore.percpu_offset + +/* + * For 64 bit module code, the module may be more than 4G above the + * per cpu area, use weak definitions to force the compiler to + * generate external references. + */ +#if defined(MODULE) +#define ARCH_NEEDS_WEAK_PER_CPU +#endif + +/* + * We use a compare-and-swap loop since that uses less cpu cycles than + * disabling and enabling interrupts like the generic variant would do. + */ +#define arch_this_cpu_to_op_simple(pcp, val, op) \ +({ \ + typedef typeof(pcp) pcp_op_T__; \ + pcp_op_T__ old__, new__, prev__; \ + pcp_op_T__ *ptr__; \ + preempt_disable_notrace(); \ + ptr__ = raw_cpu_ptr(&(pcp)); \ + prev__ = READ_ONCE(*ptr__); \ + do { \ + old__ = prev__; \ + new__ = old__ op (val); \ + prev__ = cmpxchg(ptr__, old__, new__); \ + } while (prev__ != old__); \ + preempt_enable_notrace(); \ + new__; \ +}) + +#define this_cpu_add_1(pcp, val) arch_this_cpu_to_op_simple(pcp, val, +) +#define this_cpu_add_2(pcp, val) arch_this_cpu_to_op_simple(pcp, val, +) +#define this_cpu_add_return_1(pcp, val) arch_this_cpu_to_op_simple(pcp, val, +) +#define this_cpu_add_return_2(pcp, val) arch_this_cpu_to_op_simple(pcp, val, +) +#define this_cpu_and_1(pcp, val) arch_this_cpu_to_op_simple(pcp, val, &) +#define this_cpu_and_2(pcp, val) arch_this_cpu_to_op_simple(pcp, val, &) +#define this_cpu_or_1(pcp, val) arch_this_cpu_to_op_simple(pcp, val, |) +#define this_cpu_or_2(pcp, val) arch_this_cpu_to_op_simple(pcp, val, |) + +#ifndef CONFIG_HAVE_MARCH_Z196_FEATURES + +#define this_cpu_add_4(pcp, val) arch_this_cpu_to_op_simple(pcp, val, +) +#define this_cpu_add_8(pcp, val) arch_this_cpu_to_op_simple(pcp, val, +) +#define this_cpu_add_return_4(pcp, val) arch_this_cpu_to_op_simple(pcp, val, +) +#define this_cpu_add_return_8(pcp, val) arch_this_cpu_to_op_simple(pcp, val, +) +#define this_cpu_and_4(pcp, val) arch_this_cpu_to_op_simple(pcp, val, &) +#define this_cpu_and_8(pcp, val) arch_this_cpu_to_op_simple(pcp, val, &) +#define this_cpu_or_4(pcp, val) arch_this_cpu_to_op_simple(pcp, val, |) +#define this_cpu_or_8(pcp, val) arch_this_cpu_to_op_simple(pcp, val, |) + +#else /* CONFIG_HAVE_MARCH_Z196_FEATURES */ + +#define arch_this_cpu_add(pcp, val, op1, op2, szcast) \ +{ \ + typedef typeof(pcp) pcp_op_T__; \ + pcp_op_T__ val__ = (val); \ + pcp_op_T__ old__, *ptr__; \ + preempt_disable_notrace(); \ + ptr__ = raw_cpu_ptr(&(pcp)); \ + if (__builtin_constant_p(val__) && \ + ((szcast)val__ > -129) && ((szcast)val__ < 128)) { \ + asm volatile( \ + op2 " %[ptr__],%[val__]\n" \ + : [ptr__] "+Q" (*ptr__) \ + : [val__] "i" ((szcast)val__) \ + : "cc"); \ + } else { \ + asm volatile( \ + op1 " %[old__],%[val__],%[ptr__]\n" \ + : [old__] "=d" (old__), [ptr__] "+Q" (*ptr__) \ + : [val__] "d" (val__) \ + : "cc"); \ + } \ + preempt_enable_notrace(); \ +} + +#define this_cpu_add_4(pcp, val) arch_this_cpu_add(pcp, val, "laa", "asi", int) +#define this_cpu_add_8(pcp, val) arch_this_cpu_add(pcp, val, "laag", "agsi", long) + +#define arch_this_cpu_add_return(pcp, val, op) \ +({ \ + typedef typeof(pcp) pcp_op_T__; \ + pcp_op_T__ val__ = (val); \ + pcp_op_T__ old__, *ptr__; \ + preempt_disable_notrace(); \ + ptr__ = raw_cpu_ptr(&(pcp)); \ + asm volatile( \ + op " %[old__],%[val__],%[ptr__]\n" \ + : [old__] "=d" (old__), [ptr__] "+Q" (*ptr__) \ + : [val__] "d" (val__) \ + : "cc"); \ + preempt_enable_notrace(); \ + old__ + val__; \ +}) + +#define this_cpu_add_return_4(pcp, val) arch_this_cpu_add_return(pcp, val, "laa") +#define this_cpu_add_return_8(pcp, val) arch_this_cpu_add_return(pcp, val, "laag") + +#define arch_this_cpu_to_op(pcp, val, op) \ +{ \ + typedef typeof(pcp) pcp_op_T__; \ + pcp_op_T__ val__ = (val); \ + pcp_op_T__ old__, *ptr__; \ + preempt_disable_notrace(); \ + ptr__ = raw_cpu_ptr(&(pcp)); \ + asm volatile( \ + op " %[old__],%[val__],%[ptr__]\n" \ + : [old__] "=d" (old__), [ptr__] "+Q" (*ptr__) \ + : [val__] "d" (val__) \ + : "cc"); \ + preempt_enable_notrace(); \ +} + +#define this_cpu_and_4(pcp, val) arch_this_cpu_to_op(pcp, val, "lan") +#define this_cpu_and_8(pcp, val) arch_this_cpu_to_op(pcp, val, "lang") +#define this_cpu_or_4(pcp, val) arch_this_cpu_to_op(pcp, val, "lao") +#define this_cpu_or_8(pcp, val) arch_this_cpu_to_op(pcp, val, "laog") + +#endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */ + +#define arch_this_cpu_cmpxchg(pcp, oval, nval) \ +({ \ + typedef typeof(pcp) pcp_op_T__; \ + pcp_op_T__ ret__; \ + pcp_op_T__ *ptr__; \ + preempt_disable_notrace(); \ + ptr__ = raw_cpu_ptr(&(pcp)); \ + ret__ = cmpxchg(ptr__, oval, nval); \ + preempt_enable_notrace(); \ + ret__; \ +}) + +#define this_cpu_cmpxchg_1(pcp, oval, nval) arch_this_cpu_cmpxchg(pcp, oval, nval) +#define this_cpu_cmpxchg_2(pcp, oval, nval) arch_this_cpu_cmpxchg(pcp, oval, nval) +#define this_cpu_cmpxchg_4(pcp, oval, nval) arch_this_cpu_cmpxchg(pcp, oval, nval) +#define this_cpu_cmpxchg_8(pcp, oval, nval) arch_this_cpu_cmpxchg(pcp, oval, nval) + +#define this_cpu_cmpxchg64(pcp, o, n) this_cpu_cmpxchg_8(pcp, o, n) + +#define this_cpu_cmpxchg128(pcp, oval, nval) \ +({ \ + typedef typeof(pcp) pcp_op_T__; \ + u128 old__, new__, ret__; \ + pcp_op_T__ *ptr__; \ + old__ = oval; \ + new__ = nval; \ + preempt_disable_notrace(); \ + ptr__ = raw_cpu_ptr(&(pcp)); \ + ret__ = cmpxchg128((void *)ptr__, old__, new__); \ + preempt_enable_notrace(); \ + ret__; \ +}) + +#define arch_this_cpu_xchg(pcp, nval) \ +({ \ + typeof(pcp) *ptr__; \ + typeof(pcp) ret__; \ + preempt_disable_notrace(); \ + ptr__ = raw_cpu_ptr(&(pcp)); \ + ret__ = xchg(ptr__, nval); \ + preempt_enable_notrace(); \ + ret__; \ +}) + +#define this_cpu_xchg_1(pcp, nval) arch_this_cpu_xchg(pcp, nval) +#define this_cpu_xchg_2(pcp, nval) arch_this_cpu_xchg(pcp, nval) +#define this_cpu_xchg_4(pcp, nval) arch_this_cpu_xchg(pcp, nval) +#define this_cpu_xchg_8(pcp, nval) arch_this_cpu_xchg(pcp, nval) + +#include + +#endif /* __ARCH_S390_PERCPU__ */ diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h new file mode 100644 index 0000000000..9917e2717b --- /dev/null +++ b/arch/s390/include/asm/perf_event.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Performance event support - s390 specific definitions. + * + * Copyright IBM Corp. 2009, 2017 + * Author(s): Martin Schwidefsky + * Hendrik Brueckner + */ + +#ifndef _ASM_S390_PERF_EVENT_H +#define _ASM_S390_PERF_EVENT_H + +#include +#include +#include + +/* Per-CPU flags for PMU states */ +#define PMU_F_RESERVED 0x1000 +#define PMU_F_ENABLED 0x2000 +#define PMU_F_IN_USE 0x4000 +#define PMU_F_ERR_IBE 0x0100 +#define PMU_F_ERR_LSDA 0x0200 +#define PMU_F_ERR_MASK (PMU_F_ERR_IBE|PMU_F_ERR_LSDA) + +/* Perf definitions for PMU event attributes in sysfs */ +extern __init const struct attribute_group **cpumf_cf_event_group(void); +extern ssize_t cpumf_events_sysfs_show(struct device *dev, + struct device_attribute *attr, + char *page); +#define EVENT_VAR(_cat, _name) event_attr_##_cat##_##_name +#define EVENT_PTR(_cat, _name) (&EVENT_VAR(_cat, _name).attr.attr) + +#define CPUMF_EVENT_ATTR(cat, name, id) \ + PMU_EVENT_ATTR(name, EVENT_VAR(cat, name), id, cpumf_events_sysfs_show) +#define CPUMF_EVENT_PTR(cat, name) EVENT_PTR(cat, name) + + +/* Perf callbacks */ +struct pt_regs; +extern unsigned long perf_instruction_pointer(struct pt_regs *regs); +extern unsigned long perf_misc_flags(struct pt_regs *regs); +#define perf_misc_flags(regs) perf_misc_flags(regs) +#define perf_arch_bpf_user_pt_regs(regs) ®s->user_regs + +/* Perf pt_regs extension for sample-data-entry indicators */ +struct perf_sf_sde_regs { + unsigned char in_guest:1; /* guest sample */ + unsigned long reserved:63; /* reserved */ +}; + +/* Perf PMU definitions for the counter facility */ +#define PERF_CPUM_CF_MAX_CTR 0xffffUL /* Max ctr for ECCTR */ + +/* Perf PMU definitions for the sampling facility */ +#define PERF_CPUM_SF_MAX_CTR 2 +#define PERF_EVENT_CPUM_SF 0xB0000UL /* Event: Basic-sampling */ +#define PERF_EVENT_CPUM_SF_DIAG 0xBD000UL /* Event: Combined-sampling */ +#define PERF_EVENT_CPUM_CF_DIAG 0xBC000UL /* Event: Counter sets */ +#define PERF_CPUM_SF_BASIC_MODE 0x0001 /* Basic-sampling flag */ +#define PERF_CPUM_SF_DIAG_MODE 0x0002 /* Diagnostic-sampling flag */ +#define PERF_CPUM_SF_MODE_MASK (PERF_CPUM_SF_BASIC_MODE| \ + PERF_CPUM_SF_DIAG_MODE) +#define PERF_CPUM_SF_FREQ_MODE 0x0008 /* Sampling with frequency */ + +#define REG_NONE 0 +#define REG_OVERFLOW 1 +#define OVERFLOW_REG(hwc) ((hwc)->extra_reg.config) +#define SFB_ALLOC_REG(hwc) ((hwc)->extra_reg.alloc) +#define TEAR_REG(hwc) ((hwc)->last_tag) +#define SAMPL_RATE(hwc) ((hwc)->event_base) +#define SAMPL_FLAGS(hwc) ((hwc)->config_base) +#define SAMPL_DIAG_MODE(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_DIAG_MODE) +#define SAMPLE_FREQ_MODE(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_FREQ_MODE) + +#define perf_arch_fetch_caller_regs(regs, __ip) do { \ + (regs)->psw.addr = (__ip); \ + (regs)->gprs[15] = (unsigned long)__builtin_frame_address(0) - \ + offsetof(struct stack_frame, back_chain); \ +} while (0) + +#endif /* _ASM_S390_PERF_EVENT_H */ diff --git a/arch/s390/include/asm/pfault.h b/arch/s390/include/asm/pfault.h new file mode 100644 index 0000000000..a1bee4a1e4 --- /dev/null +++ b/arch/s390/include/asm/pfault.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 1999, 2023 + */ +#ifndef _ASM_S390_PFAULT_H +#define _ASM_S390_PFAULT_H + +#include + +int __pfault_init(void); +void __pfault_fini(void); + +static inline int pfault_init(void) +{ + if (IS_ENABLED(CONFIG_PFAULT)) + return __pfault_init(); + return -EOPNOTSUPP; +} + +static inline void pfault_fini(void) +{ + if (IS_ENABLED(CONFIG_PFAULT)) + __pfault_fini(); +} + +#endif /* _ASM_S390_PFAULT_H */ diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h new file mode 100644 index 0000000000..376b4b23bd --- /dev/null +++ b/arch/s390/include/asm/pgalloc.h @@ -0,0 +1,157 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * S390 version + * Copyright IBM Corp. 1999, 2000 + * Author(s): Hartmut Penner (hp@de.ibm.com) + * Martin Schwidefsky (schwidefsky@de.ibm.com) + * + * Derived from "include/asm-i386/pgalloc.h" + * Copyright (C) 1994 Linus Torvalds + */ + +#ifndef _S390_PGALLOC_H +#define _S390_PGALLOC_H + +#include +#include +#include +#include + +#define CRST_ALLOC_ORDER 2 + +unsigned long *crst_table_alloc(struct mm_struct *); +void crst_table_free(struct mm_struct *, unsigned long *); + +unsigned long *page_table_alloc(struct mm_struct *); +struct page *page_table_alloc_pgste(struct mm_struct *mm); +void page_table_free(struct mm_struct *, unsigned long *); +void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long); +void page_table_free_pgste(struct page *page); +extern int page_table_allocate_pgste; + +static inline void crst_table_init(unsigned long *crst, unsigned long entry) +{ + memset64((u64 *)crst, entry, _CRST_ENTRIES); +} + +int crst_table_upgrade(struct mm_struct *mm, unsigned long limit); + +static inline unsigned long check_asce_limit(struct mm_struct *mm, unsigned long addr, + unsigned long len) +{ + int rc; + + if (addr + len > mm->context.asce_limit && + addr + len <= TASK_SIZE) { + rc = crst_table_upgrade(mm, addr + len); + if (rc) + return (unsigned long) rc; + } + return addr; +} + +static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long address) +{ + unsigned long *table = crst_table_alloc(mm); + + if (table) + crst_table_init(table, _REGION2_ENTRY_EMPTY); + return (p4d_t *) table; +} + +static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) +{ + if (!mm_p4d_folded(mm)) + crst_table_free(mm, (unsigned long *) p4d); +} + +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address) +{ + unsigned long *table = crst_table_alloc(mm); + if (table) + crst_table_init(table, _REGION3_ENTRY_EMPTY); + return (pud_t *) table; +} + +static inline void pud_free(struct mm_struct *mm, pud_t *pud) +{ + if (!mm_pud_folded(mm)) + crst_table_free(mm, (unsigned long *) pud); +} + +static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr) +{ + unsigned long *table = crst_table_alloc(mm); + + if (!table) + return NULL; + crst_table_init(table, _SEGMENT_ENTRY_EMPTY); + if (!pagetable_pmd_ctor(virt_to_ptdesc(table))) { + crst_table_free(mm, table); + return NULL; + } + return (pmd_t *) table; +} + +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) +{ + if (mm_pmd_folded(mm)) + return; + pagetable_pmd_dtor(virt_to_ptdesc(pmd)); + crst_table_free(mm, (unsigned long *) pmd); +} + +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d) +{ + set_pgd(pgd, __pgd(_REGION1_ENTRY | __pa(p4d))); +} + +static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud) +{ + set_p4d(p4d, __p4d(_REGION2_ENTRY | __pa(pud))); +} + +static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) +{ + set_pud(pud, __pud(_REGION3_ENTRY | __pa(pmd))); +} + +static inline pgd_t *pgd_alloc(struct mm_struct *mm) +{ + return (pgd_t *) crst_table_alloc(mm); +} + +static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ + crst_table_free(mm, (unsigned long *) pgd); +} + +static inline void pmd_populate(struct mm_struct *mm, + pmd_t *pmd, pgtable_t pte) +{ + set_pmd(pmd, __pmd(_SEGMENT_ENTRY | __pa(pte))); +} + +#define pmd_populate_kernel(mm, pmd, pte) pmd_populate(mm, pmd, pte) + +/* + * page table entry allocation/free routines. + */ +#define pte_alloc_one_kernel(mm) ((pte_t *)page_table_alloc(mm)) +#define pte_alloc_one(mm) ((pte_t *)page_table_alloc(mm)) + +#define pte_free_kernel(mm, pte) page_table_free(mm, (unsigned long *) pte) +#define pte_free(mm, pte) page_table_free(mm, (unsigned long *) pte) + +/* arch use pte_free_defer() implementation in arch/s390/mm/pgalloc.c */ +#define pte_free_defer pte_free_defer +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable); + +void vmem_map_init(void); +void *vmem_crst_alloc(unsigned long val); +pte_t *vmem_pte_alloc(void); + +unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages); +void base_asce_free(unsigned long asce); + +#endif /* _S390_PGALLOC_H */ diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h new file mode 100644 index 0000000000..fb3ee7758b --- /dev/null +++ b/arch/s390/include/asm/pgtable.h @@ -0,0 +1,1872 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * S390 version + * Copyright IBM Corp. 1999, 2000 + * Author(s): Hartmut Penner (hp@de.ibm.com) + * Ulrich Weigand (weigand@de.ibm.com) + * Martin Schwidefsky (schwidefsky@de.ibm.com) + * + * Derived from "include/asm-i386/pgtable.h" + */ + +#ifndef _ASM_S390_PGTABLE_H +#define _ASM_S390_PGTABLE_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern pgd_t swapper_pg_dir[]; +extern pgd_t invalid_pg_dir[]; +extern void paging_init(void); +extern unsigned long s390_invalid_asce; + +enum { + PG_DIRECT_MAP_4K = 0, + PG_DIRECT_MAP_1M, + PG_DIRECT_MAP_2G, + PG_DIRECT_MAP_MAX +}; + +extern atomic_long_t __bootdata_preserved(direct_pages_count[PG_DIRECT_MAP_MAX]); + +static inline void update_page_count(int level, long count) +{ + if (IS_ENABLED(CONFIG_PROC_FS)) + atomic_long_add(count, &direct_pages_count[level]); +} + +/* + * The S390 doesn't have any external MMU info: the kernel page + * tables contain all the necessary information. + */ +#define update_mmu_cache(vma, address, ptep) do { } while (0) +#define update_mmu_cache_range(vmf, vma, addr, ptep, nr) do { } while (0) +#define update_mmu_cache_pmd(vma, address, ptep) do { } while (0) + +/* + * ZERO_PAGE is a global shared page that is always zero; used + * for zero-mapped memory areas etc.. + */ + +extern unsigned long empty_zero_page; +extern unsigned long zero_page_mask; + +#define ZERO_PAGE(vaddr) \ + (virt_to_page((void *)(empty_zero_page + \ + (((unsigned long)(vaddr)) &zero_page_mask)))) +#define __HAVE_COLOR_ZERO_PAGE + +/* TODO: s390 cannot support io_remap_pfn_range... */ + +#define pte_ERROR(e) \ + pr_err("%s:%d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e)) +#define pmd_ERROR(e) \ + pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e)) +#define pud_ERROR(e) \ + pr_err("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e)) +#define p4d_ERROR(e) \ + pr_err("%s:%d: bad p4d %016lx.\n", __FILE__, __LINE__, p4d_val(e)) +#define pgd_ERROR(e) \ + pr_err("%s:%d: bad pgd %016lx.\n", __FILE__, __LINE__, pgd_val(e)) + +/* + * The vmalloc and module area will always be on the topmost area of the + * kernel mapping. 512GB are reserved for vmalloc by default. + * At the top of the vmalloc area a 2GB area is reserved where modules + * will reside. That makes sure that inter module branches always + * happen without trampolines and in addition the placement within a + * 2GB frame is branch prediction unit friendly. + */ +extern unsigned long __bootdata_preserved(VMALLOC_START); +extern unsigned long __bootdata_preserved(VMALLOC_END); +#define VMALLOC_DEFAULT_SIZE ((512UL << 30) - MODULES_LEN) +extern struct page *__bootdata_preserved(vmemmap); +extern unsigned long __bootdata_preserved(vmemmap_size); + +extern unsigned long __bootdata_preserved(MODULES_VADDR); +extern unsigned long __bootdata_preserved(MODULES_END); +#define MODULES_VADDR MODULES_VADDR +#define MODULES_END MODULES_END +#define MODULES_LEN (1UL << 31) + +static inline int is_module_addr(void *addr) +{ + BUILD_BUG_ON(MODULES_LEN > (1UL << 31)); + if (addr < (void *)MODULES_VADDR) + return 0; + if (addr > (void *)MODULES_END) + return 0; + return 1; +} + +/* + * A 64 bit pagetable entry of S390 has following format: + * | PFRA |0IPC| OS | + * 0000000000111111111122222222223333333333444444444455555555556666 + * 0123456789012345678901234567890123456789012345678901234567890123 + * + * I Page-Invalid Bit: Page is not available for address-translation + * P Page-Protection Bit: Store access not possible for page + * C Change-bit override: HW is not required to set change bit + * + * A 64 bit segmenttable entry of S390 has following format: + * | P-table origin | TT + * 0000000000111111111122222222223333333333444444444455555555556666 + * 0123456789012345678901234567890123456789012345678901234567890123 + * + * I Segment-Invalid Bit: Segment is not available for address-translation + * C Common-Segment Bit: Segment is not private (PoP 3-30) + * P Page-Protection Bit: Store access not possible for page + * TT Type 00 + * + * A 64 bit region table entry of S390 has following format: + * | S-table origin | TF TTTL + * 0000000000111111111122222222223333333333444444444455555555556666 + * 0123456789012345678901234567890123456789012345678901234567890123 + * + * I Segment-Invalid Bit: Segment is not available for address-translation + * TT Type 01 + * TF + * TL Table length + * + * The 64 bit regiontable origin of S390 has following format: + * | region table origon | DTTL + * 0000000000111111111122222222223333333333444444444455555555556666 + * 0123456789012345678901234567890123456789012345678901234567890123 + * + * X Space-Switch event: + * G Segment-Invalid Bit: + * P Private-Space Bit: + * S Storage-Alteration: + * R Real space + * TL Table-Length: + * + * A storage key has the following format: + * | ACC |F|R|C|0| + * 0 3 4 5 6 7 + * ACC: access key + * F : fetch protection bit + * R : referenced bit + * C : changed bit + */ + +/* Hardware bits in the page table entry */ +#define _PAGE_NOEXEC 0x100 /* HW no-execute bit */ +#define _PAGE_PROTECT 0x200 /* HW read-only bit */ +#define _PAGE_INVALID 0x400 /* HW invalid bit */ +#define _PAGE_LARGE 0x800 /* Bit to mark a large pte */ + +/* Software bits in the page table entry */ +#define _PAGE_PRESENT 0x001 /* SW pte present bit */ +#define _PAGE_YOUNG 0x004 /* SW pte young bit */ +#define _PAGE_DIRTY 0x008 /* SW pte dirty bit */ +#define _PAGE_READ 0x010 /* SW pte read bit */ +#define _PAGE_WRITE 0x020 /* SW pte write bit */ +#define _PAGE_SPECIAL 0x040 /* SW associated with special page */ +#define _PAGE_UNUSED 0x080 /* SW bit for pgste usage state */ + +#ifdef CONFIG_MEM_SOFT_DIRTY +#define _PAGE_SOFT_DIRTY 0x002 /* SW pte soft dirty bit */ +#else +#define _PAGE_SOFT_DIRTY 0x000 +#endif + +#define _PAGE_SW_BITS 0xffUL /* All SW bits */ + +#define _PAGE_SWP_EXCLUSIVE _PAGE_LARGE /* SW pte exclusive swap bit */ + +/* Set of bits not changed in pte_modify */ +#define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_SPECIAL | _PAGE_DIRTY | \ + _PAGE_YOUNG | _PAGE_SOFT_DIRTY) + +/* + * Mask of bits that must not be changed with RDP. Allow only _PAGE_PROTECT + * HW bit and all SW bits. + */ +#define _PAGE_RDP_MASK ~(_PAGE_PROTECT | _PAGE_SW_BITS) + +/* + * handle_pte_fault uses pte_present and pte_none to find out the pte type + * WITHOUT holding the page table lock. The _PAGE_PRESENT bit is used to + * distinguish present from not-present ptes. It is changed only with the page + * table lock held. + * + * The following table gives the different possible bit combinations for + * the pte hardware and software bits in the last 12 bits of a pte + * (. unassigned bit, x don't care, t swap type): + * + * 842100000000 + * 000084210000 + * 000000008421 + * .IR.uswrdy.p + * empty .10.00000000 + * swap .11..ttttt.0 + * prot-none, clean, old .11.xx0000.1 + * prot-none, clean, young .11.xx0001.1 + * prot-none, dirty, old .11.xx0010.1 + * prot-none, dirty, young .11.xx0011.1 + * read-only, clean, old .11.xx0100.1 + * read-only, clean, young .01.xx0101.1 + * read-only, dirty, old .11.xx0110.1 + * read-only, dirty, young .01.xx0111.1 + * read-write, clean, old .11.xx1100.1 + * read-write, clean, young .01.xx1101.1 + * read-write, dirty, old .10.xx1110.1 + * read-write, dirty, young .00.xx1111.1 + * HW-bits: R read-only, I invalid + * SW-bits: p present, y young, d dirty, r read, w write, s special, + * u unused, l large + * + * pte_none is true for the bit pattern .10.00000000, pte == 0x400 + * pte_swap is true for the bit pattern .11..ooooo.0, (pte & 0x201) == 0x200 + * pte_present is true for the bit pattern .xx.xxxxxx.1, (pte & 0x001) == 0x001 + */ + +/* Bits in the segment/region table address-space-control-element */ +#define _ASCE_ORIGIN ~0xfffUL/* region/segment table origin */ +#define _ASCE_PRIVATE_SPACE 0x100 /* private space control */ +#define _ASCE_ALT_EVENT 0x80 /* storage alteration event control */ +#define _ASCE_SPACE_SWITCH 0x40 /* space switch event */ +#define _ASCE_REAL_SPACE 0x20 /* real space control */ +#define _ASCE_TYPE_MASK 0x0c /* asce table type mask */ +#define _ASCE_TYPE_REGION1 0x0c /* region first table type */ +#define _ASCE_TYPE_REGION2 0x08 /* region second table type */ +#define _ASCE_TYPE_REGION3 0x04 /* region third table type */ +#define _ASCE_TYPE_SEGMENT 0x00 /* segment table type */ +#define _ASCE_TABLE_LENGTH 0x03 /* region table length */ + +/* Bits in the region table entry */ +#define _REGION_ENTRY_ORIGIN ~0xfffUL/* region/segment table origin */ +#define _REGION_ENTRY_PROTECT 0x200 /* region protection bit */ +#define _REGION_ENTRY_NOEXEC 0x100 /* region no-execute bit */ +#define _REGION_ENTRY_OFFSET 0xc0 /* region table offset */ +#define _REGION_ENTRY_INVALID 0x20 /* invalid region table entry */ +#define _REGION_ENTRY_TYPE_MASK 0x0c /* region table type mask */ +#define _REGION_ENTRY_TYPE_R1 0x0c /* region first table type */ +#define _REGION_ENTRY_TYPE_R2 0x08 /* region second table type */ +#define _REGION_ENTRY_TYPE_R3 0x04 /* region third table type */ +#define _REGION_ENTRY_LENGTH 0x03 /* region third length */ + +#define _REGION1_ENTRY (_REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_LENGTH) +#define _REGION1_ENTRY_EMPTY (_REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID) +#define _REGION2_ENTRY (_REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_LENGTH) +#define _REGION2_ENTRY_EMPTY (_REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID) +#define _REGION3_ENTRY (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_LENGTH) +#define _REGION3_ENTRY_EMPTY (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID) + +#define _REGION3_ENTRY_ORIGIN_LARGE ~0x7fffffffUL /* large page address */ +#define _REGION3_ENTRY_DIRTY 0x2000 /* SW region dirty bit */ +#define _REGION3_ENTRY_YOUNG 0x1000 /* SW region young bit */ +#define _REGION3_ENTRY_LARGE 0x0400 /* RTTE-format control, large page */ +#define _REGION3_ENTRY_READ 0x0002 /* SW region read bit */ +#define _REGION3_ENTRY_WRITE 0x0001 /* SW region write bit */ + +#ifdef CONFIG_MEM_SOFT_DIRTY +#define _REGION3_ENTRY_SOFT_DIRTY 0x4000 /* SW region soft dirty bit */ +#else +#define _REGION3_ENTRY_SOFT_DIRTY 0x0000 /* SW region soft dirty bit */ +#endif + +#define _REGION_ENTRY_BITS 0xfffffffffffff22fUL + +/* Bits in the segment table entry */ +#define _SEGMENT_ENTRY_BITS 0xfffffffffffffe33UL +#define _SEGMENT_ENTRY_HARDWARE_BITS 0xfffffffffffffe30UL +#define _SEGMENT_ENTRY_HARDWARE_BITS_LARGE 0xfffffffffff00730UL +#define _SEGMENT_ENTRY_ORIGIN_LARGE ~0xfffffUL /* large page address */ +#define _SEGMENT_ENTRY_ORIGIN ~0x7ffUL/* page table origin */ +#define _SEGMENT_ENTRY_PROTECT 0x200 /* segment protection bit */ +#define _SEGMENT_ENTRY_NOEXEC 0x100 /* segment no-execute bit */ +#define _SEGMENT_ENTRY_INVALID 0x20 /* invalid segment table entry */ +#define _SEGMENT_ENTRY_TYPE_MASK 0x0c /* segment table type mask */ + +#define _SEGMENT_ENTRY (0) +#define _SEGMENT_ENTRY_EMPTY (_SEGMENT_ENTRY_INVALID) + +#define _SEGMENT_ENTRY_DIRTY 0x2000 /* SW segment dirty bit */ +#define _SEGMENT_ENTRY_YOUNG 0x1000 /* SW segment young bit */ +#define _SEGMENT_ENTRY_LARGE 0x0400 /* STE-format control, large page */ +#define _SEGMENT_ENTRY_WRITE 0x0002 /* SW segment write bit */ +#define _SEGMENT_ENTRY_READ 0x0001 /* SW segment read bit */ + +#ifdef CONFIG_MEM_SOFT_DIRTY +#define _SEGMENT_ENTRY_SOFT_DIRTY 0x4000 /* SW segment soft dirty bit */ +#else +#define _SEGMENT_ENTRY_SOFT_DIRTY 0x0000 /* SW segment soft dirty bit */ +#endif + +#define _CRST_ENTRIES 2048 /* number of region/segment table entries */ +#define _PAGE_ENTRIES 256 /* number of page table entries */ + +#define _CRST_TABLE_SIZE (_CRST_ENTRIES * 8) +#define _PAGE_TABLE_SIZE (_PAGE_ENTRIES * 8) + +#define _REGION1_SHIFT 53 +#define _REGION2_SHIFT 42 +#define _REGION3_SHIFT 31 +#define _SEGMENT_SHIFT 20 + +#define _REGION1_INDEX (0x7ffUL << _REGION1_SHIFT) +#define _REGION2_INDEX (0x7ffUL << _REGION2_SHIFT) +#define _REGION3_INDEX (0x7ffUL << _REGION3_SHIFT) +#define _SEGMENT_INDEX (0x7ffUL << _SEGMENT_SHIFT) +#define _PAGE_INDEX (0xffUL << _PAGE_SHIFT) + +#define _REGION1_SIZE (1UL << _REGION1_SHIFT) +#define _REGION2_SIZE (1UL << _REGION2_SHIFT) +#define _REGION3_SIZE (1UL << _REGION3_SHIFT) +#define _SEGMENT_SIZE (1UL << _SEGMENT_SHIFT) + +#define _REGION1_MASK (~(_REGION1_SIZE - 1)) +#define _REGION2_MASK (~(_REGION2_SIZE - 1)) +#define _REGION3_MASK (~(_REGION3_SIZE - 1)) +#define _SEGMENT_MASK (~(_SEGMENT_SIZE - 1)) + +#define PMD_SHIFT _SEGMENT_SHIFT +#define PUD_SHIFT _REGION3_SHIFT +#define P4D_SHIFT _REGION2_SHIFT +#define PGDIR_SHIFT _REGION1_SHIFT + +#define PMD_SIZE _SEGMENT_SIZE +#define PUD_SIZE _REGION3_SIZE +#define P4D_SIZE _REGION2_SIZE +#define PGDIR_SIZE _REGION1_SIZE + +#define PMD_MASK _SEGMENT_MASK +#define PUD_MASK _REGION3_MASK +#define P4D_MASK _REGION2_MASK +#define PGDIR_MASK _REGION1_MASK + +#define PTRS_PER_PTE _PAGE_ENTRIES +#define PTRS_PER_PMD _CRST_ENTRIES +#define PTRS_PER_PUD _CRST_ENTRIES +#define PTRS_PER_P4D _CRST_ENTRIES +#define PTRS_PER_PGD _CRST_ENTRIES + +/* + * Segment table and region3 table entry encoding + * (R = read-only, I = invalid, y = young bit): + * dy..R...I...wr + * prot-none, clean, old 00..1...1...00 + * prot-none, clean, young 01..1...1...00 + * prot-none, dirty, old 10..1...1...00 + * prot-none, dirty, young 11..1...1...00 + * read-only, clean, old 00..1...1...01 + * read-only, clean, young 01..1...0...01 + * read-only, dirty, old 10..1...1...01 + * read-only, dirty, young 11..1...0...01 + * read-write, clean, old 00..1...1...11 + * read-write, clean, young 01..1...0...11 + * read-write, dirty, old 10..0...1...11 + * read-write, dirty, young 11..0...0...11 + * The segment table origin is used to distinguish empty (origin==0) from + * read-write, old segment table entries (origin!=0) + * HW-bits: R read-only, I invalid + * SW-bits: y young, d dirty, r read, w write + */ + +/* Page status table bits for virtualization */ +#define PGSTE_ACC_BITS 0xf000000000000000UL +#define PGSTE_FP_BIT 0x0800000000000000UL +#define PGSTE_PCL_BIT 0x0080000000000000UL +#define PGSTE_HR_BIT 0x0040000000000000UL +#define PGSTE_HC_BIT 0x0020000000000000UL +#define PGSTE_GR_BIT 0x0004000000000000UL +#define PGSTE_GC_BIT 0x0002000000000000UL +#define PGSTE_UC_BIT 0x0000800000000000UL /* user dirty (migration) */ +#define PGSTE_IN_BIT 0x0000400000000000UL /* IPTE notify bit */ +#define PGSTE_VSIE_BIT 0x0000200000000000UL /* ref'd in a shadow table */ + +/* Guest Page State used for virtualization */ +#define _PGSTE_GPS_ZERO 0x0000000080000000UL +#define _PGSTE_GPS_NODAT 0x0000000040000000UL +#define _PGSTE_GPS_USAGE_MASK 0x0000000003000000UL +#define _PGSTE_GPS_USAGE_STABLE 0x0000000000000000UL +#define _PGSTE_GPS_USAGE_UNUSED 0x0000000001000000UL +#define _PGSTE_GPS_USAGE_POT_VOLATILE 0x0000000002000000UL +#define _PGSTE_GPS_USAGE_VOLATILE _PGSTE_GPS_USAGE_MASK + +/* + * A user page table pointer has the space-switch-event bit, the + * private-space-control bit and the storage-alteration-event-control + * bit set. A kernel page table pointer doesn't need them. + */ +#define _ASCE_USER_BITS (_ASCE_SPACE_SWITCH | _ASCE_PRIVATE_SPACE | \ + _ASCE_ALT_EVENT) + +/* + * Page protection definitions. + */ +#define PAGE_NONE __pgprot(_PAGE_PRESENT | _PAGE_INVALID | _PAGE_PROTECT) +#define PAGE_RO __pgprot(_PAGE_PRESENT | _PAGE_READ | \ + _PAGE_NOEXEC | _PAGE_INVALID | _PAGE_PROTECT) +#define PAGE_RX __pgprot(_PAGE_PRESENT | _PAGE_READ | \ + _PAGE_INVALID | _PAGE_PROTECT) +#define PAGE_RW __pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \ + _PAGE_NOEXEC | _PAGE_INVALID | _PAGE_PROTECT) +#define PAGE_RWX __pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \ + _PAGE_INVALID | _PAGE_PROTECT) + +#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \ + _PAGE_YOUNG | _PAGE_DIRTY | _PAGE_NOEXEC) +#define PAGE_KERNEL __pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \ + _PAGE_YOUNG | _PAGE_DIRTY | _PAGE_NOEXEC) +#define PAGE_KERNEL_RO __pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_YOUNG | \ + _PAGE_PROTECT | _PAGE_NOEXEC) +#define PAGE_KERNEL_EXEC __pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \ + _PAGE_YOUNG | _PAGE_DIRTY) + +/* + * On s390 the page table entry has an invalid bit and a read-only bit. + * Read permission implies execute permission and write permission + * implies read permission. + */ + /*xwr*/ + +/* + * Segment entry (large page) protection definitions. + */ +#define SEGMENT_NONE __pgprot(_SEGMENT_ENTRY_INVALID | \ + _SEGMENT_ENTRY_PROTECT) +#define SEGMENT_RO __pgprot(_SEGMENT_ENTRY_PROTECT | \ + _SEGMENT_ENTRY_READ | \ + _SEGMENT_ENTRY_NOEXEC) +#define SEGMENT_RX __pgprot(_SEGMENT_ENTRY_PROTECT | \ + _SEGMENT_ENTRY_READ) +#define SEGMENT_RW __pgprot(_SEGMENT_ENTRY_READ | \ + _SEGMENT_ENTRY_WRITE | \ + _SEGMENT_ENTRY_NOEXEC) +#define SEGMENT_RWX __pgprot(_SEGMENT_ENTRY_READ | \ + _SEGMENT_ENTRY_WRITE) +#define SEGMENT_KERNEL __pgprot(_SEGMENT_ENTRY | \ + _SEGMENT_ENTRY_LARGE | \ + _SEGMENT_ENTRY_READ | \ + _SEGMENT_ENTRY_WRITE | \ + _SEGMENT_ENTRY_YOUNG | \ + _SEGMENT_ENTRY_DIRTY | \ + _SEGMENT_ENTRY_NOEXEC) +#define SEGMENT_KERNEL_RO __pgprot(_SEGMENT_ENTRY | \ + _SEGMENT_ENTRY_LARGE | \ + _SEGMENT_ENTRY_READ | \ + _SEGMENT_ENTRY_YOUNG | \ + _SEGMENT_ENTRY_PROTECT | \ + _SEGMENT_ENTRY_NOEXEC) +#define SEGMENT_KERNEL_EXEC __pgprot(_SEGMENT_ENTRY | \ + _SEGMENT_ENTRY_LARGE | \ + _SEGMENT_ENTRY_READ | \ + _SEGMENT_ENTRY_WRITE | \ + _SEGMENT_ENTRY_YOUNG | \ + _SEGMENT_ENTRY_DIRTY) + +/* + * Region3 entry (large page) protection definitions. + */ + +#define REGION3_KERNEL __pgprot(_REGION_ENTRY_TYPE_R3 | \ + _REGION3_ENTRY_LARGE | \ + _REGION3_ENTRY_READ | \ + _REGION3_ENTRY_WRITE | \ + _REGION3_ENTRY_YOUNG | \ + _REGION3_ENTRY_DIRTY | \ + _REGION_ENTRY_NOEXEC) +#define REGION3_KERNEL_RO __pgprot(_REGION_ENTRY_TYPE_R3 | \ + _REGION3_ENTRY_LARGE | \ + _REGION3_ENTRY_READ | \ + _REGION3_ENTRY_YOUNG | \ + _REGION_ENTRY_PROTECT | \ + _REGION_ENTRY_NOEXEC) +#define REGION3_KERNEL_EXEC __pgprot(_REGION_ENTRY_TYPE_R3 | \ + _REGION3_ENTRY_LARGE | \ + _REGION3_ENTRY_READ | \ + _REGION3_ENTRY_WRITE | \ + _REGION3_ENTRY_YOUNG | \ + _REGION3_ENTRY_DIRTY) + +static inline bool mm_p4d_folded(struct mm_struct *mm) +{ + return mm->context.asce_limit <= _REGION1_SIZE; +} +#define mm_p4d_folded(mm) mm_p4d_folded(mm) + +static inline bool mm_pud_folded(struct mm_struct *mm) +{ + return mm->context.asce_limit <= _REGION2_SIZE; +} +#define mm_pud_folded(mm) mm_pud_folded(mm) + +static inline bool mm_pmd_folded(struct mm_struct *mm) +{ + return mm->context.asce_limit <= _REGION3_SIZE; +} +#define mm_pmd_folded(mm) mm_pmd_folded(mm) + +static inline int mm_has_pgste(struct mm_struct *mm) +{ +#ifdef CONFIG_PGSTE + if (unlikely(mm->context.has_pgste)) + return 1; +#endif + return 0; +} + +static inline int mm_is_protected(struct mm_struct *mm) +{ +#ifdef CONFIG_PGSTE + if (unlikely(atomic_read(&mm->context.protected_count))) + return 1; +#endif + return 0; +} + +static inline int mm_alloc_pgste(struct mm_struct *mm) +{ +#ifdef CONFIG_PGSTE + if (unlikely(mm->context.alloc_pgste)) + return 1; +#endif + return 0; +} + +static inline pte_t clear_pte_bit(pte_t pte, pgprot_t prot) +{ + return __pte(pte_val(pte) & ~pgprot_val(prot)); +} + +static inline pte_t set_pte_bit(pte_t pte, pgprot_t prot) +{ + return __pte(pte_val(pte) | pgprot_val(prot)); +} + +static inline pmd_t clear_pmd_bit(pmd_t pmd, pgprot_t prot) +{ + return __pmd(pmd_val(pmd) & ~pgprot_val(prot)); +} + +static inline pmd_t set_pmd_bit(pmd_t pmd, pgprot_t prot) +{ + return __pmd(pmd_val(pmd) | pgprot_val(prot)); +} + +static inline pud_t clear_pud_bit(pud_t pud, pgprot_t prot) +{ + return __pud(pud_val(pud) & ~pgprot_val(prot)); +} + +static inline pud_t set_pud_bit(pud_t pud, pgprot_t prot) +{ + return __pud(pud_val(pud) | pgprot_val(prot)); +} + +/* + * In the case that a guest uses storage keys + * faults should no longer be backed by zero pages + */ +#define mm_forbids_zeropage mm_has_pgste +static inline int mm_uses_skeys(struct mm_struct *mm) +{ +#ifdef CONFIG_PGSTE + if (mm->context.uses_skeys) + return 1; +#endif + return 0; +} + +static inline void csp(unsigned int *ptr, unsigned int old, unsigned int new) +{ + union register_pair r1 = { .even = old, .odd = new, }; + unsigned long address = (unsigned long)ptr | 1; + + asm volatile( + " csp %[r1],%[address]" + : [r1] "+&d" (r1.pair), "+m" (*ptr) + : [address] "d" (address) + : "cc"); +} + +static inline void cspg(unsigned long *ptr, unsigned long old, unsigned long new) +{ + union register_pair r1 = { .even = old, .odd = new, }; + unsigned long address = (unsigned long)ptr | 1; + + asm volatile( + " cspg %[r1],%[address]" + : [r1] "+&d" (r1.pair), "+m" (*ptr) + : [address] "d" (address) + : "cc"); +} + +#define CRDTE_DTT_PAGE 0x00UL +#define CRDTE_DTT_SEGMENT 0x10UL +#define CRDTE_DTT_REGION3 0x14UL +#define CRDTE_DTT_REGION2 0x18UL +#define CRDTE_DTT_REGION1 0x1cUL + +static inline void crdte(unsigned long old, unsigned long new, + unsigned long *table, unsigned long dtt, + unsigned long address, unsigned long asce) +{ + union register_pair r1 = { .even = old, .odd = new, }; + union register_pair r2 = { .even = __pa(table) | dtt, .odd = address, }; + + asm volatile(".insn rrf,0xb98f0000,%[r1],%[r2],%[asce],0" + : [r1] "+&d" (r1.pair) + : [r2] "d" (r2.pair), [asce] "a" (asce) + : "memory", "cc"); +} + +/* + * pgd/p4d/pud/pmd/pte query functions + */ +static inline int pgd_folded(pgd_t pgd) +{ + return (pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R1; +} + +static inline int pgd_present(pgd_t pgd) +{ + if (pgd_folded(pgd)) + return 1; + return (pgd_val(pgd) & _REGION_ENTRY_ORIGIN) != 0UL; +} + +static inline int pgd_none(pgd_t pgd) +{ + if (pgd_folded(pgd)) + return 0; + return (pgd_val(pgd) & _REGION_ENTRY_INVALID) != 0UL; +} + +static inline int pgd_bad(pgd_t pgd) +{ + if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R1) + return 0; + return (pgd_val(pgd) & ~_REGION_ENTRY_BITS) != 0; +} + +static inline unsigned long pgd_pfn(pgd_t pgd) +{ + unsigned long origin_mask; + + origin_mask = _REGION_ENTRY_ORIGIN; + return (pgd_val(pgd) & origin_mask) >> PAGE_SHIFT; +} + +static inline int p4d_folded(p4d_t p4d) +{ + return (p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R2; +} + +static inline int p4d_present(p4d_t p4d) +{ + if (p4d_folded(p4d)) + return 1; + return (p4d_val(p4d) & _REGION_ENTRY_ORIGIN) != 0UL; +} + +static inline int p4d_none(p4d_t p4d) +{ + if (p4d_folded(p4d)) + return 0; + return p4d_val(p4d) == _REGION2_ENTRY_EMPTY; +} + +static inline unsigned long p4d_pfn(p4d_t p4d) +{ + unsigned long origin_mask; + + origin_mask = _REGION_ENTRY_ORIGIN; + return (p4d_val(p4d) & origin_mask) >> PAGE_SHIFT; +} + +static inline int pud_folded(pud_t pud) +{ + return (pud_val(pud) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R3; +} + +static inline int pud_present(pud_t pud) +{ + if (pud_folded(pud)) + return 1; + return (pud_val(pud) & _REGION_ENTRY_ORIGIN) != 0UL; +} + +static inline int pud_none(pud_t pud) +{ + if (pud_folded(pud)) + return 0; + return pud_val(pud) == _REGION3_ENTRY_EMPTY; +} + +#define pud_leaf pud_large +static inline int pud_large(pud_t pud) +{ + if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) != _REGION_ENTRY_TYPE_R3) + return 0; + return !!(pud_val(pud) & _REGION3_ENTRY_LARGE); +} + +#define pmd_leaf pmd_large +static inline int pmd_large(pmd_t pmd) +{ + return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) != 0; +} + +static inline int pmd_bad(pmd_t pmd) +{ + if ((pmd_val(pmd) & _SEGMENT_ENTRY_TYPE_MASK) > 0 || pmd_large(pmd)) + return 1; + return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0; +} + +static inline int pud_bad(pud_t pud) +{ + unsigned long type = pud_val(pud) & _REGION_ENTRY_TYPE_MASK; + + if (type > _REGION_ENTRY_TYPE_R3 || pud_large(pud)) + return 1; + if (type < _REGION_ENTRY_TYPE_R3) + return 0; + return (pud_val(pud) & ~_REGION_ENTRY_BITS) != 0; +} + +static inline int p4d_bad(p4d_t p4d) +{ + unsigned long type = p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK; + + if (type > _REGION_ENTRY_TYPE_R2) + return 1; + if (type < _REGION_ENTRY_TYPE_R2) + return 0; + return (p4d_val(p4d) & ~_REGION_ENTRY_BITS) != 0; +} + +static inline int pmd_present(pmd_t pmd) +{ + return pmd_val(pmd) != _SEGMENT_ENTRY_EMPTY; +} + +static inline int pmd_none(pmd_t pmd) +{ + return pmd_val(pmd) == _SEGMENT_ENTRY_EMPTY; +} + +#define pmd_write pmd_write +static inline int pmd_write(pmd_t pmd) +{ + return (pmd_val(pmd) & _SEGMENT_ENTRY_WRITE) != 0; +} + +#define pud_write pud_write +static inline int pud_write(pud_t pud) +{ + return (pud_val(pud) & _REGION3_ENTRY_WRITE) != 0; +} + +static inline int pmd_dirty(pmd_t pmd) +{ + return (pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY) != 0; +} + +#define pmd_young pmd_young +static inline int pmd_young(pmd_t pmd) +{ + return (pmd_val(pmd) & _SEGMENT_ENTRY_YOUNG) != 0; +} + +static inline int pte_present(pte_t pte) +{ + /* Bit pattern: (pte & 0x001) == 0x001 */ + return (pte_val(pte) & _PAGE_PRESENT) != 0; +} + +static inline int pte_none(pte_t pte) +{ + /* Bit pattern: pte == 0x400 */ + return pte_val(pte) == _PAGE_INVALID; +} + +static inline int pte_swap(pte_t pte) +{ + /* Bit pattern: (pte & 0x201) == 0x200 */ + return (pte_val(pte) & (_PAGE_PROTECT | _PAGE_PRESENT)) + == _PAGE_PROTECT; +} + +static inline int pte_special(pte_t pte) +{ + return (pte_val(pte) & _PAGE_SPECIAL); +} + +#define __HAVE_ARCH_PTE_SAME +static inline int pte_same(pte_t a, pte_t b) +{ + return pte_val(a) == pte_val(b); +} + +#ifdef CONFIG_NUMA_BALANCING +static inline int pte_protnone(pte_t pte) +{ + return pte_present(pte) && !(pte_val(pte) & _PAGE_READ); +} + +static inline int pmd_protnone(pmd_t pmd) +{ + /* pmd_large(pmd) implies pmd_present(pmd) */ + return pmd_large(pmd) && !(pmd_val(pmd) & _SEGMENT_ENTRY_READ); +} +#endif + +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + return set_pte_bit(pte, __pgprot(_PAGE_SWP_EXCLUSIVE)); +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + return clear_pte_bit(pte, __pgprot(_PAGE_SWP_EXCLUSIVE)); +} + +static inline int pte_soft_dirty(pte_t pte) +{ + return pte_val(pte) & _PAGE_SOFT_DIRTY; +} +#define pte_swp_soft_dirty pte_soft_dirty + +static inline pte_t pte_mksoft_dirty(pte_t pte) +{ + return set_pte_bit(pte, __pgprot(_PAGE_SOFT_DIRTY)); +} +#define pte_swp_mksoft_dirty pte_mksoft_dirty + +static inline pte_t pte_clear_soft_dirty(pte_t pte) +{ + return clear_pte_bit(pte, __pgprot(_PAGE_SOFT_DIRTY)); +} +#define pte_swp_clear_soft_dirty pte_clear_soft_dirty + +static inline int pmd_soft_dirty(pmd_t pmd) +{ + return pmd_val(pmd) & _SEGMENT_ENTRY_SOFT_DIRTY; +} + +static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) +{ + return set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_SOFT_DIRTY)); +} + +static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd) +{ + return clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_SOFT_DIRTY)); +} + +/* + * query functions pte_write/pte_dirty/pte_young only work if + * pte_present() is true. Undefined behaviour if not.. + */ +static inline int pte_write(pte_t pte) +{ + return (pte_val(pte) & _PAGE_WRITE) != 0; +} + +static inline int pte_dirty(pte_t pte) +{ + return (pte_val(pte) & _PAGE_DIRTY) != 0; +} + +static inline int pte_young(pte_t pte) +{ + return (pte_val(pte) & _PAGE_YOUNG) != 0; +} + +#define __HAVE_ARCH_PTE_UNUSED +static inline int pte_unused(pte_t pte) +{ + return pte_val(pte) & _PAGE_UNUSED; +} + +/* + * Extract the pgprot value from the given pte while at the same time making it + * usable for kernel address space mappings where fault driven dirty and + * young/old accounting is not supported, i.e _PAGE_PROTECT and _PAGE_INVALID + * must not be set. + */ +static inline pgprot_t pte_pgprot(pte_t pte) +{ + unsigned long pte_flags = pte_val(pte) & _PAGE_CHG_MASK; + + if (pte_write(pte)) + pte_flags |= pgprot_val(PAGE_KERNEL); + else + pte_flags |= pgprot_val(PAGE_KERNEL_RO); + pte_flags |= pte_val(pte) & mio_wb_bit_mask; + + return __pgprot(pte_flags); +} + +/* + * pgd/pmd/pte modification functions + */ + +static inline void set_pgd(pgd_t *pgdp, pgd_t pgd) +{ + WRITE_ONCE(*pgdp, pgd); +} + +static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) +{ + WRITE_ONCE(*p4dp, p4d); +} + +static inline void set_pud(pud_t *pudp, pud_t pud) +{ + WRITE_ONCE(*pudp, pud); +} + +static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) +{ + WRITE_ONCE(*pmdp, pmd); +} + +static inline void set_pte(pte_t *ptep, pte_t pte) +{ + WRITE_ONCE(*ptep, pte); +} + +static inline void pgd_clear(pgd_t *pgd) +{ + if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R1) + set_pgd(pgd, __pgd(_REGION1_ENTRY_EMPTY)); +} + +static inline void p4d_clear(p4d_t *p4d) +{ + if ((p4d_val(*p4d) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R2) + set_p4d(p4d, __p4d(_REGION2_ENTRY_EMPTY)); +} + +static inline void pud_clear(pud_t *pud) +{ + if ((pud_val(*pud) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) + set_pud(pud, __pud(_REGION3_ENTRY_EMPTY)); +} + +static inline void pmd_clear(pmd_t *pmdp) +{ + set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); +} + +static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + set_pte(ptep, __pte(_PAGE_INVALID)); +} + +/* + * The following pte modification functions only work if + * pte_present() is true. Undefined behaviour if not.. + */ +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) +{ + pte = clear_pte_bit(pte, __pgprot(~_PAGE_CHG_MASK)); + pte = set_pte_bit(pte, newprot); + /* + * newprot for PAGE_NONE, PAGE_RO, PAGE_RX, PAGE_RW and PAGE_RWX + * has the invalid bit set, clear it again for readable, young pages + */ + if ((pte_val(pte) & _PAGE_YOUNG) && (pte_val(pte) & _PAGE_READ)) + pte = clear_pte_bit(pte, __pgprot(_PAGE_INVALID)); + /* + * newprot for PAGE_RO, PAGE_RX, PAGE_RW and PAGE_RWX has the page + * protection bit set, clear it again for writable, dirty pages + */ + if ((pte_val(pte) & _PAGE_DIRTY) && (pte_val(pte) & _PAGE_WRITE)) + pte = clear_pte_bit(pte, __pgprot(_PAGE_PROTECT)); + return pte; +} + +static inline pte_t pte_wrprotect(pte_t pte) +{ + pte = clear_pte_bit(pte, __pgprot(_PAGE_WRITE)); + return set_pte_bit(pte, __pgprot(_PAGE_PROTECT)); +} + +static inline pte_t pte_mkwrite_novma(pte_t pte) +{ + pte = set_pte_bit(pte, __pgprot(_PAGE_WRITE)); + if (pte_val(pte) & _PAGE_DIRTY) + pte = clear_pte_bit(pte, __pgprot(_PAGE_PROTECT)); + return pte; +} + +static inline pte_t pte_mkclean(pte_t pte) +{ + pte = clear_pte_bit(pte, __pgprot(_PAGE_DIRTY)); + return set_pte_bit(pte, __pgprot(_PAGE_PROTECT)); +} + +static inline pte_t pte_mkdirty(pte_t pte) +{ + pte = set_pte_bit(pte, __pgprot(_PAGE_DIRTY | _PAGE_SOFT_DIRTY)); + if (pte_val(pte) & _PAGE_WRITE) + pte = clear_pte_bit(pte, __pgprot(_PAGE_PROTECT)); + return pte; +} + +static inline pte_t pte_mkold(pte_t pte) +{ + pte = clear_pte_bit(pte, __pgprot(_PAGE_YOUNG)); + return set_pte_bit(pte, __pgprot(_PAGE_INVALID)); +} + +static inline pte_t pte_mkyoung(pte_t pte) +{ + pte = set_pte_bit(pte, __pgprot(_PAGE_YOUNG)); + if (pte_val(pte) & _PAGE_READ) + pte = clear_pte_bit(pte, __pgprot(_PAGE_INVALID)); + return pte; +} + +static inline pte_t pte_mkspecial(pte_t pte) +{ + return set_pte_bit(pte, __pgprot(_PAGE_SPECIAL)); +} + +#ifdef CONFIG_HUGETLB_PAGE +static inline pte_t pte_mkhuge(pte_t pte) +{ + return set_pte_bit(pte, __pgprot(_PAGE_LARGE)); +} +#endif + +#define IPTE_GLOBAL 0 +#define IPTE_LOCAL 1 + +#define IPTE_NODAT 0x400 +#define IPTE_GUEST_ASCE 0x800 + +static __always_inline void __ptep_rdp(unsigned long addr, pte_t *ptep, + unsigned long opt, unsigned long asce, + int local) +{ + unsigned long pto; + + pto = __pa(ptep) & ~(PTRS_PER_PTE * sizeof(pte_t) - 1); + asm volatile(".insn rrf,0xb98b0000,%[r1],%[r2],%[asce],%[m4]" + : "+m" (*ptep) + : [r1] "a" (pto), [r2] "a" ((addr & PAGE_MASK) | opt), + [asce] "a" (asce), [m4] "i" (local)); +} + +static __always_inline void __ptep_ipte(unsigned long address, pte_t *ptep, + unsigned long opt, unsigned long asce, + int local) +{ + unsigned long pto = __pa(ptep); + + if (__builtin_constant_p(opt) && opt == 0) { + /* Invalidation + TLB flush for the pte */ + asm volatile( + " ipte %[r1],%[r2],0,%[m4]" + : "+m" (*ptep) : [r1] "a" (pto), [r2] "a" (address), + [m4] "i" (local)); + return; + } + + /* Invalidate ptes with options + TLB flush of the ptes */ + opt = opt | (asce & _ASCE_ORIGIN); + asm volatile( + " ipte %[r1],%[r2],%[r3],%[m4]" + : [r2] "+a" (address), [r3] "+a" (opt) + : [r1] "a" (pto), [m4] "i" (local) : "memory"); +} + +static __always_inline void __ptep_ipte_range(unsigned long address, int nr, + pte_t *ptep, int local) +{ + unsigned long pto = __pa(ptep); + + /* Invalidate a range of ptes + TLB flush of the ptes */ + do { + asm volatile( + " ipte %[r1],%[r2],%[r3],%[m4]" + : [r2] "+a" (address), [r3] "+a" (nr) + : [r1] "a" (pto), [m4] "i" (local) : "memory"); + } while (nr != 255); +} + +/* + * This is hard to understand. ptep_get_and_clear and ptep_clear_flush + * both clear the TLB for the unmapped pte. The reason is that + * ptep_get_and_clear is used in common code (e.g. change_pte_range) + * to modify an active pte. The sequence is + * 1) ptep_get_and_clear + * 2) set_pte_at + * 3) flush_tlb_range + * On s390 the tlb needs to get flushed with the modification of the pte + * if the pte is active. The only way how this can be implemented is to + * have ptep_get_and_clear do the tlb flush. In exchange flush_tlb_range + * is a nop. + */ +pte_t ptep_xchg_direct(struct mm_struct *, unsigned long, pte_t *, pte_t); +pte_t ptep_xchg_lazy(struct mm_struct *, unsigned long, pte_t *, pte_t); + +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG +static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + pte_t pte = *ptep; + + pte = ptep_xchg_direct(vma->vm_mm, addr, ptep, pte_mkold(pte)); + return pte_young(pte); +} + +#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH +static inline int ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + return ptep_test_and_clear_young(vma, address, ptep); +} + +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + pte_t res; + + res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID)); + /* At this point the reference through the mapping is still present */ + if (mm_is_protected(mm) && pte_present(res)) + uv_convert_owned_from_secure(pte_val(res) & PAGE_MASK); + return res; +} + +#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION +pte_t ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *); +void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long, + pte_t *, pte_t, pte_t); + +#define __HAVE_ARCH_PTEP_CLEAR_FLUSH +static inline pte_t ptep_clear_flush(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + pte_t res; + + res = ptep_xchg_direct(vma->vm_mm, addr, ptep, __pte(_PAGE_INVALID)); + /* At this point the reference through the mapping is still present */ + if (mm_is_protected(vma->vm_mm) && pte_present(res)) + uv_convert_owned_from_secure(pte_val(res) & PAGE_MASK); + return res; +} + +/* + * The batched pte unmap code uses ptep_get_and_clear_full to clear the + * ptes. Here an optimization is possible. tlb_gather_mmu flushes all + * tlbs of an mm if it can guarantee that the ptes of the mm_struct + * cannot be accessed while the batched unmap is running. In this case + * full==1 and a simple pte_clear is enough. See tlb.h. + */ +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL +static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, int full) +{ + pte_t res; + + if (full) { + res = *ptep; + set_pte(ptep, __pte(_PAGE_INVALID)); + } else { + res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID)); + } + /* Nothing to do */ + if (!mm_is_protected(mm) || !pte_present(res)) + return res; + /* + * At this point the reference through the mapping is still present. + * The notifier should have destroyed all protected vCPUs at this + * point, so the destroy should be successful. + */ + if (full && !uv_destroy_owned_page(pte_val(res) & PAGE_MASK)) + return res; + /* + * If something went wrong and the page could not be destroyed, or + * if this is not a mm teardown, the slower export is used as + * fallback instead. + */ + uv_convert_owned_from_secure(pte_val(res) & PAGE_MASK); + return res; +} + +#define __HAVE_ARCH_PTEP_SET_WRPROTECT +static inline void ptep_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + pte_t pte = *ptep; + + if (pte_write(pte)) + ptep_xchg_lazy(mm, addr, ptep, pte_wrprotect(pte)); +} + +/* + * Check if PTEs only differ in _PAGE_PROTECT HW bit, but also allow SW PTE + * bits in the comparison. Those might change e.g. because of dirty and young + * tracking. + */ +static inline int pte_allow_rdp(pte_t old, pte_t new) +{ + /* + * Only allow changes from RO to RW + */ + if (!(pte_val(old) & _PAGE_PROTECT) || pte_val(new) & _PAGE_PROTECT) + return 0; + + return (pte_val(old) & _PAGE_RDP_MASK) == (pte_val(new) & _PAGE_RDP_MASK); +} + +static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma, + unsigned long address, + pte_t *ptep) +{ + /* + * RDP might not have propagated the PTE protection reset to all CPUs, + * so there could be spurious TLB protection faults. + * NOTE: This will also be called when a racing pagetable update on + * another thread already installed the correct PTE. Both cases cannot + * really be distinguished. + * Therefore, only do the local TLB flush when RDP can be used, and the + * PTE does not have _PAGE_PROTECT set, to avoid unnecessary overhead. + * A local RDP can be used to do the flush. + */ + if (MACHINE_HAS_RDP && !(pte_val(*ptep) & _PAGE_PROTECT)) + __ptep_rdp(address, ptep, 0, 0, 1); +} +#define flush_tlb_fix_spurious_fault flush_tlb_fix_spurious_fault + +void ptep_reset_dat_prot(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t new); + +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS +static inline int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t entry, int dirty) +{ + if (pte_same(*ptep, entry)) + return 0; + if (MACHINE_HAS_RDP && !mm_has_pgste(vma->vm_mm) && pte_allow_rdp(*ptep, entry)) + ptep_reset_dat_prot(vma->vm_mm, addr, ptep, entry); + else + ptep_xchg_direct(vma->vm_mm, addr, ptep, entry); + return 1; +} + +/* + * Additional functions to handle KVM guest page tables + */ +void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t entry); +void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +void ptep_notify(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned long bits); +int ptep_force_prot(struct mm_struct *mm, unsigned long gaddr, + pte_t *ptep, int prot, unsigned long bit); +void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, + pte_t *ptep , int reset); +void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, + pte_t *sptep, pte_t *tptep, pte_t pte); +void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep); + +bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long address, + pte_t *ptep); +int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, + unsigned char key, bool nq); +int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr, + unsigned char key, unsigned char *oldkey, + bool nq, bool mr, bool mc); +int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr); +int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, + unsigned char *key); + +int set_pgste_bits(struct mm_struct *mm, unsigned long addr, + unsigned long bits, unsigned long value); +int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep); +int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, + unsigned long *oldpte, unsigned long *oldpgste); +void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr); +void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr); +void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr); +void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr); + +#define pgprot_writecombine pgprot_writecombine +pgprot_t pgprot_writecombine(pgprot_t prot); + +#define pgprot_writethrough pgprot_writethrough +pgprot_t pgprot_writethrough(pgprot_t prot); + +/* + * Set multiple PTEs to consecutive pages with a single call. All PTEs + * are within the same folio, PMD and VMA. + */ +static inline void set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t entry, unsigned int nr) +{ + if (pte_present(entry)) + entry = clear_pte_bit(entry, __pgprot(_PAGE_UNUSED)); + if (mm_has_pgste(mm)) { + for (;;) { + ptep_set_pte_at(mm, addr, ptep, entry); + if (--nr == 0) + break; + ptep++; + entry = __pte(pte_val(entry) + PAGE_SIZE); + addr += PAGE_SIZE; + } + } else { + for (;;) { + set_pte(ptep, entry); + if (--nr == 0) + break; + ptep++; + entry = __pte(pte_val(entry) + PAGE_SIZE); + } + } +} +#define set_ptes set_ptes + +/* + * Conversion functions: convert a page and protection to a page entry, + * and a page entry and page directory to the page they refer to. + */ +static inline pte_t mk_pte_phys(unsigned long physpage, pgprot_t pgprot) +{ + pte_t __pte; + + __pte = __pte(physpage | pgprot_val(pgprot)); + if (!MACHINE_HAS_NX) + __pte = clear_pte_bit(__pte, __pgprot(_PAGE_NOEXEC)); + return pte_mkyoung(__pte); +} + +static inline pte_t mk_pte(struct page *page, pgprot_t pgprot) +{ + unsigned long physpage = page_to_phys(page); + pte_t __pte = mk_pte_phys(physpage, pgprot); + + if (pte_write(__pte) && PageDirty(page)) + __pte = pte_mkdirty(__pte); + return __pte; +} + +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) +#define p4d_index(address) (((address) >> P4D_SHIFT) & (PTRS_PER_P4D-1)) +#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) +#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) + +#define p4d_deref(pud) ((unsigned long)__va(p4d_val(pud) & _REGION_ENTRY_ORIGIN)) +#define pgd_deref(pgd) ((unsigned long)__va(pgd_val(pgd) & _REGION_ENTRY_ORIGIN)) + +static inline unsigned long pmd_deref(pmd_t pmd) +{ + unsigned long origin_mask; + + origin_mask = _SEGMENT_ENTRY_ORIGIN; + if (pmd_large(pmd)) + origin_mask = _SEGMENT_ENTRY_ORIGIN_LARGE; + return (unsigned long)__va(pmd_val(pmd) & origin_mask); +} + +static inline unsigned long pmd_pfn(pmd_t pmd) +{ + return __pa(pmd_deref(pmd)) >> PAGE_SHIFT; +} + +static inline unsigned long pud_deref(pud_t pud) +{ + unsigned long origin_mask; + + origin_mask = _REGION_ENTRY_ORIGIN; + if (pud_large(pud)) + origin_mask = _REGION3_ENTRY_ORIGIN_LARGE; + return (unsigned long)__va(pud_val(pud) & origin_mask); +} + +static inline unsigned long pud_pfn(pud_t pud) +{ + return __pa(pud_deref(pud)) >> PAGE_SHIFT; +} + +/* + * The pgd_offset function *always* adds the index for the top-level + * region/segment table. This is done to get a sequence like the + * following to work: + * pgdp = pgd_offset(current->mm, addr); + * pgd = READ_ONCE(*pgdp); + * p4dp = p4d_offset(&pgd, addr); + * ... + * The subsequent p4d_offset, pud_offset and pmd_offset functions + * only add an index if they dereferenced the pointer. + */ +static inline pgd_t *pgd_offset_raw(pgd_t *pgd, unsigned long address) +{ + unsigned long rste; + unsigned int shift; + + /* Get the first entry of the top level table */ + rste = pgd_val(*pgd); + /* Pick up the shift from the table type of the first entry */ + shift = ((rste & _REGION_ENTRY_TYPE_MASK) >> 2) * 11 + 20; + return pgd + ((address >> shift) & (PTRS_PER_PGD - 1)); +} + +#define pgd_offset(mm, address) pgd_offset_raw(READ_ONCE((mm)->pgd), address) + +static inline p4d_t *p4d_offset_lockless(pgd_t *pgdp, pgd_t pgd, unsigned long address) +{ + if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1) + return (p4d_t *) pgd_deref(pgd) + p4d_index(address); + return (p4d_t *) pgdp; +} +#define p4d_offset_lockless p4d_offset_lockless + +static inline p4d_t *p4d_offset(pgd_t *pgdp, unsigned long address) +{ + return p4d_offset_lockless(pgdp, *pgdp, address); +} + +static inline pud_t *pud_offset_lockless(p4d_t *p4dp, p4d_t p4d, unsigned long address) +{ + if ((p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2) + return (pud_t *) p4d_deref(p4d) + pud_index(address); + return (pud_t *) p4dp; +} +#define pud_offset_lockless pud_offset_lockless + +static inline pud_t *pud_offset(p4d_t *p4dp, unsigned long address) +{ + return pud_offset_lockless(p4dp, *p4dp, address); +} +#define pud_offset pud_offset + +static inline pmd_t *pmd_offset_lockless(pud_t *pudp, pud_t pud, unsigned long address) +{ + if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R3) + return (pmd_t *) pud_deref(pud) + pmd_index(address); + return (pmd_t *) pudp; +} +#define pmd_offset_lockless pmd_offset_lockless + +static inline pmd_t *pmd_offset(pud_t *pudp, unsigned long address) +{ + return pmd_offset_lockless(pudp, *pudp, address); +} +#define pmd_offset pmd_offset + +static inline unsigned long pmd_page_vaddr(pmd_t pmd) +{ + return (unsigned long) pmd_deref(pmd); +} + +static inline bool gup_fast_permitted(unsigned long start, unsigned long end) +{ + return end <= current->mm->context.asce_limit; +} +#define gup_fast_permitted gup_fast_permitted + +#define pfn_pte(pfn, pgprot) mk_pte_phys(((pfn) << PAGE_SHIFT), (pgprot)) +#define pte_pfn(x) (pte_val(x) >> PAGE_SHIFT) +#define pte_page(x) pfn_to_page(pte_pfn(x)) + +#define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd)) +#define pud_page(pud) pfn_to_page(pud_pfn(pud)) +#define p4d_page(p4d) pfn_to_page(p4d_pfn(p4d)) +#define pgd_page(pgd) pfn_to_page(pgd_pfn(pgd)) + +static inline pmd_t pmd_wrprotect(pmd_t pmd) +{ + pmd = clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_WRITE)); + return set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_PROTECT)); +} + +static inline pmd_t pmd_mkwrite_novma(pmd_t pmd) +{ + pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_WRITE)); + if (pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY) + pmd = clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_PROTECT)); + return pmd; +} + +static inline pmd_t pmd_mkclean(pmd_t pmd) +{ + pmd = clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_DIRTY)); + return set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_PROTECT)); +} + +static inline pmd_t pmd_mkdirty(pmd_t pmd) +{ + pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_DIRTY | _SEGMENT_ENTRY_SOFT_DIRTY)); + if (pmd_val(pmd) & _SEGMENT_ENTRY_WRITE) + pmd = clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_PROTECT)); + return pmd; +} + +static inline pud_t pud_wrprotect(pud_t pud) +{ + pud = clear_pud_bit(pud, __pgprot(_REGION3_ENTRY_WRITE)); + return set_pud_bit(pud, __pgprot(_REGION_ENTRY_PROTECT)); +} + +static inline pud_t pud_mkwrite(pud_t pud) +{ + pud = set_pud_bit(pud, __pgprot(_REGION3_ENTRY_WRITE)); + if (pud_val(pud) & _REGION3_ENTRY_DIRTY) + pud = clear_pud_bit(pud, __pgprot(_REGION_ENTRY_PROTECT)); + return pud; +} + +static inline pud_t pud_mkclean(pud_t pud) +{ + pud = clear_pud_bit(pud, __pgprot(_REGION3_ENTRY_DIRTY)); + return set_pud_bit(pud, __pgprot(_REGION_ENTRY_PROTECT)); +} + +static inline pud_t pud_mkdirty(pud_t pud) +{ + pud = set_pud_bit(pud, __pgprot(_REGION3_ENTRY_DIRTY | _REGION3_ENTRY_SOFT_DIRTY)); + if (pud_val(pud) & _REGION3_ENTRY_WRITE) + pud = clear_pud_bit(pud, __pgprot(_REGION_ENTRY_PROTECT)); + return pud; +} + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE) +static inline unsigned long massage_pgprot_pmd(pgprot_t pgprot) +{ + /* + * pgprot is PAGE_NONE, PAGE_RO, PAGE_RX, PAGE_RW or PAGE_RWX + * (see __Pxxx / __Sxxx). Convert to segment table entry format. + */ + if (pgprot_val(pgprot) == pgprot_val(PAGE_NONE)) + return pgprot_val(SEGMENT_NONE); + if (pgprot_val(pgprot) == pgprot_val(PAGE_RO)) + return pgprot_val(SEGMENT_RO); + if (pgprot_val(pgprot) == pgprot_val(PAGE_RX)) + return pgprot_val(SEGMENT_RX); + if (pgprot_val(pgprot) == pgprot_val(PAGE_RW)) + return pgprot_val(SEGMENT_RW); + return pgprot_val(SEGMENT_RWX); +} + +static inline pmd_t pmd_mkyoung(pmd_t pmd) +{ + pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_YOUNG)); + if (pmd_val(pmd) & _SEGMENT_ENTRY_READ) + pmd = clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_INVALID)); + return pmd; +} + +static inline pmd_t pmd_mkold(pmd_t pmd) +{ + pmd = clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_YOUNG)); + return set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_INVALID)); +} + +static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) +{ + unsigned long mask; + + mask = _SEGMENT_ENTRY_ORIGIN_LARGE; + mask |= _SEGMENT_ENTRY_DIRTY; + mask |= _SEGMENT_ENTRY_YOUNG; + mask |= _SEGMENT_ENTRY_LARGE; + mask |= _SEGMENT_ENTRY_SOFT_DIRTY; + pmd = __pmd(pmd_val(pmd) & mask); + pmd = set_pmd_bit(pmd, __pgprot(massage_pgprot_pmd(newprot))); + if (!(pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY)) + pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_PROTECT)); + if (!(pmd_val(pmd) & _SEGMENT_ENTRY_YOUNG)) + pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_INVALID)); + return pmd; +} + +static inline pmd_t mk_pmd_phys(unsigned long physpage, pgprot_t pgprot) +{ + return __pmd(physpage + massage_pgprot_pmd(pgprot)); +} + +#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLB_PAGE */ + +static inline void __pmdp_csp(pmd_t *pmdp) +{ + csp((unsigned int *)pmdp + 1, pmd_val(*pmdp), + pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID); +} + +#define IDTE_GLOBAL 0 +#define IDTE_LOCAL 1 + +#define IDTE_PTOA 0x0800 +#define IDTE_NODAT 0x1000 +#define IDTE_GUEST_ASCE 0x2000 + +static __always_inline void __pmdp_idte(unsigned long addr, pmd_t *pmdp, + unsigned long opt, unsigned long asce, + int local) +{ + unsigned long sto; + + sto = __pa(pmdp) - pmd_index(addr) * sizeof(pmd_t); + if (__builtin_constant_p(opt) && opt == 0) { + /* flush without guest asce */ + asm volatile( + " idte %[r1],0,%[r2],%[m4]" + : "+m" (*pmdp) + : [r1] "a" (sto), [r2] "a" ((addr & HPAGE_MASK)), + [m4] "i" (local) + : "cc" ); + } else { + /* flush with guest asce */ + asm volatile( + " idte %[r1],%[r3],%[r2],%[m4]" + : "+m" (*pmdp) + : [r1] "a" (sto), [r2] "a" ((addr & HPAGE_MASK) | opt), + [r3] "a" (asce), [m4] "i" (local) + : "cc" ); + } +} + +static __always_inline void __pudp_idte(unsigned long addr, pud_t *pudp, + unsigned long opt, unsigned long asce, + int local) +{ + unsigned long r3o; + + r3o = __pa(pudp) - pud_index(addr) * sizeof(pud_t); + r3o |= _ASCE_TYPE_REGION3; + if (__builtin_constant_p(opt) && opt == 0) { + /* flush without guest asce */ + asm volatile( + " idte %[r1],0,%[r2],%[m4]" + : "+m" (*pudp) + : [r1] "a" (r3o), [r2] "a" ((addr & PUD_MASK)), + [m4] "i" (local) + : "cc"); + } else { + /* flush with guest asce */ + asm volatile( + " idte %[r1],%[r3],%[r2],%[m4]" + : "+m" (*pudp) + : [r1] "a" (r3o), [r2] "a" ((addr & PUD_MASK) | opt), + [r3] "a" (asce), [m4] "i" (local) + : "cc" ); + } +} + +pmd_t pmdp_xchg_direct(struct mm_struct *, unsigned long, pmd_t *, pmd_t); +pmd_t pmdp_xchg_lazy(struct mm_struct *, unsigned long, pmd_t *, pmd_t); +pud_t pudp_xchg_direct(struct mm_struct *, unsigned long, pud_t *, pud_t); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + +#define __HAVE_ARCH_PGTABLE_DEPOSIT +void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable); + +#define __HAVE_ARCH_PGTABLE_WITHDRAW +pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); + +#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS +static inline int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp, + pmd_t entry, int dirty) +{ + VM_BUG_ON(addr & ~HPAGE_MASK); + + entry = pmd_mkyoung(entry); + if (dirty) + entry = pmd_mkdirty(entry); + if (pmd_val(*pmdp) == pmd_val(entry)) + return 0; + pmdp_xchg_direct(vma->vm_mm, addr, pmdp, entry); + return 1; +} + +#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG +static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t pmd = *pmdp; + + pmd = pmdp_xchg_direct(vma->vm_mm, addr, pmdp, pmd_mkold(pmd)); + return pmd_young(pmd); +} + +#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH +static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + VM_BUG_ON(addr & ~HPAGE_MASK); + return pmdp_test_and_clear_young(vma, addr, pmdp); +} + +static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t entry) +{ + if (!MACHINE_HAS_NX) + entry = clear_pmd_bit(entry, __pgprot(_SEGMENT_ENTRY_NOEXEC)); + set_pmd(pmdp, entry); +} + +static inline pmd_t pmd_mkhuge(pmd_t pmd) +{ + pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_LARGE)); + pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_YOUNG)); + return set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_PROTECT)); +} + +#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR +static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + return pmdp_xchg_direct(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); +} + +#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL +static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, + unsigned long addr, + pmd_t *pmdp, int full) +{ + if (full) { + pmd_t pmd = *pmdp; + set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); + return pmd; + } + return pmdp_xchg_lazy(vma->vm_mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); +} + +#define __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH +static inline pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + return pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp); +} + +#define __HAVE_ARCH_PMDP_INVALIDATE +static inline pmd_t pmdp_invalidate(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t pmd = __pmd(pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID); + + return pmdp_xchg_direct(vma->vm_mm, addr, pmdp, pmd); +} + +#define __HAVE_ARCH_PMDP_SET_WRPROTECT +static inline void pmdp_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t pmd = *pmdp; + + if (pmd_write(pmd)) + pmd = pmdp_xchg_lazy(mm, addr, pmdp, pmd_wrprotect(pmd)); +} + +static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmdp) +{ + return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); +} +#define pmdp_collapse_flush pmdp_collapse_flush + +#define pfn_pmd(pfn, pgprot) mk_pmd_phys(((pfn) << PAGE_SHIFT), (pgprot)) +#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) + +static inline int pmd_trans_huge(pmd_t pmd) +{ + return pmd_val(pmd) & _SEGMENT_ENTRY_LARGE; +} + +#define has_transparent_hugepage has_transparent_hugepage +static inline int has_transparent_hugepage(void) +{ + return MACHINE_HAS_EDAT1 ? 1 : 0; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +/* + * 64 bit swap entry format: + * A page-table entry has some bits we have to treat in a special way. + * Bits 54 and 63 are used to indicate the page type. Bit 53 marks the pte + * as invalid. + * A swap pte is indicated by bit pattern (pte & 0x201) == 0x200 + * | offset |E11XX|type |S0| + * |0000000000111111111122222222223333333333444444444455|55555|55566|66| + * |0123456789012345678901234567890123456789012345678901|23456|78901|23| + * + * Bits 0-51 store the offset. + * Bit 52 (E) is used to remember PG_anon_exclusive. + * Bits 57-61 store the type. + * Bit 62 (S) is used for softdirty tracking. + * Bits 55 and 56 (X) are unused. + */ + +#define __SWP_OFFSET_MASK ((1UL << 52) - 1) +#define __SWP_OFFSET_SHIFT 12 +#define __SWP_TYPE_MASK ((1UL << 5) - 1) +#define __SWP_TYPE_SHIFT 2 + +static inline pte_t mk_swap_pte(unsigned long type, unsigned long offset) +{ + unsigned long pteval; + + pteval = _PAGE_INVALID | _PAGE_PROTECT; + pteval |= (offset & __SWP_OFFSET_MASK) << __SWP_OFFSET_SHIFT; + pteval |= (type & __SWP_TYPE_MASK) << __SWP_TYPE_SHIFT; + return __pte(pteval); +} + +static inline unsigned long __swp_type(swp_entry_t entry) +{ + return (entry.val >> __SWP_TYPE_SHIFT) & __SWP_TYPE_MASK; +} + +static inline unsigned long __swp_offset(swp_entry_t entry) +{ + return (entry.val >> __SWP_OFFSET_SHIFT) & __SWP_OFFSET_MASK; +} + +static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset) +{ + return (swp_entry_t) { pte_val(mk_swap_pte(type, offset)) }; +} + +#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) +#define __swp_entry_to_pte(x) ((pte_t) { (x).val }) + +extern int vmem_add_mapping(unsigned long start, unsigned long size); +extern void vmem_remove_mapping(unsigned long start, unsigned long size); +extern int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc); +extern int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot); +extern void vmem_unmap_4k_page(unsigned long addr); +extern pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc); +extern int s390_enable_sie(void); +extern int s390_enable_skey(void); +extern void s390_reset_cmma(struct mm_struct *mm); + +/* s390 has a private copy of get unmapped area to deal with cache synonyms */ +#define HAVE_ARCH_UNMAPPED_AREA +#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN + +#define pmd_pgtable(pmd) \ + ((pgtable_t)__va(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE)) + +#endif /* _S390_PAGE_H */ diff --git a/arch/s390/include/asm/physmem_info.h b/arch/s390/include/asm/physmem_info.h new file mode 100644 index 0000000000..9e41a74fce --- /dev/null +++ b/arch/s390/include/asm/physmem_info.h @@ -0,0 +1,172 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_MEM_DETECT_H +#define _ASM_S390_MEM_DETECT_H + +#include +#include + +enum physmem_info_source { + MEM_DETECT_NONE = 0, + MEM_DETECT_SCLP_STOR_INFO, + MEM_DETECT_DIAG260, + MEM_DETECT_SCLP_READ_INFO, + MEM_DETECT_BIN_SEARCH +}; + +struct physmem_range { + u64 start; + u64 end; +}; + +enum reserved_range_type { + RR_DECOMPRESSOR, + RR_INITRD, + RR_VMLINUX, + RR_AMODE31, + RR_IPLREPORT, + RR_CERT_COMP_LIST, + RR_MEM_DETECT_EXTENDED, + RR_VMEM, + RR_MAX +}; + +struct reserved_range { + unsigned long start; + unsigned long end; + struct reserved_range *chain; +}; + +/* + * Storage element id is defined as 1 byte (up to 256 storage elements). + * In practise only storage element id 0 and 1 are used). + * According to architecture one storage element could have as much as + * 1020 subincrements. 255 physmem_ranges are embedded in physmem_info. + * If more physmem_ranges are required, a block of memory from already + * known physmem_range is taken (online_extended points to it). + */ +#define MEM_INLINED_ENTRIES 255 /* (PAGE_SIZE - 16) / 16 */ + +struct physmem_info { + u32 range_count; + u8 info_source; + unsigned long usable; + struct reserved_range reserved[RR_MAX]; + struct physmem_range online[MEM_INLINED_ENTRIES]; + struct physmem_range *online_extended; +}; + +extern struct physmem_info physmem_info; + +void add_physmem_online_range(u64 start, u64 end); + +static inline int __get_physmem_range(u32 n, unsigned long *start, + unsigned long *end, bool respect_usable_limit) +{ + if (n >= physmem_info.range_count) { + *start = 0; + *end = 0; + return -1; + } + + if (n < MEM_INLINED_ENTRIES) { + *start = (unsigned long)physmem_info.online[n].start; + *end = (unsigned long)physmem_info.online[n].end; + } else { + *start = (unsigned long)physmem_info.online_extended[n - MEM_INLINED_ENTRIES].start; + *end = (unsigned long)physmem_info.online_extended[n - MEM_INLINED_ENTRIES].end; + } + + if (respect_usable_limit && physmem_info.usable) { + if (*start >= physmem_info.usable) + return -1; + if (*end > physmem_info.usable) + *end = physmem_info.usable; + } + return 0; +} + +/** + * for_each_physmem_usable_range - early online memory range iterator + * @i: an integer used as loop variable + * @p_start: ptr to unsigned long for start address of the range + * @p_end: ptr to unsigned long for end address of the range + * + * Walks over detected online memory ranges below usable limit. + */ +#define for_each_physmem_usable_range(i, p_start, p_end) \ + for (i = 0; !__get_physmem_range(i, p_start, p_end, true); i++) + +/* Walks over all detected online memory ranges disregarding usable limit. */ +#define for_each_physmem_online_range(i, p_start, p_end) \ + for (i = 0; !__get_physmem_range(i, p_start, p_end, false); i++) + +static inline const char *get_physmem_info_source(void) +{ + switch (physmem_info.info_source) { + case MEM_DETECT_SCLP_STOR_INFO: + return "sclp storage info"; + case MEM_DETECT_DIAG260: + return "diag260"; + case MEM_DETECT_SCLP_READ_INFO: + return "sclp read info"; + case MEM_DETECT_BIN_SEARCH: + return "binary search"; + } + return "none"; +} + +#define RR_TYPE_NAME(t) case RR_ ## t: return #t +static inline const char *get_rr_type_name(enum reserved_range_type t) +{ + switch (t) { + RR_TYPE_NAME(DECOMPRESSOR); + RR_TYPE_NAME(INITRD); + RR_TYPE_NAME(VMLINUX); + RR_TYPE_NAME(AMODE31); + RR_TYPE_NAME(IPLREPORT); + RR_TYPE_NAME(CERT_COMP_LIST); + RR_TYPE_NAME(MEM_DETECT_EXTENDED); + RR_TYPE_NAME(VMEM); + default: + return "UNKNOWN"; + } +} + +#define for_each_physmem_reserved_type_range(t, range, p_start, p_end) \ + for (range = &physmem_info.reserved[t], *p_start = range->start, *p_end = range->end; \ + range && range->end; range = range->chain ? __va(range->chain) : NULL, \ + *p_start = range ? range->start : 0, *p_end = range ? range->end : 0) + +static inline struct reserved_range *__physmem_reserved_next(enum reserved_range_type *t, + struct reserved_range *range) +{ + if (!range) { + range = &physmem_info.reserved[*t]; + if (range->end) + return range; + } + if (range->chain) + return __va(range->chain); + while (++*t < RR_MAX) { + range = &physmem_info.reserved[*t]; + if (range->end) + return range; + } + return NULL; +} + +#define for_each_physmem_reserved_range(t, range, p_start, p_end) \ + for (t = 0, range = __physmem_reserved_next(&t, NULL), \ + *p_start = range ? range->start : 0, *p_end = range ? range->end : 0; \ + range; range = __physmem_reserved_next(&t, range), \ + *p_start = range ? range->start : 0, *p_end = range ? range->end : 0) + +static inline unsigned long get_physmem_reserved(enum reserved_range_type type, + unsigned long *addr, unsigned long *size) +{ + *addr = physmem_info.reserved[type].start; + *size = physmem_info.reserved[type].end - physmem_info.reserved[type].start; + return *size; +} + +#endif diff --git a/arch/s390/include/asm/pkey.h b/arch/s390/include/asm/pkey.h new file mode 100644 index 0000000000..47d80a7451 --- /dev/null +++ b/arch/s390/include/asm/pkey.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Kernelspace interface to the pkey device driver + * + * Copyright IBM Corp. 2016, 2023 + * + * Author: Harald Freudenberger + * + */ + +#ifndef _KAPI_PKEY_H +#define _KAPI_PKEY_H + +#include +#include +#include + +/* + * In-kernel API: Transform an key blob (of any type) into a protected key. + * @param key pointer to a buffer containing the key blob + * @param keylen size of the key blob in bytes + * @param protkey pointer to buffer receiving the protected key + * @return 0 on success, negative errno value on failure + */ +int pkey_keyblob2pkey(const u8 *key, u32 keylen, + u8 *protkey, u32 *protkeylen, u32 *protkeytype); + +#endif /* _KAPI_PKEY_H */ diff --git a/arch/s390/include/asm/pnet.h b/arch/s390/include/asm/pnet.h new file mode 100644 index 0000000000..5739276b45 --- /dev/null +++ b/arch/s390/include/asm/pnet.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * IBM System z PNET ID Support + * + * Copyright IBM Corp. 2018 + */ + +#ifndef _ASM_S390_PNET_H +#define _ASM_S390_PNET_H + +#include +#include + +int pnet_id_by_dev_port(struct device *dev, unsigned short port, u8 *pnetid); +#endif /* _ASM_S390_PNET_H */ diff --git a/arch/s390/include/asm/preempt.h b/arch/s390/include/asm/preempt.h new file mode 100644 index 0000000000..bf15da0fed --- /dev/null +++ b/arch/s390/include/asm/preempt.h @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_PREEMPT_H +#define __ASM_PREEMPT_H + +#include +#include +#include + +#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES + +/* We use the MSB mostly because its available */ +#define PREEMPT_NEED_RESCHED 0x80000000 +#define PREEMPT_ENABLED (0 + PREEMPT_NEED_RESCHED) + +static inline int preempt_count(void) +{ + return READ_ONCE(S390_lowcore.preempt_count) & ~PREEMPT_NEED_RESCHED; +} + +static inline void preempt_count_set(int pc) +{ + int old, new; + + do { + old = READ_ONCE(S390_lowcore.preempt_count); + new = (old & PREEMPT_NEED_RESCHED) | + (pc & ~PREEMPT_NEED_RESCHED); + } while (__atomic_cmpxchg(&S390_lowcore.preempt_count, + old, new) != old); +} + +static inline void set_preempt_need_resched(void) +{ + __atomic_and(~PREEMPT_NEED_RESCHED, &S390_lowcore.preempt_count); +} + +static inline void clear_preempt_need_resched(void) +{ + __atomic_or(PREEMPT_NEED_RESCHED, &S390_lowcore.preempt_count); +} + +static inline bool test_preempt_need_resched(void) +{ + return !(READ_ONCE(S390_lowcore.preempt_count) & PREEMPT_NEED_RESCHED); +} + +static inline void __preempt_count_add(int val) +{ + /* + * With some obscure config options and CONFIG_PROFILE_ALL_BRANCHES + * enabled, gcc 12 fails to handle __builtin_constant_p(). + */ + if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES)) { + if (__builtin_constant_p(val) && (val >= -128) && (val <= 127)) { + __atomic_add_const(val, &S390_lowcore.preempt_count); + return; + } + } + __atomic_add(val, &S390_lowcore.preempt_count); +} + +static inline void __preempt_count_sub(int val) +{ + __preempt_count_add(-val); +} + +static inline bool __preempt_count_dec_and_test(void) +{ + return __atomic_add(-1, &S390_lowcore.preempt_count) == 1; +} + +static inline bool should_resched(int preempt_offset) +{ + return unlikely(READ_ONCE(S390_lowcore.preempt_count) == + preempt_offset); +} + +#else /* CONFIG_HAVE_MARCH_Z196_FEATURES */ + +#define PREEMPT_ENABLED (0) + +static inline int preempt_count(void) +{ + return READ_ONCE(S390_lowcore.preempt_count); +} + +static inline void preempt_count_set(int pc) +{ + S390_lowcore.preempt_count = pc; +} + +static inline void set_preempt_need_resched(void) +{ +} + +static inline void clear_preempt_need_resched(void) +{ +} + +static inline bool test_preempt_need_resched(void) +{ + return false; +} + +static inline void __preempt_count_add(int val) +{ + S390_lowcore.preempt_count += val; +} + +static inline void __preempt_count_sub(int val) +{ + S390_lowcore.preempt_count -= val; +} + +static inline bool __preempt_count_dec_and_test(void) +{ + return !--S390_lowcore.preempt_count && tif_need_resched(); +} + +static inline bool should_resched(int preempt_offset) +{ + return unlikely(preempt_count() == preempt_offset && + tif_need_resched()); +} + +#endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */ + +#define init_task_preempt_count(p) do { } while (0) +/* Deferred to CPU bringup time */ +#define init_idle_preempt_count(p, cpu) do { } while (0) + +#ifdef CONFIG_PREEMPTION +extern void preempt_schedule(void); +#define __preempt_schedule() preempt_schedule() +extern void preempt_schedule_notrace(void); +#define __preempt_schedule_notrace() preempt_schedule_notrace() +#endif /* CONFIG_PREEMPTION */ + +#endif /* __ASM_PREEMPT_H */ diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h new file mode 100644 index 0000000000..dc17896a00 --- /dev/null +++ b/arch/s390/include/asm/processor.h @@ -0,0 +1,380 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * S390 version + * Copyright IBM Corp. 1999 + * Author(s): Hartmut Penner (hp@de.ibm.com), + * Martin Schwidefsky (schwidefsky@de.ibm.com) + * + * Derived from "include/asm-i386/processor.h" + * Copyright (C) 1994, Linus Torvalds + */ + +#ifndef __ASM_S390_PROCESSOR_H +#define __ASM_S390_PROCESSOR_H + +#include + +#define CIF_NOHZ_DELAY 2 /* delay HZ disable for a tick */ +#define CIF_FPU 3 /* restore FPU registers */ +#define CIF_ENABLED_WAIT 5 /* in enabled wait state */ +#define CIF_MCCK_GUEST 6 /* machine check happening in guest */ +#define CIF_DEDICATED_CPU 7 /* this CPU is dedicated */ + +#define _CIF_NOHZ_DELAY BIT(CIF_NOHZ_DELAY) +#define _CIF_FPU BIT(CIF_FPU) +#define _CIF_ENABLED_WAIT BIT(CIF_ENABLED_WAIT) +#define _CIF_MCCK_GUEST BIT(CIF_MCCK_GUEST) +#define _CIF_DEDICATED_CPU BIT(CIF_DEDICATED_CPU) + +#define RESTART_FLAG_CTLREGS _AC(1 << 0, U) + +#ifndef __ASSEMBLY__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef long (*sys_call_ptr_t)(struct pt_regs *regs); + +static __always_inline void set_cpu_flag(int flag) +{ + S390_lowcore.cpu_flags |= (1UL << flag); +} + +static __always_inline void clear_cpu_flag(int flag) +{ + S390_lowcore.cpu_flags &= ~(1UL << flag); +} + +static __always_inline bool test_cpu_flag(int flag) +{ + return S390_lowcore.cpu_flags & (1UL << flag); +} + +static __always_inline bool test_and_set_cpu_flag(int flag) +{ + if (test_cpu_flag(flag)) + return true; + set_cpu_flag(flag); + return false; +} + +static __always_inline bool test_and_clear_cpu_flag(int flag) +{ + if (!test_cpu_flag(flag)) + return false; + clear_cpu_flag(flag); + return true; +} + +/* + * Test CIF flag of another CPU. The caller needs to ensure that + * CPU hotplug can not happen, e.g. by disabling preemption. + */ +static __always_inline bool test_cpu_flag_of(int flag, int cpu) +{ + struct lowcore *lc = lowcore_ptr[cpu]; + + return lc->cpu_flags & (1UL << flag); +} + +#define arch_needs_cpu() test_cpu_flag(CIF_NOHZ_DELAY) + +static inline void get_cpu_id(struct cpuid *ptr) +{ + asm volatile("stidp %0" : "=Q" (*ptr)); +} + +void s390_adjust_jiffies(void); +void s390_update_cpu_mhz(void); +void cpu_detect_mhz_feature(void); + +extern const struct seq_operations cpuinfo_op; +extern void execve_tail(void); +unsigned long vdso_size(void); + +/* + * User space process size: 2GB for 31 bit, 4TB or 8PT for 64 bit. + */ + +#define TASK_SIZE (test_thread_flag(TIF_31BIT) ? \ + _REGION3_SIZE : TASK_SIZE_MAX) +#define TASK_UNMAPPED_BASE (test_thread_flag(TIF_31BIT) ? \ + (_REGION3_SIZE >> 1) : (_REGION2_SIZE >> 1)) +#define TASK_SIZE_MAX (-PAGE_SIZE) + +#define VDSO_BASE (STACK_TOP + PAGE_SIZE) +#define VDSO_LIMIT (test_thread_flag(TIF_31BIT) ? _REGION3_SIZE : _REGION2_SIZE) +#define STACK_TOP (VDSO_LIMIT - vdso_size() - PAGE_SIZE) +#define STACK_TOP_MAX (_REGION2_SIZE - vdso_size() - PAGE_SIZE) + +#define HAVE_ARCH_PICK_MMAP_LAYOUT + +#define __stackleak_poison __stackleak_poison +static __always_inline void __stackleak_poison(unsigned long erase_low, + unsigned long erase_high, + unsigned long poison) +{ + unsigned long tmp, count; + + count = erase_high - erase_low; + if (!count) + return; + asm volatile( + " cghi %[count],8\n" + " je 2f\n" + " aghi %[count],-(8+1)\n" + " srlg %[tmp],%[count],8\n" + " ltgr %[tmp],%[tmp]\n" + " jz 1f\n" + "0: stg %[poison],0(%[addr])\n" + " mvc 8(256-8,%[addr]),0(%[addr])\n" + " la %[addr],256(%[addr])\n" + " brctg %[tmp],0b\n" + "1: stg %[poison],0(%[addr])\n" + " larl %[tmp],3f\n" + " ex %[count],0(%[tmp])\n" + " j 4f\n" + "2: stg %[poison],0(%[addr])\n" + " j 4f\n" + "3: mvc 8(1,%[addr]),0(%[addr])\n" + "4:\n" + : [addr] "+&a" (erase_low), [count] "+&d" (count), [tmp] "=&a" (tmp) + : [poison] "d" (poison) + : "memory", "cc" + ); +} + +/* + * Thread structure + */ +struct thread_struct { + unsigned int acrs[NUM_ACRS]; + unsigned long ksp; /* kernel stack pointer */ + unsigned long user_timer; /* task cputime in user space */ + unsigned long guest_timer; /* task cputime in kvm guest */ + unsigned long system_timer; /* task cputime in kernel space */ + unsigned long hardirq_timer; /* task cputime in hardirq context */ + unsigned long softirq_timer; /* task cputime in softirq context */ + const sys_call_ptr_t *sys_call_table; /* system call table address */ + unsigned long gmap_addr; /* address of last gmap fault. */ + unsigned int gmap_write_flag; /* gmap fault write indication */ + unsigned int gmap_int_code; /* int code of last gmap fault */ + unsigned int gmap_pfault; /* signal of a pending guest pfault */ + + /* Per-thread information related to debugging */ + struct per_regs per_user; /* User specified PER registers */ + struct per_event per_event; /* Cause of the last PER trap */ + unsigned long per_flags; /* Flags to control debug behavior */ + unsigned int system_call; /* system call number in signal */ + unsigned long last_break; /* last breaking-event-address. */ + /* pfault_wait is used to block the process on a pfault event */ + unsigned long pfault_wait; + struct list_head list; + /* cpu runtime instrumentation */ + struct runtime_instr_cb *ri_cb; + struct gs_cb *gs_cb; /* Current guarded storage cb */ + struct gs_cb *gs_bc_cb; /* Broadcast guarded storage cb */ + struct pgm_tdb trap_tdb; /* Transaction abort diagnose block */ + /* + * Warning: 'fpu' is dynamically-sized. It *MUST* be at + * the end. + */ + struct fpu fpu; /* FP and VX register save area */ +}; + +/* Flag to disable transactions. */ +#define PER_FLAG_NO_TE 1UL +/* Flag to enable random transaction aborts. */ +#define PER_FLAG_TE_ABORT_RAND 2UL +/* Flag to specify random transaction abort mode: + * - abort each transaction at a random instruction before TEND if set. + * - abort random transactions at a random instruction if cleared. + */ +#define PER_FLAG_TE_ABORT_RAND_TEND 4UL + +typedef struct thread_struct thread_struct; + +#define ARCH_MIN_TASKALIGN 8 + +#define INIT_THREAD { \ + .ksp = sizeof(init_stack) + (unsigned long) &init_stack, \ + .fpu.regs = (void *) init_task.thread.fpu.fprs, \ + .last_break = 1, \ +} + +/* + * Do necessary setup to start up a new thread. + */ +#define start_thread(regs, new_psw, new_stackp) do { \ + regs->psw.mask = PSW_USER_BITS | PSW_MASK_EA | PSW_MASK_BA; \ + regs->psw.addr = new_psw; \ + regs->gprs[15] = new_stackp; \ + execve_tail(); \ +} while (0) + +#define start_thread31(regs, new_psw, new_stackp) do { \ + regs->psw.mask = PSW_USER_BITS | PSW_MASK_BA; \ + regs->psw.addr = new_psw; \ + regs->gprs[15] = new_stackp; \ + execve_tail(); \ +} while (0) + +/* Forward declaration, a strange C thing */ +struct task_struct; +struct mm_struct; +struct seq_file; +struct pt_regs; + +void show_registers(struct pt_regs *regs); +void show_cacheinfo(struct seq_file *m); + +/* Free guarded storage control block */ +void guarded_storage_release(struct task_struct *tsk); +void gs_load_bc_cb(struct pt_regs *regs); + +unsigned long __get_wchan(struct task_struct *p); +#define task_pt_regs(tsk) ((struct pt_regs *) \ + (task_stack_page(tsk) + THREAD_SIZE) - 1) +#define KSTK_EIP(tsk) (task_pt_regs(tsk)->psw.addr) +#define KSTK_ESP(tsk) (task_pt_regs(tsk)->gprs[15]) + +/* Has task runtime instrumentation enabled ? */ +#define is_ri_task(tsk) (!!(tsk)->thread.ri_cb) + +/* avoid using global register due to gcc bug in versions < 8.4 */ +#define current_stack_pointer (__current_stack_pointer()) + +static __always_inline unsigned long __current_stack_pointer(void) +{ + unsigned long sp; + + asm volatile("lgr %0,15" : "=d" (sp)); + return sp; +} + +static __always_inline bool on_thread_stack(void) +{ + unsigned long ksp = S390_lowcore.kernel_stack; + + return !((ksp ^ current_stack_pointer) & ~(THREAD_SIZE - 1)); +} + +static __always_inline unsigned short stap(void) +{ + unsigned short cpu_address; + + asm volatile("stap %0" : "=Q" (cpu_address)); + return cpu_address; +} + +#define cpu_relax() barrier() + +#define ECAG_CACHE_ATTRIBUTE 0 +#define ECAG_CPU_ATTRIBUTE 1 + +static inline unsigned long __ecag(unsigned int asi, unsigned char parm) +{ + unsigned long val; + + asm volatile("ecag %0,0,0(%1)" : "=d" (val) : "a" (asi << 8 | parm)); + return val; +} + +static inline void psw_set_key(unsigned int key) +{ + asm volatile("spka 0(%0)" : : "d" (key)); +} + +/* + * Set PSW to specified value. + */ +static inline void __load_psw(psw_t psw) +{ + asm volatile("lpswe %0" : : "Q" (psw) : "cc"); +} + +/* + * Set PSW mask to specified value, while leaving the + * PSW addr pointing to the next instruction. + */ +static __always_inline void __load_psw_mask(unsigned long mask) +{ + unsigned long addr; + psw_t psw; + + psw.mask = mask; + + asm volatile( + " larl %0,1f\n" + " stg %0,%1\n" + " lpswe %2\n" + "1:" + : "=&d" (addr), "=Q" (psw.addr) : "Q" (psw) : "memory", "cc"); +} + +/* + * Extract current PSW mask + */ +static inline unsigned long __extract_psw(void) +{ + unsigned int reg1, reg2; + + asm volatile("epsw %0,%1" : "=d" (reg1), "=a" (reg2)); + return (((unsigned long) reg1) << 32) | ((unsigned long) reg2); +} + +static inline void local_mcck_enable(void) +{ + __load_psw_mask(__extract_psw() | PSW_MASK_MCHECK); +} + +static inline void local_mcck_disable(void) +{ + __load_psw_mask(__extract_psw() & ~PSW_MASK_MCHECK); +} + +/* + * Rewind PSW instruction address by specified number of bytes. + */ +static inline unsigned long __rewind_psw(psw_t psw, unsigned long ilc) +{ + unsigned long mask; + + mask = (psw.mask & PSW_MASK_EA) ? -1UL : + (psw.mask & PSW_MASK_BA) ? (1UL << 31) - 1 : + (1UL << 24) - 1; + return (psw.addr - ilc) & mask; +} + +/* + * Function to drop a processor into disabled wait state + */ +static __always_inline void __noreturn disabled_wait(void) +{ + psw_t psw; + + psw.mask = PSW_MASK_BASE | PSW_MASK_WAIT | PSW_MASK_BA | PSW_MASK_EA; + psw.addr = _THIS_IP_; + __load_psw(psw); + while (1); +} + +#define ARCH_LOW_ADDRESS_LIMIT 0x7fffffffUL + +static __always_inline bool regs_irqs_disabled(struct pt_regs *regs) +{ + return arch_irqs_disabled_flags(regs->psw.mask); +} + +#endif /* __ASSEMBLY__ */ + +#endif /* __ASM_S390_PROCESSOR_H */ diff --git a/arch/s390/include/asm/ptdump.h b/arch/s390/include/asm/ptdump.h new file mode 100644 index 0000000000..f960b28966 --- /dev/null +++ b/arch/s390/include/asm/ptdump.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_S390_PTDUMP_H +#define _ASM_S390_PTDUMP_H + +void ptdump_check_wx(void); + +static inline void debug_checkwx(void) +{ + if (IS_ENABLED(CONFIG_DEBUG_WX)) + ptdump_check_wx(); +} + +#endif /* _ASM_S390_PTDUMP_H */ diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h new file mode 100644 index 0000000000..d28bf8fb27 --- /dev/null +++ b/arch/s390/include/asm/ptrace.h @@ -0,0 +1,263 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * S390 version + * Copyright IBM Corp. 1999, 2000 + * Author(s): Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com) + */ +#ifndef _S390_PTRACE_H +#define _S390_PTRACE_H + +#include +#include +#include + +#define PIF_SYSCALL 0 /* inside a system call */ +#define PIF_EXECVE_PGSTE_RESTART 1 /* restart execve for PGSTE binaries */ +#define PIF_SYSCALL_RET_SET 2 /* return value was set via ptrace */ +#define PIF_GUEST_FAULT 3 /* indicates program check in sie64a */ +#define PIF_FTRACE_FULL_REGS 4 /* all register contents valid (ftrace) */ + +#define _PIF_SYSCALL BIT(PIF_SYSCALL) +#define _PIF_EXECVE_PGSTE_RESTART BIT(PIF_EXECVE_PGSTE_RESTART) +#define _PIF_SYSCALL_RET_SET BIT(PIF_SYSCALL_RET_SET) +#define _PIF_GUEST_FAULT BIT(PIF_GUEST_FAULT) +#define _PIF_FTRACE_FULL_REGS BIT(PIF_FTRACE_FULL_REGS) + +#define PSW32_MASK_PER _AC(0x40000000, UL) +#define PSW32_MASK_DAT _AC(0x04000000, UL) +#define PSW32_MASK_IO _AC(0x02000000, UL) +#define PSW32_MASK_EXT _AC(0x01000000, UL) +#define PSW32_MASK_KEY _AC(0x00F00000, UL) +#define PSW32_MASK_BASE _AC(0x00080000, UL) /* Always one */ +#define PSW32_MASK_MCHECK _AC(0x00040000, UL) +#define PSW32_MASK_WAIT _AC(0x00020000, UL) +#define PSW32_MASK_PSTATE _AC(0x00010000, UL) +#define PSW32_MASK_ASC _AC(0x0000C000, UL) +#define PSW32_MASK_CC _AC(0x00003000, UL) +#define PSW32_MASK_PM _AC(0x00000f00, UL) +#define PSW32_MASK_RI _AC(0x00000080, UL) + +#define PSW32_ADDR_AMODE _AC(0x80000000, UL) +#define PSW32_ADDR_INSN _AC(0x7FFFFFFF, UL) + +#define PSW32_DEFAULT_KEY ((PAGE_DEFAULT_ACC) << 20) + +#define PSW32_ASC_PRIMARY _AC(0x00000000, UL) +#define PSW32_ASC_ACCREG _AC(0x00004000, UL) +#define PSW32_ASC_SECONDARY _AC(0x00008000, UL) +#define PSW32_ASC_HOME _AC(0x0000C000, UL) + +#define PSW_DEFAULT_KEY ((PAGE_DEFAULT_ACC) << 52) + +#define PSW_KERNEL_BITS (PSW_DEFAULT_KEY | PSW_MASK_BASE | PSW_ASC_HOME | \ + PSW_MASK_EA | PSW_MASK_BA | PSW_MASK_DAT) +#define PSW_USER_BITS (PSW_MASK_DAT | PSW_MASK_IO | PSW_MASK_EXT | \ + PSW_DEFAULT_KEY | PSW_MASK_BASE | PSW_MASK_MCHECK | \ + PSW_MASK_PSTATE | PSW_ASC_PRIMARY) + +#ifndef __ASSEMBLY__ + +struct psw_bits { + unsigned long : 1; + unsigned long per : 1; /* PER-Mask */ + unsigned long : 3; + unsigned long dat : 1; /* DAT Mode */ + unsigned long io : 1; /* Input/Output Mask */ + unsigned long ext : 1; /* External Mask */ + unsigned long key : 4; /* PSW Key */ + unsigned long : 1; + unsigned long mcheck : 1; /* Machine-Check Mask */ + unsigned long wait : 1; /* Wait State */ + unsigned long pstate : 1; /* Problem State */ + unsigned long as : 2; /* Address Space Control */ + unsigned long cc : 2; /* Condition Code */ + unsigned long pm : 4; /* Program Mask */ + unsigned long ri : 1; /* Runtime Instrumentation */ + unsigned long : 6; + unsigned long eaba : 2; /* Addressing Mode */ + unsigned long : 31; + unsigned long ia : 64; /* Instruction Address */ +}; + +enum { + PSW_BITS_AMODE_24BIT = 0, + PSW_BITS_AMODE_31BIT = 1, + PSW_BITS_AMODE_64BIT = 3 +}; + +enum { + PSW_BITS_AS_PRIMARY = 0, + PSW_BITS_AS_ACCREG = 1, + PSW_BITS_AS_SECONDARY = 2, + PSW_BITS_AS_HOME = 3 +}; + +#define psw_bits(__psw) (*({ \ + typecheck(psw_t, __psw); \ + &(*(struct psw_bits *)(&(__psw))); \ +})) + +typedef struct { + unsigned int mask; + unsigned int addr; +} psw_t32 __aligned(8); + +#define PGM_INT_CODE_MASK 0x7f +#define PGM_INT_CODE_PER 0x80 + +/* + * The pt_regs struct defines the way the registers are stored on + * the stack during a system call. + */ +struct pt_regs { + union { + user_pt_regs user_regs; + struct { + unsigned long args[1]; + psw_t psw; + unsigned long gprs[NUM_GPRS]; + }; + }; + unsigned long orig_gpr2; + union { + struct { + unsigned int int_code; + unsigned int int_parm; + unsigned long int_parm_long; + }; + struct tpi_info tpi_info; + }; + unsigned long flags; + unsigned long cr1; + unsigned long last_break; +}; + +/* + * Program event recording (PER) register set. + */ +struct per_regs { + unsigned long control; /* PER control bits */ + unsigned long start; /* PER starting address */ + unsigned long end; /* PER ending address */ +}; + +/* + * PER event contains information about the cause of the last PER exception. + */ +struct per_event { + unsigned short cause; /* PER code, ATMID and AI */ + unsigned long address; /* PER address */ + unsigned char paid; /* PER access identification */ +}; + +/* + * Simplified per_info structure used to decode the ptrace user space ABI. + */ +struct per_struct_kernel { + unsigned long cr9; /* PER control bits */ + unsigned long cr10; /* PER starting address */ + unsigned long cr11; /* PER ending address */ + unsigned long bits; /* Obsolete software bits */ + unsigned long starting_addr; /* User specified start address */ + unsigned long ending_addr; /* User specified end address */ + unsigned short perc_atmid; /* PER trap ATMID */ + unsigned long address; /* PER trap instruction address */ + unsigned char access_id; /* PER trap access identification */ +}; + +#define PER_EVENT_MASK 0xEB000000UL + +#define PER_EVENT_BRANCH 0x80000000UL +#define PER_EVENT_IFETCH 0x40000000UL +#define PER_EVENT_STORE 0x20000000UL +#define PER_EVENT_STORE_REAL 0x08000000UL +#define PER_EVENT_TRANSACTION_END 0x02000000UL +#define PER_EVENT_NULLIFICATION 0x01000000UL + +#define PER_CONTROL_MASK 0x00e00000UL + +#define PER_CONTROL_BRANCH_ADDRESS 0x00800000UL +#define PER_CONTROL_SUSPENSION 0x00400000UL +#define PER_CONTROL_ALTERATION 0x00200000UL + +static inline void set_pt_regs_flag(struct pt_regs *regs, int flag) +{ + regs->flags |= (1UL << flag); +} + +static inline void clear_pt_regs_flag(struct pt_regs *regs, int flag) +{ + regs->flags &= ~(1UL << flag); +} + +static inline int test_pt_regs_flag(struct pt_regs *regs, int flag) +{ + return !!(regs->flags & (1UL << flag)); +} + +static inline int test_and_clear_pt_regs_flag(struct pt_regs *regs, int flag) +{ + int ret = test_pt_regs_flag(regs, flag); + + clear_pt_regs_flag(regs, flag); + return ret; +} + +/* + * These are defined as per linux/ptrace.h, which see. + */ +#define arch_has_single_step() (1) +#define arch_has_block_step() (1) + +#define user_mode(regs) (((regs)->psw.mask & PSW_MASK_PSTATE) != 0) +#define instruction_pointer(regs) ((regs)->psw.addr) +#define user_stack_pointer(regs)((regs)->gprs[15]) +#define profile_pc(regs) instruction_pointer(regs) + +static inline long regs_return_value(struct pt_regs *regs) +{ + return regs->gprs[2]; +} + +static inline void instruction_pointer_set(struct pt_regs *regs, + unsigned long val) +{ + regs->psw.addr = val; +} + +int regs_query_register_offset(const char *name); +const char *regs_query_register_name(unsigned int offset); +unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset); +unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n); + +/** + * regs_get_kernel_argument() - get Nth function argument in kernel + * @regs: pt_regs of that context + * @n: function argument number (start from 0) + * + * regs_get_kernel_argument() returns @n th argument of the function call. + */ +static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs, + unsigned int n) +{ + unsigned int argoffset = STACK_FRAME_OVERHEAD / sizeof(long); + +#define NR_REG_ARGUMENTS 5 + if (n < NR_REG_ARGUMENTS) + return regs_get_register(regs, 2 + n); + n -= NR_REG_ARGUMENTS; + return regs_get_kernel_stack_nth(regs, argoffset + n); +} + +static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) +{ + return regs->gprs[15]; +} + +static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) +{ + regs->gprs[2] = rc; +} + +#endif /* __ASSEMBLY__ */ +#endif /* _S390_PTRACE_H */ diff --git a/arch/s390/include/asm/purgatory.h b/arch/s390/include/asm/purgatory.h new file mode 100644 index 0000000000..e297bcfc47 --- /dev/null +++ b/arch/s390/include/asm/purgatory.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2018 + * + * Author(s): Philipp Rudo + */ + +#ifndef _S390_PURGATORY_H_ +#define _S390_PURGATORY_H_ +#ifndef __ASSEMBLY__ + +#include + +int verify_sha256_digest(void); + +#endif /* __ASSEMBLY__ */ +#endif /* _S390_PURGATORY_H_ */ diff --git a/arch/s390/include/asm/qdio.h b/arch/s390/include/asm/qdio.h new file mode 100644 index 0000000000..2f983e0b95 --- /dev/null +++ b/arch/s390/include/asm/qdio.h @@ -0,0 +1,364 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2000, 2008 + * Author(s): Utz Bacher + * Jan Glauber + * + */ +#ifndef __QDIO_H__ +#define __QDIO_H__ + +#include +#include +#include + +/* only use 4 queues to save some cachelines */ +#define QDIO_MAX_QUEUES_PER_IRQ 4 +#define QDIO_MAX_BUFFERS_PER_Q 128 +#define QDIO_MAX_BUFFERS_MASK (QDIO_MAX_BUFFERS_PER_Q - 1) +#define QDIO_BUFNR(num) ((num) & QDIO_MAX_BUFFERS_MASK) +#define QDIO_MAX_ELEMENTS_PER_BUFFER 16 + +#define QDIO_QETH_QFMT 0 +#define QDIO_ZFCP_QFMT 1 +#define QDIO_IQDIO_QFMT 2 + +/** + * struct qdesfmt0 - queue descriptor, format 0 + * @sliba: absolute address of storage list information block + * @sla: absolute address of storage list + * @slsba: absolute address of storage list state block + * @akey: access key for SLIB + * @bkey: access key for SL + * @ckey: access key for SBALs + * @dkey: access key for SLSB + */ +struct qdesfmt0 { + u64 sliba; + u64 sla; + u64 slsba; + u32 : 32; + u32 akey : 4; + u32 bkey : 4; + u32 ckey : 4; + u32 dkey : 4; + u32 : 16; +} __attribute__ ((packed)); + +#define QDR_AC_MULTI_BUFFER_ENABLE 0x01 + +/** + * struct qdr - queue description record (QDR) + * @qfmt: queue format + * @ac: adapter characteristics + * @iqdcnt: input queue descriptor count + * @oqdcnt: output queue descriptor count + * @iqdsz: input queue descriptor size + * @oqdsz: output queue descriptor size + * @qiba: absolute address of queue information block + * @qkey: queue information block key + * @qdf0: queue descriptions + */ +struct qdr { + u32 qfmt : 8; + u32 : 16; + u32 ac : 8; + u32 : 8; + u32 iqdcnt : 8; + u32 : 8; + u32 oqdcnt : 8; + u32 : 8; + u32 iqdsz : 8; + u32 : 8; + u32 oqdsz : 8; + /* private: */ + u32 res[9]; + /* public: */ + u64 qiba; + u32 : 32; + u32 qkey : 4; + u32 : 28; + struct qdesfmt0 qdf0[126]; +} __packed __aligned(PAGE_SIZE); + +#define QIB_AC_OUTBOUND_PCI_SUPPORTED 0x40 +#define QIB_RFLAGS_ENABLE_QEBSM 0x80 +#define QIB_RFLAGS_ENABLE_DATA_DIV 0x02 + +/** + * struct qib - queue information block (QIB) + * @qfmt: queue format + * @pfmt: implementation dependent parameter format + * @rflags: QEBSM + * @ac: adapter characteristics + * @isliba: logical address of first input SLIB + * @osliba: logical address of first output SLIB + * @ebcnam: adapter identifier in EBCDIC + * @parm: implementation dependent parameters + */ +struct qib { + u32 qfmt : 8; + u32 pfmt : 8; + u32 rflags : 8; + u32 ac : 8; + u32 : 32; + u64 isliba; + u64 osliba; + u32 : 32; + u32 : 32; + u8 ebcnam[8]; + /* private: */ + u8 res[88]; + /* public: */ + u8 parm[128]; +} __attribute__ ((packed, aligned(256))); + +/** + * struct slibe - storage list information block element (SLIBE) + * @parms: implementation dependent parameters + */ +struct slibe { + u64 parms; +}; + +/** + * struct qaob - queue asynchronous operation block + * @res0: reserved parameters + * @res1: reserved parameter + * @res2: reserved parameter + * @res3: reserved parameter + * @aorc: asynchronous operation return code + * @flags: internal flags + * @cbtbs: control block type + * @sb_count: number of storage blocks + * @sba: storage block element addresses + * @dcount: size of storage block elements + * @user0: user definable value + * @res4: reserved parameter + * @user1: user definable value + */ +struct qaob { + u64 res0[6]; + u8 res1; + u8 res2; + u8 res3; + u8 aorc; + u8 flags; + u16 cbtbs; + u8 sb_count; + u64 sba[QDIO_MAX_ELEMENTS_PER_BUFFER]; + u16 dcount[QDIO_MAX_ELEMENTS_PER_BUFFER]; + u64 user0; + u64 res4[2]; + u8 user1[16]; +} __attribute__ ((packed, aligned(256))); + +/** + * struct slib - storage list information block (SLIB) + * @nsliba: next SLIB address (if any) + * @sla: SL address + * @slsba: SLSB address + * @slibe: SLIB elements + */ +struct slib { + u64 nsliba; + u64 sla; + u64 slsba; + /* private: */ + u8 res[1000]; + /* public: */ + struct slibe slibe[QDIO_MAX_BUFFERS_PER_Q]; +} __attribute__ ((packed, aligned(2048))); + +#define SBAL_EFLAGS_LAST_ENTRY 0x40 +#define SBAL_EFLAGS_CONTIGUOUS 0x20 +#define SBAL_EFLAGS_FIRST_FRAG 0x04 +#define SBAL_EFLAGS_MIDDLE_FRAG 0x08 +#define SBAL_EFLAGS_LAST_FRAG 0x0c +#define SBAL_EFLAGS_MASK 0x6f + +#define SBAL_SFLAGS0_PCI_REQ 0x40 +#define SBAL_SFLAGS0_DATA_CONTINUATION 0x20 + +/* Awesome OpenFCP extensions */ +#define SBAL_SFLAGS0_TYPE_STATUS 0x00 +#define SBAL_SFLAGS0_TYPE_WRITE 0x08 +#define SBAL_SFLAGS0_TYPE_READ 0x10 +#define SBAL_SFLAGS0_TYPE_WRITE_READ 0x18 +#define SBAL_SFLAGS0_MORE_SBALS 0x04 +#define SBAL_SFLAGS0_COMMAND 0x02 +#define SBAL_SFLAGS0_LAST_SBAL 0x00 +#define SBAL_SFLAGS0_ONLY_SBAL SBAL_SFLAGS0_COMMAND +#define SBAL_SFLAGS0_MIDDLE_SBAL SBAL_SFLAGS0_MORE_SBALS +#define SBAL_SFLAGS0_FIRST_SBAL (SBAL_SFLAGS0_MORE_SBALS | SBAL_SFLAGS0_COMMAND) + +/** + * struct qdio_buffer_element - SBAL entry + * @eflags: SBAL entry flags + * @scount: SBAL count + * @sflags: whole SBAL flags + * @length: length + * @addr: absolute data address +*/ +struct qdio_buffer_element { + u8 eflags; + /* private: */ + u8 res1; + /* public: */ + u8 scount; + u8 sflags; + u32 length; + u64 addr; +} __attribute__ ((packed, aligned(16))); + +/** + * struct qdio_buffer - storage block address list (SBAL) + * @element: SBAL entries + */ +struct qdio_buffer { + struct qdio_buffer_element element[QDIO_MAX_ELEMENTS_PER_BUFFER]; +} __attribute__ ((packed, aligned(256))); + +/** + * struct sl_element - storage list entry + * @sbal: absolute SBAL address + */ +struct sl_element { + u64 sbal; +} __attribute__ ((packed)); + +/** + * struct sl - storage list (SL) + * @element: SL entries + */ +struct sl { + struct sl_element element[QDIO_MAX_BUFFERS_PER_Q]; +} __attribute__ ((packed, aligned(1024))); + +/** + * struct slsb - storage list state block (SLSB) + * @val: state per buffer + */ +struct slsb { + u8 val[QDIO_MAX_BUFFERS_PER_Q]; +} __attribute__ ((packed, aligned(256))); + +/* qdio adapter-characteristics-1 flag */ +#define CHSC_AC1_INITIATE_INPUTQ 0x80 +#define AC1_SIGA_INPUT_NEEDED 0x40 /* process input queues */ +#define AC1_SIGA_OUTPUT_NEEDED 0x20 /* process output queues */ +#define AC1_SIGA_SYNC_NEEDED 0x10 /* ask hypervisor to sync */ +#define AC1_AUTOMATIC_SYNC_ON_THININT 0x08 /* set by hypervisor */ +#define AC1_AUTOMATIC_SYNC_ON_OUT_PCI 0x04 /* set by hypervisor */ +#define AC1_SC_QEBSM_AVAILABLE 0x02 /* available for subchannel */ +#define AC1_SC_QEBSM_ENABLED 0x01 /* enabled for subchannel */ + +#define CHSC_AC2_MULTI_BUFFER_AVAILABLE 0x0080 +#define CHSC_AC2_MULTI_BUFFER_ENABLED 0x0040 +#define CHSC_AC2_DATA_DIV_AVAILABLE 0x0010 +#define CHSC_AC2_SNIFFER_AVAILABLE 0x0008 +#define CHSC_AC2_DATA_DIV_ENABLED 0x0002 + +#define CHSC_AC3_FORMAT2_CQ_AVAILABLE 0x8000 + +struct qdio_ssqd_desc { + u8 flags; + u8:8; + u16 sch; + u8 qfmt; + u8 parm; + u8 qdioac1; + u8 sch_class; + u8 pcnt; + u8 icnt; + u8:8; + u8 ocnt; + u8:8; + u8 mbccnt; + u16 qdioac2; + u64 sch_token; + u8 mro; + u8 mri; + u16 qdioac3; + u16:16; + u8:8; + u8 mmwc; +} __attribute__ ((packed)); + +/* params are: ccw_device, qdio_error, queue_number, + first element processed, number of elements processed, int_parm */ +typedef void qdio_handler_t(struct ccw_device *, unsigned int, int, + int, int, unsigned long); + +/* qdio errors reported through the queue handlers: */ +#define QDIO_ERROR_ACTIVATE 0x0001 +#define QDIO_ERROR_GET_BUF_STATE 0x0002 +#define QDIO_ERROR_SET_BUF_STATE 0x0004 + +/* extra info for completed SBALs: */ +#define QDIO_ERROR_SLSB_STATE 0x0100 +#define QDIO_ERROR_SLSB_PENDING 0x0200 + +/* for qdio_cleanup */ +#define QDIO_FLAG_CLEANUP_USING_CLEAR 0x01 +#define QDIO_FLAG_CLEANUP_USING_HALT 0x02 + +/** + * struct qdio_initialize - qdio initialization data + * @q_format: queue format + * @qdr_ac: feature flags to set + * @qib_param_field_format: format for qib_parm_field + * @qib_param_field: pointer to 128 bytes or NULL, if no param field + * @qib_rflags: rflags to set + * @no_input_qs: number of input queues + * @no_output_qs: number of output queues + * @input_handler: handler to be called for input queues, and device-wide errors + * @output_handler: handler to be called for output queues + * @irq_poll: Data IRQ polling handler + * @scan_threshold: # of in-use buffers that triggers scan on output queue + * @int_parm: interruption parameter + * @input_sbal_addr_array: per-queue array, each element points to 128 SBALs + * @output_sbal_addr_array: per-queue array, each element points to 128 SBALs + */ +struct qdio_initialize { + unsigned char q_format; + unsigned char qdr_ac; + unsigned int qib_param_field_format; + unsigned char *qib_param_field; + unsigned char qib_rflags; + unsigned int no_input_qs; + unsigned int no_output_qs; + qdio_handler_t *input_handler; + qdio_handler_t *output_handler; + void (*irq_poll)(struct ccw_device *cdev, unsigned long data); + unsigned long int_parm; + struct qdio_buffer ***input_sbal_addr_array; + struct qdio_buffer ***output_sbal_addr_array; +}; + +int qdio_alloc_buffers(struct qdio_buffer **buf, unsigned int count); +void qdio_free_buffers(struct qdio_buffer **buf, unsigned int count); +void qdio_reset_buffers(struct qdio_buffer **buf, unsigned int count); + +extern int qdio_allocate(struct ccw_device *cdev, unsigned int no_input_qs, + unsigned int no_output_qs); +extern int qdio_establish(struct ccw_device *cdev, + struct qdio_initialize *init_data); +extern int qdio_activate(struct ccw_device *); +extern int qdio_start_irq(struct ccw_device *cdev); +extern int qdio_stop_irq(struct ccw_device *cdev); +extern int qdio_inspect_input_queue(struct ccw_device *cdev, unsigned int nr, + unsigned int *bufnr, unsigned int *error); +extern int qdio_inspect_output_queue(struct ccw_device *cdev, unsigned int nr, + unsigned int *bufnr, unsigned int *error); +extern int qdio_add_bufs_to_input_queue(struct ccw_device *cdev, + unsigned int q_nr, unsigned int bufnr, + unsigned int count); +extern int qdio_add_bufs_to_output_queue(struct ccw_device *cdev, + unsigned int q_nr, unsigned int bufnr, + unsigned int count, struct qaob *aob); +extern int qdio_shutdown(struct ccw_device *, int); +extern int qdio_free(struct ccw_device *); +extern int qdio_get_ssqd_desc(struct ccw_device *, struct qdio_ssqd_desc *); + +#endif /* __QDIO_H__ */ diff --git a/arch/s390/include/asm/runtime_instr.h b/arch/s390/include/asm/runtime_instr.h new file mode 100644 index 0000000000..0e1605538c --- /dev/null +++ b/arch/s390/include/asm/runtime_instr.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _RUNTIME_INSTR_H +#define _RUNTIME_INSTR_H + +#include + +extern struct runtime_instr_cb runtime_instr_empty_cb; + +static inline void save_ri_cb(struct runtime_instr_cb *cb_prev) +{ + if (cb_prev) + store_runtime_instr_cb(cb_prev); +} + +static inline void restore_ri_cb(struct runtime_instr_cb *cb_next, + struct runtime_instr_cb *cb_prev) +{ + if (cb_next) + load_runtime_instr_cb(cb_next); + else if (cb_prev) + load_runtime_instr_cb(&runtime_instr_empty_cb); +} + +struct task_struct; + +void runtime_instr_release(struct task_struct *tsk); + +#endif /* _RUNTIME_INSTR_H */ diff --git a/arch/s390/include/asm/rwonce.h b/arch/s390/include/asm/rwonce.h new file mode 100644 index 0000000000..91fc24520e --- /dev/null +++ b/arch/s390/include/asm/rwonce.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __ASM_S390_RWONCE_H +#define __ASM_S390_RWONCE_H + +#include + +/* + * Use READ_ONCE_ALIGNED_128() for 128-bit block concurrent (atomic) read + * accesses. Note that x must be 128-bit aligned, otherwise a specification + * exception is generated. + */ +#define READ_ONCE_ALIGNED_128(x) \ +({ \ + union { \ + typeof(x) __x; \ + __uint128_t val; \ + } __u; \ + \ + BUILD_BUG_ON(sizeof(x) != 16); \ + asm volatile( \ + " lpq %[val],%[_x]\n" \ + : [val] "=d" (__u.val) \ + : [_x] "QS" (x) \ + : "memory"); \ + __u.__x; \ +}) + +#include + +#endif /* __ASM_S390_RWONCE_H */ diff --git a/arch/s390/include/asm/schid.h b/arch/s390/include/asm/schid.h new file mode 100644 index 0000000000..3ac405a67f --- /dev/null +++ b/arch/s390/include/asm/schid.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef ASM_SCHID_H +#define ASM_SCHID_H + +#include +#include + +/* Helper function for sane state of pre-allocated subchannel_id. */ +static inline void +init_subchannel_id(struct subchannel_id *schid) +{ + memset(schid, 0, sizeof(struct subchannel_id)); + schid->one = 1; +} + +static inline int +schid_equal(struct subchannel_id *schid1, struct subchannel_id *schid2) +{ + return !memcmp(schid1, schid2, sizeof(struct subchannel_id)); +} + +#endif /* ASM_SCHID_H */ diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h new file mode 100644 index 0000000000..5742d23bba --- /dev/null +++ b/arch/s390/include/asm/sclp.h @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2007 + */ + +#ifndef _ASM_S390_SCLP_H +#define _ASM_S390_SCLP_H + +#include + +#define SCLP_CHP_INFO_MASK_SIZE 32 +#define EARLY_SCCB_SIZE PAGE_SIZE +#define SCLP_MAX_CORES 512 +/* 144 + 16 * SCLP_MAX_CORES + 2 * (SCLP_MAX_CORES - 1) */ +#define EXT_SCCB_READ_SCP (3 * PAGE_SIZE) +/* 24 + 16 * SCLP_MAX_CORES */ +#define EXT_SCCB_READ_CPU (3 * PAGE_SIZE) + +#ifndef __ASSEMBLY__ +#include +#include +#include + +struct sclp_chp_info { + u8 recognized[SCLP_CHP_INFO_MASK_SIZE]; + u8 standby[SCLP_CHP_INFO_MASK_SIZE]; + u8 configured[SCLP_CHP_INFO_MASK_SIZE]; +}; + +#define LOADPARM_LEN 8 + +struct sclp_ipl_info { + int is_valid; + int has_dump; + char loadparm[LOADPARM_LEN]; +}; + +struct sclp_core_entry { + u8 core_id; + u8 reserved0; + u8 : 4; + u8 sief2 : 1; + u8 skey : 1; + u8 : 2; + u8 : 2; + u8 gpere : 1; + u8 siif : 1; + u8 sigpif : 1; + u8 : 3; + u8 reserved2[3]; + u8 : 2; + u8 ib : 1; + u8 cei : 1; + u8 : 4; + u8 reserved3[6]; + u8 type; + u8 reserved1; +} __attribute__((packed)); + +struct sclp_core_info { + unsigned int configured; + unsigned int standby; + unsigned int combined; + struct sclp_core_entry core[SCLP_MAX_CORES]; +}; + +struct sclp_info { + unsigned char has_linemode : 1; + unsigned char has_vt220 : 1; + unsigned char has_siif : 1; + unsigned char has_sigpif : 1; + unsigned char has_core_type : 1; + unsigned char has_sprp : 1; + unsigned char has_hvs : 1; + unsigned char has_esca : 1; + unsigned char has_sief2 : 1; + unsigned char has_64bscao : 1; + unsigned char has_gpere : 1; + unsigned char has_cmma : 1; + unsigned char has_gsls : 1; + unsigned char has_ib : 1; + unsigned char has_cei : 1; + unsigned char has_pfmfi : 1; + unsigned char has_ibs : 1; + unsigned char has_skey : 1; + unsigned char has_kss : 1; + unsigned char has_gisaf : 1; + unsigned char has_diag318 : 1; + unsigned char has_diag320 : 1; + unsigned char has_sipl : 1; + unsigned char has_sipl_eckd : 1; + unsigned char has_dirq : 1; + unsigned char has_iplcc : 1; + unsigned char has_zpci_lsi : 1; + unsigned char has_aisii : 1; + unsigned char has_aeni : 1; + unsigned char has_aisi : 1; + unsigned int ibc; + unsigned int mtid; + unsigned int mtid_cp; + unsigned int mtid_prev; + unsigned long rzm; + unsigned long rnmax; + unsigned long hamax; + unsigned int max_cores; + unsigned long hsa_size; + unsigned long facilities; + unsigned int hmfai; +}; +extern struct sclp_info sclp; + +struct zpci_report_error_header { + u8 version; /* Interface version byte */ + u8 action; /* Action qualifier byte + * 0: Adapter Reset Request + * 1: Deconfigure and repair action requested + * (OpenCrypto Problem Call Home) + * 2: Informational Report + * (OpenCrypto Successful Diagnostics Execution) + */ + u16 length; /* Length of Subsequent Data (up to 4K – SCLP header */ + u8 data[]; /* Subsequent Data passed verbatim to SCLP ET 24 */ +} __packed; + +extern char *sclp_early_sccb; + +void sclp_early_adjust_va(void); +void sclp_early_set_buffer(void *sccb); +int sclp_early_read_info(void); +int sclp_early_read_storage_info(void); +int sclp_early_get_core_info(struct sclp_core_info *info); +void sclp_early_get_ipl_info(struct sclp_ipl_info *info); +void sclp_early_detect(void); +void sclp_early_printk(const char *s); +void __sclp_early_printk(const char *s, unsigned int len); +void sclp_emergency_printk(const char *s); + +int sclp_early_get_memsize(unsigned long *mem); +int sclp_early_get_hsa_size(unsigned long *hsa_size); +int _sclp_get_core_info(struct sclp_core_info *info); +int sclp_core_configure(u8 core); +int sclp_core_deconfigure(u8 core); +int sclp_sdias_blk_count(void); +int sclp_sdias_copy(void *dest, int blk_num, int nr_blks); +int sclp_chp_configure(struct chp_id chpid); +int sclp_chp_deconfigure(struct chp_id chpid); +int sclp_chp_read_info(struct sclp_chp_info *info); +int sclp_pci_configure(u32 fid); +int sclp_pci_deconfigure(u32 fid); +int sclp_ap_configure(u32 apid); +int sclp_ap_deconfigure(u32 apid); +int sclp_pci_report(struct zpci_report_error_header *report, u32 fh, u32 fid); +size_t memcpy_hsa_iter(struct iov_iter *iter, unsigned long src, size_t count); +void sclp_ocf_cpc_name_copy(char *dst); + +static inline int sclp_get_core_info(struct sclp_core_info *info, int early) +{ + if (early) + return sclp_early_get_core_info(info); + return _sclp_get_core_info(info); +} + +#endif /* __ASSEMBLY__ */ +#endif /* _ASM_S390_SCLP_H */ diff --git a/arch/s390/include/asm/scsw.h b/arch/s390/include/asm/scsw.h new file mode 100644 index 0000000000..322bdcd4b6 --- /dev/null +++ b/arch/s390/include/asm/scsw.h @@ -0,0 +1,1050 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Helper functions for scsw access. + * + * Copyright IBM Corp. 2008, 2012 + * Author(s): Peter Oberparleiter + */ + +#ifndef _ASM_S390_SCSW_H_ +#define _ASM_S390_SCSW_H_ + +#include +#include +#include + +/** + * struct cmd_scsw - command-mode subchannel status word + * @key: subchannel key + * @sctl: suspend control + * @eswf: esw format + * @cc: deferred condition code + * @fmt: format + * @pfch: prefetch + * @isic: initial-status interruption control + * @alcc: address-limit checking control + * @ssi: suppress-suspended interruption + * @zcc: zero condition code + * @ectl: extended control + * @pno: path not operational + * @res: reserved + * @fctl: function control + * @actl: activity control + * @stctl: status control + * @cpa: channel program address + * @dstat: device status + * @cstat: subchannel status + * @count: residual count + */ +struct cmd_scsw { + __u32 key : 4; + __u32 sctl : 1; + __u32 eswf : 1; + __u32 cc : 2; + __u32 fmt : 1; + __u32 pfch : 1; + __u32 isic : 1; + __u32 alcc : 1; + __u32 ssi : 1; + __u32 zcc : 1; + __u32 ectl : 1; + __u32 pno : 1; + __u32 res : 1; + __u32 fctl : 3; + __u32 actl : 7; + __u32 stctl : 5; + __u32 cpa; + __u32 dstat : 8; + __u32 cstat : 8; + __u32 count : 16; +} __attribute__ ((packed)); + +/** + * struct tm_scsw - transport-mode subchannel status word + * @key: subchannel key + * @eswf: esw format + * @cc: deferred condition code + * @fmt: format + * @x: IRB-format control + * @q: interrogate-complete + * @ectl: extended control + * @pno: path not operational + * @fctl: function control + * @actl: activity control + * @stctl: status control + * @tcw: TCW address + * @dstat: device status + * @cstat: subchannel status + * @fcxs: FCX status + * @schxs: subchannel-extended status + */ +struct tm_scsw { + u32 key:4; + u32 :1; + u32 eswf:1; + u32 cc:2; + u32 fmt:3; + u32 x:1; + u32 q:1; + u32 :1; + u32 ectl:1; + u32 pno:1; + u32 :1; + u32 fctl:3; + u32 actl:7; + u32 stctl:5; + u32 tcw; + u32 dstat:8; + u32 cstat:8; + u32 fcxs:8; + u32 ifob:1; + u32 sesq:7; +} __attribute__ ((packed)); + +/** + * struct eadm_scsw - subchannel status word for eadm subchannels + * @key: subchannel key + * @eswf: esw format + * @cc: deferred condition code + * @ectl: extended control + * @fctl: function control + * @actl: activity control + * @stctl: status control + * @aob: AOB address + * @dstat: device status + * @cstat: subchannel status + */ +struct eadm_scsw { + u32 key:4; + u32:1; + u32 eswf:1; + u32 cc:2; + u32:6; + u32 ectl:1; + u32:2; + u32 fctl:3; + u32 actl:7; + u32 stctl:5; + u32 aob; + u32 dstat:8; + u32 cstat:8; + u32:16; +} __packed; + +/** + * union scsw - subchannel status word + * @cmd: command-mode SCSW + * @tm: transport-mode SCSW + * @eadm: eadm SCSW + */ +union scsw { + struct cmd_scsw cmd; + struct tm_scsw tm; + struct eadm_scsw eadm; +} __packed; + +#define SCSW_FCTL_CLEAR_FUNC 0x1 +#define SCSW_FCTL_HALT_FUNC 0x2 +#define SCSW_FCTL_START_FUNC 0x4 + +#define SCSW_ACTL_SUSPENDED 0x1 +#define SCSW_ACTL_DEVACT 0x2 +#define SCSW_ACTL_SCHACT 0x4 +#define SCSW_ACTL_CLEAR_PEND 0x8 +#define SCSW_ACTL_HALT_PEND 0x10 +#define SCSW_ACTL_START_PEND 0x20 +#define SCSW_ACTL_RESUME_PEND 0x40 + +#define SCSW_STCTL_STATUS_PEND 0x1 +#define SCSW_STCTL_SEC_STATUS 0x2 +#define SCSW_STCTL_PRIM_STATUS 0x4 +#define SCSW_STCTL_INTER_STATUS 0x8 +#define SCSW_STCTL_ALERT_STATUS 0x10 + +#define DEV_STAT_ATTENTION 0x80 +#define DEV_STAT_STAT_MOD 0x40 +#define DEV_STAT_CU_END 0x20 +#define DEV_STAT_BUSY 0x10 +#define DEV_STAT_CHN_END 0x08 +#define DEV_STAT_DEV_END 0x04 +#define DEV_STAT_UNIT_CHECK 0x02 +#define DEV_STAT_UNIT_EXCEP 0x01 + +#define SCHN_STAT_PCI 0x80 +#define SCHN_STAT_INCORR_LEN 0x40 +#define SCHN_STAT_PROG_CHECK 0x20 +#define SCHN_STAT_PROT_CHECK 0x10 +#define SCHN_STAT_CHN_DATA_CHK 0x08 +#define SCHN_STAT_CHN_CTRL_CHK 0x04 +#define SCHN_STAT_INTF_CTRL_CHK 0x02 +#define SCHN_STAT_CHAIN_CHECK 0x01 + +#define SCSW_SESQ_DEV_NOFCX 3 +#define SCSW_SESQ_PATH_NOFCX 4 + +/* + * architectured values for first sense byte + */ +#define SNS0_CMD_REJECT 0x80 +#define SNS_CMD_REJECT SNS0_CMD_REJEC +#define SNS0_INTERVENTION_REQ 0x40 +#define SNS0_BUS_OUT_CHECK 0x20 +#define SNS0_EQUIPMENT_CHECK 0x10 +#define SNS0_DATA_CHECK 0x08 +#define SNS0_OVERRUN 0x04 +#define SNS0_INCOMPL_DOMAIN 0x01 + +/* + * architectured values for second sense byte + */ +#define SNS1_PERM_ERR 0x80 +#define SNS1_INV_TRACK_FORMAT 0x40 +#define SNS1_EOC 0x20 +#define SNS1_MESSAGE_TO_OPER 0x10 +#define SNS1_NO_REC_FOUND 0x08 +#define SNS1_FILE_PROTECTED 0x04 +#define SNS1_WRITE_INHIBITED 0x02 +#define SNS1_INPRECISE_END 0x01 + +/* + * architectured values for third sense byte + */ +#define SNS2_REQ_INH_WRITE 0x80 +#define SNS2_CORRECTABLE 0x40 +#define SNS2_FIRST_LOG_ERR 0x20 +#define SNS2_ENV_DATA_PRESENT 0x10 +#define SNS2_INPRECISE_END 0x04 + +/* + * architectured values for PPRC errors + */ +#define SNS7_INVALID_ON_SEC 0x0e + +/** + * scsw_is_tm - check for transport mode scsw + * @scsw: pointer to scsw + * + * Return non-zero if the specified scsw is a transport mode scsw, zero + * otherwise. + */ +static inline int scsw_is_tm(union scsw *scsw) +{ + return css_general_characteristics.fcx && (scsw->tm.x == 1); +} + +/** + * scsw_key - return scsw key field + * @scsw: pointer to scsw + * + * Return the value of the key field of the specified scsw, regardless of + * whether it is a transport mode or command mode scsw. + */ +static inline u32 scsw_key(union scsw *scsw) +{ + if (scsw_is_tm(scsw)) + return scsw->tm.key; + else + return scsw->cmd.key; +} + +/** + * scsw_eswf - return scsw eswf field + * @scsw: pointer to scsw + * + * Return the value of the eswf field of the specified scsw, regardless of + * whether it is a transport mode or command mode scsw. + */ +static inline u32 scsw_eswf(union scsw *scsw) +{ + if (scsw_is_tm(scsw)) + return scsw->tm.eswf; + else + return scsw->cmd.eswf; +} + +/** + * scsw_cc - return scsw cc field + * @scsw: pointer to scsw + * + * Return the value of the cc field of the specified scsw, regardless of + * whether it is a transport mode or command mode scsw. + */ +static inline u32 scsw_cc(union scsw *scsw) +{ + if (scsw_is_tm(scsw)) + return scsw->tm.cc; + else + return scsw->cmd.cc; +} + +/** + * scsw_ectl - return scsw ectl field + * @scsw: pointer to scsw + * + * Return the value of the ectl field of the specified scsw, regardless of + * whether it is a transport mode or command mode scsw. + */ +static inline u32 scsw_ectl(union scsw *scsw) +{ + if (scsw_is_tm(scsw)) + return scsw->tm.ectl; + else + return scsw->cmd.ectl; +} + +/** + * scsw_pno - return scsw pno field + * @scsw: pointer to scsw + * + * Return the value of the pno field of the specified scsw, regardless of + * whether it is a transport mode or command mode scsw. + */ +static inline u32 scsw_pno(union scsw *scsw) +{ + if (scsw_is_tm(scsw)) + return scsw->tm.pno; + else + return scsw->cmd.pno; +} + +/** + * scsw_fctl - return scsw fctl field + * @scsw: pointer to scsw + * + * Return the value of the fctl field of the specified scsw, regardless of + * whether it is a transport mode or command mode scsw. + */ +static inline u32 scsw_fctl(union scsw *scsw) +{ + if (scsw_is_tm(scsw)) + return scsw->tm.fctl; + else + return scsw->cmd.fctl; +} + +/** + * scsw_actl - return scsw actl field + * @scsw: pointer to scsw + * + * Return the value of the actl field of the specified scsw, regardless of + * whether it is a transport mode or command mode scsw. + */ +static inline u32 scsw_actl(union scsw *scsw) +{ + if (scsw_is_tm(scsw)) + return scsw->tm.actl; + else + return scsw->cmd.actl; +} + +/** + * scsw_stctl - return scsw stctl field + * @scsw: pointer to scsw + * + * Return the value of the stctl field of the specified scsw, regardless of + * whether it is a transport mode or command mode scsw. + */ +static inline u32 scsw_stctl(union scsw *scsw) +{ + if (scsw_is_tm(scsw)) + return scsw->tm.stctl; + else + return scsw->cmd.stctl; +} + +/** + * scsw_dstat - return scsw dstat field + * @scsw: pointer to scsw + * + * Return the value of the dstat field of the specified scsw, regardless of + * whether it is a transport mode or command mode scsw. + */ +static inline u32 scsw_dstat(union scsw *scsw) +{ + if (scsw_is_tm(scsw)) + return scsw->tm.dstat; + else + return scsw->cmd.dstat; +} + +/** + * scsw_cstat - return scsw cstat field + * @scsw: pointer to scsw + * + * Return the value of the cstat field of the specified scsw, regardless of + * whether it is a transport mode or command mode scsw. + */ +static inline u32 scsw_cstat(union scsw *scsw) +{ + if (scsw_is_tm(scsw)) + return scsw->tm.cstat; + else + return scsw->cmd.cstat; +} + +/** + * scsw_cmd_is_valid_key - check key field validity + * @scsw: pointer to scsw + * + * Return non-zero if the key field of the specified command mode scsw is + * valid, zero otherwise. + */ +static inline int scsw_cmd_is_valid_key(union scsw *scsw) +{ + return (scsw->cmd.fctl & SCSW_FCTL_START_FUNC); +} + +/** + * scsw_cmd_is_valid_sctl - check sctl field validity + * @scsw: pointer to scsw + * + * Return non-zero if the sctl field of the specified command mode scsw is + * valid, zero otherwise. + */ +static inline int scsw_cmd_is_valid_sctl(union scsw *scsw) +{ + return (scsw->cmd.fctl & SCSW_FCTL_START_FUNC); +} + +/** + * scsw_cmd_is_valid_eswf - check eswf field validity + * @scsw: pointer to scsw + * + * Return non-zero if the eswf field of the specified command mode scsw is + * valid, zero otherwise. + */ +static inline int scsw_cmd_is_valid_eswf(union scsw *scsw) +{ + return (scsw->cmd.stctl & SCSW_STCTL_STATUS_PEND); +} + +/** + * scsw_cmd_is_valid_cc - check cc field validity + * @scsw: pointer to scsw + * + * Return non-zero if the cc field of the specified command mode scsw is + * valid, zero otherwise. + */ +static inline int scsw_cmd_is_valid_cc(union scsw *scsw) +{ + return (scsw->cmd.fctl & SCSW_FCTL_START_FUNC) && + (scsw->cmd.stctl & SCSW_STCTL_STATUS_PEND); +} + +/** + * scsw_cmd_is_valid_fmt - check fmt field validity + * @scsw: pointer to scsw + * + * Return non-zero if the fmt field of the specified command mode scsw is + * valid, zero otherwise. + */ +static inline int scsw_cmd_is_valid_fmt(union scsw *scsw) +{ + return (scsw->cmd.fctl & SCSW_FCTL_START_FUNC); +} + +/** + * scsw_cmd_is_valid_pfch - check pfch field validity + * @scsw: pointer to scsw + * + * Return non-zero if the pfch field of the specified command mode scsw is + * valid, zero otherwise. + */ +static inline int scsw_cmd_is_valid_pfch(union scsw *scsw) +{ + return (scsw->cmd.fctl & SCSW_FCTL_START_FUNC); +} + +/** + * scsw_cmd_is_valid_isic - check isic field validity + * @scsw: pointer to scsw + * + * Return non-zero if the isic field of the specified command mode scsw is + * valid, zero otherwise. + */ +static inline int scsw_cmd_is_valid_isic(union scsw *scsw) +{ + return (scsw->cmd.fctl & SCSW_FCTL_START_FUNC); +} + +/** + * scsw_cmd_is_valid_alcc - check alcc field validity + * @scsw: pointer to scsw + * + * Return non-zero if the alcc field of the specified command mode scsw is + * valid, zero otherwise. + */ +static inline int scsw_cmd_is_valid_alcc(union scsw *scsw) +{ + return (scsw->cmd.fctl & SCSW_FCTL_START_FUNC); +} + +/** + * scsw_cmd_is_valid_ssi - check ssi field validity + * @scsw: pointer to scsw + * + * Return non-zero if the ssi field of the specified command mode scsw is + * valid, zero otherwise. + */ +static inline int scsw_cmd_is_valid_ssi(union scsw *scsw) +{ + return (scsw->cmd.fctl & SCSW_FCTL_START_FUNC); +} + +/** + * scsw_cmd_is_valid_zcc - check zcc field validity + * @scsw: pointer to scsw + * + * Return non-zero if the zcc field of the specified command mode scsw is + * valid, zero otherwise. + */ +static inline int scsw_cmd_is_valid_zcc(union scsw *scsw) +{ + return (scsw->cmd.fctl & SCSW_FCTL_START_FUNC) && + (scsw->cmd.stctl & SCSW_STCTL_INTER_STATUS); +} + +/** + * scsw_cmd_is_valid_ectl - check ectl field validity + * @scsw: pointer to scsw + * + * Return non-zero if the ectl field of the specified command mode scsw is + * valid, zero otherwise. + */ +static inline int scsw_cmd_is_valid_ectl(union scsw *scsw) +{ + /* Must be status pending. */ + if (!(scsw->cmd.stctl & SCSW_STCTL_STATUS_PEND)) + return 0; + + /* Must have alert status. */ + if (!(scsw->cmd.stctl & SCSW_STCTL_ALERT_STATUS)) + return 0; + + /* Must be alone or together with primary, secondary or both, + * => no intermediate status. + */ + if (scsw->cmd.stctl & SCSW_STCTL_INTER_STATUS) + return 0; + + return 1; +} + +/** + * scsw_cmd_is_valid_pno - check pno field validity + * @scsw: pointer to scsw + * + * Return non-zero if the pno field of the specified command mode scsw is + * valid, zero otherwise. + */ +static inline int scsw_cmd_is_valid_pno(union scsw *scsw) +{ + /* Must indicate at least one I/O function. */ + if (!scsw->cmd.fctl) + return 0; + + /* Must be status pending. */ + if (!(scsw->cmd.stctl & SCSW_STCTL_STATUS_PEND)) + return 0; + + /* Can be status pending alone, or with any combination of primary, + * secondary and alert => no intermediate status. + */ + if (!(scsw->cmd.stctl & SCSW_STCTL_INTER_STATUS)) + return 1; + + /* If intermediate, must be suspended. */ + if (scsw->cmd.actl & SCSW_ACTL_SUSPENDED) + return 1; + + return 0; +} + +/** + * scsw_cmd_is_valid_fctl - check fctl field validity + * @scsw: pointer to scsw + * + * Return non-zero if the fctl field of the specified command mode scsw is + * valid, zero otherwise. + */ +static inline int scsw_cmd_is_valid_fctl(union scsw *scsw) +{ + /* Only valid if pmcw.dnv == 1*/ + return 1; +} + +/** + * scsw_cmd_is_valid_actl - check actl field validity + * @scsw: pointer to scsw + * + * Return non-zero if the actl field of the specified command mode scsw is + * valid, zero otherwise. + */ +static inline int scsw_cmd_is_valid_actl(union scsw *scsw) +{ + /* Only valid if pmcw.dnv == 1*/ + return 1; +} + +/** + * scsw_cmd_is_valid_stctl - check stctl field validity + * @scsw: pointer to scsw + * + * Return non-zero if the stctl field of the specified command mode scsw is + * valid, zero otherwise. + */ +static inline int scsw_cmd_is_valid_stctl(union scsw *scsw) +{ + /* Only valid if pmcw.dnv == 1*/ + return 1; +} + +/** + * scsw_cmd_is_valid_dstat - check dstat field validity + * @scsw: pointer to scsw + * + * Return non-zero if the dstat field of the specified command mode scsw is + * valid, zero otherwise. + */ +static inline int scsw_cmd_is_valid_dstat(union scsw *scsw) +{ + return (scsw->cmd.stctl & SCSW_STCTL_STATUS_PEND) && + (scsw->cmd.cc != 3); +} + +/** + * scsw_cmd_is_valid_cstat - check cstat field validity + * @scsw: pointer to scsw + * + * Return non-zero if the cstat field of the specified command mode scsw is + * valid, zero otherwise. + */ +static inline int scsw_cmd_is_valid_cstat(union scsw *scsw) +{ + return (scsw->cmd.stctl & SCSW_STCTL_STATUS_PEND) && + (scsw->cmd.cc != 3); +} + +/** + * scsw_tm_is_valid_key - check key field validity + * @scsw: pointer to scsw + * + * Return non-zero if the key field of the specified transport mode scsw is + * valid, zero otherwise. + */ +static inline int scsw_tm_is_valid_key(union scsw *scsw) +{ + return (scsw->tm.fctl & SCSW_FCTL_START_FUNC); +} + +/** + * scsw_tm_is_valid_eswf - check eswf field validity + * @scsw: pointer to scsw + * + * Return non-zero if the eswf field of the specified transport mode scsw is + * valid, zero otherwise. + */ +static inline int scsw_tm_is_valid_eswf(union scsw *scsw) +{ + return (scsw->tm.stctl & SCSW_STCTL_STATUS_PEND); +} + +/** + * scsw_tm_is_valid_cc - check cc field validity + * @scsw: pointer to scsw + * + * Return non-zero if the cc field of the specified transport mode scsw is + * valid, zero otherwise. + */ +static inline int scsw_tm_is_valid_cc(union scsw *scsw) +{ + return (scsw->tm.fctl & SCSW_FCTL_START_FUNC) && + (scsw->tm.stctl & SCSW_STCTL_STATUS_PEND); +} + +/** + * scsw_tm_is_valid_fmt - check fmt field validity + * @scsw: pointer to scsw + * + * Return non-zero if the fmt field of the specified transport mode scsw is + * valid, zero otherwise. + */ +static inline int scsw_tm_is_valid_fmt(union scsw *scsw) +{ + return 1; +} + +/** + * scsw_tm_is_valid_x - check x field validity + * @scsw: pointer to scsw + * + * Return non-zero if the x field of the specified transport mode scsw is + * valid, zero otherwise. + */ +static inline int scsw_tm_is_valid_x(union scsw *scsw) +{ + return 1; +} + +/** + * scsw_tm_is_valid_q - check q field validity + * @scsw: pointer to scsw + * + * Return non-zero if the q field of the specified transport mode scsw is + * valid, zero otherwise. + */ +static inline int scsw_tm_is_valid_q(union scsw *scsw) +{ + return 1; +} + +/** + * scsw_tm_is_valid_ectl - check ectl field validity + * @scsw: pointer to scsw + * + * Return non-zero if the ectl field of the specified transport mode scsw is + * valid, zero otherwise. + */ +static inline int scsw_tm_is_valid_ectl(union scsw *scsw) +{ + /* Must be status pending. */ + if (!(scsw->tm.stctl & SCSW_STCTL_STATUS_PEND)) + return 0; + + /* Must have alert status. */ + if (!(scsw->tm.stctl & SCSW_STCTL_ALERT_STATUS)) + return 0; + + /* Must be alone or together with primary, secondary or both, + * => no intermediate status. + */ + if (scsw->tm.stctl & SCSW_STCTL_INTER_STATUS) + return 0; + + return 1; +} + +/** + * scsw_tm_is_valid_pno - check pno field validity + * @scsw: pointer to scsw + * + * Return non-zero if the pno field of the specified transport mode scsw is + * valid, zero otherwise. + */ +static inline int scsw_tm_is_valid_pno(union scsw *scsw) +{ + /* Must indicate at least one I/O function. */ + if (!scsw->tm.fctl) + return 0; + + /* Must be status pending. */ + if (!(scsw->tm.stctl & SCSW_STCTL_STATUS_PEND)) + return 0; + + /* Can be status pending alone, or with any combination of primary, + * secondary and alert => no intermediate status. + */ + if (!(scsw->tm.stctl & SCSW_STCTL_INTER_STATUS)) + return 1; + + /* If intermediate, must be suspended. */ + if (scsw->tm.actl & SCSW_ACTL_SUSPENDED) + return 1; + + return 0; +} + +/** + * scsw_tm_is_valid_fctl - check fctl field validity + * @scsw: pointer to scsw + * + * Return non-zero if the fctl field of the specified transport mode scsw is + * valid, zero otherwise. + */ +static inline int scsw_tm_is_valid_fctl(union scsw *scsw) +{ + /* Only valid if pmcw.dnv == 1*/ + return 1; +} + +/** + * scsw_tm_is_valid_actl - check actl field validity + * @scsw: pointer to scsw + * + * Return non-zero if the actl field of the specified transport mode scsw is + * valid, zero otherwise. + */ +static inline int scsw_tm_is_valid_actl(union scsw *scsw) +{ + /* Only valid if pmcw.dnv == 1*/ + return 1; +} + +/** + * scsw_tm_is_valid_stctl - check stctl field validity + * @scsw: pointer to scsw + * + * Return non-zero if the stctl field of the specified transport mode scsw is + * valid, zero otherwise. + */ +static inline int scsw_tm_is_valid_stctl(union scsw *scsw) +{ + /* Only valid if pmcw.dnv == 1*/ + return 1; +} + +/** + * scsw_tm_is_valid_dstat - check dstat field validity + * @scsw: pointer to scsw + * + * Return non-zero if the dstat field of the specified transport mode scsw is + * valid, zero otherwise. + */ +static inline int scsw_tm_is_valid_dstat(union scsw *scsw) +{ + return (scsw->tm.stctl & SCSW_STCTL_STATUS_PEND) && + (scsw->tm.cc != 3); +} + +/** + * scsw_tm_is_valid_cstat - check cstat field validity + * @scsw: pointer to scsw + * + * Return non-zero if the cstat field of the specified transport mode scsw is + * valid, zero otherwise. + */ +static inline int scsw_tm_is_valid_cstat(union scsw *scsw) +{ + return (scsw->tm.stctl & SCSW_STCTL_STATUS_PEND) && + (scsw->tm.cc != 3); +} + +/** + * scsw_tm_is_valid_fcxs - check fcxs field validity + * @scsw: pointer to scsw + * + * Return non-zero if the fcxs field of the specified transport mode scsw is + * valid, zero otherwise. + */ +static inline int scsw_tm_is_valid_fcxs(union scsw *scsw) +{ + return 1; +} + +/** + * scsw_tm_is_valid_schxs - check schxs field validity + * @scsw: pointer to scsw + * + * Return non-zero if the schxs field of the specified transport mode scsw is + * valid, zero otherwise. + */ +static inline int scsw_tm_is_valid_schxs(union scsw *scsw) +{ + return (scsw->tm.cstat & (SCHN_STAT_PROG_CHECK | + SCHN_STAT_INTF_CTRL_CHK | + SCHN_STAT_PROT_CHECK | + SCHN_STAT_CHN_DATA_CHK)); +} + +/** + * scsw_is_valid_actl - check actl field validity + * @scsw: pointer to scsw + * + * Return non-zero if the actl field of the specified scsw is valid, + * regardless of whether it is a transport mode or command mode scsw. + * Return zero if the field does not contain a valid value. + */ +static inline int scsw_is_valid_actl(union scsw *scsw) +{ + if (scsw_is_tm(scsw)) + return scsw_tm_is_valid_actl(scsw); + else + return scsw_cmd_is_valid_actl(scsw); +} + +/** + * scsw_is_valid_cc - check cc field validity + * @scsw: pointer to scsw + * + * Return non-zero if the cc field of the specified scsw is valid, + * regardless of whether it is a transport mode or command mode scsw. + * Return zero if the field does not contain a valid value. + */ +static inline int scsw_is_valid_cc(union scsw *scsw) +{ + if (scsw_is_tm(scsw)) + return scsw_tm_is_valid_cc(scsw); + else + return scsw_cmd_is_valid_cc(scsw); +} + +/** + * scsw_is_valid_cstat - check cstat field validity + * @scsw: pointer to scsw + * + * Return non-zero if the cstat field of the specified scsw is valid, + * regardless of whether it is a transport mode or command mode scsw. + * Return zero if the field does not contain a valid value. + */ +static inline int scsw_is_valid_cstat(union scsw *scsw) +{ + if (scsw_is_tm(scsw)) + return scsw_tm_is_valid_cstat(scsw); + else + return scsw_cmd_is_valid_cstat(scsw); +} + +/** + * scsw_is_valid_dstat - check dstat field validity + * @scsw: pointer to scsw + * + * Return non-zero if the dstat field of the specified scsw is valid, + * regardless of whether it is a transport mode or command mode scsw. + * Return zero if the field does not contain a valid value. + */ +static inline int scsw_is_valid_dstat(union scsw *scsw) +{ + if (scsw_is_tm(scsw)) + return scsw_tm_is_valid_dstat(scsw); + else + return scsw_cmd_is_valid_dstat(scsw); +} + +/** + * scsw_is_valid_ectl - check ectl field validity + * @scsw: pointer to scsw + * + * Return non-zero if the ectl field of the specified scsw is valid, + * regardless of whether it is a transport mode or command mode scsw. + * Return zero if the field does not contain a valid value. + */ +static inline int scsw_is_valid_ectl(union scsw *scsw) +{ + if (scsw_is_tm(scsw)) + return scsw_tm_is_valid_ectl(scsw); + else + return scsw_cmd_is_valid_ectl(scsw); +} + +/** + * scsw_is_valid_eswf - check eswf field validity + * @scsw: pointer to scsw + * + * Return non-zero if the eswf field of the specified scsw is valid, + * regardless of whether it is a transport mode or command mode scsw. + * Return zero if the field does not contain a valid value. + */ +static inline int scsw_is_valid_eswf(union scsw *scsw) +{ + if (scsw_is_tm(scsw)) + return scsw_tm_is_valid_eswf(scsw); + else + return scsw_cmd_is_valid_eswf(scsw); +} + +/** + * scsw_is_valid_fctl - check fctl field validity + * @scsw: pointer to scsw + * + * Return non-zero if the fctl field of the specified scsw is valid, + * regardless of whether it is a transport mode or command mode scsw. + * Return zero if the field does not contain a valid value. + */ +static inline int scsw_is_valid_fctl(union scsw *scsw) +{ + if (scsw_is_tm(scsw)) + return scsw_tm_is_valid_fctl(scsw); + else + return scsw_cmd_is_valid_fctl(scsw); +} + +/** + * scsw_is_valid_key - check key field validity + * @scsw: pointer to scsw + * + * Return non-zero if the key field of the specified scsw is valid, + * regardless of whether it is a transport mode or command mode scsw. + * Return zero if the field does not contain a valid value. + */ +static inline int scsw_is_valid_key(union scsw *scsw) +{ + if (scsw_is_tm(scsw)) + return scsw_tm_is_valid_key(scsw); + else + return scsw_cmd_is_valid_key(scsw); +} + +/** + * scsw_is_valid_pno - check pno field validity + * @scsw: pointer to scsw + * + * Return non-zero if the pno field of the specified scsw is valid, + * regardless of whether it is a transport mode or command mode scsw. + * Return zero if the field does not contain a valid value. + */ +static inline int scsw_is_valid_pno(union scsw *scsw) +{ + if (scsw_is_tm(scsw)) + return scsw_tm_is_valid_pno(scsw); + else + return scsw_cmd_is_valid_pno(scsw); +} + +/** + * scsw_is_valid_stctl - check stctl field validity + * @scsw: pointer to scsw + * + * Return non-zero if the stctl field of the specified scsw is valid, + * regardless of whether it is a transport mode or command mode scsw. + * Return zero if the field does not contain a valid value. + */ +static inline int scsw_is_valid_stctl(union scsw *scsw) +{ + if (scsw_is_tm(scsw)) + return scsw_tm_is_valid_stctl(scsw); + else + return scsw_cmd_is_valid_stctl(scsw); +} + +/** + * scsw_cmd_is_solicited - check for solicited scsw + * @scsw: pointer to scsw + * + * Return non-zero if the command mode scsw indicates that the associated + * status condition is solicited, zero if it is unsolicited. + */ +static inline int scsw_cmd_is_solicited(union scsw *scsw) +{ + return (scsw->cmd.cc != 0) || (scsw->cmd.stctl != + (SCSW_STCTL_STATUS_PEND | SCSW_STCTL_ALERT_STATUS)); +} + +/** + * scsw_tm_is_solicited - check for solicited scsw + * @scsw: pointer to scsw + * + * Return non-zero if the transport mode scsw indicates that the associated + * status condition is solicited, zero if it is unsolicited. + */ +static inline int scsw_tm_is_solicited(union scsw *scsw) +{ + return (scsw->tm.cc != 0) || (scsw->tm.stctl != + (SCSW_STCTL_STATUS_PEND | SCSW_STCTL_ALERT_STATUS)); +} + +/** + * scsw_is_solicited - check for solicited scsw + * @scsw: pointer to scsw + * + * Return non-zero if the transport or command mode scsw indicates that the + * associated status condition is solicited, zero if it is unsolicited. + */ +static inline int scsw_is_solicited(union scsw *scsw) +{ + if (scsw_is_tm(scsw)) + return scsw_tm_is_solicited(scsw); + else + return scsw_cmd_is_solicited(scsw); +} + +#endif /* _ASM_S390_SCSW_H_ */ diff --git a/arch/s390/include/asm/seccomp.h b/arch/s390/include/asm/seccomp.h new file mode 100644 index 0000000000..71d46f0ba9 --- /dev/null +++ b/arch/s390/include/asm/seccomp.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_SECCOMP_H +#define _ASM_S390_SECCOMP_H + +#include + +#define __NR_seccomp_read __NR_read +#define __NR_seccomp_write __NR_write +#define __NR_seccomp_exit __NR_exit +#define __NR_seccomp_sigreturn __NR_sigreturn + +#define __NR_seccomp_read_32 __NR_read +#define __NR_seccomp_write_32 __NR_write +#define __NR_seccomp_exit_32 __NR_exit +#define __NR_seccomp_sigreturn_32 __NR_sigreturn + +#include + +#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_S390X +#define SECCOMP_ARCH_NATIVE_NR NR_syscalls +#define SECCOMP_ARCH_NATIVE_NAME "s390x" +#ifdef CONFIG_COMPAT +# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_S390 +# define SECCOMP_ARCH_COMPAT_NR NR_syscalls +# define SECCOMP_ARCH_COMPAT_NAME "s390" +#endif + +#endif /* _ASM_S390_SECCOMP_H */ diff --git a/arch/s390/include/asm/sections.h b/arch/s390/include/asm/sections.h new file mode 100644 index 0000000000..0486e6ef62 --- /dev/null +++ b/arch/s390/include/asm/sections.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _S390_SECTIONS_H +#define _S390_SECTIONS_H + +#include + +/* + * .boot.data section contains variables "shared" between the decompressor and + * the decompressed kernel. The decompressor will store values in them, and + * copy over to the decompressed image before starting it. + * + * Each variable end up in its own intermediate section .boot.data., + * those sections are later sorted by alignment + name and merged together into + * final .boot.data section, which should be identical in the decompressor and + * the decompressed kernel (that is checked during the build). + */ +#define __bootdata(var) __section(".boot.data." #var) var + +/* + * .boot.preserved.data is similar to .boot.data, but it is not part of the + * .init section and thus will be preserved for later use in the decompressed + * kernel. + */ +#define __bootdata_preserved(var) __section(".boot.preserved.data." #var) var + +extern char *__samode31, *__eamode31; +extern char *__stext_amode31, *__etext_amode31; + +#endif diff --git a/arch/s390/include/asm/set_memory.h b/arch/s390/include/asm/set_memory.h new file mode 100644 index 0000000000..06fbabe2f6 --- /dev/null +++ b/arch/s390/include/asm/set_memory.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASMS390_SET_MEMORY_H +#define _ASMS390_SET_MEMORY_H + +#include + +extern struct mutex cpa_mutex; + +enum { + _SET_MEMORY_RO_BIT, + _SET_MEMORY_RW_BIT, + _SET_MEMORY_NX_BIT, + _SET_MEMORY_X_BIT, + _SET_MEMORY_4K_BIT, + _SET_MEMORY_INV_BIT, + _SET_MEMORY_DEF_BIT, +}; + +#define SET_MEMORY_RO BIT(_SET_MEMORY_RO_BIT) +#define SET_MEMORY_RW BIT(_SET_MEMORY_RW_BIT) +#define SET_MEMORY_NX BIT(_SET_MEMORY_NX_BIT) +#define SET_MEMORY_X BIT(_SET_MEMORY_X_BIT) +#define SET_MEMORY_4K BIT(_SET_MEMORY_4K_BIT) +#define SET_MEMORY_INV BIT(_SET_MEMORY_INV_BIT) +#define SET_MEMORY_DEF BIT(_SET_MEMORY_DEF_BIT) + +int __set_memory(unsigned long addr, unsigned long numpages, unsigned long flags); + +#define set_memory_rox set_memory_rox + +/* + * Generate two variants of each set_memory() function: + * + * set_memory_yy(unsigned long addr, int numpages); + * __set_memory_yy(void *start, void *end); + * + * The second variant exists for both convenience to avoid the usual + * (unsigned long) casts, but unlike the first variant it can also be used + * for areas larger than 8TB, which may happen at memory initialization. + */ +#define __SET_MEMORY_FUNC(fname, flags) \ +static inline int fname(unsigned long addr, int numpages) \ +{ \ + return __set_memory(addr, numpages, (flags)); \ +} \ + \ +static inline int __##fname(void *start, void *end) \ +{ \ + unsigned long numpages; \ + \ + numpages = (end - start) >> PAGE_SHIFT; \ + return __set_memory((unsigned long)start, numpages, (flags)); \ +} + +__SET_MEMORY_FUNC(set_memory_ro, SET_MEMORY_RO) +__SET_MEMORY_FUNC(set_memory_rw, SET_MEMORY_RW) +__SET_MEMORY_FUNC(set_memory_nx, SET_MEMORY_NX) +__SET_MEMORY_FUNC(set_memory_x, SET_MEMORY_X) +__SET_MEMORY_FUNC(set_memory_rox, SET_MEMORY_RO | SET_MEMORY_X) +__SET_MEMORY_FUNC(set_memory_rwnx, SET_MEMORY_RW | SET_MEMORY_NX) +__SET_MEMORY_FUNC(set_memory_4k, SET_MEMORY_4K) + +int set_direct_map_invalid_noflush(struct page *page); +int set_direct_map_default_noflush(struct page *page); + +#endif diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h new file mode 100644 index 0000000000..25cadc2b9c --- /dev/null +++ b/arch/s390/include/asm/setup.h @@ -0,0 +1,161 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * S390 version + * Copyright IBM Corp. 1999, 2017 + */ +#ifndef _ASM_S390_SETUP_H +#define _ASM_S390_SETUP_H + +#include +#include +#include + +#define PARMAREA 0x10400 + +#define COMMAND_LINE_SIZE CONFIG_COMMAND_LINE_SIZE +/* + * Machine features detected in early.c + */ + +#define MACHINE_FLAG_VM BIT(0) +#define MACHINE_FLAG_KVM BIT(1) +#define MACHINE_FLAG_LPAR BIT(2) +#define MACHINE_FLAG_DIAG9C BIT(3) +#define MACHINE_FLAG_ESOP BIT(4) +#define MACHINE_FLAG_IDTE BIT(5) +#define MACHINE_FLAG_EDAT1 BIT(7) +#define MACHINE_FLAG_EDAT2 BIT(8) +#define MACHINE_FLAG_TOPOLOGY BIT(10) +#define MACHINE_FLAG_TE BIT(11) +#define MACHINE_FLAG_TLB_LC BIT(12) +#define MACHINE_FLAG_VX BIT(13) +#define MACHINE_FLAG_TLB_GUEST BIT(14) +#define MACHINE_FLAG_NX BIT(15) +#define MACHINE_FLAG_GS BIT(16) +#define MACHINE_FLAG_SCC BIT(17) +#define MACHINE_FLAG_PCI_MIO BIT(18) +#define MACHINE_FLAG_RDP BIT(19) + +#define LPP_MAGIC BIT(31) +#define LPP_PID_MASK _AC(0xffffffff, UL) + +/* Offsets to entry points in kernel/head.S */ + +#define STARTUP_NORMAL_OFFSET 0x10000 +#define STARTUP_KDUMP_OFFSET 0x10010 + +#define LEGACY_COMMAND_LINE_SIZE 896 + +#ifndef __ASSEMBLY__ + +#include +#include + +struct parmarea { + unsigned long ipl_device; /* 0x10400 */ + unsigned long initrd_start; /* 0x10408 */ + unsigned long initrd_size; /* 0x10410 */ + unsigned long oldmem_base; /* 0x10418 */ + unsigned long oldmem_size; /* 0x10420 */ + unsigned long kernel_version; /* 0x10428 */ + unsigned long max_command_line_size; /* 0x10430 */ + char pad1[0x10480-0x10438]; /* 0x10438 - 0x10480 */ + char command_line[COMMAND_LINE_SIZE]; /* 0x10480 */ +}; + +extern struct parmarea parmarea; + +extern unsigned int zlib_dfltcc_support; +#define ZLIB_DFLTCC_DISABLED 0 +#define ZLIB_DFLTCC_FULL 1 +#define ZLIB_DFLTCC_DEFLATE_ONLY 2 +#define ZLIB_DFLTCC_INFLATE_ONLY 3 +#define ZLIB_DFLTCC_FULL_DEBUG 4 + +extern unsigned long ident_map_size; +extern unsigned long max_mappable; + +/* The Write Back bit position in the physaddr is given by the SLPC PCI */ +extern unsigned long mio_wb_bit_mask; + +#define MACHINE_IS_VM (S390_lowcore.machine_flags & MACHINE_FLAG_VM) +#define MACHINE_IS_KVM (S390_lowcore.machine_flags & MACHINE_FLAG_KVM) +#define MACHINE_IS_LPAR (S390_lowcore.machine_flags & MACHINE_FLAG_LPAR) + +#define MACHINE_HAS_DIAG9C (S390_lowcore.machine_flags & MACHINE_FLAG_DIAG9C) +#define MACHINE_HAS_ESOP (S390_lowcore.machine_flags & MACHINE_FLAG_ESOP) +#define MACHINE_HAS_IDTE (S390_lowcore.machine_flags & MACHINE_FLAG_IDTE) +#define MACHINE_HAS_EDAT1 (S390_lowcore.machine_flags & MACHINE_FLAG_EDAT1) +#define MACHINE_HAS_EDAT2 (S390_lowcore.machine_flags & MACHINE_FLAG_EDAT2) +#define MACHINE_HAS_TOPOLOGY (S390_lowcore.machine_flags & MACHINE_FLAG_TOPOLOGY) +#define MACHINE_HAS_TE (S390_lowcore.machine_flags & MACHINE_FLAG_TE) +#define MACHINE_HAS_TLB_LC (S390_lowcore.machine_flags & MACHINE_FLAG_TLB_LC) +#define MACHINE_HAS_VX (S390_lowcore.machine_flags & MACHINE_FLAG_VX) +#define MACHINE_HAS_TLB_GUEST (S390_lowcore.machine_flags & MACHINE_FLAG_TLB_GUEST) +#define MACHINE_HAS_NX (S390_lowcore.machine_flags & MACHINE_FLAG_NX) +#define MACHINE_HAS_GS (S390_lowcore.machine_flags & MACHINE_FLAG_GS) +#define MACHINE_HAS_SCC (S390_lowcore.machine_flags & MACHINE_FLAG_SCC) +#define MACHINE_HAS_PCI_MIO (S390_lowcore.machine_flags & MACHINE_FLAG_PCI_MIO) +#define MACHINE_HAS_RDP (S390_lowcore.machine_flags & MACHINE_FLAG_RDP) + +/* + * Console mode. Override with conmode= + */ +extern unsigned int console_mode; +extern unsigned int console_devno; +extern unsigned int console_irq; + +#define CONSOLE_IS_UNDEFINED (console_mode == 0) +#define CONSOLE_IS_SCLP (console_mode == 1) +#define CONSOLE_IS_3215 (console_mode == 2) +#define CONSOLE_IS_3270 (console_mode == 3) +#define CONSOLE_IS_VT220 (console_mode == 4) +#define CONSOLE_IS_HVC (console_mode == 5) +#define SET_CONSOLE_SCLP do { console_mode = 1; } while (0) +#define SET_CONSOLE_3215 do { console_mode = 2; } while (0) +#define SET_CONSOLE_3270 do { console_mode = 3; } while (0) +#define SET_CONSOLE_VT220 do { console_mode = 4; } while (0) +#define SET_CONSOLE_HVC do { console_mode = 5; } while (0) + +#ifdef CONFIG_VMCP +void vmcp_cma_reserve(void); +#else +static inline void vmcp_cma_reserve(void) { } +#endif + +void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault); + +void cmma_init(void); +void cmma_init_nodat(void); + +extern void (*_machine_restart)(char *command); +extern void (*_machine_halt)(void); +extern void (*_machine_power_off)(void); + +extern unsigned long __kaslr_offset; +static inline unsigned long kaslr_offset(void) +{ + return __kaslr_offset; +} + +extern int __kaslr_enabled; +static inline int kaslr_enabled(void) +{ + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) + return __kaslr_enabled; + return 0; +} + +struct oldmem_data { + unsigned long start; + unsigned long size; +}; +extern struct oldmem_data oldmem_data; + +static __always_inline u32 gen_lpswe(unsigned long addr) +{ + BUILD_BUG_ON(addr > 0xfff); + return 0xb2b20000 | addr; +} +#endif /* __ASSEMBLY__ */ +#endif /* _ASM_S390_SETUP_H */ diff --git a/arch/s390/include/asm/signal.h b/arch/s390/include/asm/signal.h new file mode 100644 index 0000000000..7daf4d8b5e --- /dev/null +++ b/arch/s390/include/asm/signal.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * S390 version + * + * Derived from "include/asm-i386/signal.h" + */ +#ifndef _ASMS390_SIGNAL_H +#define _ASMS390_SIGNAL_H + +#include + +/* Most things should be clean enough to redefine this at will, if care + is taken to make libc match. */ +#include +#define _NSIG _SIGCONTEXT_NSIG +#define _NSIG_BPW _SIGCONTEXT_NSIG_BPW +#define _NSIG_WORDS _SIGCONTEXT_NSIG_WORDS + +typedef unsigned long old_sigset_t; /* at least 32 bits */ + +typedef struct { + unsigned long sig[_NSIG_WORDS]; +} sigset_t; + +#define __ARCH_HAS_SA_RESTORER +#endif diff --git a/arch/s390/include/asm/sigp.h b/arch/s390/include/asm/sigp.h new file mode 100644 index 0000000000..edee63da08 --- /dev/null +++ b/arch/s390/include/asm/sigp.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __S390_ASM_SIGP_H +#define __S390_ASM_SIGP_H + +/* SIGP order codes */ +#define SIGP_SENSE 1 +#define SIGP_EXTERNAL_CALL 2 +#define SIGP_EMERGENCY_SIGNAL 3 +#define SIGP_START 4 +#define SIGP_STOP 5 +#define SIGP_RESTART 6 +#define SIGP_STOP_AND_STORE_STATUS 9 +#define SIGP_INITIAL_CPU_RESET 11 +#define SIGP_CPU_RESET 12 +#define SIGP_SET_PREFIX 13 +#define SIGP_STORE_STATUS_AT_ADDRESS 14 +#define SIGP_SET_ARCHITECTURE 18 +#define SIGP_COND_EMERGENCY_SIGNAL 19 +#define SIGP_SENSE_RUNNING 21 +#define SIGP_SET_MULTI_THREADING 22 +#define SIGP_STORE_ADDITIONAL_STATUS 23 + +/* SIGP condition codes */ +#define SIGP_CC_ORDER_CODE_ACCEPTED 0 +#define SIGP_CC_STATUS_STORED 1 +#define SIGP_CC_BUSY 2 +#define SIGP_CC_NOT_OPERATIONAL 3 + +/* SIGP cpu status bits */ + +#define SIGP_STATUS_INVALID_ORDER 0x00000002UL +#define SIGP_STATUS_CHECK_STOP 0x00000010UL +#define SIGP_STATUS_STOPPED 0x00000040UL +#define SIGP_STATUS_EXT_CALL_PENDING 0x00000080UL +#define SIGP_STATUS_INVALID_PARAMETER 0x00000100UL +#define SIGP_STATUS_INCORRECT_STATE 0x00000200UL +#define SIGP_STATUS_NOT_RUNNING 0x00000400UL + +#ifndef __ASSEMBLY__ + +static inline int ____pcpu_sigp(u16 addr, u8 order, unsigned long parm, + u32 *status) +{ + union register_pair r1 = { .odd = parm, }; + int cc; + + asm volatile( + " sigp %[r1],%[addr],0(%[order])\n" + " ipm %[cc]\n" + " srl %[cc],28\n" + : [cc] "=&d" (cc), [r1] "+&d" (r1.pair) + : [addr] "d" (addr), [order] "a" (order) + : "cc"); + *status = r1.even; + return cc; +} + +static inline int __pcpu_sigp(u16 addr, u8 order, unsigned long parm, + u32 *status) +{ + u32 _status; + int cc; + + cc = ____pcpu_sigp(addr, order, parm, &_status); + if (status && cc == SIGP_CC_STATUS_STORED) + *status = _status; + return cc; +} + +#endif /* __ASSEMBLY__ */ + +#endif /* __S390_ASM_SIGP_H */ diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h new file mode 100644 index 0000000000..73ed278107 --- /dev/null +++ b/arch/s390/include/asm/smp.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 1999, 2012 + * Author(s): Denis Joseph Barrow, + * Martin Schwidefsky , + */ +#ifndef __ASM_SMP_H +#define __ASM_SMP_H + +#include +#include +#include + +#define raw_smp_processor_id() (S390_lowcore.cpu_nr) + +extern struct mutex smp_cpu_state_mutex; +extern unsigned int smp_cpu_mt_shift; +extern unsigned int smp_cpu_mtid; +extern __vector128 __initdata boot_cpu_vector_save_area[__NUM_VXRS]; +extern cpumask_t cpu_setup_mask; + +extern int __cpu_up(unsigned int cpu, struct task_struct *tidle); + +extern void arch_send_call_function_single_ipi(int cpu); +extern void arch_send_call_function_ipi_mask(const struct cpumask *mask); + +extern void smp_call_online_cpu(void (*func)(void *), void *); +extern void smp_call_ipl_cpu(void (*func)(void *), void *); +extern void smp_emergency_stop(void); + +extern int smp_find_processor_id(u16 address); +extern int smp_store_status(int cpu); +extern void smp_save_dump_ipl_cpu(void); +extern void smp_save_dump_secondary_cpus(void); +extern void smp_yield_cpu(int cpu); +extern void smp_cpu_set_polarization(int cpu, int val); +extern int smp_cpu_get_polarization(int cpu); +extern int smp_cpu_get_cpu_address(int cpu); +extern void smp_fill_possible_mask(void); +extern void smp_detect_cpus(void); + +static inline void smp_stop_cpu(void) +{ + u16 pcpu = stap(); + + for (;;) { + __pcpu_sigp(pcpu, SIGP_STOP, 0, NULL); + cpu_relax(); + } +} + +/* Return thread 0 CPU number as base CPU */ +static inline int smp_get_base_cpu(int cpu) +{ + return cpu - (cpu % (smp_cpu_mtid + 1)); +} + +static inline void smp_cpus_done(unsigned int max_cpus) +{ +} + +extern int smp_reinit_ipl_cpu(void); +extern int smp_rescan_cpus(void); +extern void __noreturn cpu_die(void); +extern void __cpu_die(unsigned int cpu); +extern int __cpu_disable(void); +extern void schedule_mcck_handler(void); +void notrace smp_yield_cpu(int cpu); + +#endif /* __ASM_SMP_H */ diff --git a/arch/s390/include/asm/softirq_stack.h b/arch/s390/include/asm/softirq_stack.h new file mode 100644 index 0000000000..1ac5115d31 --- /dev/null +++ b/arch/s390/include/asm/softirq_stack.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef __ASM_S390_SOFTIRQ_STACK_H +#define __ASM_S390_SOFTIRQ_STACK_H + +#include +#include + +#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK +static inline void do_softirq_own_stack(void) +{ + call_on_stack(0, S390_lowcore.async_stack, void, __do_softirq); +} +#endif +#endif /* __ASM_S390_SOFTIRQ_STACK_H */ diff --git a/arch/s390/include/asm/sparsemem.h b/arch/s390/include/asm/sparsemem.h new file mode 100644 index 0000000000..c549893602 --- /dev/null +++ b/arch/s390/include/asm/sparsemem.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_SPARSEMEM_H +#define _ASM_S390_SPARSEMEM_H + +#define SECTION_SIZE_BITS 28 +#define MAX_PHYSMEM_BITS CONFIG_MAX_PHYSMEM_BITS + +#endif /* _ASM_S390_SPARSEMEM_H */ diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h new file mode 100644 index 0000000000..37127cd774 --- /dev/null +++ b/arch/s390/include/asm/spinlock.h @@ -0,0 +1,148 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * S390 version + * Copyright IBM Corp. 1999 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) + * + * Derived from "include/asm-i386/spinlock.h" + */ + +#ifndef __ASM_SPINLOCK_H +#define __ASM_SPINLOCK_H + +#include +#include +#include +#include +#include + +#define SPINLOCK_LOCKVAL (S390_lowcore.spinlock_lockval) + +extern int spin_retry; + +bool arch_vcpu_is_preempted(int cpu); + +#define vcpu_is_preempted arch_vcpu_is_preempted + +/* + * Simple spin lock operations. There are two variants, one clears IRQ's + * on the local processor, one does not. + * + * We make no fairness assumptions. They have a cost. + * + * (the type definitions are in asm/spinlock_types.h) + */ + +void arch_spin_relax(arch_spinlock_t *lock); +#define arch_spin_relax arch_spin_relax + +void arch_spin_lock_wait(arch_spinlock_t *); +int arch_spin_trylock_retry(arch_spinlock_t *); +void arch_spin_lock_setup(int cpu); + +static inline u32 arch_spin_lockval(int cpu) +{ + return cpu + 1; +} + +static inline int arch_spin_value_unlocked(arch_spinlock_t lock) +{ + return lock.lock == 0; +} + +static inline int arch_spin_is_locked(arch_spinlock_t *lp) +{ + return READ_ONCE(lp->lock) != 0; +} + +static inline int arch_spin_trylock_once(arch_spinlock_t *lp) +{ + barrier(); + return likely(__atomic_cmpxchg_bool(&lp->lock, 0, SPINLOCK_LOCKVAL)); +} + +static inline void arch_spin_lock(arch_spinlock_t *lp) +{ + if (!arch_spin_trylock_once(lp)) + arch_spin_lock_wait(lp); +} + +static inline int arch_spin_trylock(arch_spinlock_t *lp) +{ + if (!arch_spin_trylock_once(lp)) + return arch_spin_trylock_retry(lp); + return 1; +} + +static inline void arch_spin_unlock(arch_spinlock_t *lp) +{ + typecheck(int, lp->lock); + kcsan_release(); + asm_inline volatile( + ALTERNATIVE("nop", ".insn rre,0xb2fa0000,7,0", 49) /* NIAI 7 */ + " sth %1,%0\n" + : "=R" (((unsigned short *) &lp->lock)[1]) + : "d" (0) : "cc", "memory"); +} + +/* + * Read-write spinlocks, allowing multiple readers + * but only one writer. + * + * NOTE! it is quite common to have readers in interrupts + * but no interrupt writers. For those circumstances we + * can "mix" irq-safe locks - any writer needs to get a + * irq-safe write-lock, but readers can get non-irqsafe + * read-locks. + */ + +#define arch_read_relax(rw) barrier() +#define arch_write_relax(rw) barrier() + +void arch_read_lock_wait(arch_rwlock_t *lp); +void arch_write_lock_wait(arch_rwlock_t *lp); + +static inline void arch_read_lock(arch_rwlock_t *rw) +{ + int old; + + old = __atomic_add(1, &rw->cnts); + if (old & 0xffff0000) + arch_read_lock_wait(rw); +} + +static inline void arch_read_unlock(arch_rwlock_t *rw) +{ + __atomic_add_const_barrier(-1, &rw->cnts); +} + +static inline void arch_write_lock(arch_rwlock_t *rw) +{ + if (!__atomic_cmpxchg_bool(&rw->cnts, 0, 0x30000)) + arch_write_lock_wait(rw); +} + +static inline void arch_write_unlock(arch_rwlock_t *rw) +{ + __atomic_add_barrier(-0x30000, &rw->cnts); +} + + +static inline int arch_read_trylock(arch_rwlock_t *rw) +{ + int old; + + old = READ_ONCE(rw->cnts); + return (!(old & 0xffff0000) && + __atomic_cmpxchg_bool(&rw->cnts, old, old + 1)); +} + +static inline int arch_write_trylock(arch_rwlock_t *rw) +{ + int old; + + old = READ_ONCE(rw->cnts); + return !old && __atomic_cmpxchg_bool(&rw->cnts, 0, 0x30000); +} + +#endif /* __ASM_SPINLOCK_H */ diff --git a/arch/s390/include/asm/spinlock_types.h b/arch/s390/include/asm/spinlock_types.h new file mode 100644 index 0000000000..b69695e399 --- /dev/null +++ b/arch/s390/include/asm/spinlock_types.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_SPINLOCK_TYPES_H +#define __ASM_SPINLOCK_TYPES_H + +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H +# error "please don't include this file directly" +#endif + +typedef struct { + int lock; +} arch_spinlock_t; + +#define __ARCH_SPIN_LOCK_UNLOCKED { .lock = 0, } + +typedef struct { + int cnts; + arch_spinlock_t wait; +} arch_rwlock_t; + +#define __ARCH_RW_LOCK_UNLOCKED { 0 } + +#endif diff --git a/arch/s390/include/asm/stacktrace.h b/arch/s390/include/asm/stacktrace.h new file mode 100644 index 0000000000..78f7b729b6 --- /dev/null +++ b/arch/s390/include/asm/stacktrace.h @@ -0,0 +1,241 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_STACKTRACE_H +#define _ASM_S390_STACKTRACE_H + +#include +#include +#include + +enum stack_type { + STACK_TYPE_UNKNOWN, + STACK_TYPE_TASK, + STACK_TYPE_IRQ, + STACK_TYPE_NODAT, + STACK_TYPE_RESTART, + STACK_TYPE_MCCK, +}; + +struct stack_info { + enum stack_type type; + unsigned long begin, end; +}; + +const char *stack_type_name(enum stack_type type); +int get_stack_info(unsigned long sp, struct task_struct *task, + struct stack_info *info, unsigned long *visit_mask); + +static inline bool on_stack(struct stack_info *info, + unsigned long addr, size_t len) +{ + if (info->type == STACK_TYPE_UNKNOWN) + return false; + if (addr + len < addr) + return false; + return addr >= info->begin && addr + len <= info->end; +} + +/* + * Stack layout of a C stack frame. + * Kernel uses the packed stack layout (-mpacked-stack). + */ +struct stack_frame { + union { + unsigned long empty[9]; + struct { + unsigned long sie_control_block; + unsigned long sie_savearea; + unsigned long sie_reason; + unsigned long sie_flags; + unsigned long sie_control_block_phys; + }; + }; + unsigned long gprs[10]; + unsigned long back_chain; +}; + +/* + * Unlike current_stack_pointer which simply contains the current value of %r15 + * current_frame_address() returns function stack frame address, which matches + * %r15 upon function invocation. It may differ from %r15 later if function + * allocates stack for local variables or new stack frame to call other + * functions. + */ +#define current_frame_address() \ + ((unsigned long)__builtin_frame_address(0) - \ + offsetof(struct stack_frame, back_chain)) + +static __always_inline unsigned long get_stack_pointer(struct task_struct *task, + struct pt_regs *regs) +{ + if (regs) + return (unsigned long)kernel_stack_pointer(regs); + if (task == current) + return current_frame_address(); + return (unsigned long)task->thread.ksp; +} + +/* + * To keep this simple mark register 2-6 as being changed (volatile) + * by the called function, even though register 6 is saved/nonvolatile. + */ +#define CALL_FMT_0 "=&d" (r2) +#define CALL_FMT_1 "+&d" (r2) +#define CALL_FMT_2 CALL_FMT_1, "+&d" (r3) +#define CALL_FMT_3 CALL_FMT_2, "+&d" (r4) +#define CALL_FMT_4 CALL_FMT_3, "+&d" (r5) +#define CALL_FMT_5 CALL_FMT_4, "+&d" (r6) + +#define CALL_CLOBBER_5 "0", "1", "14", "cc", "memory" +#define CALL_CLOBBER_4 CALL_CLOBBER_5 +#define CALL_CLOBBER_3 CALL_CLOBBER_4, "5" +#define CALL_CLOBBER_2 CALL_CLOBBER_3, "4" +#define CALL_CLOBBER_1 CALL_CLOBBER_2, "3" +#define CALL_CLOBBER_0 CALL_CLOBBER_1 + +#define CALL_LARGS_0(...) \ + long dummy = 0 +#define CALL_LARGS_1(t1, a1) \ + long arg1 = (long)(t1)(a1) +#define CALL_LARGS_2(t1, a1, t2, a2) \ + CALL_LARGS_1(t1, a1); \ + long arg2 = (long)(t2)(a2) +#define CALL_LARGS_3(t1, a1, t2, a2, t3, a3) \ + CALL_LARGS_2(t1, a1, t2, a2); \ + long arg3 = (long)(t3)(a3) +#define CALL_LARGS_4(t1, a1, t2, a2, t3, a3, t4, a4) \ + CALL_LARGS_3(t1, a1, t2, a2, t3, a3); \ + long arg4 = (long)(t4)(a4) +#define CALL_LARGS_5(t1, a1, t2, a2, t3, a3, t4, a4, t5, a5) \ + CALL_LARGS_4(t1, a1, t2, a2, t3, a3, t4, a4); \ + long arg5 = (long)(t5)(a5) + +#define CALL_REGS_0 \ + register long r2 asm("2") = dummy +#define CALL_REGS_1 \ + register long r2 asm("2") = arg1 +#define CALL_REGS_2 \ + CALL_REGS_1; \ + register long r3 asm("3") = arg2 +#define CALL_REGS_3 \ + CALL_REGS_2; \ + register long r4 asm("4") = arg3 +#define CALL_REGS_4 \ + CALL_REGS_3; \ + register long r5 asm("5") = arg4 +#define CALL_REGS_5 \ + CALL_REGS_4; \ + register long r6 asm("6") = arg5 + +#define CALL_TYPECHECK_0(...) +#define CALL_TYPECHECK_1(t, a, ...) \ + typecheck(t, a) +#define CALL_TYPECHECK_2(t, a, ...) \ + CALL_TYPECHECK_1(__VA_ARGS__); \ + typecheck(t, a) +#define CALL_TYPECHECK_3(t, a, ...) \ + CALL_TYPECHECK_2(__VA_ARGS__); \ + typecheck(t, a) +#define CALL_TYPECHECK_4(t, a, ...) \ + CALL_TYPECHECK_3(__VA_ARGS__); \ + typecheck(t, a) +#define CALL_TYPECHECK_5(t, a, ...) \ + CALL_TYPECHECK_4(__VA_ARGS__); \ + typecheck(t, a) + +#define CALL_PARM_0(...) void +#define CALL_PARM_1(t, a, ...) t +#define CALL_PARM_2(t, a, ...) t, CALL_PARM_1(__VA_ARGS__) +#define CALL_PARM_3(t, a, ...) t, CALL_PARM_2(__VA_ARGS__) +#define CALL_PARM_4(t, a, ...) t, CALL_PARM_3(__VA_ARGS__) +#define CALL_PARM_5(t, a, ...) t, CALL_PARM_4(__VA_ARGS__) +#define CALL_PARM_6(t, a, ...) t, CALL_PARM_5(__VA_ARGS__) + +/* + * Use call_on_stack() to call a function switching to a specified + * stack. Proper sign and zero extension of function arguments is + * done. Usage: + * + * rc = call_on_stack(nr, stack, rettype, fn, t1, a1, t2, a2, ...) + * + * - nr specifies the number of function arguments of fn. + * - stack specifies the stack to be used. + * - fn is the function to be called. + * - rettype is the return type of fn. + * - t1, a1, ... are pairs, where t1 must match the type of the first + * argument of fn, t2 the second, etc. a1 is the corresponding + * first function argument (not name), etc. + */ +#define call_on_stack(nr, stack, rettype, fn, ...) \ +({ \ + rettype (*__fn)(CALL_PARM_##nr(__VA_ARGS__)) = fn; \ + unsigned long frame = current_frame_address(); \ + unsigned long __stack = stack; \ + unsigned long prev; \ + CALL_LARGS_##nr(__VA_ARGS__); \ + CALL_REGS_##nr; \ + \ + CALL_TYPECHECK_##nr(__VA_ARGS__); \ + asm volatile( \ + " lgr %[_prev],15\n" \ + " lg 15,%[_stack]\n" \ + " stg %[_frame],%[_bc](15)\n" \ + " brasl 14,%[_fn]\n" \ + " lgr 15,%[_prev]\n" \ + : [_prev] "=&d" (prev), CALL_FMT_##nr \ + : [_stack] "R" (__stack), \ + [_bc] "i" (offsetof(struct stack_frame, back_chain)), \ + [_frame] "d" (frame), \ + [_fn] "X" (__fn) : CALL_CLOBBER_##nr); \ + (rettype)r2; \ +}) + +/* + * Use call_nodat() to call a function with DAT disabled. + * Proper sign and zero extension of function arguments is done. + * Usage: + * + * rc = call_nodat(nr, rettype, fn, t1, a1, t2, a2, ...) + * + * - nr specifies the number of function arguments of fn. + * - fn is the function to be called, where fn is a physical address. + * - rettype is the return type of fn. + * - t1, a1, ... are pairs, where t1 must match the type of the first + * argument of fn, t2 the second, etc. a1 is the corresponding + * first function argument (not name), etc. + * + * fn() is called with standard C function call ABI, with the exception + * that no useful stackframe or stackpointer is passed via register 15. + * Therefore the called function must not use r15 to access the stack. + */ +#define call_nodat(nr, rettype, fn, ...) \ +({ \ + rettype (*__fn)(CALL_PARM_##nr(__VA_ARGS__)) = (fn); \ + /* aligned since psw_leave must not cross page boundary */ \ + psw_t __aligned(16) psw_leave; \ + psw_t psw_enter; \ + CALL_LARGS_##nr(__VA_ARGS__); \ + CALL_REGS_##nr; \ + \ + CALL_TYPECHECK_##nr(__VA_ARGS__); \ + psw_enter.mask = PSW_KERNEL_BITS & ~PSW_MASK_DAT; \ + psw_enter.addr = (unsigned long)__fn; \ + asm volatile( \ + " epsw 0,1\n" \ + " risbg 1,0,0,31,32\n" \ + " larl 7,1f\n" \ + " stg 1,%[psw_leave]\n" \ + " stg 7,8+%[psw_leave]\n" \ + " la 7,%[psw_leave]\n" \ + " lra 7,0(7)\n" \ + " larl 1,0f\n" \ + " lra 14,0(1)\n" \ + " lpswe %[psw_enter]\n" \ + "0: lpswe 0(7)\n" \ + "1:\n" \ + : CALL_FMT_##nr, [psw_leave] "=Q" (psw_leave) \ + : [psw_enter] "Q" (psw_enter) \ + : "7", CALL_CLOBBER_##nr); \ + (rettype)r2; \ +}) + +#endif /* _ASM_S390_STACKTRACE_H */ diff --git a/arch/s390/include/asm/stp.h b/arch/s390/include/asm/stp.h new file mode 100644 index 0000000000..4d74d7e333 --- /dev/null +++ b/arch/s390/include/asm/stp.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2006 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) + */ +#ifndef __S390_STP_H +#define __S390_STP_H + +#include + +/* notifier for syncs */ +extern struct atomic_notifier_head s390_epoch_delta_notifier; + +/* STP interruption parameter */ +struct stp_irq_parm { + u32 : 14; + u32 tsc : 1; /* Timing status change */ + u32 lac : 1; /* Link availability change */ + u32 tcpc : 1; /* Time control parameter change */ + u32 : 15; +} __packed; + +#define STP_OP_SYNC 1 +#define STP_OP_CTRL 3 + +struct stp_sstpi { + u32 : 32; + u32 tu : 1; + u32 lu : 1; + u32 : 6; + u32 stratum : 8; + u32 vbits : 16; + u32 leaps : 16; + u32 tmd : 4; + u32 ctn : 4; + u32 : 3; + u32 c : 1; + u32 tst : 4; + u32 tzo : 16; + u32 dsto : 16; + u32 ctrl : 16; + u32 : 16; + u32 tto; + u32 : 32; + u32 ctnid[3]; + u32 : 32; + u64 todoff; + u32 rsvd[50]; +} __packed; + +struct stp_tzib { + u32 tzan : 16; + u32 : 16; + u32 tzo : 16; + u32 dsto : 16; + u32 stn; + u32 dstn; + u64 dst_on_alg; + u64 dst_off_alg; +} __packed; + +struct stp_tcpib { + u32 atcode : 4; + u32 ntcode : 4; + u32 d : 1; + u32 : 23; + s32 tto; + struct stp_tzib atzib; + struct stp_tzib ntzib; + s32 adst_offset : 16; + s32 ndst_offset : 16; + u32 rsvd1; + u64 ntzib_update; + u64 ndsto_update; +} __packed; + +struct stp_lsoib { + u32 p : 1; + u32 : 31; + s32 also : 16; + s32 nlso : 16; + u64 nlsout; +} __packed; + +struct stp_stzi { + u32 rsvd0[3]; + u64 data_ts; + u32 rsvd1[22]; + struct stp_tcpib tcpib; + struct stp_lsoib lsoib; +} __packed; + +/* Functions needed by the machine check handler */ +int stp_sync_check(void); +int stp_island_check(void); +void stp_queue_work(void); + +#endif /* __S390_STP_H */ diff --git a/arch/s390/include/asm/string.h b/arch/s390/include/asm/string.h new file mode 100644 index 0000000000..351685de53 --- /dev/null +++ b/arch/s390/include/asm/string.h @@ -0,0 +1,206 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * S390 version + * Copyright IBM Corp. 1999 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), + */ + +#ifndef _S390_STRING_H_ +#define _S390_STRING_H_ + +#ifndef _LINUX_TYPES_H +#include +#endif + +#define __HAVE_ARCH_MEMCPY /* gcc builtin & arch function */ +#define __HAVE_ARCH_MEMMOVE /* gcc builtin & arch function */ +#define __HAVE_ARCH_MEMSET /* gcc builtin & arch function */ +#define __HAVE_ARCH_MEMSET16 /* arch function */ +#define __HAVE_ARCH_MEMSET32 /* arch function */ +#define __HAVE_ARCH_MEMSET64 /* arch function */ + +void *memcpy(void *dest, const void *src, size_t n); +void *memset(void *s, int c, size_t n); +void *memmove(void *dest, const void *src, size_t n); + +#ifndef CONFIG_KASAN +#define __HAVE_ARCH_MEMCHR /* inline & arch function */ +#define __HAVE_ARCH_MEMCMP /* arch function */ +#define __HAVE_ARCH_MEMSCAN /* inline & arch function */ +#define __HAVE_ARCH_STRCAT /* inline & arch function */ +#define __HAVE_ARCH_STRCMP /* arch function */ +#define __HAVE_ARCH_STRCPY /* inline & arch function */ +#define __HAVE_ARCH_STRLCAT /* arch function */ +#define __HAVE_ARCH_STRLEN /* inline & arch function */ +#define __HAVE_ARCH_STRNCAT /* arch function */ +#define __HAVE_ARCH_STRNCPY /* arch function */ +#define __HAVE_ARCH_STRNLEN /* inline & arch function */ +#define __HAVE_ARCH_STRSTR /* arch function */ + +/* Prototypes for non-inlined arch strings functions. */ +int memcmp(const void *s1, const void *s2, size_t n); +int strcmp(const char *s1, const char *s2); +size_t strlcat(char *dest, const char *src, size_t n); +char *strncat(char *dest, const char *src, size_t n); +char *strncpy(char *dest, const char *src, size_t n); +char *strstr(const char *s1, const char *s2); +#endif /* !CONFIG_KASAN */ + +#undef __HAVE_ARCH_STRCHR +#undef __HAVE_ARCH_STRNCHR +#undef __HAVE_ARCH_STRNCMP +#undef __HAVE_ARCH_STRPBRK +#undef __HAVE_ARCH_STRSEP +#undef __HAVE_ARCH_STRSPN + +#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__) + +#define strlen(s) __strlen(s) + +#define __no_sanitize_prefix_strfunc(x) __##x + +#ifndef __NO_FORTIFY +#define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */ +#endif + +#else +#define __no_sanitize_prefix_strfunc(x) x +#endif /* defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__) */ + +void *__memcpy(void *dest, const void *src, size_t n); +void *__memset(void *s, int c, size_t n); +void *__memmove(void *dest, const void *src, size_t n); +void *__memset16(uint16_t *s, uint16_t v, size_t count); +void *__memset32(uint32_t *s, uint32_t v, size_t count); +void *__memset64(uint64_t *s, uint64_t v, size_t count); + +static inline void *memset16(uint16_t *s, uint16_t v, size_t count) +{ + return __memset16(s, v, count * sizeof(v)); +} + +static inline void *memset32(uint32_t *s, uint32_t v, size_t count) +{ + return __memset32(s, v, count * sizeof(v)); +} + +static inline void *memset64(uint64_t *s, uint64_t v, size_t count) +{ + return __memset64(s, v, count * sizeof(v)); +} + +#if !defined(IN_ARCH_STRING_C) && (!defined(CONFIG_FORTIFY_SOURCE) || defined(__NO_FORTIFY)) + +#ifdef __HAVE_ARCH_MEMCHR +static inline void *memchr(const void * s, int c, size_t n) +{ + const void *ret = s + n; + + asm volatile( + " lgr 0,%[c]\n" + "0: srst %[ret],%[s]\n" + " jo 0b\n" + " jl 1f\n" + " la %[ret],0\n" + "1:" + : [ret] "+&a" (ret), [s] "+&a" (s) + : [c] "d" (c) + : "cc", "memory", "0"); + return (void *) ret; +} +#endif + +#ifdef __HAVE_ARCH_MEMSCAN +static inline void *memscan(void *s, int c, size_t n) +{ + const void *ret = s + n; + + asm volatile( + " lgr 0,%[c]\n" + "0: srst %[ret],%[s]\n" + " jo 0b\n" + : [ret] "+&a" (ret), [s] "+&a" (s) + : [c] "d" (c) + : "cc", "memory", "0"); + return (void *) ret; +} +#endif + +#ifdef __HAVE_ARCH_STRCAT +static inline char *strcat(char *dst, const char *src) +{ + unsigned long dummy = 0; + char *ret = dst; + + asm volatile( + " lghi 0,0\n" + "0: srst %[dummy],%[dst]\n" + " jo 0b\n" + "1: mvst %[dummy],%[src]\n" + " jo 1b" + : [dummy] "+&a" (dummy), [dst] "+&a" (dst), [src] "+&a" (src) + : + : "cc", "memory", "0"); + return ret; +} +#endif + +#ifdef __HAVE_ARCH_STRCPY +static inline char *strcpy(char *dst, const char *src) +{ + char *ret = dst; + + asm volatile( + " lghi 0,0\n" + "0: mvst %[dst],%[src]\n" + " jo 0b" + : [dst] "+&a" (dst), [src] "+&a" (src) + : + : "cc", "memory", "0"); + return ret; +} +#endif + +#if defined(__HAVE_ARCH_STRLEN) || (defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)) +static inline size_t __no_sanitize_prefix_strfunc(strlen)(const char *s) +{ + unsigned long end = 0; + const char *tmp = s; + + asm volatile( + " lghi 0,0\n" + "0: srst %[end],%[tmp]\n" + " jo 0b" + : [end] "+&a" (end), [tmp] "+&a" (tmp) + : + : "cc", "memory", "0"); + return end - (unsigned long)s; +} +#endif + +#ifdef __HAVE_ARCH_STRNLEN +static inline size_t strnlen(const char * s, size_t n) +{ + const char *tmp = s; + const char *end = s + n; + + asm volatile( + " lghi 0,0\n" + "0: srst %[end],%[tmp]\n" + " jo 0b" + : [end] "+&a" (end), [tmp] "+&a" (tmp) + : + : "cc", "memory", "0"); + return end - s; +} +#endif +#else /* IN_ARCH_STRING_C */ +void *memchr(const void * s, int c, size_t n); +void *memscan(void *s, int c, size_t n); +char *strcat(char *dst, const char *src); +char *strcpy(char *dst, const char *src); +size_t strlen(const char *s); +size_t strnlen(const char * s, size_t n); +#endif /* !IN_ARCH_STRING_C */ + +#endif /* __S390_STRING_H_ */ diff --git a/arch/s390/include/asm/switch_to.h b/arch/s390/include/asm/switch_to.h new file mode 100644 index 0000000000..c61b2cc1a8 --- /dev/null +++ b/arch/s390/include/asm/switch_to.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 1999, 2009 + * + * Author(s): Martin Schwidefsky + */ + +#ifndef __ASM_SWITCH_TO_H +#define __ASM_SWITCH_TO_H + +#include +#include +#include +#include + +extern struct task_struct *__switch_to(void *, void *); +extern void update_cr_regs(struct task_struct *task); + +static inline void save_access_regs(unsigned int *acrs) +{ + typedef struct { int _[NUM_ACRS]; } acrstype; + + asm volatile("stam 0,15,%0" : "=Q" (*(acrstype *)acrs)); +} + +static inline void restore_access_regs(unsigned int *acrs) +{ + typedef struct { int _[NUM_ACRS]; } acrstype; + + asm volatile("lam 0,15,%0" : : "Q" (*(acrstype *)acrs)); +} + +#define switch_to(prev, next, last) do { \ + /* save_fpu_regs() sets the CIF_FPU flag, which enforces \ + * a restore of the floating point / vector registers as \ + * soon as the next task returns to user space \ + */ \ + save_fpu_regs(); \ + save_access_regs(&prev->thread.acrs[0]); \ + save_ri_cb(prev->thread.ri_cb); \ + save_gs_cb(prev->thread.gs_cb); \ + update_cr_regs(next); \ + restore_access_regs(&next->thread.acrs[0]); \ + restore_ri_cb(next->thread.ri_cb, prev->thread.ri_cb); \ + restore_gs_cb(next->thread.gs_cb); \ + prev = __switch_to(prev, next); \ +} while (0) + +#endif /* __ASM_SWITCH_TO_H */ diff --git a/arch/s390/include/asm/syscall.h b/arch/s390/include/asm/syscall.h new file mode 100644 index 0000000000..27e3d804b3 --- /dev/null +++ b/arch/s390/include/asm/syscall.h @@ -0,0 +1,154 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Access to user system call parameters and results + * + * Copyright IBM Corp. 2008 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) + */ + +#ifndef _ASM_SYSCALL_H +#define _ASM_SYSCALL_H 1 + +#include +#include +#include +#include + +extern const sys_call_ptr_t sys_call_table[]; +extern const sys_call_ptr_t sys_call_table_emu[]; + +static inline long syscall_get_nr(struct task_struct *task, + struct pt_regs *regs) +{ + return test_pt_regs_flag(regs, PIF_SYSCALL) ? + (regs->int_code & 0xffff) : -1; +} + +static inline void syscall_rollback(struct task_struct *task, + struct pt_regs *regs) +{ + regs->gprs[2] = regs->orig_gpr2; +} + +static inline long syscall_get_error(struct task_struct *task, + struct pt_regs *regs) +{ + unsigned long error = regs->gprs[2]; +#ifdef CONFIG_COMPAT + if (test_tsk_thread_flag(task, TIF_31BIT)) { + /* + * Sign-extend the value so (int)-EFOO becomes (long)-EFOO + * and will match correctly in comparisons. + */ + error = (long)(int)error; + } +#endif + return IS_ERR_VALUE(error) ? error : 0; +} + +static inline long syscall_get_return_value(struct task_struct *task, + struct pt_regs *regs) +{ + return regs->gprs[2]; +} + +static inline void syscall_set_return_value(struct task_struct *task, + struct pt_regs *regs, + int error, long val) +{ + set_pt_regs_flag(regs, PIF_SYSCALL_RET_SET); + regs->gprs[2] = error ? error : val; +} + +static inline void syscall_get_arguments(struct task_struct *task, + struct pt_regs *regs, + unsigned long *args) +{ + unsigned long mask = -1UL; + unsigned int n = 6; + +#ifdef CONFIG_COMPAT + if (test_tsk_thread_flag(task, TIF_31BIT)) + mask = 0xffffffff; +#endif + while (n-- > 0) + if (n > 0) + args[n] = regs->gprs[2 + n] & mask; + + args[0] = regs->orig_gpr2 & mask; +} + +static inline int syscall_get_arch(struct task_struct *task) +{ +#ifdef CONFIG_COMPAT + if (test_tsk_thread_flag(task, TIF_31BIT)) + return AUDIT_ARCH_S390; +#endif + return AUDIT_ARCH_S390X; +} + +static inline bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs) +{ + return false; +} + +#define SYSCALL_FMT_0 +#define SYSCALL_FMT_1 , "0" (r2) +#define SYSCALL_FMT_2 , "d" (r3) SYSCALL_FMT_1 +#define SYSCALL_FMT_3 , "d" (r4) SYSCALL_FMT_2 +#define SYSCALL_FMT_4 , "d" (r5) SYSCALL_FMT_3 +#define SYSCALL_FMT_5 , "d" (r6) SYSCALL_FMT_4 +#define SYSCALL_FMT_6 , "d" (r7) SYSCALL_FMT_5 + +#define SYSCALL_PARM_0 +#define SYSCALL_PARM_1 , long arg1 +#define SYSCALL_PARM_2 SYSCALL_PARM_1, long arg2 +#define SYSCALL_PARM_3 SYSCALL_PARM_2, long arg3 +#define SYSCALL_PARM_4 SYSCALL_PARM_3, long arg4 +#define SYSCALL_PARM_5 SYSCALL_PARM_4, long arg5 +#define SYSCALL_PARM_6 SYSCALL_PARM_5, long arg6 + +#define SYSCALL_REGS_0 +#define SYSCALL_REGS_1 \ + register long r2 asm("2") = arg1 +#define SYSCALL_REGS_2 \ + SYSCALL_REGS_1; \ + register long r3 asm("3") = arg2 +#define SYSCALL_REGS_3 \ + SYSCALL_REGS_2; \ + register long r4 asm("4") = arg3 +#define SYSCALL_REGS_4 \ + SYSCALL_REGS_3; \ + register long r5 asm("5") = arg4 +#define SYSCALL_REGS_5 \ + SYSCALL_REGS_4; \ + register long r6 asm("6") = arg5 +#define SYSCALL_REGS_6 \ + SYSCALL_REGS_5; \ + register long r7 asm("7") = arg6 + +#define GENERATE_SYSCALL_FUNC(nr) \ +static __always_inline \ +long syscall##nr(unsigned long syscall SYSCALL_PARM_##nr) \ +{ \ + register unsigned long r1 asm ("1") = syscall; \ + register long rc asm ("2"); \ + SYSCALL_REGS_##nr; \ + \ + asm volatile ( \ + " svc 0\n" \ + : "=d" (rc) \ + : "d" (r1) SYSCALL_FMT_##nr \ + : "memory"); \ + return rc; \ +} + +GENERATE_SYSCALL_FUNC(0) +GENERATE_SYSCALL_FUNC(1) +GENERATE_SYSCALL_FUNC(2) +GENERATE_SYSCALL_FUNC(3) +GENERATE_SYSCALL_FUNC(4) +GENERATE_SYSCALL_FUNC(5) +GENERATE_SYSCALL_FUNC(6) + +#endif /* _ASM_SYSCALL_H */ diff --git a/arch/s390/include/asm/syscall_wrapper.h b/arch/s390/include/asm/syscall_wrapper.h new file mode 100644 index 0000000000..35c1d1b860 --- /dev/null +++ b/arch/s390/include/asm/syscall_wrapper.h @@ -0,0 +1,140 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * syscall_wrapper.h - s390 specific wrappers to syscall definitions + * + */ + +#ifndef _ASM_S390_SYSCALL_WRAPPER_H +#define _ASM_S390_SYSCALL_WRAPPER_H + +/* Mapping of registers to parameters for syscalls */ +#define SC_S390_REGS_TO_ARGS(x, ...) \ + __MAP(x, __SC_ARGS \ + ,, regs->orig_gpr2,, regs->gprs[3],, regs->gprs[4] \ + ,, regs->gprs[5],, regs->gprs[6],, regs->gprs[7]) + +#ifdef CONFIG_COMPAT + +#define __SC_COMPAT_CAST(t, a) \ +({ \ + long __ReS = a; \ + \ + BUILD_BUG_ON((sizeof(t) > 4) && !__TYPE_IS_L(t) && \ + !__TYPE_IS_UL(t) && !__TYPE_IS_PTR(t) && \ + !__TYPE_IS_LL(t)); \ + if (__TYPE_IS_L(t)) \ + __ReS = (s32)a; \ + if (__TYPE_IS_UL(t)) \ + __ReS = (u32)a; \ + if (__TYPE_IS_PTR(t)) \ + __ReS = a & 0x7fffffff; \ + if (__TYPE_IS_LL(t)) \ + return -ENOSYS; \ + (t)__ReS; \ +}) + +/* + * To keep the naming coherent, re-define SYSCALL_DEFINE0 to create an alias + * named __s390x_sys_*() + */ +#define COMPAT_SYSCALL_DEFINE0(sname) \ + long __s390_compat_sys_##sname(void); \ + ALLOW_ERROR_INJECTION(__s390_compat_sys_##sname, ERRNO); \ + long __s390_compat_sys_##sname(void) + +#define SYSCALL_DEFINE0(sname) \ + SYSCALL_METADATA(_##sname, 0); \ + long __s390_sys_##sname(void); \ + ALLOW_ERROR_INJECTION(__s390_sys_##sname, ERRNO); \ + long __s390x_sys_##sname(void); \ + ALLOW_ERROR_INJECTION(__s390x_sys_##sname, ERRNO); \ + static inline long __do_sys_##sname(void); \ + long __s390_sys_##sname(void) \ + { \ + return __do_sys_##sname(); \ + } \ + long __s390x_sys_##sname(void) \ + { \ + return __do_sys_##sname(); \ + } \ + static inline long __do_sys_##sname(void) + +#define COND_SYSCALL(name) \ + cond_syscall(__s390x_sys_##name); \ + cond_syscall(__s390_sys_##name) + +#define COMPAT_SYSCALL_DEFINEx(x, name, ...) \ + long __s390_compat_sys##name(struct pt_regs *regs); \ + ALLOW_ERROR_INJECTION(__s390_compat_sys##name, ERRNO); \ + static inline long __se_compat_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)); \ + static inline long __do_compat_sys##name(__MAP(x, __SC_DECL, __VA_ARGS__)); \ + long __s390_compat_sys##name(struct pt_regs *regs) \ + { \ + return __se_compat_sys##name(SC_S390_REGS_TO_ARGS(x, __VA_ARGS__)); \ + } \ + static inline long __se_compat_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)) \ + { \ + __MAP(x, __SC_TEST, __VA_ARGS__); \ + return __do_compat_sys##name(__MAP(x, __SC_DELOUSE, __VA_ARGS__)); \ + } \ + static inline long __do_compat_sys##name(__MAP(x, __SC_DECL, __VA_ARGS__)) + +/* + * As some compat syscalls may not be implemented, we need to expand + * COND_SYSCALL_COMPAT in kernel/sys_ni.c to cover this case as well. + */ +#define COND_SYSCALL_COMPAT(name) \ + cond_syscall(__s390_compat_sys_##name) + +#define __S390_SYS_STUBx(x, name, ...) \ + long __s390_sys##name(struct pt_regs *regs); \ + ALLOW_ERROR_INJECTION(__s390_sys##name, ERRNO); \ + static inline long ___se_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)); \ + long __s390_sys##name(struct pt_regs *regs) \ + { \ + return ___se_sys##name(SC_S390_REGS_TO_ARGS(x, __VA_ARGS__)); \ + } \ + static inline long ___se_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)) \ + { \ + __MAP(x, __SC_TEST, __VA_ARGS__); \ + return __do_sys##name(__MAP(x, __SC_COMPAT_CAST, __VA_ARGS__)); \ + } + +#else /* CONFIG_COMPAT */ + +#define SYSCALL_DEFINE0(sname) \ + SYSCALL_METADATA(_##sname, 0); \ + long __s390x_sys_##sname(void); \ + ALLOW_ERROR_INJECTION(__s390x_sys_##sname, ERRNO); \ + static inline long __do_sys_##sname(void); \ + long __s390x_sys_##sname(void) \ + { \ + return __do_sys_##sname(); \ + } \ + static inline long __do_sys_##sname(void) + +#define COND_SYSCALL(name) \ + cond_syscall(__s390x_sys_##name) + +#define __S390_SYS_STUBx(x, fullname, name, ...) + +#endif /* CONFIG_COMPAT */ + +#define __SYSCALL_DEFINEx(x, name, ...) \ + long __s390x_sys##name(struct pt_regs *regs); \ + ALLOW_ERROR_INJECTION(__s390x_sys##name, ERRNO); \ + static inline long __se_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)); \ + static inline long __do_sys##name(__MAP(x, __SC_DECL, __VA_ARGS__)); \ + __S390_SYS_STUBx(x, name, __VA_ARGS__); \ + long __s390x_sys##name(struct pt_regs *regs) \ + { \ + return __se_sys##name(SC_S390_REGS_TO_ARGS(x, __VA_ARGS__)); \ + } \ + static inline long __se_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)) \ + { \ + __MAP(x, __SC_TEST, __VA_ARGS__); \ + return __do_sys##name(__MAP(x, __SC_CAST, __VA_ARGS__)); \ + } \ + static inline long __do_sys##name(__MAP(x, __SC_DECL, __VA_ARGS__)) + +#endif /* _ASM_S390_SYSCALL_WRAPPER_H */ diff --git a/arch/s390/include/asm/sysinfo.h b/arch/s390/include/asm/sysinfo.h new file mode 100644 index 0000000000..ab1c631605 --- /dev/null +++ b/arch/s390/include/asm/sysinfo.h @@ -0,0 +1,201 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * definition for store system information stsi + * + * Copyright IBM Corp. 2001, 2008 + * + * Author(s): Ulrich Weigand + * Christian Borntraeger + */ + +#ifndef __ASM_S390_SYSINFO_H +#define __ASM_S390_SYSINFO_H + +#include +#include + +struct sysinfo_1_1_1 { + unsigned char p:1; + unsigned char :6; + unsigned char t:1; + unsigned char :8; + unsigned char ccr; + unsigned char cai; + char reserved_0[20]; + unsigned long lic; + char manufacturer[16]; + char type[4]; + char reserved_1[12]; + char model_capacity[16]; + char sequence[16]; + char plant[4]; + char model[16]; + char model_perm_cap[16]; + char model_temp_cap[16]; + unsigned int model_cap_rating; + unsigned int model_perm_cap_rating; + unsigned int model_temp_cap_rating; + unsigned char typepct[5]; + unsigned char reserved_2[3]; + unsigned int ncr; + unsigned int npr; + unsigned int ntr; +}; + +struct sysinfo_1_2_1 { + char reserved_0[80]; + char sequence[16]; + char plant[4]; + char reserved_1[2]; + unsigned short cpu_address; +}; + +struct sysinfo_1_2_2 { + char format; + char reserved_0[1]; + unsigned short acc_offset; + unsigned char mt_installed :1; + unsigned char :2; + unsigned char mt_stid :5; + unsigned char :3; + unsigned char mt_gtid :5; + char reserved_1[18]; + unsigned int nominal_cap; + unsigned int secondary_cap; + unsigned int capability; + unsigned short cpus_total; + unsigned short cpus_configured; + unsigned short cpus_standby; + unsigned short cpus_reserved; + unsigned short adjustment[]; +}; + +struct sysinfo_1_2_2_extension { + unsigned int alt_capability; + unsigned short alt_adjustment[]; +}; + +struct sysinfo_2_2_1 { + char reserved_0[80]; + char sequence[16]; + char plant[4]; + unsigned short cpu_id; + unsigned short cpu_address; +}; + +struct sysinfo_2_2_2 { + char reserved_0[32]; + unsigned short lpar_number; + char reserved_1; + unsigned char characteristics; + unsigned short cpus_total; + unsigned short cpus_configured; + unsigned short cpus_standby; + unsigned short cpus_reserved; + char name[8]; + unsigned int caf; + char reserved_2[8]; + unsigned char mt_installed :1; + unsigned char :2; + unsigned char mt_stid :5; + unsigned char :3; + unsigned char mt_gtid :5; + unsigned char :3; + unsigned char mt_psmtid :5; + char reserved_3[5]; + unsigned short cpus_dedicated; + unsigned short cpus_shared; + char reserved_4[3]; + unsigned char vsne; + uuid_t uuid; + char reserved_5[160]; + char ext_name[256]; +}; + +#define LPAR_CHAR_DEDICATED (1 << 7) +#define LPAR_CHAR_SHARED (1 << 6) +#define LPAR_CHAR_LIMITED (1 << 5) + +struct sysinfo_3_2_2 { + char reserved_0[31]; + unsigned char :4; + unsigned char count:4; + struct { + char reserved_0[4]; + unsigned short cpus_total; + unsigned short cpus_configured; + unsigned short cpus_standby; + unsigned short cpus_reserved; + char name[8]; + unsigned int caf; + char cpi[16]; + char reserved_1[3]; + unsigned char evmne; + unsigned int reserved_2; + uuid_t uuid; + } vm[8]; + char reserved_3[1504]; + char ext_names[8][256]; +}; + +extern int topology_max_mnest; + +/* + * Returns the maximum nesting level supported by the cpu topology code. + * The current maximum level is 4 which is the drawer level. + */ +static inline unsigned char topology_mnest_limit(void) +{ + return min(topology_max_mnest, 4); +} + +#define TOPOLOGY_NR_MAG 6 + +struct topology_core { + unsigned char nl; + unsigned char reserved0[3]; + unsigned char :5; + unsigned char d:1; + unsigned char pp:2; + unsigned char reserved1; + unsigned short origin; + unsigned long mask; +}; + +struct topology_container { + unsigned char nl; + unsigned char reserved[6]; + unsigned char id; +}; + +union topology_entry { + unsigned char nl; + struct topology_core cpu; + struct topology_container container; +}; + +struct sysinfo_15_1_x { + unsigned char reserved0[2]; + unsigned short length; + unsigned char mag[TOPOLOGY_NR_MAG]; + unsigned char reserved1; + unsigned char mnest; + unsigned char reserved2[4]; + union topology_entry tle[]; +}; + +int stsi(void *sysinfo, int fc, int sel1, int sel2); + +/* + * Service level reporting interface. + */ +struct service_level { + struct list_head list; + void (*seq_print)(struct seq_file *, struct service_level *); +}; + +int register_service_level(struct service_level *); +int unregister_service_level(struct service_level *); + +int sthyi_fill(void *dst, u64 *rc); +#endif /* __ASM_S390_SYSINFO_H */ diff --git a/arch/s390/include/asm/text-patching.h b/arch/s390/include/asm/text-patching.h new file mode 100644 index 0000000000..b219056a88 --- /dev/null +++ b/arch/s390/include/asm/text-patching.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_S390_TEXT_PATCHING_H +#define _ASM_S390_TEXT_PATCHING_H + +#include + +static __always_inline void sync_core(void) +{ + bcr_serialize(); +} + +void text_poke_sync(void); +void text_poke_sync_lock(void); + +#endif /* _ASM_S390_TEXT_PATCHING_H */ diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h new file mode 100644 index 0000000000..a674c7d25d --- /dev/null +++ b/arch/s390/include/asm/thread_info.h @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * S390 version + * Copyright IBM Corp. 2002, 2006 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) + */ + +#ifndef _ASM_THREAD_INFO_H +#define _ASM_THREAD_INFO_H + +#include +#ifndef ASM_OFFSETS_C +#include +#endif + +/* + * General size of kernel stacks + */ +#ifdef CONFIG_KASAN +#define THREAD_SIZE_ORDER 4 +#else +#define THREAD_SIZE_ORDER 2 +#endif +#define BOOT_STACK_SIZE (PAGE_SIZE << 2) +#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) + +#define STACK_INIT_OFFSET (THREAD_SIZE - STACK_FRAME_OVERHEAD - __PT_SIZE) + +#ifndef __ASSEMBLY__ +#include +#include + +/* + * low level task data that entry.S needs immediate access to + * - this struct should fit entirely inside of one cache line + * - this struct shares the supervisor stack pages + * - if the contents of this structure are changed, the assembly constants must also be changed + */ +struct thread_info { + unsigned long flags; /* low level flags */ + unsigned long syscall_work; /* SYSCALL_WORK_ flags */ + unsigned int cpu; /* current CPU */ +}; + +/* + * macros/functions for gaining access to the thread information structure + */ +#define INIT_THREAD_INFO(tsk) \ +{ \ + .flags = 0, \ +} + +struct task_struct; + +void arch_setup_new_exec(void); +#define arch_setup_new_exec arch_setup_new_exec + +#endif + +/* + * thread information flags bit numbers + */ +/* _TIF_WORK bits */ +#define TIF_NOTIFY_RESUME 0 /* callback before returning to user */ +#define TIF_SIGPENDING 1 /* signal pending */ +#define TIF_NEED_RESCHED 2 /* rescheduling necessary */ +#define TIF_UPROBE 3 /* breakpointed or single-stepping */ +#define TIF_GUARDED_STORAGE 4 /* load guarded storage control block */ +#define TIF_PATCH_PENDING 5 /* pending live patching update */ +#define TIF_PGSTE 6 /* New mm's will use 4K page tables */ +#define TIF_NOTIFY_SIGNAL 7 /* signal notifications exist */ +#define TIF_ISOLATE_BP_GUEST 9 /* Run KVM guests with isolated BP */ +#define TIF_PER_TRAP 10 /* Need to handle PER trap on exit to usermode */ + +#define TIF_31BIT 16 /* 32bit process */ +#define TIF_MEMDIE 17 /* is terminating due to OOM killer */ +#define TIF_RESTORE_SIGMASK 18 /* restore signal mask in do_signal() */ +#define TIF_SINGLE_STEP 19 /* This task is single stepped */ +#define TIF_BLOCK_STEP 20 /* This task is block stepped */ +#define TIF_UPROBE_SINGLESTEP 21 /* This task is uprobe single stepped */ + +/* _TIF_TRACE bits */ +#define TIF_SYSCALL_TRACE 24 /* syscall trace active */ +#define TIF_SYSCALL_AUDIT 25 /* syscall auditing active */ +#define TIF_SECCOMP 26 /* secure computing */ +#define TIF_SYSCALL_TRACEPOINT 27 /* syscall tracepoint instrumentation */ + +#define _TIF_NOTIFY_RESUME BIT(TIF_NOTIFY_RESUME) +#define _TIF_NOTIFY_SIGNAL BIT(TIF_NOTIFY_SIGNAL) +#define _TIF_SIGPENDING BIT(TIF_SIGPENDING) +#define _TIF_NEED_RESCHED BIT(TIF_NEED_RESCHED) +#define _TIF_UPROBE BIT(TIF_UPROBE) +#define _TIF_GUARDED_STORAGE BIT(TIF_GUARDED_STORAGE) +#define _TIF_PATCH_PENDING BIT(TIF_PATCH_PENDING) +#define _TIF_ISOLATE_BP_GUEST BIT(TIF_ISOLATE_BP_GUEST) +#define _TIF_PER_TRAP BIT(TIF_PER_TRAP) + +#define _TIF_31BIT BIT(TIF_31BIT) +#define _TIF_SINGLE_STEP BIT(TIF_SINGLE_STEP) + +#define _TIF_SYSCALL_TRACE BIT(TIF_SYSCALL_TRACE) +#define _TIF_SYSCALL_AUDIT BIT(TIF_SYSCALL_AUDIT) +#define _TIF_SECCOMP BIT(TIF_SECCOMP) +#define _TIF_SYSCALL_TRACEPOINT BIT(TIF_SYSCALL_TRACEPOINT) + +#endif /* _ASM_THREAD_INFO_H */ diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h new file mode 100644 index 0000000000..4d646659a5 --- /dev/null +++ b/arch/s390/include/asm/timex.h @@ -0,0 +1,281 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * S390 version + * Copyright IBM Corp. 1999 + * + * Derived from "include/asm-i386/timex.h" + * Copyright (C) 1992, Linus Torvalds + */ + +#ifndef _ASM_S390_TIMEX_H +#define _ASM_S390_TIMEX_H + +#include +#include +#include + +/* The value of the TOD clock for 1.1.1970. */ +#define TOD_UNIX_EPOCH 0x7d91048bca000000ULL + +extern u64 clock_comparator_max; + +union tod_clock { + __uint128_t val; + struct { + __uint128_t ei : 8; /* epoch index */ + __uint128_t tod : 64; /* bits 0-63 of tod clock */ + __uint128_t : 40; + __uint128_t pf : 16; /* programmable field */ + }; + struct { + __uint128_t eitod : 72; /* epoch index + bits 0-63 tod clock */ + __uint128_t : 56; + }; + struct { + __uint128_t us : 60; /* micro-seconds */ + __uint128_t sus : 12; /* sub-microseconds */ + __uint128_t : 56; + }; +} __packed; + +/* Inline functions for clock register access. */ +static inline int set_tod_clock(__u64 time) +{ + int cc; + + asm volatile( + " sck %1\n" + " ipm %0\n" + " srl %0,28\n" + : "=d" (cc) : "Q" (time) : "cc"); + return cc; +} + +static inline int store_tod_clock_ext_cc(union tod_clock *clk) +{ + int cc; + + asm volatile( + " stcke %1\n" + " ipm %0\n" + " srl %0,28\n" + : "=d" (cc), "=Q" (*clk) : : "cc"); + return cc; +} + +static __always_inline void store_tod_clock_ext(union tod_clock *tod) +{ + asm volatile("stcke %0" : "=Q" (*tod) : : "cc"); +} + +static inline void set_clock_comparator(__u64 time) +{ + asm volatile("sckc %0" : : "Q" (time)); +} + +static inline void set_tod_programmable_field(u16 val) +{ + asm volatile( + " lgr 0,%[val]\n" + " sckpf\n" + : + : [val] "d" ((unsigned long)val) + : "0"); +} + +void clock_comparator_work(void); + +void __init time_early_init(void); + +extern unsigned char ptff_function_mask[16]; + +/* Function codes for the ptff instruction. */ +#define PTFF_QAF 0x00 /* query available functions */ +#define PTFF_QTO 0x01 /* query tod offset */ +#define PTFF_QSI 0x02 /* query steering information */ +#define PTFF_QUI 0x04 /* query UTC information */ +#define PTFF_ATO 0x40 /* adjust tod offset */ +#define PTFF_STO 0x41 /* set tod offset */ +#define PTFF_SFS 0x42 /* set fine steering rate */ +#define PTFF_SGS 0x43 /* set gross steering rate */ + +/* Query TOD offset result */ +struct ptff_qto { + unsigned long physical_clock; + unsigned long tod_offset; + unsigned long logical_tod_offset; + unsigned long tod_epoch_difference; +} __packed; + +static inline int ptff_query(unsigned int nr) +{ + unsigned char *ptr; + + ptr = ptff_function_mask + (nr >> 3); + return (*ptr & (0x80 >> (nr & 7))) != 0; +} + +/* Query UTC information result */ +struct ptff_qui { + unsigned int tm : 2; + unsigned int ts : 2; + unsigned int : 28; + unsigned int pad_0x04; + unsigned long leap_event; + short old_leap; + short new_leap; + unsigned int pad_0x14; + unsigned long prt[5]; + unsigned long cst[3]; + unsigned int skew; + unsigned int pad_0x5c[41]; +} __packed; + +/* + * ptff - Perform timing facility function + * @ptff_block: Pointer to ptff parameter block + * @len: Length of parameter block + * @func: Function code + * Returns: Condition code (0 on success) + */ +#define ptff(ptff_block, len, func) \ +({ \ + struct addrtype { char _[len]; }; \ + unsigned int reg0 = func; \ + unsigned long reg1 = (unsigned long)(ptff_block); \ + int rc; \ + \ + asm volatile( \ + " lgr 0,%[reg0]\n" \ + " lgr 1,%[reg1]\n" \ + " ptff\n" \ + " ipm %[rc]\n" \ + " srl %[rc],28\n" \ + : [rc] "=&d" (rc), "+m" (*(struct addrtype *)reg1) \ + : [reg0] "d" (reg0), [reg1] "d" (reg1) \ + : "cc", "0", "1"); \ + rc; \ +}) + +static inline unsigned long local_tick_disable(void) +{ + unsigned long old; + + old = S390_lowcore.clock_comparator; + S390_lowcore.clock_comparator = clock_comparator_max; + set_clock_comparator(S390_lowcore.clock_comparator); + return old; +} + +static inline void local_tick_enable(unsigned long comp) +{ + S390_lowcore.clock_comparator = comp; + set_clock_comparator(S390_lowcore.clock_comparator); +} + +#define CLOCK_TICK_RATE 1193180 /* Underlying HZ */ + +typedef unsigned long cycles_t; + +static __always_inline unsigned long get_tod_clock(void) +{ + union tod_clock clk; + + store_tod_clock_ext(&clk); + return clk.tod; +} + +static inline unsigned long get_tod_clock_fast(void) +{ + unsigned long clk; + + asm volatile("stckf %0" : "=Q" (clk) : : "cc"); + return clk; +} + +static inline cycles_t get_cycles(void) +{ + return (cycles_t) get_tod_clock() >> 2; +} +#define get_cycles get_cycles + +int get_phys_clock(unsigned long *clock); +void init_cpu_timer(void); + +extern union tod_clock tod_clock_base; + +static __always_inline unsigned long __get_tod_clock_monotonic(void) +{ + return get_tod_clock() - tod_clock_base.tod; +} + +/** + * get_clock_monotonic - returns current time in clock rate units + * + * The clock and tod_clock_base get changed via stop_machine. + * Therefore preemption must be disabled, otherwise the returned + * value is not guaranteed to be monotonic. + */ +static inline unsigned long get_tod_clock_monotonic(void) +{ + unsigned long tod; + + preempt_disable_notrace(); + tod = __get_tod_clock_monotonic(); + preempt_enable_notrace(); + return tod; +} + +/** + * tod_to_ns - convert a TOD format value to nanoseconds + * @todval: to be converted TOD format value + * Returns: number of nanoseconds that correspond to the TOD format value + * + * Converting a 64 Bit TOD format value to nanoseconds means that the value + * must be divided by 4.096. In order to achieve that we multiply with 125 + * and divide by 512: + * + * ns = (todval * 125) >> 9; + * + * In order to avoid an overflow with the multiplication we can rewrite this. + * With a split todval == 2^9 * th + tl (th upper 55 bits, tl lower 9 bits) + * we end up with + * + * ns = ((2^9 * th + tl) * 125 ) >> 9; + * -> ns = (th * 125) + ((tl * 125) >> 9); + * + */ +static __always_inline unsigned long tod_to_ns(unsigned long todval) +{ + return ((todval >> 9) * 125) + (((todval & 0x1ff) * 125) >> 9); +} + +/** + * tod_after - compare two 64 bit TOD values + * @a: first 64 bit TOD timestamp + * @b: second 64 bit TOD timestamp + * + * Returns: true if a is later than b + */ +static inline int tod_after(unsigned long a, unsigned long b) +{ + if (MACHINE_HAS_SCC) + return (long) a > (long) b; + return a > b; +} + +/** + * tod_after_eq - compare two 64 bit TOD values + * @a: first 64 bit TOD timestamp + * @b: second 64 bit TOD timestamp + * + * Returns: true if a is later than b + */ +static inline int tod_after_eq(unsigned long a, unsigned long b) +{ + if (MACHINE_HAS_SCC) + return (long) a >= (long) b; + return a >= b; +} + +#endif diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h new file mode 100644 index 0000000000..383b1f9144 --- /dev/null +++ b/arch/s390/include/asm/tlb.h @@ -0,0 +1,137 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _S390_TLB_H +#define _S390_TLB_H + +/* + * TLB flushing on s390 is complicated. The following requirement + * from the principles of operation is the most arduous: + * + * "A valid table entry must not be changed while it is attached + * to any CPU and may be used for translation by that CPU except to + * (1) invalidate the entry by using INVALIDATE PAGE TABLE ENTRY, + * or INVALIDATE DAT TABLE ENTRY, (2) alter bits 56-63 of a page + * table entry, or (3) make a change by means of a COMPARE AND SWAP + * AND PURGE instruction that purges the TLB." + * + * The modification of a pte of an active mm struct therefore is + * a two step process: i) invalidate the pte, ii) store the new pte. + * This is true for the page protection bit as well. + * The only possible optimization is to flush at the beginning of + * a tlb_gather_mmu cycle if the mm_struct is currently not in use. + * + * Pages used for the page tables is a different story. FIXME: more + */ + +void __tlb_remove_table(void *_table); +static inline void tlb_flush(struct mmu_gather *tlb); +static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, + struct encoded_page *page, + int page_size); + +#define tlb_flush tlb_flush +#define pte_free_tlb pte_free_tlb +#define pmd_free_tlb pmd_free_tlb +#define p4d_free_tlb p4d_free_tlb +#define pud_free_tlb pud_free_tlb + +#include +#include + +/* + * Release the page cache reference for a pte removed by + * tlb_ptep_clear_flush. In both flush modes the tlb for a page cache page + * has already been freed, so just do free_page_and_swap_cache. + * + * s390 doesn't delay rmap removal, so there is nothing encoded in + * the page pointer. + */ +static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, + struct encoded_page *page, + int page_size) +{ + free_page_and_swap_cache(encoded_page_ptr(page)); + return false; +} + +static inline void tlb_flush(struct mmu_gather *tlb) +{ + __tlb_flush_mm_lazy(tlb->mm); +} + +/* + * pte_free_tlb frees a pte table and clears the CRSTE for the + * page table from the tlb. + */ +static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, + unsigned long address) +{ + __tlb_adjust_range(tlb, address, PAGE_SIZE); + tlb->mm->context.flush_mm = 1; + tlb->freed_tables = 1; + tlb->cleared_pmds = 1; + /* + * page_table_free_rcu takes care of the allocation bit masks + * of the 2K table fragments in the 4K page table page, + * then calls tlb_remove_table. + */ + page_table_free_rcu(tlb, (unsigned long *) pte, address); +} + +/* + * pmd_free_tlb frees a pmd table and clears the CRSTE for the + * segment table entry from the tlb. + * If the mm uses a two level page table the single pmd is freed + * as the pgd. pmd_free_tlb checks the asce_limit against 2GB + * to avoid the double free of the pmd in this case. + */ +static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long address) +{ + if (mm_pmd_folded(tlb->mm)) + return; + pagetable_pmd_dtor(virt_to_ptdesc(pmd)); + __tlb_adjust_range(tlb, address, PAGE_SIZE); + tlb->mm->context.flush_mm = 1; + tlb->freed_tables = 1; + tlb->cleared_puds = 1; + tlb_remove_ptdesc(tlb, pmd); +} + +/* + * p4d_free_tlb frees a pud table and clears the CRSTE for the + * region second table entry from the tlb. + * If the mm uses a four level page table the single p4d is freed + * as the pgd. p4d_free_tlb checks the asce_limit against 8PB + * to avoid the double free of the p4d in this case. + */ +static inline void p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, + unsigned long address) +{ + if (mm_p4d_folded(tlb->mm)) + return; + __tlb_adjust_range(tlb, address, PAGE_SIZE); + tlb->mm->context.flush_mm = 1; + tlb->freed_tables = 1; + tlb_remove_table(tlb, p4d); +} + +/* + * pud_free_tlb frees a pud table and clears the CRSTE for the + * region third table entry from the tlb. + * If the mm uses a three level page table the single pud is freed + * as the pgd. pud_free_tlb checks the asce_limit against 4TB + * to avoid the double free of the pud in this case. + */ +static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, + unsigned long address) +{ + if (mm_pud_folded(tlb->mm)) + return; + tlb->mm->context.flush_mm = 1; + tlb->freed_tables = 1; + tlb->cleared_p4ds = 1; + tlb_remove_table(tlb, pud); +} + + +#endif /* _S390_TLB_H */ diff --git a/arch/s390/include/asm/tlbflush.h b/arch/s390/include/asm/tlbflush.h new file mode 100644 index 0000000000..a6e2cd89b6 --- /dev/null +++ b/arch/s390/include/asm/tlbflush.h @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _S390_TLBFLUSH_H +#define _S390_TLBFLUSH_H + +#include +#include +#include + +/* + * Flush all TLB entries on the local CPU. + */ +static inline void __tlb_flush_local(void) +{ + asm volatile("ptlb" : : : "memory"); +} + +/* + * Flush TLB entries for a specific ASCE on all CPUs + */ +static inline void __tlb_flush_idte(unsigned long asce) +{ + unsigned long opt; + + opt = IDTE_PTOA; + if (MACHINE_HAS_TLB_GUEST) + opt |= IDTE_GUEST_ASCE; + /* Global TLB flush for the mm */ + asm volatile("idte 0,%1,%0" : : "a" (opt), "a" (asce) : "cc"); +} + +/* + * Flush all TLB entries on all CPUs. + */ +static inline void __tlb_flush_global(void) +{ + unsigned int dummy = 0; + + csp(&dummy, 0, 0); +} + +/* + * Flush TLB entries for a specific mm on all CPUs (in case gmap is used + * this implicates multiple ASCEs!). + */ +static inline void __tlb_flush_mm(struct mm_struct *mm) +{ + unsigned long gmap_asce; + + /* + * If the machine has IDTE we prefer to do a per mm flush + * on all cpus instead of doing a local flush if the mm + * only ran on the local cpu. + */ + preempt_disable(); + atomic_inc(&mm->context.flush_count); + /* Reset TLB flush mask */ + cpumask_copy(mm_cpumask(mm), &mm->context.cpu_attach_mask); + barrier(); + gmap_asce = READ_ONCE(mm->context.gmap_asce); + if (MACHINE_HAS_IDTE && gmap_asce != -1UL) { + if (gmap_asce) + __tlb_flush_idte(gmap_asce); + __tlb_flush_idte(mm->context.asce); + } else { + /* Global TLB flush */ + __tlb_flush_global(); + } + atomic_dec(&mm->context.flush_count); + preempt_enable(); +} + +static inline void __tlb_flush_kernel(void) +{ + if (MACHINE_HAS_IDTE) + __tlb_flush_idte(init_mm.context.asce); + else + __tlb_flush_global(); +} + +static inline void __tlb_flush_mm_lazy(struct mm_struct * mm) +{ + spin_lock(&mm->context.lock); + if (mm->context.flush_mm) { + mm->context.flush_mm = 0; + __tlb_flush_mm(mm); + } + spin_unlock(&mm->context.lock); +} + +/* + * TLB flushing: + * flush_tlb() - flushes the current mm struct TLBs + * flush_tlb_all() - flushes all processes TLBs + * flush_tlb_mm(mm) - flushes the specified mm context TLB's + * flush_tlb_page(vma, vmaddr) - flushes one page + * flush_tlb_range(vma, start, end) - flushes a range of pages + * flush_tlb_kernel_range(start, end) - flushes a range of kernel pages + */ + +/* + * flush_tlb_mm goes together with ptep_set_wrprotect for the + * copy_page_range operation and flush_tlb_range is related to + * ptep_get_and_clear for change_protection. ptep_set_wrprotect and + * ptep_get_and_clear do not flush the TLBs directly if the mm has + * only one user. At the end of the update the flush_tlb_mm and + * flush_tlb_range functions need to do the flush. + */ +#define flush_tlb() do { } while (0) +#define flush_tlb_all() do { } while (0) +#define flush_tlb_page(vma, addr) do { } while (0) + +static inline void flush_tlb_mm(struct mm_struct *mm) +{ + __tlb_flush_mm_lazy(mm); +} + +static inline void flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + __tlb_flush_mm_lazy(vma->vm_mm); +} + +static inline void flush_tlb_kernel_range(unsigned long start, + unsigned long end) +{ + __tlb_flush_kernel(); +} + +#endif /* _S390_TLBFLUSH_H */ diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h new file mode 100644 index 0000000000..3a0ac0c7a9 --- /dev/null +++ b/arch/s390/include/asm/topology.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_TOPOLOGY_H +#define _ASM_S390_TOPOLOGY_H + +#include +#include + +struct sysinfo_15_1_x; +struct cpu; + +#ifdef CONFIG_SCHED_TOPOLOGY + +struct cpu_topology_s390 { + unsigned short thread_id; + unsigned short core_id; + unsigned short socket_id; + unsigned short book_id; + unsigned short drawer_id; + unsigned short dedicated : 1; + int booted_cores; + cpumask_t thread_mask; + cpumask_t core_mask; + cpumask_t book_mask; + cpumask_t drawer_mask; +}; + +extern struct cpu_topology_s390 cpu_topology[NR_CPUS]; + +#define topology_physical_package_id(cpu) (cpu_topology[cpu].socket_id) +#define topology_thread_id(cpu) (cpu_topology[cpu].thread_id) +#define topology_sibling_cpumask(cpu) (&cpu_topology[cpu].thread_mask) +#define topology_core_id(cpu) (cpu_topology[cpu].core_id) +#define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_mask) +#define topology_book_id(cpu) (cpu_topology[cpu].book_id) +#define topology_book_cpumask(cpu) (&cpu_topology[cpu].book_mask) +#define topology_drawer_id(cpu) (cpu_topology[cpu].drawer_id) +#define topology_drawer_cpumask(cpu) (&cpu_topology[cpu].drawer_mask) +#define topology_cpu_dedicated(cpu) (cpu_topology[cpu].dedicated) +#define topology_booted_cores(cpu) (cpu_topology[cpu].booted_cores) + +#define mc_capable() 1 + +void topology_init_early(void); +int topology_cpu_init(struct cpu *); +int topology_set_cpu_management(int fc); +void topology_schedule_update(void); +void store_topology(struct sysinfo_15_1_x *info); +void update_cpu_masks(void); +void topology_expect_change(void); +const struct cpumask *cpu_coregroup_mask(int cpu); + +#else /* CONFIG_SCHED_TOPOLOGY */ + +static inline void topology_init_early(void) { } +static inline void topology_schedule_update(void) { } +static inline int topology_cpu_init(struct cpu *cpu) { return 0; } +static inline int topology_cpu_dedicated(int cpu_nr) { return 0; } +static inline int topology_booted_cores(int cpu_nr) { return 1; } +static inline void update_cpu_masks(void) { } +static inline void topology_expect_change(void) { } + +#endif /* CONFIG_SCHED_TOPOLOGY */ + +#define POLARIZATION_UNKNOWN (-1) +#define POLARIZATION_HRZ (0) +#define POLARIZATION_VL (1) +#define POLARIZATION_VM (2) +#define POLARIZATION_VH (3) + +#define SD_BOOK_INIT SD_CPU_INIT + +#ifdef CONFIG_NUMA + +#define cpu_to_node cpu_to_node +static inline int cpu_to_node(int cpu) +{ + return 0; +} + +/* Returns a pointer to the cpumask of CPUs on node 'node'. */ +#define cpumask_of_node cpumask_of_node +static inline const struct cpumask *cpumask_of_node(int node) +{ + return cpu_possible_mask; +} + +#define pcibus_to_node(bus) __pcibus_to_node(bus) + +#else /* !CONFIG_NUMA */ + +#define numa_node_id numa_node_id +static inline int numa_node_id(void) +{ + return 0; +} + +#endif /* CONFIG_NUMA */ + +#include + +#endif /* _ASM_S390_TOPOLOGY_H */ diff --git a/arch/s390/include/asm/tpi.h b/arch/s390/include/asm/tpi.h new file mode 100644 index 0000000000..f76e5fdff2 --- /dev/null +++ b/arch/s390/include/asm/tpi.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _ASM_S390_TPI_H +#define _ASM_S390_TPI_H + +#include +#include + +#ifndef __ASSEMBLY__ + +/* I/O-Interruption Code as stored by TEST PENDING INTERRUPTION (TPI). */ +struct tpi_info { + struct subchannel_id schid; + u32 intparm; + u32 adapter_IO:1; + u32 directed_irq:1; + u32 isc:3; + u32 :12; + u32 type:3; + u32 :12; +} __packed __aligned(4); + +/* I/O-Interruption Code as stored by TPI for an Adapter I/O */ +struct tpi_adapter_info { + u32 aism:8; + u32 :22; + u32 error:1; + u32 forward:1; + u32 reserved; + u32 adapter_IO:1; + u32 directed_irq:1; + u32 isc:3; + u32 :27; +} __packed __aligned(4); + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_S390_TPI_H */ diff --git a/arch/s390/include/asm/trace/diag.h b/arch/s390/include/asm/trace/diag.h new file mode 100644 index 0000000000..22fcac4ff9 --- /dev/null +++ b/arch/s390/include/asm/trace/diag.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Tracepoint header for s390 diagnose calls + * + * Copyright IBM Corp. 2015 + * Author(s): Martin Schwidefsky + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM s390 + +#if !defined(_TRACE_S390_DIAG_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_S390_DIAG_H + +#include + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE + +#define TRACE_INCLUDE_PATH asm/trace +#define TRACE_INCLUDE_FILE diag + +TRACE_EVENT(s390_diagnose, + TP_PROTO(unsigned short nr), + TP_ARGS(nr), + TP_STRUCT__entry( + __field(unsigned short, nr) + ), + TP_fast_assign( + __entry->nr = nr; + ), + TP_printk("nr=0x%x", __entry->nr) +); + +#ifdef CONFIG_TRACEPOINTS +void trace_s390_diagnose_norecursion(int diag_nr); +#else +static inline void trace_s390_diagnose_norecursion(int diag_nr) { } +#endif + +#endif /* _TRACE_S390_DIAG_H */ + +/* This part must be outside protection */ +#include diff --git a/arch/s390/include/asm/trace/zcrypt.h b/arch/s390/include/asm/trace/zcrypt.h new file mode 100644 index 0000000000..457ddaa99e --- /dev/null +++ b/arch/s390/include/asm/trace/zcrypt.h @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Tracepoint definitions for the s390 zcrypt device driver + * + * Copyright IBM Corp. 2016 + * Author(s): Harald Freudenberger + * + * Currently there are two tracepoint events defined here. + * An s390_zcrypt_req request event occurs as soon as the request is + * recognized by the zcrypt ioctl function. This event may act as some kind + * of request-processing-starts-now indication. + * As late as possible within the zcrypt ioctl function there occurs the + * s390_zcrypt_rep event which may act as the point in time where the + * request has been processed by the kernel and the result is about to be + * transferred back to userspace. + * The glue which binds together request and reply event is the ptr + * parameter, which is the local buffer address where the request from + * userspace has been stored by the ioctl function. + * + * The main purpose of this zcrypt tracepoint api is to get some data for + * performance measurements together with information about on which card + * and queue the request has been processed. It is not an ffdc interface as + * there is already code in the zcrypt device driver to serve the s390 + * debug feature interface. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM s390 + +#if !defined(_TRACE_S390_ZCRYPT_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_S390_ZCRYPT_H + +#include + +#define TP_ICARSAMODEXPO 0x0001 +#define TP_ICARSACRT 0x0002 +#define TB_ZSECSENDCPRB 0x0003 +#define TP_ZSENDEP11CPRB 0x0004 +#define TP_HWRNGCPRB 0x0005 + +#define show_zcrypt_tp_type(type) \ + __print_symbolic(type, \ + { TP_ICARSAMODEXPO, "ICARSAMODEXPO" }, \ + { TP_ICARSACRT, "ICARSACRT" }, \ + { TB_ZSECSENDCPRB, "ZSECSENDCPRB" }, \ + { TP_ZSENDEP11CPRB, "ZSENDEP11CPRB" }, \ + { TP_HWRNGCPRB, "HWRNGCPRB" }) + +/** + * trace_s390_zcrypt_req - zcrypt request tracepoint function + * @ptr: Address of the local buffer where the request from userspace + * is stored. Can be used as a unique id to relate together + * request and reply. + * @type: One of the TP_ defines above. + * + * Called when a request from userspace is recognised within the ioctl + * function of the zcrypt device driver and may act as an entry + * timestamp. + */ +TRACE_EVENT(s390_zcrypt_req, + TP_PROTO(void *ptr, u32 type), + TP_ARGS(ptr, type), + TP_STRUCT__entry( + __field(void *, ptr) + __field(u32, type)), + TP_fast_assign( + __entry->ptr = ptr; + __entry->type = type;), + TP_printk("ptr=%p type=%s", + __entry->ptr, + show_zcrypt_tp_type(__entry->type)) +); + +/** + * trace_s390_zcrypt_rep - zcrypt reply tracepoint function + * @ptr: Address of the local buffer where the request from userspace + * is stored. Can be used as a unique id to match together + * request and reply. + * @fc: Function code. + * @rc: The bare returncode as returned by the device driver ioctl + * function. + * @dev: The adapter nr where this request was actually processed. + * @dom: Domain id of the device where this request was processed. + * + * Called upon recognising the reply from the crypto adapter. This + * message may act as the exit timestamp for the request but also + * carries some info about on which adapter the request was processed + * and the returncode from the device driver. + */ +TRACE_EVENT(s390_zcrypt_rep, + TP_PROTO(void *ptr, u32 fc, u32 rc, u16 dev, u16 dom), + TP_ARGS(ptr, fc, rc, dev, dom), + TP_STRUCT__entry( + __field(void *, ptr) + __field(u32, fc) + __field(u32, rc) + __field(u16, device) + __field(u16, domain)), + TP_fast_assign( + __entry->ptr = ptr; + __entry->fc = fc; + __entry->rc = rc; + __entry->device = dev; + __entry->domain = dom;), + TP_printk("ptr=%p fc=0x%04x rc=%d dev=0x%02hx domain=0x%04hx", + __entry->ptr, + (unsigned int) __entry->fc, + (int) __entry->rc, + (unsigned short) __entry->device, + (unsigned short) __entry->domain) +); + +#endif /* _TRACE_S390_ZCRYPT_H */ + +/* This part must be outside protection */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE + +#define TRACE_INCLUDE_PATH asm/trace +#define TRACE_INCLUDE_FILE zcrypt + +#include diff --git a/arch/s390/include/asm/types.h b/arch/s390/include/asm/types.h new file mode 100644 index 0000000000..0b5d550a04 --- /dev/null +++ b/arch/s390/include/asm/types.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _ASM_S390_TYPES_H +#define _ASM_S390_TYPES_H + +#include + +#ifndef __ASSEMBLY__ + +union register_pair { + unsigned __int128 pair; + struct { + unsigned long even; + unsigned long odd; + }; +}; + +#endif /* __ASSEMBLY__ */ +#endif /* _ASM_S390_TYPES_H */ diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h new file mode 100644 index 0000000000..8a8c64a678 --- /dev/null +++ b/arch/s390/include/asm/uaccess.h @@ -0,0 +1,601 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * S390 version + * Copyright IBM Corp. 1999, 2000 + * Author(s): Hartmut Penner (hp@de.ibm.com), + * Martin Schwidefsky (schwidefsky@de.ibm.com) + * + * Derived from "include/asm-i386/uaccess.h" + */ +#ifndef __S390_UACCESS_H +#define __S390_UACCESS_H + +/* + * User space memory access functions + */ +#include +#include +#include +#include +#include +#include + +void debug_user_asce(int exit); + +unsigned long __must_check +raw_copy_from_user(void *to, const void __user *from, unsigned long n); + +unsigned long __must_check +raw_copy_to_user(void __user *to, const void *from, unsigned long n); + +#ifndef CONFIG_KASAN +#define INLINE_COPY_FROM_USER +#define INLINE_COPY_TO_USER +#endif + +unsigned long __must_check +_copy_from_user_key(void *to, const void __user *from, unsigned long n, unsigned long key); + +static __always_inline unsigned long __must_check +copy_from_user_key(void *to, const void __user *from, unsigned long n, unsigned long key) +{ + if (check_copy_size(to, n, false)) + n = _copy_from_user_key(to, from, n, key); + return n; +} + +unsigned long __must_check +_copy_to_user_key(void __user *to, const void *from, unsigned long n, unsigned long key); + +static __always_inline unsigned long __must_check +copy_to_user_key(void __user *to, const void *from, unsigned long n, unsigned long key) +{ + if (check_copy_size(from, n, true)) + n = _copy_to_user_key(to, from, n, key); + return n; +} + +union oac { + unsigned int val; + struct { + struct { + unsigned short key : 4; + unsigned short : 4; + unsigned short as : 2; + unsigned short : 4; + unsigned short k : 1; + unsigned short a : 1; + } oac1; + struct { + unsigned short key : 4; + unsigned short : 4; + unsigned short as : 2; + unsigned short : 4; + unsigned short k : 1; + unsigned short a : 1; + } oac2; + }; +}; + +int __noreturn __put_user_bad(void); + +#define __put_user_asm(to, from, size) \ +({ \ + union oac __oac_spec = { \ + .oac1.as = PSW_BITS_AS_SECONDARY, \ + .oac1.a = 1, \ + }; \ + int __rc; \ + \ + asm volatile( \ + " lr 0,%[spec]\n" \ + "0: mvcos %[_to],%[_from],%[_size]\n" \ + "1: xr %[rc],%[rc]\n" \ + "2:\n" \ + EX_TABLE_UA_STORE(0b, 2b, %[rc]) \ + EX_TABLE_UA_STORE(1b, 2b, %[rc]) \ + : [rc] "=&d" (__rc), [_to] "+Q" (*(to)) \ + : [_size] "d" (size), [_from] "Q" (*(from)), \ + [spec] "d" (__oac_spec.val) \ + : "cc", "0"); \ + __rc; \ +}) + +static __always_inline int __put_user_fn(void *x, void __user *ptr, unsigned long size) +{ + int rc; + + switch (size) { + case 1: + rc = __put_user_asm((unsigned char __user *)ptr, + (unsigned char *)x, + size); + break; + case 2: + rc = __put_user_asm((unsigned short __user *)ptr, + (unsigned short *)x, + size); + break; + case 4: + rc = __put_user_asm((unsigned int __user *)ptr, + (unsigned int *)x, + size); + break; + case 8: + rc = __put_user_asm((unsigned long __user *)ptr, + (unsigned long *)x, + size); + break; + default: + __put_user_bad(); + break; + } + return rc; +} + +int __noreturn __get_user_bad(void); + +#define __get_user_asm(to, from, size) \ +({ \ + union oac __oac_spec = { \ + .oac2.as = PSW_BITS_AS_SECONDARY, \ + .oac2.a = 1, \ + }; \ + int __rc; \ + \ + asm volatile( \ + " lr 0,%[spec]\n" \ + "0: mvcos 0(%[_to]),%[_from],%[_size]\n" \ + "1: xr %[rc],%[rc]\n" \ + "2:\n" \ + EX_TABLE_UA_LOAD_MEM(0b, 2b, %[rc], %[_to], %[_ksize]) \ + EX_TABLE_UA_LOAD_MEM(1b, 2b, %[rc], %[_to], %[_ksize]) \ + : [rc] "=&d" (__rc), "=Q" (*(to)) \ + : [_size] "d" (size), [_from] "Q" (*(from)), \ + [spec] "d" (__oac_spec.val), [_to] "a" (to), \ + [_ksize] "K" (size) \ + : "cc", "0"); \ + __rc; \ +}) + +static __always_inline int __get_user_fn(void *x, const void __user *ptr, unsigned long size) +{ + int rc; + + switch (size) { + case 1: + rc = __get_user_asm((unsigned char *)x, + (unsigned char __user *)ptr, + size); + break; + case 2: + rc = __get_user_asm((unsigned short *)x, + (unsigned short __user *)ptr, + size); + break; + case 4: + rc = __get_user_asm((unsigned int *)x, + (unsigned int __user *)ptr, + size); + break; + case 8: + rc = __get_user_asm((unsigned long *)x, + (unsigned long __user *)ptr, + size); + break; + default: + __get_user_bad(); + break; + } + return rc; +} + +/* + * These are the main single-value transfer routines. They automatically + * use the right size if we just have the right pointer type. + */ +#define __put_user(x, ptr) \ +({ \ + __typeof__(*(ptr)) __x = (x); \ + int __pu_err = -EFAULT; \ + \ + __chk_user_ptr(ptr); \ + switch (sizeof(*(ptr))) { \ + case 1: \ + case 2: \ + case 4: \ + case 8: \ + __pu_err = __put_user_fn(&__x, ptr, sizeof(*(ptr))); \ + break; \ + default: \ + __put_user_bad(); \ + break; \ + } \ + __builtin_expect(__pu_err, 0); \ +}) + +#define put_user(x, ptr) \ +({ \ + might_fault(); \ + __put_user(x, ptr); \ +}) + +#define __get_user(x, ptr) \ +({ \ + int __gu_err = -EFAULT; \ + \ + __chk_user_ptr(ptr); \ + switch (sizeof(*(ptr))) { \ + case 1: { \ + unsigned char __x; \ + \ + __gu_err = __get_user_fn(&__x, ptr, sizeof(*(ptr))); \ + (x) = *(__force __typeof__(*(ptr)) *)&__x; \ + break; \ + }; \ + case 2: { \ + unsigned short __x; \ + \ + __gu_err = __get_user_fn(&__x, ptr, sizeof(*(ptr))); \ + (x) = *(__force __typeof__(*(ptr)) *)&__x; \ + break; \ + }; \ + case 4: { \ + unsigned int __x; \ + \ + __gu_err = __get_user_fn(&__x, ptr, sizeof(*(ptr))); \ + (x) = *(__force __typeof__(*(ptr)) *)&__x; \ + break; \ + }; \ + case 8: { \ + unsigned long __x; \ + \ + __gu_err = __get_user_fn(&__x, ptr, sizeof(*(ptr))); \ + (x) = *(__force __typeof__(*(ptr)) *)&__x; \ + break; \ + }; \ + default: \ + __get_user_bad(); \ + break; \ + } \ + __builtin_expect(__gu_err, 0); \ +}) + +#define get_user(x, ptr) \ +({ \ + might_fault(); \ + __get_user(x, ptr); \ +}) + +/* + * Copy a null terminated string from userspace. + */ +long __must_check strncpy_from_user(char *dst, const char __user *src, long count); + +long __must_check strnlen_user(const char __user *src, long count); + +/* + * Zero Userspace + */ +unsigned long __must_check __clear_user(void __user *to, unsigned long size); + +static inline unsigned long __must_check clear_user(void __user *to, unsigned long n) +{ + might_fault(); + return __clear_user(to, n); +} + +void *s390_kernel_write(void *dst, const void *src, size_t size); + +int __noreturn __put_kernel_bad(void); + +#define __put_kernel_asm(val, to, insn) \ +({ \ + int __rc; \ + \ + asm volatile( \ + "0: " insn " %[_val],%[_to]\n" \ + "1: xr %[rc],%[rc]\n" \ + "2:\n" \ + EX_TABLE_UA_STORE(0b, 2b, %[rc]) \ + EX_TABLE_UA_STORE(1b, 2b, %[rc]) \ + : [rc] "=d" (__rc), [_to] "+Q" (*(to)) \ + : [_val] "d" (val) \ + : "cc"); \ + __rc; \ +}) + +#define __put_kernel_nofault(dst, src, type, err_label) \ +do { \ + unsigned long __x = (unsigned long)(*((type *)(src))); \ + int __pk_err; \ + \ + switch (sizeof(type)) { \ + case 1: \ + __pk_err = __put_kernel_asm(__x, (type *)(dst), "stc"); \ + break; \ + case 2: \ + __pk_err = __put_kernel_asm(__x, (type *)(dst), "sth"); \ + break; \ + case 4: \ + __pk_err = __put_kernel_asm(__x, (type *)(dst), "st"); \ + break; \ + case 8: \ + __pk_err = __put_kernel_asm(__x, (type *)(dst), "stg"); \ + break; \ + default: \ + __pk_err = __put_kernel_bad(); \ + break; \ + } \ + if (unlikely(__pk_err)) \ + goto err_label; \ +} while (0) + +int __noreturn __get_kernel_bad(void); + +#define __get_kernel_asm(val, from, insn) \ +({ \ + int __rc; \ + \ + asm volatile( \ + "0: " insn " %[_val],%[_from]\n" \ + "1: xr %[rc],%[rc]\n" \ + "2:\n" \ + EX_TABLE_UA_LOAD_REG(0b, 2b, %[rc], %[_val]) \ + EX_TABLE_UA_LOAD_REG(1b, 2b, %[rc], %[_val]) \ + : [rc] "=d" (__rc), [_val] "=d" (val) \ + : [_from] "Q" (*(from)) \ + : "cc"); \ + __rc; \ +}) + +#define __get_kernel_nofault(dst, src, type, err_label) \ +do { \ + int __gk_err; \ + \ + switch (sizeof(type)) { \ + case 1: { \ + unsigned char __x; \ + \ + __gk_err = __get_kernel_asm(__x, (type *)(src), "ic"); \ + *((type *)(dst)) = (type)__x; \ + break; \ + }; \ + case 2: { \ + unsigned short __x; \ + \ + __gk_err = __get_kernel_asm(__x, (type *)(src), "lh"); \ + *((type *)(dst)) = (type)__x; \ + break; \ + }; \ + case 4: { \ + unsigned int __x; \ + \ + __gk_err = __get_kernel_asm(__x, (type *)(src), "l"); \ + *((type *)(dst)) = (type)__x; \ + break; \ + }; \ + case 8: { \ + unsigned long __x; \ + \ + __gk_err = __get_kernel_asm(__x, (type *)(src), "lg"); \ + *((type *)(dst)) = (type)__x; \ + break; \ + }; \ + default: \ + __gk_err = __get_kernel_bad(); \ + break; \ + } \ + if (unlikely(__gk_err)) \ + goto err_label; \ +} while (0) + +void __cmpxchg_user_key_called_with_bad_pointer(void); + +#define CMPXCHG_USER_KEY_MAX_LOOPS 128 + +static __always_inline int __cmpxchg_user_key(unsigned long address, void *uval, + __uint128_t old, __uint128_t new, + unsigned long key, int size) +{ + int rc = 0; + + switch (size) { + case 1: { + unsigned int prev, shift, mask, _old, _new; + unsigned long count; + + shift = (3 ^ (address & 3)) << 3; + address ^= address & 3; + _old = ((unsigned int)old & 0xff) << shift; + _new = ((unsigned int)new & 0xff) << shift; + mask = ~(0xff << shift); + asm volatile( + " spka 0(%[key])\n" + " sacf 256\n" + " llill %[count],%[max_loops]\n" + "0: l %[prev],%[address]\n" + "1: nr %[prev],%[mask]\n" + " xilf %[mask],0xffffffff\n" + " or %[new],%[prev]\n" + " or %[prev],%[tmp]\n" + "2: lr %[tmp],%[prev]\n" + "3: cs %[prev],%[new],%[address]\n" + "4: jnl 5f\n" + " xr %[tmp],%[prev]\n" + " xr %[new],%[tmp]\n" + " nr %[tmp],%[mask]\n" + " jnz 5f\n" + " brct %[count],2b\n" + "5: sacf 768\n" + " spka %[default_key]\n" + EX_TABLE_UA_LOAD_REG(0b, 5b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(1b, 5b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(3b, 5b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(4b, 5b, %[rc], %[prev]) + : [rc] "+&d" (rc), + [prev] "=&d" (prev), + [address] "+Q" (*(int *)address), + [tmp] "+&d" (_old), + [new] "+&d" (_new), + [mask] "+&d" (mask), + [count] "=a" (count) + : [key] "%[count]" (key << 4), + [default_key] "J" (PAGE_DEFAULT_KEY), + [max_loops] "J" (CMPXCHG_USER_KEY_MAX_LOOPS) + : "memory", "cc"); + *(unsigned char *)uval = prev >> shift; + if (!count) + rc = -EAGAIN; + return rc; + } + case 2: { + unsigned int prev, shift, mask, _old, _new; + unsigned long count; + + shift = (2 ^ (address & 2)) << 3; + address ^= address & 2; + _old = ((unsigned int)old & 0xffff) << shift; + _new = ((unsigned int)new & 0xffff) << shift; + mask = ~(0xffff << shift); + asm volatile( + " spka 0(%[key])\n" + " sacf 256\n" + " llill %[count],%[max_loops]\n" + "0: l %[prev],%[address]\n" + "1: nr %[prev],%[mask]\n" + " xilf %[mask],0xffffffff\n" + " or %[new],%[prev]\n" + " or %[prev],%[tmp]\n" + "2: lr %[tmp],%[prev]\n" + "3: cs %[prev],%[new],%[address]\n" + "4: jnl 5f\n" + " xr %[tmp],%[prev]\n" + " xr %[new],%[tmp]\n" + " nr %[tmp],%[mask]\n" + " jnz 5f\n" + " brct %[count],2b\n" + "5: sacf 768\n" + " spka %[default_key]\n" + EX_TABLE_UA_LOAD_REG(0b, 5b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(1b, 5b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(3b, 5b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(4b, 5b, %[rc], %[prev]) + : [rc] "+&d" (rc), + [prev] "=&d" (prev), + [address] "+Q" (*(int *)address), + [tmp] "+&d" (_old), + [new] "+&d" (_new), + [mask] "+&d" (mask), + [count] "=a" (count) + : [key] "%[count]" (key << 4), + [default_key] "J" (PAGE_DEFAULT_KEY), + [max_loops] "J" (CMPXCHG_USER_KEY_MAX_LOOPS) + : "memory", "cc"); + *(unsigned short *)uval = prev >> shift; + if (!count) + rc = -EAGAIN; + return rc; + } + case 4: { + unsigned int prev = old; + + asm volatile( + " spka 0(%[key])\n" + " sacf 256\n" + "0: cs %[prev],%[new],%[address]\n" + "1: sacf 768\n" + " spka %[default_key]\n" + EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev]) + : [rc] "+&d" (rc), + [prev] "+&d" (prev), + [address] "+Q" (*(int *)address) + : [new] "d" ((unsigned int)new), + [key] "a" (key << 4), + [default_key] "J" (PAGE_DEFAULT_KEY) + : "memory", "cc"); + *(unsigned int *)uval = prev; + return rc; + } + case 8: { + unsigned long prev = old; + + asm volatile( + " spka 0(%[key])\n" + " sacf 256\n" + "0: csg %[prev],%[new],%[address]\n" + "1: sacf 768\n" + " spka %[default_key]\n" + EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev]) + : [rc] "+&d" (rc), + [prev] "+&d" (prev), + [address] "+QS" (*(long *)address) + : [new] "d" ((unsigned long)new), + [key] "a" (key << 4), + [default_key] "J" (PAGE_DEFAULT_KEY) + : "memory", "cc"); + *(unsigned long *)uval = prev; + return rc; + } + case 16: { + __uint128_t prev = old; + + asm volatile( + " spka 0(%[key])\n" + " sacf 256\n" + "0: cdsg %[prev],%[new],%[address]\n" + "1: sacf 768\n" + " spka %[default_key]\n" + EX_TABLE_UA_LOAD_REGPAIR(0b, 1b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REGPAIR(1b, 1b, %[rc], %[prev]) + : [rc] "+&d" (rc), + [prev] "+&d" (prev), + [address] "+QS" (*(__int128_t *)address) + : [new] "d" (new), + [key] "a" (key << 4), + [default_key] "J" (PAGE_DEFAULT_KEY) + : "memory", "cc"); + *(__uint128_t *)uval = prev; + return rc; + } + } + __cmpxchg_user_key_called_with_bad_pointer(); + return rc; +} + +/** + * cmpxchg_user_key() - cmpxchg with user space target, honoring storage keys + * @ptr: User space address of value to compare to @old and exchange with + * @new. Must be aligned to sizeof(*@ptr). + * @uval: Address where the old value of *@ptr is written to. + * @old: Old value. Compared to the content pointed to by @ptr in order to + * determine if the exchange occurs. The old value read from *@ptr is + * written to *@uval. + * @new: New value to place at *@ptr. + * @key: Access key to use for checking storage key protection. + * + * Perform a cmpxchg on a user space target, honoring storage key protection. + * @key alone determines how key checking is performed, neither + * storage-protection-override nor fetch-protection-override apply. + * The caller must compare *@uval and @old to determine if values have been + * exchanged. In case of an exception *@uval is set to zero. + * + * Return: 0: cmpxchg executed + * -EFAULT: an exception happened when trying to access *@ptr + * -EAGAIN: maxed out number of retries (byte and short only) + */ +#define cmpxchg_user_key(ptr, uval, old, new, key) \ +({ \ + __typeof__(ptr) __ptr = (ptr); \ + __typeof__(uval) __uval = (uval); \ + \ + BUILD_BUG_ON(sizeof(*(__ptr)) != sizeof(*(__uval))); \ + might_fault(); \ + __chk_user_ptr(__ptr); \ + __cmpxchg_user_key((unsigned long)(__ptr), (void *)(__uval), \ + (old), (new), (key), sizeof(*(__ptr))); \ +}) + +#endif /* __S390_UACCESS_H */ diff --git a/arch/s390/include/asm/unistd.h b/arch/s390/include/asm/unistd.h new file mode 100644 index 0000000000..4260bc5ce7 --- /dev/null +++ b/arch/s390/include/asm/unistd.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * S390 version + * + * Derived from "include/asm-i386/unistd.h" + */ +#ifndef _ASM_S390_UNISTD_H_ +#define _ASM_S390_UNISTD_H_ + +#include +#include + +#define __ARCH_WANT_NEW_STAT +#define __ARCH_WANT_OLD_READDIR +#define __ARCH_WANT_SYS_ALARM +#define __ARCH_WANT_SYS_GETHOSTNAME +#define __ARCH_WANT_SYS_PAUSE +#define __ARCH_WANT_SYS_SIGNAL +#define __ARCH_WANT_SYS_UTIME +#define __ARCH_WANT_SYS_SOCKETCALL +#define __ARCH_WANT_SYS_IPC +#define __ARCH_WANT_SYS_FADVISE64 +#define __ARCH_WANT_SYS_GETPGRP +#define __ARCH_WANT_SYS_NICE +#define __ARCH_WANT_SYS_OLD_GETRLIMIT +#define __ARCH_WANT_SYS_OLD_MMAP +#define __ARCH_WANT_SYS_OLDUMOUNT +#define __ARCH_WANT_SYS_SIGPENDING +#define __ARCH_WANT_SYS_SIGPROCMASK +# ifdef CONFIG_COMPAT +# define __ARCH_WANT_COMPAT_STAT +# define __ARCH_WANT_SYS_TIME32 +# define __ARCH_WANT_SYS_UTIME32 +# endif +#define __ARCH_WANT_SYS_FORK +#define __ARCH_WANT_SYS_VFORK +#define __ARCH_WANT_SYS_CLONE +#define __ARCH_WANT_SYS_CLONE3 + +#endif /* _ASM_S390_UNISTD_H_ */ diff --git a/arch/s390/include/asm/unwind.h b/arch/s390/include/asm/unwind.h new file mode 100644 index 0000000000..b8ecf04e34 --- /dev/null +++ b/arch/s390/include/asm/unwind.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_UNWIND_H +#define _ASM_S390_UNWIND_H + +#include +#include +#include +#include +#include +#include + +/* + * To use the stack unwinder it has to be initialized with unwind_start. + * There four combinations for task and regs: + * 1) task==NULL, regs==NULL: the unwind starts for the task that is currently + * running, sp/ip picked up from the CPU registers + * 2) task==NULL, regs!=NULL: the unwind starts from the sp/ip found in + * the struct pt_regs of an interrupt frame for the current task + * 3) task!=NULL, regs==NULL: the unwind starts for an inactive task with + * the sp picked up from task->thread.ksp and the ip picked up from the + * return address stored by __switch_to + * 4) task!=NULL, regs!=NULL: the sp/ip are picked up from the interrupt + * frame 'regs' of a inactive task + * If 'first_frame' is not zero unwind_start skips unwind frames until it + * reaches the specified stack pointer. + * The end of the unwinding is indicated with unwind_done, this can be true + * right after unwind_start, e.g. with first_frame!=0 that can not be found. + * unwind_next_frame skips to the next frame. + * Once the unwind is completed unwind_error() can be used to check if there + * has been a situation where the unwinder could not correctly understand + * the tasks call chain. + */ + +struct unwind_state { + struct stack_info stack_info; + unsigned long stack_mask; + struct task_struct *task; + struct pt_regs *regs; + unsigned long sp, ip; + int graph_idx; + struct llist_node *kr_cur; + bool reliable; + bool error; +}; + +/* Recover the return address modified by rethook and ftrace_graph. */ +static inline unsigned long unwind_recover_ret_addr(struct unwind_state *state, + unsigned long ip) +{ + ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, ip, (void *)state->sp); +#ifdef CONFIG_RETHOOK + if (is_rethook_trampoline(ip)) + ip = rethook_find_ret_addr(state->task, state->sp, &state->kr_cur); +#endif + return ip; +} + +void __unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long first_frame); +bool unwind_next_frame(struct unwind_state *state); +unsigned long unwind_get_return_address(struct unwind_state *state); + +static inline bool unwind_done(struct unwind_state *state) +{ + return state->stack_info.type == STACK_TYPE_UNKNOWN; +} + +static inline bool unwind_error(struct unwind_state *state) +{ + return state->error; +} + +static __always_inline void unwind_start(struct unwind_state *state, + struct task_struct *task, + struct pt_regs *regs, + unsigned long first_frame) +{ + task = task ?: current; + first_frame = first_frame ?: get_stack_pointer(task, regs); + __unwind_start(state, task, regs, first_frame); +} + +static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) +{ + return unwind_done(state) ? NULL : state->regs; +} + +#define unwind_for_each_frame(state, task, regs, first_frame) \ + for (unwind_start(state, task, regs, first_frame); \ + !unwind_done(state); \ + unwind_next_frame(state)) + +static inline void unwind_init(void) {} +static inline void unwind_module_init(struct module *mod, void *orc_ip, + size_t orc_ip_size, void *orc, + size_t orc_size) {} + +#endif /* _ASM_S390_UNWIND_H */ diff --git a/arch/s390/include/asm/uprobes.h b/arch/s390/include/asm/uprobes.h new file mode 100644 index 0000000000..b60b3c7ef6 --- /dev/null +++ b/arch/s390/include/asm/uprobes.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * User-space Probes (UProbes) for s390 + * + * Copyright IBM Corp. 2014 + * Author(s): Jan Willeke, + */ + +#ifndef _ASM_UPROBES_H +#define _ASM_UPROBES_H + +#include + +typedef u16 uprobe_opcode_t; + +#define UPROBE_XOL_SLOT_BYTES 256 /* cache aligned */ + +#define UPROBE_SWBP_INSN 0x0002 +#define UPROBE_SWBP_INSN_SIZE 2 + +struct arch_uprobe { + union{ + uprobe_opcode_t insn[3]; + uprobe_opcode_t ixol[3]; + }; + unsigned int saved_per : 1; + unsigned int saved_int_code; +}; + +struct arch_uprobe_task { +}; + +#endif /* _ASM_UPROBES_H */ diff --git a/arch/s390/include/asm/user.h b/arch/s390/include/asm/user.h new file mode 100644 index 0000000000..8e8aaf4858 --- /dev/null +++ b/arch/s390/include/asm/user.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * S390 version + * + * Derived from "include/asm-i386/usr.h" + */ + +#ifndef _S390_USER_H +#define _S390_USER_H + +#include +#include +/* Core file format: The core file is written in such a way that gdb + can understand it and provide useful information to the user (under + linux we use the 'trad-core' bfd). There are quite a number of + obstacles to being able to view the contents of the floating point + registers, and until these are solved you will not be able to view the + contents of them. Actually, you can read in the core file and look at + the contents of the user struct to find out what the floating point + registers contain. + The actual file contents are as follows: + UPAGE: 1 page consisting of a user struct that tells gdb what is present + in the file. Directly after this is a copy of the task_struct, which + is currently not used by gdb, but it may come in useful at some point. + All of the registers are stored as part of the upage. The upage should + always be only one page. + DATA: The data area is stored. We use current->end_text to + current->brk to pick up all of the user variables, plus any memory + that may have been malloced. No attempt is made to determine if a page + is demand-zero or if a page is totally unused, we just cover the entire + range. All of the addresses are rounded in such a way that an integral + number of pages is written. + STACK: We need the stack information in order to get a meaningful + backtrace. We need to write the data from (esp) to + current->start_stack, so we round each of these off in order to be able + to write an integer number of pages. + The minimum core file size is 3 pages, or 12288 bytes. +*/ + + +/* + * This is the old layout of "struct pt_regs", and + * is still the layout used by user mode (the new + * pt_regs doesn't have all registers as the kernel + * doesn't use the extra segment registers) + */ + +/* When the kernel dumps core, it starts by dumping the user struct - + this will be used by gdb to figure out where the data and stack segments + are within the file, and what virtual addresses to use. */ +struct user { +/* We start with the registers, to mimic the way that "memory" is returned + from the ptrace(3,...) function. */ + struct user_regs_struct regs; /* Where the registers are actually stored */ +/* The rest of this junk is to help gdb figure out what goes where */ + unsigned long int u_tsize; /* Text segment size (pages). */ + unsigned long int u_dsize; /* Data segment size (pages). */ + unsigned long int u_ssize; /* Stack segment size (pages). */ + unsigned long start_code; /* Starting virtual address of text. */ + unsigned long start_stack; /* Starting virtual address of stack area. + This is actually the bottom of the stack, + the top of the stack is always found in the + esp register. */ + long int signal; /* Signal that caused the core dump. */ + unsigned long u_ar0; /* Used by gdb to help find the values for */ + /* the registers. */ + unsigned long magic; /* To uniquely identify a core file */ + char u_comm[32]; /* User command that was responsible */ +}; + +#endif /* _S390_USER_H */ diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h new file mode 100644 index 0000000000..0e7bd38739 --- /dev/null +++ b/arch/s390/include/asm/uv.h @@ -0,0 +1,517 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Ultravisor Interfaces + * + * Copyright IBM Corp. 2019, 2022 + * + * Author(s): + * Vasily Gorbik + * Janosch Frank + */ +#ifndef _ASM_S390_UV_H +#define _ASM_S390_UV_H + +#include +#include +#include +#include +#include +#include + +#define UVC_CC_OK 0 +#define UVC_CC_ERROR 1 +#define UVC_CC_BUSY 2 +#define UVC_CC_PARTIAL 3 + +#define UVC_RC_EXECUTED 0x0001 +#define UVC_RC_INV_CMD 0x0002 +#define UVC_RC_INV_STATE 0x0003 +#define UVC_RC_INV_LEN 0x0005 +#define UVC_RC_NO_RESUME 0x0007 +#define UVC_RC_NEED_DESTROY 0x8000 + +#define UVC_CMD_QUI 0x0001 +#define UVC_CMD_INIT_UV 0x000f +#define UVC_CMD_CREATE_SEC_CONF 0x0100 +#define UVC_CMD_DESTROY_SEC_CONF 0x0101 +#define UVC_CMD_DESTROY_SEC_CONF_FAST 0x0102 +#define UVC_CMD_CREATE_SEC_CPU 0x0120 +#define UVC_CMD_DESTROY_SEC_CPU 0x0121 +#define UVC_CMD_CONV_TO_SEC_STOR 0x0200 +#define UVC_CMD_CONV_FROM_SEC_STOR 0x0201 +#define UVC_CMD_DESTR_SEC_STOR 0x0202 +#define UVC_CMD_SET_SEC_CONF_PARAMS 0x0300 +#define UVC_CMD_UNPACK_IMG 0x0301 +#define UVC_CMD_VERIFY_IMG 0x0302 +#define UVC_CMD_CPU_RESET 0x0310 +#define UVC_CMD_CPU_RESET_INITIAL 0x0311 +#define UVC_CMD_PREPARE_RESET 0x0320 +#define UVC_CMD_CPU_RESET_CLEAR 0x0321 +#define UVC_CMD_CPU_SET_STATE 0x0330 +#define UVC_CMD_SET_UNSHARE_ALL 0x0340 +#define UVC_CMD_PIN_PAGE_SHARED 0x0341 +#define UVC_CMD_UNPIN_PAGE_SHARED 0x0342 +#define UVC_CMD_DUMP_INIT 0x0400 +#define UVC_CMD_DUMP_CONF_STOR_STATE 0x0401 +#define UVC_CMD_DUMP_CPU 0x0402 +#define UVC_CMD_DUMP_COMPLETE 0x0403 +#define UVC_CMD_SET_SHARED_ACCESS 0x1000 +#define UVC_CMD_REMOVE_SHARED_ACCESS 0x1001 +#define UVC_CMD_RETR_ATTEST 0x1020 +#define UVC_CMD_ADD_SECRET 0x1031 +#define UVC_CMD_LIST_SECRETS 0x1033 +#define UVC_CMD_LOCK_SECRETS 0x1034 + +/* Bits in installed uv calls */ +enum uv_cmds_inst { + BIT_UVC_CMD_QUI = 0, + BIT_UVC_CMD_INIT_UV = 1, + BIT_UVC_CMD_CREATE_SEC_CONF = 2, + BIT_UVC_CMD_DESTROY_SEC_CONF = 3, + BIT_UVC_CMD_CREATE_SEC_CPU = 4, + BIT_UVC_CMD_DESTROY_SEC_CPU = 5, + BIT_UVC_CMD_CONV_TO_SEC_STOR = 6, + BIT_UVC_CMD_CONV_FROM_SEC_STOR = 7, + BIT_UVC_CMD_SET_SHARED_ACCESS = 8, + BIT_UVC_CMD_REMOVE_SHARED_ACCESS = 9, + BIT_UVC_CMD_SET_SEC_PARMS = 11, + BIT_UVC_CMD_UNPACK_IMG = 13, + BIT_UVC_CMD_VERIFY_IMG = 14, + BIT_UVC_CMD_CPU_RESET = 15, + BIT_UVC_CMD_CPU_RESET_INITIAL = 16, + BIT_UVC_CMD_CPU_SET_STATE = 17, + BIT_UVC_CMD_PREPARE_RESET = 18, + BIT_UVC_CMD_CPU_PERFORM_CLEAR_RESET = 19, + BIT_UVC_CMD_UNSHARE_ALL = 20, + BIT_UVC_CMD_PIN_PAGE_SHARED = 21, + BIT_UVC_CMD_UNPIN_PAGE_SHARED = 22, + BIT_UVC_CMD_DESTROY_SEC_CONF_FAST = 23, + BIT_UVC_CMD_DUMP_INIT = 24, + BIT_UVC_CMD_DUMP_CONFIG_STOR_STATE = 25, + BIT_UVC_CMD_DUMP_CPU = 26, + BIT_UVC_CMD_DUMP_COMPLETE = 27, + BIT_UVC_CMD_RETR_ATTEST = 28, + BIT_UVC_CMD_ADD_SECRET = 29, + BIT_UVC_CMD_LIST_SECRETS = 30, + BIT_UVC_CMD_LOCK_SECRETS = 31, +}; + +enum uv_feat_ind { + BIT_UV_FEAT_MISC = 0, + BIT_UV_FEAT_AIV = 1, + BIT_UV_FEAT_AP = 4, + BIT_UV_FEAT_AP_INTR = 5, +}; + +struct uv_cb_header { + u16 len; + u16 cmd; /* Command Code */ + u16 rc; /* Response Code */ + u16 rrc; /* Return Reason Code */ +} __packed __aligned(8); + +/* Query Ultravisor Information */ +struct uv_cb_qui { + struct uv_cb_header header; /* 0x0000 */ + u64 reserved08; /* 0x0008 */ + u64 inst_calls_list[4]; /* 0x0010 */ + u64 reserved30[2]; /* 0x0030 */ + u64 uv_base_stor_len; /* 0x0040 */ + u64 reserved48; /* 0x0048 */ + u64 conf_base_phys_stor_len; /* 0x0050 */ + u64 conf_base_virt_stor_len; /* 0x0058 */ + u64 conf_virt_var_stor_len; /* 0x0060 */ + u64 cpu_stor_len; /* 0x0068 */ + u32 reserved70[3]; /* 0x0070 */ + u32 max_num_sec_conf; /* 0x007c */ + u64 max_guest_stor_addr; /* 0x0080 */ + u8 reserved88[0x9e - 0x88]; /* 0x0088 */ + u16 max_guest_cpu_id; /* 0x009e */ + u64 uv_feature_indications; /* 0x00a0 */ + u64 reserveda8; /* 0x00a8 */ + u64 supp_se_hdr_versions; /* 0x00b0 */ + u64 supp_se_hdr_pcf; /* 0x00b8 */ + u64 reservedc0; /* 0x00c0 */ + u64 conf_dump_storage_state_len; /* 0x00c8 */ + u64 conf_dump_finalize_len; /* 0x00d0 */ + u64 reservedd8; /* 0x00d8 */ + u64 supp_att_req_hdr_ver; /* 0x00e0 */ + u64 supp_att_pflags; /* 0x00e8 */ + u64 reservedf0; /* 0x00f0 */ + u64 supp_add_secret_req_ver; /* 0x00f8 */ + u64 supp_add_secret_pcf; /* 0x0100 */ + u64 supp_secret_types; /* 0x0180 */ + u16 max_secrets; /* 0x0110 */ + u8 reserved112[0x120 - 0x112]; /* 0x0112 */ +} __packed __aligned(8); + +/* Initialize Ultravisor */ +struct uv_cb_init { + struct uv_cb_header header; + u64 reserved08[2]; + u64 stor_origin; + u64 stor_len; + u64 reserved28[4]; +} __packed __aligned(8); + +/* Create Guest Configuration */ +struct uv_cb_cgc { + struct uv_cb_header header; + u64 reserved08[2]; + u64 guest_handle; + u64 conf_base_stor_origin; + u64 conf_virt_stor_origin; + u8 reserved30[6]; + union { + struct { + u16 : 14; + u16 ap_instr_intr : 1; + u16 ap_allow_instr : 1; + }; + u16 raw; + } flags; + u64 guest_stor_origin; + u64 guest_stor_len; + u64 guest_sca; + u64 guest_asce; + u64 reserved58[5]; +} __packed __aligned(8); + +/* Create Secure CPU */ +struct uv_cb_csc { + struct uv_cb_header header; + u64 reserved08[2]; + u64 cpu_handle; + u64 guest_handle; + u64 stor_origin; + u8 reserved30[6]; + u16 num; + u64 state_origin; + u64 reserved40[4]; +} __packed __aligned(8); + +/* Convert to Secure */ +struct uv_cb_cts { + struct uv_cb_header header; + u64 reserved08[2]; + u64 guest_handle; + u64 gaddr; +} __packed __aligned(8); + +/* Convert from Secure / Pin Page Shared */ +struct uv_cb_cfs { + struct uv_cb_header header; + u64 reserved08[2]; + u64 paddr; +} __packed __aligned(8); + +/* Set Secure Config Parameter */ +struct uv_cb_ssc { + struct uv_cb_header header; + u64 reserved08[2]; + u64 guest_handle; + u64 sec_header_origin; + u32 sec_header_len; + u32 reserved2c; + u64 reserved30[4]; +} __packed __aligned(8); + +/* Unpack */ +struct uv_cb_unp { + struct uv_cb_header header; + u64 reserved08[2]; + u64 guest_handle; + u64 gaddr; + u64 tweak[2]; + u64 reserved38[3]; +} __packed __aligned(8); + +#define PV_CPU_STATE_OPR 1 +#define PV_CPU_STATE_STP 2 +#define PV_CPU_STATE_CHKSTP 3 +#define PV_CPU_STATE_OPR_LOAD 5 + +struct uv_cb_cpu_set_state { + struct uv_cb_header header; + u64 reserved08[2]; + u64 cpu_handle; + u8 reserved20[7]; + u8 state; + u64 reserved28[5]; +}; + +/* + * A common UV call struct for calls that take no payload + * Examples: + * Destroy cpu/config + * Verify + */ +struct uv_cb_nodata { + struct uv_cb_header header; + u64 reserved08[2]; + u64 handle; + u64 reserved20[4]; +} __packed __aligned(8); + +/* Destroy Configuration Fast */ +struct uv_cb_destroy_fast { + struct uv_cb_header header; + u64 reserved08[2]; + u64 handle; + u64 reserved20[5]; +} __packed __aligned(8); + +/* Set Shared Access */ +struct uv_cb_share { + struct uv_cb_header header; + u64 reserved08[3]; + u64 paddr; + u64 reserved28; +} __packed __aligned(8); + +/* Retrieve Attestation Measurement */ +struct uv_cb_attest { + struct uv_cb_header header; /* 0x0000 */ + u64 reserved08[2]; /* 0x0008 */ + u64 arcb_addr; /* 0x0018 */ + u64 cont_token; /* 0x0020 */ + u8 reserved28[6]; /* 0x0028 */ + u16 user_data_len; /* 0x002e */ + u8 user_data[256]; /* 0x0030 */ + u32 reserved130[3]; /* 0x0130 */ + u32 meas_len; /* 0x013c */ + u64 meas_addr; /* 0x0140 */ + u8 config_uid[16]; /* 0x0148 */ + u32 reserved158; /* 0x0158 */ + u32 add_data_len; /* 0x015c */ + u64 add_data_addr; /* 0x0160 */ + u64 reserved168[4]; /* 0x0168 */ +} __packed __aligned(8); + +struct uv_cb_dump_cpu { + struct uv_cb_header header; + u64 reserved08[2]; + u64 cpu_handle; + u64 dump_area_origin; + u64 reserved28[5]; +} __packed __aligned(8); + +struct uv_cb_dump_stor_state { + struct uv_cb_header header; + u64 reserved08[2]; + u64 config_handle; + u64 dump_area_origin; + u64 gaddr; + u64 reserved28[4]; +} __packed __aligned(8); + +struct uv_cb_dump_complete { + struct uv_cb_header header; + u64 reserved08[2]; + u64 config_handle; + u64 dump_area_origin; + u64 reserved30[5]; +} __packed __aligned(8); + +/* + * A common UV call struct for pv guests that contains a single address + * Examples: + * Add Secret + * List Secrets + */ +struct uv_cb_guest_addr { + struct uv_cb_header header; + u64 reserved08[3]; + u64 addr; + u64 reserved28[4]; +} __packed __aligned(8); + +static inline int __uv_call(unsigned long r1, unsigned long r2) +{ + int cc; + + asm volatile( + " .insn rrf,0xB9A40000,%[r1],%[r2],0,0\n" + " ipm %[cc]\n" + " srl %[cc],28\n" + : [cc] "=d" (cc) + : [r1] "a" (r1), [r2] "a" (r2) + : "memory", "cc"); + return cc; +} + +static inline int uv_call(unsigned long r1, unsigned long r2) +{ + int cc; + + do { + cc = __uv_call(r1, r2); + } while (cc > 1); + return cc; +} + +/* Low level uv_call that avoids stalls for long running busy conditions */ +static inline int uv_call_sched(unsigned long r1, unsigned long r2) +{ + int cc; + + do { + cc = __uv_call(r1, r2); + cond_resched(); + } while (cc > 1); + return cc; +} + +/* + * special variant of uv_call that only transports the cpu or guest + * handle and the command, like destroy or verify. + */ +static inline int uv_cmd_nodata(u64 handle, u16 cmd, u16 *rc, u16 *rrc) +{ + struct uv_cb_nodata uvcb = { + .header.cmd = cmd, + .header.len = sizeof(uvcb), + .handle = handle, + }; + int cc; + + WARN(!handle, "No handle provided to Ultravisor call cmd %x\n", cmd); + cc = uv_call_sched(0, (u64)&uvcb); + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + return cc ? -EINVAL : 0; +} + +struct uv_info { + unsigned long inst_calls_list[4]; + unsigned long uv_base_stor_len; + unsigned long guest_base_stor_len; + unsigned long guest_virt_base_stor_len; + unsigned long guest_virt_var_stor_len; + unsigned long guest_cpu_stor_len; + unsigned long max_sec_stor_addr; + unsigned int max_num_sec_conf; + unsigned short max_guest_cpu_id; + unsigned long uv_feature_indications; + unsigned long supp_se_hdr_ver; + unsigned long supp_se_hdr_pcf; + unsigned long conf_dump_storage_state_len; + unsigned long conf_dump_finalize_len; + unsigned long supp_att_req_hdr_ver; + unsigned long supp_att_pflags; + unsigned long supp_add_secret_req_ver; + unsigned long supp_add_secret_pcf; + unsigned long supp_secret_types; + unsigned short max_secrets; +}; + +extern struct uv_info uv_info; + +static inline bool uv_has_feature(u8 feature_bit) +{ + if (feature_bit >= sizeof(uv_info.uv_feature_indications) * 8) + return false; + return test_bit_inv(feature_bit, &uv_info.uv_feature_indications); +} + +#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST +extern int prot_virt_guest; + +static inline int is_prot_virt_guest(void) +{ + return prot_virt_guest; +} + +static inline int share(unsigned long addr, u16 cmd) +{ + struct uv_cb_share uvcb = { + .header.cmd = cmd, + .header.len = sizeof(uvcb), + .paddr = addr + }; + + if (!is_prot_virt_guest()) + return -EOPNOTSUPP; + /* + * Sharing is page wise, if we encounter addresses that are + * not page aligned, we assume something went wrong. If + * malloced structs are passed to this function, we could leak + * data to the hypervisor. + */ + BUG_ON(addr & ~PAGE_MASK); + + if (!uv_call(0, (u64)&uvcb)) + return 0; + return -EINVAL; +} + +/* + * Guest 2 request to the Ultravisor to make a page shared with the + * hypervisor for IO. + * + * @addr: Real or absolute address of the page to be shared + */ +static inline int uv_set_shared(unsigned long addr) +{ + return share(addr, UVC_CMD_SET_SHARED_ACCESS); +} + +/* + * Guest 2 request to the Ultravisor to make a page unshared. + * + * @addr: Real or absolute address of the page to be unshared + */ +static inline int uv_remove_shared(unsigned long addr) +{ + return share(addr, UVC_CMD_REMOVE_SHARED_ACCESS); +} + +#else +#define is_prot_virt_guest() 0 +static inline int uv_set_shared(unsigned long addr) { return 0; } +static inline int uv_remove_shared(unsigned long addr) { return 0; } +#endif + +#if IS_ENABLED(CONFIG_KVM) +extern int prot_virt_host; + +static inline int is_prot_virt_host(void) +{ + return prot_virt_host; +} + +int uv_pin_shared(unsigned long paddr); +int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb); +int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr); +int uv_destroy_owned_page(unsigned long paddr); +int uv_convert_from_secure(unsigned long paddr); +int uv_convert_owned_from_secure(unsigned long paddr); +int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr); + +void setup_uv(void); +#else +#define is_prot_virt_host() 0 +static inline void setup_uv(void) {} + +static inline int uv_pin_shared(unsigned long paddr) +{ + return 0; +} + +static inline int uv_destroy_owned_page(unsigned long paddr) +{ + return 0; +} + +static inline int uv_convert_from_secure(unsigned long paddr) +{ + return 0; +} + +static inline int uv_convert_owned_from_secure(unsigned long paddr) +{ + return 0; +} +#endif + +#endif /* _ASM_S390_UV_H */ diff --git a/arch/s390/include/asm/vdso.h b/arch/s390/include/asm/vdso.h new file mode 100644 index 0000000000..53165aa781 --- /dev/null +++ b/arch/s390/include/asm/vdso.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __S390_VDSO_H__ +#define __S390_VDSO_H__ + +#include + +#ifndef __ASSEMBLY__ + +#include +#ifdef CONFIG_COMPAT +#include +#endif + +#define VDSO64_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso64_offset_##name)) +#ifdef CONFIG_COMPAT +#define VDSO32_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso32_offset_##name)) +#else +#define VDSO32_SYMBOL(tsk, name) (-1UL) +#endif + +extern struct vdso_data *vdso_data; + +int vdso_getcpu_init(void); + +#endif /* __ASSEMBLY__ */ + +/* Default link address for the vDSO */ +#define VDSO_LBASE 0 + +#define __VVAR_PAGES 2 + +#define VDSO_VERSION_STRING LINUX_2.6.29 + +#endif /* __S390_VDSO_H__ */ diff --git a/arch/s390/include/asm/vdso/clocksource.h b/arch/s390/include/asm/vdso/clocksource.h new file mode 100644 index 0000000000..a93eda0ce7 --- /dev/null +++ b/arch/s390/include/asm/vdso/clocksource.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_VDSO_CLOCKSOURCE_H +#define __ASM_VDSO_CLOCKSOURCE_H + +#define VDSO_ARCH_CLOCKMODES \ + VDSO_CLOCKMODE_TOD + +#endif /* __ASM_VDSO_CLOCKSOURCE_H */ diff --git a/arch/s390/include/asm/vdso/data.h b/arch/s390/include/asm/vdso/data.h new file mode 100644 index 0000000000..73ee891426 --- /dev/null +++ b/arch/s390/include/asm/vdso/data.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __S390_ASM_VDSO_DATA_H +#define __S390_ASM_VDSO_DATA_H + +#include +#include + +struct arch_vdso_data { + __s64 tod_steering_delta; + __u64 tod_steering_end; +}; + +#endif /* __S390_ASM_VDSO_DATA_H */ diff --git a/arch/s390/include/asm/vdso/gettimeofday.h b/arch/s390/include/asm/vdso/gettimeofday.h new file mode 100644 index 0000000000..db84942eb7 --- /dev/null +++ b/arch/s390/include/asm/vdso/gettimeofday.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef ASM_VDSO_GETTIMEOFDAY_H +#define ASM_VDSO_GETTIMEOFDAY_H + +#define VDSO_HAS_TIME 1 + +#define VDSO_HAS_CLOCK_GETRES 1 + +#include +#include +#include +#include + +#define vdso_calc_delta __arch_vdso_calc_delta +static __always_inline u64 __arch_vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult) +{ + return (cycles - last) * mult; +} + +static __always_inline const struct vdso_data *__arch_get_vdso_data(void) +{ + return _vdso_data; +} + +static inline u64 __arch_get_hw_counter(s32 clock_mode, const struct vdso_data *vd) +{ + u64 adj, now; + + now = get_tod_clock(); + adj = vd->arch_data.tod_steering_end - now; + if (unlikely((s64) adj > 0)) + now += (vd->arch_data.tod_steering_delta < 0) ? (adj >> 15) : -(adj >> 15); + return now; +} + +static __always_inline +long clock_gettime_fallback(clockid_t clkid, struct __kernel_timespec *ts) +{ + return syscall2(__NR_clock_gettime, (long)clkid, (long)ts); +} + +static __always_inline +long gettimeofday_fallback(register struct __kernel_old_timeval *tv, + register struct timezone *tz) +{ + return syscall2(__NR_gettimeofday, (long)tv, (long)tz); +} + +static __always_inline +long clock_getres_fallback(clockid_t clkid, struct __kernel_timespec *ts) +{ + return syscall2(__NR_clock_getres, (long)clkid, (long)ts); +} + +#ifdef CONFIG_TIME_NS +static __always_inline +const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd) +{ + return _timens_data; +} +#endif + +#endif diff --git a/arch/s390/include/asm/vdso/processor.h b/arch/s390/include/asm/vdso/processor.h new file mode 100644 index 0000000000..cfcc3e117c --- /dev/null +++ b/arch/s390/include/asm/vdso/processor.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __ASM_VDSO_PROCESSOR_H +#define __ASM_VDSO_PROCESSOR_H + +#define cpu_relax() barrier() + +#endif /* __ASM_VDSO_PROCESSOR_H */ diff --git a/arch/s390/include/asm/vdso/vsyscall.h b/arch/s390/include/asm/vdso/vsyscall.h new file mode 100644 index 0000000000..6c67c08cef --- /dev/null +++ b/arch/s390/include/asm/vdso/vsyscall.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_VDSO_VSYSCALL_H +#define __ASM_VDSO_VSYSCALL_H + +#ifndef __ASSEMBLY__ + +#include +#include +#include +#include +/* + * Update the vDSO data page to keep in sync with kernel timekeeping. + */ + +static __always_inline struct vdso_data *__s390_get_k_vdso_data(void) +{ + return vdso_data; +} +#define __arch_get_k_vdso_data __s390_get_k_vdso_data + +/* The asm-generic header needs to be included after the definitions above */ +#include + +#endif /* !__ASSEMBLY__ */ + +#endif /* __ASM_VDSO_VSYSCALL_H */ diff --git a/arch/s390/include/asm/vmalloc.h b/arch/s390/include/asm/vmalloc.h new file mode 100644 index 0000000000..3ba3a6bdca --- /dev/null +++ b/arch/s390/include/asm/vmalloc.h @@ -0,0 +1,4 @@ +#ifndef _ASM_S390_VMALLOC_H +#define _ASM_S390_VMALLOC_H + +#endif /* _ASM_S390_VMALLOC_H */ diff --git a/arch/s390/include/asm/vmlinux.lds.h b/arch/s390/include/asm/vmlinux.lds.h new file mode 100644 index 0000000000..cbe670a686 --- /dev/null +++ b/arch/s390/include/asm/vmlinux.lds.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include + +/* + * .boot.data section is shared between the decompressor code and the + * decompressed kernel. The decompressor will store values in it, and copy + * over to the decompressed image before starting it. + * + * .boot.data variables are kept in separate .boot.data. sections, + * which are sorted by alignment first, then by name before being merged + * into single .boot.data section. This way big holes cased by page aligned + * structs are avoided and linker produces consistent result. + */ +#define BOOT_DATA \ + . = ALIGN(PAGE_SIZE); \ + .boot.data : { \ + __boot_data_start = .; \ + *(SORT_BY_ALIGNMENT(SORT_BY_NAME(.boot.data*))) \ + __boot_data_end = .; \ + } + +/* + * .boot.preserved.data is similar to .boot.data, but it is not part of the + * .init section and thus will be preserved for later use in the decompressed + * kernel. + */ +#define BOOT_DATA_PRESERVED \ + . = ALIGN(PAGE_SIZE); \ + .boot.preserved.data : { \ + __boot_data_preserved_start = .; \ + *(SORT_BY_ALIGNMENT(SORT_BY_NAME(.boot.preserved.data*))) \ + __boot_data_preserved_end = .; \ + } diff --git a/arch/s390/include/asm/vtime.h b/arch/s390/include/asm/vtime.h new file mode 100644 index 0000000000..fe17e448c0 --- /dev/null +++ b/arch/s390/include/asm/vtime.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _S390_VTIME_H +#define _S390_VTIME_H + +#define __ARCH_HAS_VTIME_TASK_SWITCH + +static inline void update_timer_sys(void) +{ + S390_lowcore.system_timer += S390_lowcore.last_update_timer - S390_lowcore.exit_timer; + S390_lowcore.user_timer += S390_lowcore.exit_timer - S390_lowcore.sys_enter_timer; + S390_lowcore.last_update_timer = S390_lowcore.sys_enter_timer; +} + +static inline void update_timer_mcck(void) +{ + S390_lowcore.system_timer += S390_lowcore.last_update_timer - S390_lowcore.exit_timer; + S390_lowcore.user_timer += S390_lowcore.exit_timer - S390_lowcore.mcck_enter_timer; + S390_lowcore.last_update_timer = S390_lowcore.mcck_enter_timer; +} + +#endif /* _S390_VTIME_H */ diff --git a/arch/s390/include/asm/vtimer.h b/arch/s390/include/asm/vtimer.h new file mode 100644 index 0000000000..e601adaa63 --- /dev/null +++ b/arch/s390/include/asm/vtimer.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2003, 2012 + * Virtual CPU timer + * + * Author(s): Jan Glauber + */ + +#ifndef _ASM_S390_TIMER_H +#define _ASM_S390_TIMER_H + +#define VTIMER_MAX_SLICE (0x7fffffffffffffffULL) + +struct vtimer_list { + struct list_head entry; + u64 expires; + u64 interval; + void (*function)(unsigned long); + unsigned long data; +}; + +extern void init_virt_timer(struct vtimer_list *timer); +extern void add_virt_timer(struct vtimer_list *timer); +extern void add_virt_timer_periodic(struct vtimer_list *timer); +extern int mod_virt_timer(struct vtimer_list *timer, u64 expires); +extern int mod_virt_timer_periodic(struct vtimer_list *timer, u64 expires); +extern int del_virt_timer(struct vtimer_list *timer); +extern void vtime_init(void); + +#endif /* _ASM_S390_TIMER_H */ diff --git a/arch/s390/include/asm/vx-insn-asm.h b/arch/s390/include/asm/vx-insn-asm.h new file mode 100644 index 0000000000..360f8b36d9 --- /dev/null +++ b/arch/s390/include/asm/vx-insn-asm.h @@ -0,0 +1,681 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Support for Vector Instructions + * + * Assembler macros to generate .byte/.word code for particular + * vector instructions that are supported by recent binutils (>= 2.26) only. + * + * Copyright IBM Corp. 2015 + * Author(s): Hendrik Brueckner + */ + +#ifndef __ASM_S390_VX_INSN_INTERNAL_H +#define __ASM_S390_VX_INSN_INTERNAL_H + +#ifndef __ASM_S390_VX_INSN_H +#error only can be included directly +#endif + +#ifdef __ASSEMBLY__ + +/* Macros to generate vector instruction byte code */ + +/* GR_NUM - Retrieve general-purpose register number + * + * @opd: Operand to store register number + * @r64: String designation register in the format "%rN" + */ +.macro GR_NUM opd gr + \opd = 255 + .ifc \gr,%r0 + \opd = 0 + .endif + .ifc \gr,%r1 + \opd = 1 + .endif + .ifc \gr,%r2 + \opd = 2 + .endif + .ifc \gr,%r3 + \opd = 3 + .endif + .ifc \gr,%r4 + \opd = 4 + .endif + .ifc \gr,%r5 + \opd = 5 + .endif + .ifc \gr,%r6 + \opd = 6 + .endif + .ifc \gr,%r7 + \opd = 7 + .endif + .ifc \gr,%r8 + \opd = 8 + .endif + .ifc \gr,%r9 + \opd = 9 + .endif + .ifc \gr,%r10 + \opd = 10 + .endif + .ifc \gr,%r11 + \opd = 11 + .endif + .ifc \gr,%r12 + \opd = 12 + .endif + .ifc \gr,%r13 + \opd = 13 + .endif + .ifc \gr,%r14 + \opd = 14 + .endif + .ifc \gr,%r15 + \opd = 15 + .endif + .if \opd == 255 + \opd = \gr + .endif +.endm + +/* VX_NUM - Retrieve vector register number + * + * @opd: Operand to store register number + * @vxr: String designation register in the format "%vN" + * + * The vector register number is used for as input number to the + * instruction and, as well as, to compute the RXB field of the + * instruction. + */ +.macro VX_NUM opd vxr + \opd = 255 + .ifc \vxr,%v0 + \opd = 0 + .endif + .ifc \vxr,%v1 + \opd = 1 + .endif + .ifc \vxr,%v2 + \opd = 2 + .endif + .ifc \vxr,%v3 + \opd = 3 + .endif + .ifc \vxr,%v4 + \opd = 4 + .endif + .ifc \vxr,%v5 + \opd = 5 + .endif + .ifc \vxr,%v6 + \opd = 6 + .endif + .ifc \vxr,%v7 + \opd = 7 + .endif + .ifc \vxr,%v8 + \opd = 8 + .endif + .ifc \vxr,%v9 + \opd = 9 + .endif + .ifc \vxr,%v10 + \opd = 10 + .endif + .ifc \vxr,%v11 + \opd = 11 + .endif + .ifc \vxr,%v12 + \opd = 12 + .endif + .ifc \vxr,%v13 + \opd = 13 + .endif + .ifc \vxr,%v14 + \opd = 14 + .endif + .ifc \vxr,%v15 + \opd = 15 + .endif + .ifc \vxr,%v16 + \opd = 16 + .endif + .ifc \vxr,%v17 + \opd = 17 + .endif + .ifc \vxr,%v18 + \opd = 18 + .endif + .ifc \vxr,%v19 + \opd = 19 + .endif + .ifc \vxr,%v20 + \opd = 20 + .endif + .ifc \vxr,%v21 + \opd = 21 + .endif + .ifc \vxr,%v22 + \opd = 22 + .endif + .ifc \vxr,%v23 + \opd = 23 + .endif + .ifc \vxr,%v24 + \opd = 24 + .endif + .ifc \vxr,%v25 + \opd = 25 + .endif + .ifc \vxr,%v26 + \opd = 26 + .endif + .ifc \vxr,%v27 + \opd = 27 + .endif + .ifc \vxr,%v28 + \opd = 28 + .endif + .ifc \vxr,%v29 + \opd = 29 + .endif + .ifc \vxr,%v30 + \opd = 30 + .endif + .ifc \vxr,%v31 + \opd = 31 + .endif + .if \opd == 255 + \opd = \vxr + .endif +.endm + +/* RXB - Compute most significant bit used vector registers + * + * @rxb: Operand to store computed RXB value + * @v1: First vector register designated operand + * @v2: Second vector register designated operand + * @v3: Third vector register designated operand + * @v4: Fourth vector register designated operand + */ +.macro RXB rxb v1 v2=0 v3=0 v4=0 + \rxb = 0 + .if \v1 & 0x10 + \rxb = \rxb | 0x08 + .endif + .if \v2 & 0x10 + \rxb = \rxb | 0x04 + .endif + .if \v3 & 0x10 + \rxb = \rxb | 0x02 + .endif + .if \v4 & 0x10 + \rxb = \rxb | 0x01 + .endif +.endm + +/* MRXB - Generate Element Size Control and RXB value + * + * @m: Element size control + * @v1: First vector register designated operand (for RXB) + * @v2: Second vector register designated operand (for RXB) + * @v3: Third vector register designated operand (for RXB) + * @v4: Fourth vector register designated operand (for RXB) + */ +.macro MRXB m v1 v2=0 v3=0 v4=0 + rxb = 0 + RXB rxb, \v1, \v2, \v3, \v4 + .byte (\m << 4) | rxb +.endm + +/* MRXBOPC - Generate Element Size Control, RXB, and final Opcode fields + * + * @m: Element size control + * @opc: Opcode + * @v1: First vector register designated operand (for RXB) + * @v2: Second vector register designated operand (for RXB) + * @v3: Third vector register designated operand (for RXB) + * @v4: Fourth vector register designated operand (for RXB) + */ +.macro MRXBOPC m opc v1 v2=0 v3=0 v4=0 + MRXB \m, \v1, \v2, \v3, \v4 + .byte \opc +.endm + +/* Vector support instructions */ + +/* VECTOR GENERATE BYTE MASK */ +.macro VGBM vr imm2 + VX_NUM v1, \vr + .word (0xE700 | ((v1&15) << 4)) + .word \imm2 + MRXBOPC 0, 0x44, v1 +.endm +.macro VZERO vxr + VGBM \vxr, 0 +.endm +.macro VONE vxr + VGBM \vxr, 0xFFFF +.endm + +/* VECTOR LOAD VR ELEMENT FROM GR */ +.macro VLVG v, gr, disp, m + VX_NUM v1, \v + GR_NUM b2, "%r0" + GR_NUM r3, \gr + .word 0xE700 | ((v1&15) << 4) | r3 + .word (b2 << 12) | (\disp) + MRXBOPC \m, 0x22, v1 +.endm +.macro VLVGB v, gr, index, base + VLVG \v, \gr, \index, \base, 0 +.endm +.macro VLVGH v, gr, index + VLVG \v, \gr, \index, 1 +.endm +.macro VLVGF v, gr, index + VLVG \v, \gr, \index, 2 +.endm +.macro VLVGG v, gr, index + VLVG \v, \gr, \index, 3 +.endm + +/* VECTOR LOAD REGISTER */ +.macro VLR v1, v2 + VX_NUM v1, \v1 + VX_NUM v2, \v2 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word 0 + MRXBOPC 0, 0x56, v1, v2 +.endm + +/* VECTOR LOAD */ +.macro VL v, disp, index="%r0", base + VX_NUM v1, \v + GR_NUM x2, \index + GR_NUM b2, \base + .word 0xE700 | ((v1&15) << 4) | x2 + .word (b2 << 12) | (\disp) + MRXBOPC 0, 0x06, v1 +.endm + +/* VECTOR LOAD ELEMENT */ +.macro VLEx vr1, disp, index="%r0", base, m3, opc + VX_NUM v1, \vr1 + GR_NUM x2, \index + GR_NUM b2, \base + .word 0xE700 | ((v1&15) << 4) | x2 + .word (b2 << 12) | (\disp) + MRXBOPC \m3, \opc, v1 +.endm +.macro VLEB vr1, disp, index="%r0", base, m3 + VLEx \vr1, \disp, \index, \base, \m3, 0x00 +.endm +.macro VLEH vr1, disp, index="%r0", base, m3 + VLEx \vr1, \disp, \index, \base, \m3, 0x01 +.endm +.macro VLEF vr1, disp, index="%r0", base, m3 + VLEx \vr1, \disp, \index, \base, \m3, 0x03 +.endm +.macro VLEG vr1, disp, index="%r0", base, m3 + VLEx \vr1, \disp, \index, \base, \m3, 0x02 +.endm + +/* VECTOR LOAD ELEMENT IMMEDIATE */ +.macro VLEIx vr1, imm2, m3, opc + VX_NUM v1, \vr1 + .word 0xE700 | ((v1&15) << 4) + .word \imm2 + MRXBOPC \m3, \opc, v1 +.endm +.macro VLEIB vr1, imm2, index + VLEIx \vr1, \imm2, \index, 0x40 +.endm +.macro VLEIH vr1, imm2, index + VLEIx \vr1, \imm2, \index, 0x41 +.endm +.macro VLEIF vr1, imm2, index + VLEIx \vr1, \imm2, \index, 0x43 +.endm +.macro VLEIG vr1, imm2, index + VLEIx \vr1, \imm2, \index, 0x42 +.endm + +/* VECTOR LOAD GR FROM VR ELEMENT */ +.macro VLGV gr, vr, disp, base="%r0", m + GR_NUM r1, \gr + GR_NUM b2, \base + VX_NUM v3, \vr + .word 0xE700 | (r1 << 4) | (v3&15) + .word (b2 << 12) | (\disp) + MRXBOPC \m, 0x21, v3 +.endm +.macro VLGVB gr, vr, disp, base="%r0" + VLGV \gr, \vr, \disp, \base, 0 +.endm +.macro VLGVH gr, vr, disp, base="%r0" + VLGV \gr, \vr, \disp, \base, 1 +.endm +.macro VLGVF gr, vr, disp, base="%r0" + VLGV \gr, \vr, \disp, \base, 2 +.endm +.macro VLGVG gr, vr, disp, base="%r0" + VLGV \gr, \vr, \disp, \base, 3 +.endm + +/* VECTOR LOAD MULTIPLE */ +.macro VLM vfrom, vto, disp, base, hint=3 + VX_NUM v1, \vfrom + VX_NUM v3, \vto + GR_NUM b2, \base + .word 0xE700 | ((v1&15) << 4) | (v3&15) + .word (b2 << 12) | (\disp) + MRXBOPC \hint, 0x36, v1, v3 +.endm + +/* VECTOR STORE */ +.macro VST vr1, disp, index="%r0", base + VX_NUM v1, \vr1 + GR_NUM x2, \index + GR_NUM b2, \base + .word 0xE700 | ((v1&15) << 4) | (x2&15) + .word (b2 << 12) | (\disp) + MRXBOPC 0, 0x0E, v1 +.endm + +/* VECTOR STORE MULTIPLE */ +.macro VSTM vfrom, vto, disp, base, hint=3 + VX_NUM v1, \vfrom + VX_NUM v3, \vto + GR_NUM b2, \base + .word 0xE700 | ((v1&15) << 4) | (v3&15) + .word (b2 << 12) | (\disp) + MRXBOPC \hint, 0x3E, v1, v3 +.endm + +/* VECTOR PERMUTE */ +.macro VPERM vr1, vr2, vr3, vr4 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + VX_NUM v4, \vr4 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) + MRXBOPC (v4&15), 0x8C, v1, v2, v3, v4 +.endm + +/* VECTOR UNPACK LOGICAL LOW */ +.macro VUPLL vr1, vr2, m3 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word 0x0000 + MRXBOPC \m3, 0xD4, v1, v2 +.endm +.macro VUPLLB vr1, vr2 + VUPLL \vr1, \vr2, 0 +.endm +.macro VUPLLH vr1, vr2 + VUPLL \vr1, \vr2, 1 +.endm +.macro VUPLLF vr1, vr2 + VUPLL \vr1, \vr2, 2 +.endm + +/* VECTOR PERMUTE DOUBLEWORD IMMEDIATE */ +.macro VPDI vr1, vr2, vr3, m4 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) + MRXBOPC \m4, 0x84, v1, v2, v3 +.endm + +/* VECTOR REPLICATE */ +.macro VREP vr1, vr3, imm2, m4 + VX_NUM v1, \vr1 + VX_NUM v3, \vr3 + .word 0xE700 | ((v1&15) << 4) | (v3&15) + .word \imm2 + MRXBOPC \m4, 0x4D, v1, v3 +.endm +.macro VREPB vr1, vr3, imm2 + VREP \vr1, \vr3, \imm2, 0 +.endm +.macro VREPH vr1, vr3, imm2 + VREP \vr1, \vr3, \imm2, 1 +.endm +.macro VREPF vr1, vr3, imm2 + VREP \vr1, \vr3, \imm2, 2 +.endm +.macro VREPG vr1, vr3, imm2 + VREP \vr1, \vr3, \imm2, 3 +.endm + +/* VECTOR MERGE HIGH */ +.macro VMRH vr1, vr2, vr3, m4 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) + MRXBOPC \m4, 0x61, v1, v2, v3 +.endm +.macro VMRHB vr1, vr2, vr3 + VMRH \vr1, \vr2, \vr3, 0 +.endm +.macro VMRHH vr1, vr2, vr3 + VMRH \vr1, \vr2, \vr3, 1 +.endm +.macro VMRHF vr1, vr2, vr3 + VMRH \vr1, \vr2, \vr3, 2 +.endm +.macro VMRHG vr1, vr2, vr3 + VMRH \vr1, \vr2, \vr3, 3 +.endm + +/* VECTOR MERGE LOW */ +.macro VMRL vr1, vr2, vr3, m4 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) + MRXBOPC \m4, 0x60, v1, v2, v3 +.endm +.macro VMRLB vr1, vr2, vr3 + VMRL \vr1, \vr2, \vr3, 0 +.endm +.macro VMRLH vr1, vr2, vr3 + VMRL \vr1, \vr2, \vr3, 1 +.endm +.macro VMRLF vr1, vr2, vr3 + VMRL \vr1, \vr2, \vr3, 2 +.endm +.macro VMRLG vr1, vr2, vr3 + VMRL \vr1, \vr2, \vr3, 3 +.endm + + +/* Vector integer instructions */ + +/* VECTOR AND */ +.macro VN vr1, vr2, vr3 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) + MRXBOPC 0, 0x68, v1, v2, v3 +.endm + +/* VECTOR EXCLUSIVE OR */ +.macro VX vr1, vr2, vr3 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) + MRXBOPC 0, 0x6D, v1, v2, v3 +.endm + +/* VECTOR GALOIS FIELD MULTIPLY SUM */ +.macro VGFM vr1, vr2, vr3, m4 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) + MRXBOPC \m4, 0xB4, v1, v2, v3 +.endm +.macro VGFMB vr1, vr2, vr3 + VGFM \vr1, \vr2, \vr3, 0 +.endm +.macro VGFMH vr1, vr2, vr3 + VGFM \vr1, \vr2, \vr3, 1 +.endm +.macro VGFMF vr1, vr2, vr3 + VGFM \vr1, \vr2, \vr3, 2 +.endm +.macro VGFMG vr1, vr2, vr3 + VGFM \vr1, \vr2, \vr3, 3 +.endm + +/* VECTOR GALOIS FIELD MULTIPLY SUM AND ACCUMULATE */ +.macro VGFMA vr1, vr2, vr3, vr4, m5 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + VX_NUM v4, \vr4 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) | (\m5 << 8) + MRXBOPC (v4&15), 0xBC, v1, v2, v3, v4 +.endm +.macro VGFMAB vr1, vr2, vr3, vr4 + VGFMA \vr1, \vr2, \vr3, \vr4, 0 +.endm +.macro VGFMAH vr1, vr2, vr3, vr4 + VGFMA \vr1, \vr2, \vr3, \vr4, 1 +.endm +.macro VGFMAF vr1, vr2, vr3, vr4 + VGFMA \vr1, \vr2, \vr3, \vr4, 2 +.endm +.macro VGFMAG vr1, vr2, vr3, vr4 + VGFMA \vr1, \vr2, \vr3, \vr4, 3 +.endm + +/* VECTOR SHIFT RIGHT LOGICAL BY BYTE */ +.macro VSRLB vr1, vr2, vr3 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) + MRXBOPC 0, 0x7D, v1, v2, v3 +.endm + +/* VECTOR REPLICATE IMMEDIATE */ +.macro VREPI vr1, imm2, m3 + VX_NUM v1, \vr1 + .word 0xE700 | ((v1&15) << 4) + .word \imm2 + MRXBOPC \m3, 0x45, v1 +.endm +.macro VREPIB vr1, imm2 + VREPI \vr1, \imm2, 0 +.endm +.macro VREPIH vr1, imm2 + VREPI \vr1, \imm2, 1 +.endm +.macro VREPIF vr1, imm2 + VREPI \vr1, \imm2, 2 +.endm +.macro VREPIG vr1, imm2 + VREP \vr1, \imm2, 3 +.endm + +/* VECTOR ADD */ +.macro VA vr1, vr2, vr3, m4 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) + MRXBOPC \m4, 0xF3, v1, v2, v3 +.endm +.macro VAB vr1, vr2, vr3 + VA \vr1, \vr2, \vr3, 0 +.endm +.macro VAH vr1, vr2, vr3 + VA \vr1, \vr2, \vr3, 1 +.endm +.macro VAF vr1, vr2, vr3 + VA \vr1, \vr2, \vr3, 2 +.endm +.macro VAG vr1, vr2, vr3 + VA \vr1, \vr2, \vr3, 3 +.endm +.macro VAQ vr1, vr2, vr3 + VA \vr1, \vr2, \vr3, 4 +.endm + +/* VECTOR ELEMENT SHIFT RIGHT ARITHMETIC */ +.macro VESRAV vr1, vr2, vr3, m4 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) + MRXBOPC \m4, 0x7A, v1, v2, v3 +.endm + +.macro VESRAVB vr1, vr2, vr3 + VESRAV \vr1, \vr2, \vr3, 0 +.endm +.macro VESRAVH vr1, vr2, vr3 + VESRAV \vr1, \vr2, \vr3, 1 +.endm +.macro VESRAVF vr1, vr2, vr3 + VESRAV \vr1, \vr2, \vr3, 2 +.endm +.macro VESRAVG vr1, vr2, vr3 + VESRAV \vr1, \vr2, \vr3, 3 +.endm + +/* VECTOR ELEMENT ROTATE LEFT LOGICAL */ +.macro VERLL vr1, vr3, disp, base="%r0", m4 + VX_NUM v1, \vr1 + VX_NUM v3, \vr3 + GR_NUM b2, \base + .word 0xE700 | ((v1&15) << 4) | (v3&15) + .word (b2 << 12) | (\disp) + MRXBOPC \m4, 0x33, v1, v3 +.endm +.macro VERLLB vr1, vr3, disp, base="%r0" + VERLL \vr1, \vr3, \disp, \base, 0 +.endm +.macro VERLLH vr1, vr3, disp, base="%r0" + VERLL \vr1, \vr3, \disp, \base, 1 +.endm +.macro VERLLF vr1, vr3, disp, base="%r0" + VERLL \vr1, \vr3, \disp, \base, 2 +.endm +.macro VERLLG vr1, vr3, disp, base="%r0" + VERLL \vr1, \vr3, \disp, \base, 3 +.endm + +/* VECTOR SHIFT LEFT DOUBLE BY BYTE */ +.macro VSLDB vr1, vr2, vr3, imm4 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) | (\imm4) + MRXBOPC 0, 0x77, v1, v2, v3 +.endm + +#endif /* __ASSEMBLY__ */ +#endif /* __ASM_S390_VX_INSN_INTERNAL_H */ diff --git a/arch/s390/include/asm/vx-insn.h b/arch/s390/include/asm/vx-insn.h new file mode 100644 index 0000000000..8c188f1c6d --- /dev/null +++ b/arch/s390/include/asm/vx-insn.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Support for Vector Instructions + * + * This wrapper header file allows to use the vector instruction macros in + * both assembler files as well as in inline assemblies in C files. + */ + +#ifndef __ASM_S390_VX_INSN_H +#define __ASM_S390_VX_INSN_H + +#include + +#ifndef __ASSEMBLY__ + +asm(".include \"asm/vx-insn-asm.h\"\n"); + +#endif /* __ASSEMBLY__ */ +#endif /* __ASM_S390_VX_INSN_H */ diff --git a/arch/s390/include/asm/xor.h b/arch/s390/include/asm/xor.h new file mode 100644 index 0000000000..857d6759b6 --- /dev/null +++ b/arch/s390/include/asm/xor.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Optimited xor routines + * + * Copyright IBM Corp. 2016 + * Author(s): Martin Schwidefsky + */ +#ifndef _ASM_S390_XOR_H +#define _ASM_S390_XOR_H + +extern struct xor_block_template xor_block_xc; + +#undef XOR_TRY_TEMPLATES +#define XOR_TRY_TEMPLATES \ +do { \ + xor_speed(&xor_block_xc); \ +} while (0) + +#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_xc) + +#endif /* _ASM_S390_XOR_H */ diff --git a/arch/s390/include/uapi/asm/Kbuild b/arch/s390/include/uapi/asm/Kbuild new file mode 100644 index 0000000000..46c1ff0b84 --- /dev/null +++ b/arch/s390/include/uapi/asm/Kbuild @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 + +generated-y += unistd_32.h +generated-y += unistd_64.h diff --git a/arch/s390/include/uapi/asm/auxvec.h b/arch/s390/include/uapi/asm/auxvec.h new file mode 100644 index 0000000000..a056c4637f --- /dev/null +++ b/arch/s390/include/uapi/asm/auxvec.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __ASMS390_AUXVEC_H +#define __ASMS390_AUXVEC_H + +#define AT_SYSINFO_EHDR 33 + +#define AT_VECTOR_SIZE_ARCH 1 /* entries in ARCH_DLINFO */ + +#endif diff --git a/arch/s390/include/uapi/asm/bitsperlong.h b/arch/s390/include/uapi/asm/bitsperlong.h new file mode 100644 index 0000000000..cceaf47b02 --- /dev/null +++ b/arch/s390/include/uapi/asm/bitsperlong.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __ASM_S390_BITSPERLONG_H +#define __ASM_S390_BITSPERLONG_H + +#ifndef __s390x__ +#define __BITS_PER_LONG 32 +#else +#define __BITS_PER_LONG 64 +#endif + +#include + +#endif /* __ASM_S390_BITSPERLONG_H */ + diff --git a/arch/s390/include/uapi/asm/bpf_perf_event.h b/arch/s390/include/uapi/asm/bpf_perf_event.h new file mode 100644 index 0000000000..3ed42ff6da --- /dev/null +++ b/arch/s390/include/uapi/asm/bpf_perf_event.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI__ASM_BPF_PERF_EVENT_H__ +#define _UAPI__ASM_BPF_PERF_EVENT_H__ + +#include + +typedef user_pt_regs bpf_user_pt_regs_t; + +#endif /* _UAPI__ASM_BPF_PERF_EVENT_H__ */ diff --git a/arch/s390/include/uapi/asm/byteorder.h b/arch/s390/include/uapi/asm/byteorder.h new file mode 100644 index 0000000000..1442b57dd9 --- /dev/null +++ b/arch/s390/include/uapi/asm/byteorder.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _S390_BYTEORDER_H +#define _S390_BYTEORDER_H + +#include + +#endif /* _S390_BYTEORDER_H */ diff --git a/arch/s390/include/uapi/asm/chpid.h b/arch/s390/include/uapi/asm/chpid.h new file mode 100644 index 0000000000..2ae2ed8c09 --- /dev/null +++ b/arch/s390/include/uapi/asm/chpid.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright IBM Corp. 2007, 2012 + * Author(s): Peter Oberparleiter + */ + +#ifndef _UAPI_ASM_S390_CHPID_H +#define _UAPI_ASM_S390_CHPID_H + +#include +#include + +#define __MAX_CHPID 255 + +struct chp_id { + __u8 reserved1; + __u8 cssid; + __u8 reserved2; + __u8 id; +} __attribute__((packed)); + + +#endif /* _UAPI_ASM_S390_CHPID_H */ diff --git a/arch/s390/include/uapi/asm/chsc.h b/arch/s390/include/uapi/asm/chsc.h new file mode 100644 index 0000000000..83a574e95b --- /dev/null +++ b/arch/s390/include/uapi/asm/chsc.h @@ -0,0 +1,144 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * ioctl interface for /dev/chsc + * + * Copyright IBM Corp. 2008, 2012 + * Author(s): Cornelia Huck + */ + +#ifndef _ASM_CHSC_H +#define _ASM_CHSC_H + +#include +#include +#include +#include + +#define CHSC_SIZE 0x1000 + +struct chsc_async_header { + __u16 length; + __u16 code; + __u32 cmd_dependend; + __u32 key : 4; + __u32 : 28; + struct subchannel_id sid; +}; + +struct chsc_async_area { + struct chsc_async_header header; + __u8 data[CHSC_SIZE - sizeof(struct chsc_async_header)]; +}; + +struct chsc_header { + __u16 length; + __u16 code; +}; + +struct chsc_sync_area { + struct chsc_header header; + __u8 data[CHSC_SIZE - sizeof(struct chsc_header)]; +}; + +struct chsc_response_struct { + __u16 length; + __u16 code; + __u32 parms; + __u8 data[CHSC_SIZE - 2 * sizeof(__u16) - sizeof(__u32)]; +}; + +struct chsc_chp_cd { + struct chp_id chpid; + int m; + int fmt; + struct chsc_response_struct cpcb; +}; + +struct chsc_cu_cd { + __u16 cun; + __u8 cssid; + int m; + int fmt; + struct chsc_response_struct cucb; +}; + +struct chsc_sch_cud { + struct subchannel_id schid; + int fmt; + struct chsc_response_struct scub; +}; + +struct conf_id { + int m; + __u8 cssid; + __u8 ssid; +}; + +struct chsc_conf_info { + struct conf_id id; + int fmt; + struct chsc_response_struct scid; +}; + +struct ccl_parm_chpid { + int m; + struct chp_id chp; +}; + +struct ccl_parm_cssids { + __u8 f_cssid; + __u8 l_cssid; +}; + +struct chsc_comp_list { + struct { + enum { + CCL_CU_ON_CHP = 1, + CCL_CHP_TYPE_CAP = 2, + CCL_CSS_IMG = 4, + CCL_CSS_IMG_CONF_CHAR = 5, + CCL_IOP_CHP = 6, + } ctype; + int fmt; + struct ccl_parm_chpid chpid; + struct ccl_parm_cssids cssids; + } req; + struct chsc_response_struct sccl; +}; + +struct chsc_dcal { + struct { + enum { + DCAL_CSS_IID_PN = 4, + } atype; + __u32 list_parm[2]; + int fmt; + } req; + struct chsc_response_struct sdcal; +}; + +struct chsc_cpd_info { + struct chp_id chpid; + int m; + int fmt; + int rfmt; + int c; + struct chsc_response_struct chpdb; +}; + +#define CHSC_IOCTL_MAGIC 'c' + +#define CHSC_START _IOWR(CHSC_IOCTL_MAGIC, 0x81, struct chsc_async_area) +#define CHSC_INFO_CHANNEL_PATH _IOWR(CHSC_IOCTL_MAGIC, 0x82, \ + struct chsc_chp_cd) +#define CHSC_INFO_CU _IOWR(CHSC_IOCTL_MAGIC, 0x83, struct chsc_cu_cd) +#define CHSC_INFO_SCH_CU _IOWR(CHSC_IOCTL_MAGIC, 0x84, struct chsc_sch_cud) +#define CHSC_INFO_CI _IOWR(CHSC_IOCTL_MAGIC, 0x85, struct chsc_conf_info) +#define CHSC_INFO_CCL _IOWR(CHSC_IOCTL_MAGIC, 0x86, struct chsc_comp_list) +#define CHSC_INFO_CPD _IOWR(CHSC_IOCTL_MAGIC, 0x87, struct chsc_cpd_info) +#define CHSC_INFO_DCAL _IOWR(CHSC_IOCTL_MAGIC, 0x88, struct chsc_dcal) +#define CHSC_START_SYNC _IOWR(CHSC_IOCTL_MAGIC, 0x89, struct chsc_sync_area) +#define CHSC_ON_CLOSE_SET _IOWR(CHSC_IOCTL_MAGIC, 0x8a, struct chsc_async_area) +#define CHSC_ON_CLOSE_REMOVE _IO(CHSC_IOCTL_MAGIC, 0x8b) + +#endif diff --git a/arch/s390/include/uapi/asm/clp.h b/arch/s390/include/uapi/asm/clp.h new file mode 100644 index 0000000000..b36d9e9cdd --- /dev/null +++ b/arch/s390/include/uapi/asm/clp.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * ioctl interface for /dev/clp + * + * Copyright IBM Corp. 2016 + * Author(s): Martin Schwidefsky + */ + +#ifndef _ASM_CLP_H +#define _ASM_CLP_H + +#include +#include + +struct clp_req { + unsigned int c : 1; + unsigned int r : 1; + unsigned int lps : 6; + unsigned int cmd : 8; + unsigned int : 16; + unsigned int reserved; + __u64 data_p; +}; + +#define CLP_IOCTL_MAGIC 'c' + +#define CLP_SYNC _IOWR(CLP_IOCTL_MAGIC, 0xC1, struct clp_req) + +#endif diff --git a/arch/s390/include/uapi/asm/cmb.h b/arch/s390/include/uapi/asm/cmb.h new file mode 100644 index 0000000000..115434ab98 --- /dev/null +++ b/arch/s390/include/uapi/asm/cmb.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPIS390_CMB_H +#define _UAPIS390_CMB_H + +#include + +/** + * struct cmbdata - channel measurement block data for user space + * @size: size of the stored data + * @elapsed_time: time since last sampling + * @ssch_rsch_count: number of ssch and rsch + * @sample_count: number of samples + * @device_connect_time: time of device connect + * @function_pending_time: time of function pending + * @device_disconnect_time: time of device disconnect + * @control_unit_queuing_time: time of control unit queuing + * @device_active_only_time: time of device active only + * @device_busy_time: time of device busy (ext. format) + * @initial_command_response_time: initial command response time (ext. format) + * + * All values are stored as 64 bit for simplicity, especially + * in 32 bit emulation mode. All time values are normalized to + * nanoseconds. + * Currently, two formats are known, which differ by the size of + * this structure, i.e. the last two members are only set when + * the extended channel measurement facility (first shipped in + * z990 machines) is activated. + * Potentially, more fields could be added, which would result in a + * new ioctl number. + */ +struct cmbdata { + __u64 size; + __u64 elapsed_time; + /* basic and extended format: */ + __u64 ssch_rsch_count; + __u64 sample_count; + __u64 device_connect_time; + __u64 function_pending_time; + __u64 device_disconnect_time; + __u64 control_unit_queuing_time; + __u64 device_active_only_time; + /* extended format only: */ + __u64 device_busy_time; + __u64 initial_command_response_time; +}; + +/* enable channel measurement */ +#define BIODASDCMFENABLE _IO(DASD_IOCTL_LETTER, 32) +/* enable channel measurement */ +#define BIODASDCMFDISABLE _IO(DASD_IOCTL_LETTER, 33) +/* read channel measurement data */ +#define BIODASDREADALLCMB _IOWR(DASD_IOCTL_LETTER, 33, struct cmbdata) + +#endif /* _UAPIS390_CMB_H */ diff --git a/arch/s390/include/uapi/asm/dasd.h b/arch/s390/include/uapi/asm/dasd.h new file mode 100644 index 0000000000..b11d988004 --- /dev/null +++ b/arch/s390/include/uapi/asm/dasd.h @@ -0,0 +1,354 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Author(s)......: Holger Smolinski + * Bugreports.to..: + * Copyright IBM Corp. 1999, 2000 + * EMC Symmetrix ioctl Copyright EMC Corporation, 2008 + * Author.........: Nigel Hislop + * + * This file is the interface of the DASD device driver, which is exported to user space + * any future changes wrt the API will result in a change of the APIVERSION reported + * to userspace by the DASDAPIVER-ioctl + * + */ + +#ifndef DASD_H +#define DASD_H +#include +#include + +#define DASD_IOCTL_LETTER 'D' + +#define DASD_API_VERSION 6 + +/* + * struct dasd_information2_t + * represents any data about the device, which is visible to userspace. + * including format and featueres. + */ +typedef struct dasd_information2_t { + unsigned int devno; /* S/390 devno */ + unsigned int real_devno; /* for aliases */ + unsigned int schid; /* S/390 subchannel identifier */ + unsigned int cu_type : 16; /* from SenseID */ + unsigned int cu_model : 8; /* from SenseID */ + unsigned int dev_type : 16; /* from SenseID */ + unsigned int dev_model : 8; /* from SenseID */ + unsigned int open_count; + unsigned int req_queue_len; + unsigned int chanq_len; /* length of chanq */ + char type[4]; /* from discipline.name, 'none' for unknown */ + unsigned int status; /* current device level */ + unsigned int label_block; /* where to find the VOLSER */ + unsigned int FBA_layout; /* fixed block size (like AIXVOL) */ + unsigned int characteristics_size; + unsigned int confdata_size; + char characteristics[64]; /* from read_device_characteristics */ + char configuration_data[256]; /* from read_configuration_data */ + unsigned int format; /* format info like formatted/cdl/ldl/... */ + unsigned int features; /* dasd features like 'ro',... */ + unsigned int reserved0; /* reserved for further use ,... */ + unsigned int reserved1; /* reserved for further use ,... */ + unsigned int reserved2; /* reserved for further use ,... */ + unsigned int reserved3; /* reserved for further use ,... */ + unsigned int reserved4; /* reserved for further use ,... */ + unsigned int reserved5; /* reserved for further use ,... */ + unsigned int reserved6; /* reserved for further use ,... */ + unsigned int reserved7; /* reserved for further use ,... */ +} dasd_information2_t; + +/* + * values to be used for dasd_information_t.format + * 0x00: NOT formatted + * 0x01: Linux disc layout + * 0x02: Common disc layout + */ +#define DASD_FORMAT_NONE 0 +#define DASD_FORMAT_LDL 1 +#define DASD_FORMAT_CDL 2 +/* + * values to be used for dasd_information_t.features + * 0x100: default features + * 0x001: readonly (ro) + * 0x002: use diag discipline (diag) + * 0x004: set the device initially online (internal use only) + * 0x008: enable ERP related logging + * 0x010: allow I/O to fail on lost paths + * 0x020: allow I/O to fail when a lock was stolen + * 0x040: give access to raw eckd data + * 0x080: enable discard support + * 0x100: enable autodisable for IFCC errors (default) + * 0x200: enable requeue of all requests on autoquiesce + */ +#define DASD_FEATURE_READONLY 0x001 +#define DASD_FEATURE_USEDIAG 0x002 +#define DASD_FEATURE_INITIAL_ONLINE 0x004 +#define DASD_FEATURE_ERPLOG 0x008 +#define DASD_FEATURE_FAILFAST 0x010 +#define DASD_FEATURE_FAILONSLCK 0x020 +#define DASD_FEATURE_USERAW 0x040 +#define DASD_FEATURE_DISCARD 0x080 +#define DASD_FEATURE_PATH_AUTODISABLE 0x100 +#define DASD_FEATURE_REQUEUEQUIESCE 0x200 +#define DASD_FEATURE_DEFAULT DASD_FEATURE_PATH_AUTODISABLE + +#define DASD_PARTN_BITS 2 + +/* + * struct dasd_information_t + * represents any data about the data, which is visible to userspace + */ +typedef struct dasd_information_t { + unsigned int devno; /* S/390 devno */ + unsigned int real_devno; /* for aliases */ + unsigned int schid; /* S/390 subchannel identifier */ + unsigned int cu_type : 16; /* from SenseID */ + unsigned int cu_model : 8; /* from SenseID */ + unsigned int dev_type : 16; /* from SenseID */ + unsigned int dev_model : 8; /* from SenseID */ + unsigned int open_count; + unsigned int req_queue_len; + unsigned int chanq_len; /* length of chanq */ + char type[4]; /* from discipline.name, 'none' for unknown */ + unsigned int status; /* current device level */ + unsigned int label_block; /* where to find the VOLSER */ + unsigned int FBA_layout; /* fixed block size (like AIXVOL) */ + unsigned int characteristics_size; + unsigned int confdata_size; + char characteristics[64]; /* from read_device_characteristics */ + char configuration_data[256]; /* from read_configuration_data */ +} dasd_information_t; + +/* + * Read Subsystem Data - Performance Statistics + */ +typedef struct dasd_rssd_perf_stats_t { + unsigned char invalid:1; + unsigned char format:3; + unsigned char data_format:4; + unsigned char unit_address; + unsigned short device_status; + unsigned int nr_read_normal; + unsigned int nr_read_normal_hits; + unsigned int nr_write_normal; + unsigned int nr_write_fast_normal_hits; + unsigned int nr_read_seq; + unsigned int nr_read_seq_hits; + unsigned int nr_write_seq; + unsigned int nr_write_fast_seq_hits; + unsigned int nr_read_cache; + unsigned int nr_read_cache_hits; + unsigned int nr_write_cache; + unsigned int nr_write_fast_cache_hits; + unsigned int nr_inhibit_cache; + unsigned int nr_bybass_cache; + unsigned int nr_seq_dasd_to_cache; + unsigned int nr_dasd_to_cache; + unsigned int nr_cache_to_dasd; + unsigned int nr_delayed_fast_write; + unsigned int nr_normal_fast_write; + unsigned int nr_seq_fast_write; + unsigned int nr_cache_miss; + unsigned char status2; + unsigned int nr_quick_write_promotes; + unsigned char reserved; + unsigned short ssid; + unsigned char reseved2[96]; +} __attribute__((packed)) dasd_rssd_perf_stats_t; + +/* + * struct profile_info_t + * holds the profinling information + */ +typedef struct dasd_profile_info_t { + unsigned int dasd_io_reqs; /* number of requests processed at all */ + unsigned int dasd_io_sects; /* number of sectors processed at all */ + unsigned int dasd_io_secs[32]; /* histogram of request's sizes */ + unsigned int dasd_io_times[32]; /* histogram of requests's times */ + unsigned int dasd_io_timps[32]; /* histogram of requests's times per sector */ + unsigned int dasd_io_time1[32]; /* histogram of time from build to start */ + unsigned int dasd_io_time2[32]; /* histogram of time from start to irq */ + unsigned int dasd_io_time2ps[32]; /* histogram of time from start to irq */ + unsigned int dasd_io_time3[32]; /* histogram of time from irq to end */ + unsigned int dasd_io_nr_req[32]; /* histogram of # of requests in chanq */ +} dasd_profile_info_t; + +/* + * struct format_data_t + * represents all data necessary to format a dasd + */ +typedef struct format_data_t { + unsigned int start_unit; /* from track */ + unsigned int stop_unit; /* to track */ + unsigned int blksize; /* sectorsize */ + unsigned int intensity; +} format_data_t; + +/* + * struct dasd_copypair_swap_data_t + * represents all data necessary to issue a swap of the copy pair relation + */ +struct dasd_copypair_swap_data_t { + char primary[20]; /* BUSID of primary */ + char secondary[20]; /* BUSID of secondary */ + + /* Reserved for future updates. */ + __u8 reserved[64]; +}; + +/* + * values to be used for format_data_t.intensity + * 0/8: normal format + * 1/9: also write record zero + * 3/11: also write home address + * 4/12: invalidate track + */ +#define DASD_FMT_INT_FMT_R0 1 /* write record zero */ +#define DASD_FMT_INT_FMT_HA 2 /* write home address, also set FMT_R0 ! */ +#define DASD_FMT_INT_INVAL 4 /* invalidate tracks */ +#define DASD_FMT_INT_COMPAT 8 /* use OS/390 compatible disk layout */ +#define DASD_FMT_INT_FMT_NOR0 16 /* remove permission to write record zero */ +#define DASD_FMT_INT_ESE_FULL 32 /* release space for entire volume */ + +/* + * struct format_check_t + * represents all data necessary to evaluate the format of + * different tracks of a dasd + */ +typedef struct format_check_t { + /* Input */ + struct format_data_t expect; + + /* Output */ + unsigned int result; /* Error indication (DASD_FMT_ERR_*) */ + unsigned int unit; /* Track that is in error */ + unsigned int rec; /* Record that is in error */ + unsigned int num_records; /* Records in the track in error */ + unsigned int blksize; /* Blocksize of first record in error */ + unsigned int key_length; /* Key length of first record in error */ +} format_check_t; + +/* Values returned in format_check_t when a format error is detected: */ +/* Too few records were found on a single track */ +#define DASD_FMT_ERR_TOO_FEW_RECORDS 1 +/* Too many records were found on a single track */ +#define DASD_FMT_ERR_TOO_MANY_RECORDS 2 +/* Blocksize/data-length of a record was wrong */ +#define DASD_FMT_ERR_BLKSIZE 3 +/* A record ID is defined by cylinder, head, and record number (CHR). */ +/* On mismatch, this error is set */ +#define DASD_FMT_ERR_RECORD_ID 4 +/* If key-length was != 0 */ +#define DASD_FMT_ERR_KEY_LENGTH 5 + +/* + * struct attrib_data_t + * represents the operation (cache) bits for the device. + * Used in DE to influence caching of the DASD. + */ +typedef struct attrib_data_t { + unsigned char operation:3; /* cache operation mode */ + unsigned char reserved:5; /* cache operation mode */ + __u16 nr_cyl; /* no of cyliners for read ahaed */ + __u8 reserved2[29]; /* for future use */ +} __attribute__ ((packed)) attrib_data_t; + +/* definition of operation (cache) bits within attributes of DE */ +#define DASD_NORMAL_CACHE 0x0 +#define DASD_BYPASS_CACHE 0x1 +#define DASD_INHIBIT_LOAD 0x2 +#define DASD_SEQ_ACCESS 0x3 +#define DASD_SEQ_PRESTAGE 0x4 +#define DASD_REC_ACCESS 0x5 + +/* + * Perform EMC Symmetrix I/O + */ +typedef struct dasd_symmio_parms { + unsigned char reserved[8]; /* compat with older releases */ + unsigned long long psf_data; /* char * cast to u64 */ + unsigned long long rssd_result; /* char * cast to u64 */ + int psf_data_len; + int rssd_result_len; +} __attribute__ ((packed)) dasd_symmio_parms_t; + +/* + * Data returned by Sense Path Group ID (SNID) + */ +struct dasd_snid_data { + struct { + __u8 group:2; + __u8 reserve:2; + __u8 mode:1; + __u8 res:3; + } __attribute__ ((packed)) path_state; + __u8 pgid[11]; +} __attribute__ ((packed)); + +struct dasd_snid_ioctl_data { + struct dasd_snid_data data; + __u8 path_mask; +} __attribute__ ((packed)); + + +/******************************************************************************** + * SECTION: Definition of IOCTLs + * + * Here ist how the ioctl-nr should be used: + * 0 - 31 DASD driver itself + * 32 - 239 still open + * 240 - 255 reserved for EMC + *******************************************************************************/ + +/* Disable the volume (for Linux) */ +#define BIODASDDISABLE _IO(DASD_IOCTL_LETTER,0) +/* Enable the volume (for Linux) */ +#define BIODASDENABLE _IO(DASD_IOCTL_LETTER,1) +/* Issue a reserve/release command, rsp. */ +#define BIODASDRSRV _IO(DASD_IOCTL_LETTER,2) /* reserve */ +#define BIODASDRLSE _IO(DASD_IOCTL_LETTER,3) /* release */ +#define BIODASDSLCK _IO(DASD_IOCTL_LETTER,4) /* steal lock */ +/* reset profiling information of a device */ +#define BIODASDPRRST _IO(DASD_IOCTL_LETTER,5) +/* Quiesce IO on device */ +#define BIODASDQUIESCE _IO(DASD_IOCTL_LETTER,6) +/* Resume IO on device */ +#define BIODASDRESUME _IO(DASD_IOCTL_LETTER,7) +/* Abort all I/O on a device */ +#define BIODASDABORTIO _IO(DASD_IOCTL_LETTER, 240) +/* Allow I/O on a device */ +#define BIODASDALLOWIO _IO(DASD_IOCTL_LETTER, 241) + + +/* retrieve API version number */ +#define DASDAPIVER _IOR(DASD_IOCTL_LETTER,0,int) +/* Get information on a dasd device */ +#define BIODASDINFO _IOR(DASD_IOCTL_LETTER,1,dasd_information_t) +/* retrieve profiling information of a device */ +#define BIODASDPRRD _IOR(DASD_IOCTL_LETTER,2,dasd_profile_info_t) +/* Get information on a dasd device (enhanced) */ +#define BIODASDINFO2 _IOR(DASD_IOCTL_LETTER,3,dasd_information2_t) +/* Performance Statistics Read */ +#define BIODASDPSRD _IOR(DASD_IOCTL_LETTER,4,dasd_rssd_perf_stats_t) +/* Get Attributes (cache operations) */ +#define BIODASDGATTR _IOR(DASD_IOCTL_LETTER,5,attrib_data_t) + + +/* #define BIODASDFORMAT _IOW(IOCTL_LETTER,0,format_data_t) , deprecated */ +#define BIODASDFMT _IOW(DASD_IOCTL_LETTER,1,format_data_t) +/* Set Attributes (cache operations) */ +#define BIODASDSATTR _IOW(DASD_IOCTL_LETTER,2,attrib_data_t) +/* Release Allocated Space */ +#define BIODASDRAS _IOW(DASD_IOCTL_LETTER, 3, format_data_t) +/* Swap copy pair relation */ +#define BIODASDCOPYPAIRSWAP _IOW(DASD_IOCTL_LETTER, 4, struct dasd_copypair_swap_data_t) + +/* Get Sense Path Group ID (SNID) data */ +#define BIODASDSNID _IOWR(DASD_IOCTL_LETTER, 1, struct dasd_snid_ioctl_data) +/* Check device format according to format_check_t */ +#define BIODASDCHECKFMT _IOWR(DASD_IOCTL_LETTER, 2, format_check_t) + +#define BIODASDSYMMIO _IOWR(DASD_IOCTL_LETTER, 240, dasd_symmio_parms_t) + +#endif /* DASD_H */ + diff --git a/arch/s390/include/uapi/asm/fs3270.h b/arch/s390/include/uapi/asm/fs3270.h new file mode 100644 index 0000000000..c4bc1108af --- /dev/null +++ b/arch/s390/include/uapi/asm/fs3270.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __ASM_S390_UAPI_FS3270_H +#define __ASM_S390_UAPI_FS3270_H + +#include +#include + +/* ioctls for fullscreen 3270 */ +#define TUBICMD _IO('3', 3) /* set ccw command for fs reads. */ +#define TUBOCMD _IO('3', 4) /* set ccw command for fs writes. */ +#define TUBGETI _IO('3', 7) /* get ccw command for fs reads. */ +#define TUBGETO _IO('3', 8) /* get ccw command for fs writes. */ +#define TUBGETMOD _IO('3', 13) /* get characteristics like model, cols, rows */ + +/* For TUBGETMOD */ +struct raw3270_iocb { + __u16 model; + __u16 line_cnt; + __u16 col_cnt; + __u16 pf_cnt; + __u16 re_cnt; + __u16 map; +}; + +#endif /* __ASM_S390_UAPI_FS3270_H */ diff --git a/arch/s390/include/uapi/asm/guarded_storage.h b/arch/s390/include/uapi/asm/guarded_storage.h new file mode 100644 index 0000000000..666af1c33b --- /dev/null +++ b/arch/s390/include/uapi/asm/guarded_storage.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _GUARDED_STORAGE_H +#define _GUARDED_STORAGE_H + +#include + +struct gs_cb { + __u64 reserved; + __u64 gsd; + __u64 gssm; + __u64 gs_epl_a; +}; + +struct gs_epl { + __u8 pad1; + union { + __u8 gs_eam; + struct { + __u8 : 6; + __u8 e : 1; + __u8 b : 1; + }; + }; + union { + __u8 gs_eci; + struct { + __u8 tx : 1; + __u8 cx : 1; + __u8 : 5; + __u8 in : 1; + }; + }; + union { + __u8 gs_eai; + struct { + __u8 : 1; + __u8 t : 1; + __u8 as : 2; + __u8 ar : 4; + }; + }; + __u32 pad2; + __u64 gs_eha; + __u64 gs_eia; + __u64 gs_eoa; + __u64 gs_eir; + __u64 gs_era; +}; + +#define GS_ENABLE 0 +#define GS_DISABLE 1 +#define GS_SET_BC_CB 2 +#define GS_CLEAR_BC_CB 3 +#define GS_BROADCAST 4 + +static inline void load_gs_cb(struct gs_cb *gs_cb) +{ + asm volatile(".insn rxy,0xe3000000004d,0,%0" : : "Q" (*gs_cb)); +} + +static inline void store_gs_cb(struct gs_cb *gs_cb) +{ + asm volatile(".insn rxy,0xe30000000049,0,%0" : : "Q" (*gs_cb)); +} + +static inline void save_gs_cb(struct gs_cb *gs_cb) +{ + if (gs_cb) + store_gs_cb(gs_cb); +} + +static inline void restore_gs_cb(struct gs_cb *gs_cb) +{ + if (gs_cb) + load_gs_cb(gs_cb); +} + +#endif /* _GUARDED_STORAGE_H */ diff --git a/arch/s390/include/uapi/asm/hwctrset.h b/arch/s390/include/uapi/asm/hwctrset.h new file mode 100644 index 0000000000..e56b9dd23a --- /dev/null +++ b/arch/s390/include/uapi/asm/hwctrset.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright IBM Corp. 2021 + * Interface implementation for communication with the CPU Measurement + * counter facility device driver. + * + * Author(s): Thomas Richter + * + * Define for ioctl() commands to communicate with the CPU Measurement + * counter facility device driver. + */ + +#ifndef _PERF_CPUM_CF_DIAG_H +#define _PERF_CPUM_CF_DIAG_H + +#include +#include + +#define S390_HWCTR_DEVICE "hwctr" +#define S390_HWCTR_START_VERSION 1 + +struct s390_ctrset_start { /* Set CPUs to operate on */ + __u64 version; /* Version of interface */ + __u64 data_bytes; /* # of bytes required */ + __u64 cpumask_len; /* Length of CPU mask in bytes */ + __u64 *cpumask; /* Pointer to CPU mask */ + __u64 counter_sets; /* Bit mask of counter sets to get */ +}; + +struct s390_ctrset_setdata { /* Counter set data */ + __u32 set; /* Counter set number */ + __u32 no_cnts; /* # of counters stored in cv[] */ + __u64 cv[]; /* Counter values (variable length) */ +}; + +struct s390_ctrset_cpudata { /* Counter set data per CPU */ + __u32 cpu_nr; /* CPU number */ + __u32 no_sets; /* # of counters sets in data[] */ + struct s390_ctrset_setdata data[]; +}; + +struct s390_ctrset_read { /* Structure to get all ctr sets */ + __u64 no_cpus; /* Total # of CPUs data taken from */ + struct s390_ctrset_cpudata data[]; +}; + +#define S390_HWCTR_MAGIC 'C' /* Random magic # for ioctls */ +#define S390_HWCTR_START _IOWR(S390_HWCTR_MAGIC, 1, struct s390_ctrset_start) +#define S390_HWCTR_STOP _IO(S390_HWCTR_MAGIC, 2) +#define S390_HWCTR_READ _IOWR(S390_HWCTR_MAGIC, 3, struct s390_ctrset_read) +#endif diff --git a/arch/s390/include/uapi/asm/hypfs.h b/arch/s390/include/uapi/asm/hypfs.h new file mode 100644 index 0000000000..fe6174e142 --- /dev/null +++ b/arch/s390/include/uapi/asm/hypfs.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Structures for hypfs interface + * + * Copyright IBM Corp. 2013 + * + * Author: Martin Schwidefsky + */ + +#ifndef _ASM_HYPFS_H +#define _ASM_HYPFS_H + +#include + +/* + * IOCTL for binary interface /sys/kernel/debug/diag_304 + */ +struct hypfs_diag304 { + __u32 args[2]; + __u64 data; + __u64 rc; +} __attribute__((packed)); + +#define HYPFS_IOCTL_MAGIC 0x10 + +#define HYPFS_DIAG304 \ + _IOWR(HYPFS_IOCTL_MAGIC, 0x20, struct hypfs_diag304) + +/* + * Structures for binary interface /sys/kernel/debug/diag_0c + */ +struct hypfs_diag0c_hdr { + __u64 len; /* Length of diag0c buffer without header */ + __u16 version; /* Version of header */ + char reserved1[6]; /* Reserved */ + char tod_ext[16]; /* TOD clock for diag0c */ + __u64 count; /* Number of entries (CPUs) in diag0c array */ + char reserved2[24]; /* Reserved */ +}; + +struct hypfs_diag0c_entry { + char date[8]; /* MM/DD/YY in EBCDIC */ + char time[8]; /* HH:MM:SS in EBCDIC */ + __u64 virtcpu; /* Virtual time consumed by the virt CPU (us) */ + __u64 totalproc; /* Total of virtual and simulation time (us) */ + __u32 cpu; /* Linux logical CPU number */ + __u32 reserved; /* Align to 8 byte */ +}; + +struct hypfs_diag0c_data { + struct hypfs_diag0c_hdr hdr; /* 64 byte header */ + struct hypfs_diag0c_entry entry[]; /* diag0c entry array */ +}; + +#endif diff --git a/arch/s390/include/uapi/asm/ioctls.h b/arch/s390/include/uapi/asm/ioctls.h new file mode 100644 index 0000000000..342a3284e6 --- /dev/null +++ b/arch/s390/include/uapi/asm/ioctls.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __ARCH_S390_IOCTLS_H__ +#define __ARCH_S390_IOCTLS_H__ + +#define FIOQSIZE 0x545E + +#include + +#endif diff --git a/arch/s390/include/uapi/asm/ipcbuf.h b/arch/s390/include/uapi/asm/ipcbuf.h new file mode 100644 index 0000000000..1030cd1868 --- /dev/null +++ b/arch/s390/include/uapi/asm/ipcbuf.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __S390_IPCBUF_H__ +#define __S390_IPCBUF_H__ + +#include + +/* + * The user_ipc_perm structure for S/390 architecture. + * Note extra padding because this structure is passed back and forth + * between kernel and user space. + * + * Pad space is left for: + * - 32-bit mode_t and seq + * - 2 miscellaneous 32-bit values + */ + +struct ipc64_perm +{ + __kernel_key_t key; + __kernel_uid32_t uid; + __kernel_gid32_t gid; + __kernel_uid32_t cuid; + __kernel_gid32_t cgid; + __kernel_mode_t mode; + unsigned short __pad1; + unsigned short seq; +#ifndef __s390x__ + unsigned short __pad2; +#endif /* ! __s390x__ */ + unsigned long __unused1; + unsigned long __unused2; +}; + +#endif /* __S390_IPCBUF_H__ */ diff --git a/arch/s390/include/uapi/asm/ipl.h b/arch/s390/include/uapi/asm/ipl.h new file mode 100644 index 0000000000..2cd28af50d --- /dev/null +++ b/arch/s390/include/uapi/asm/ipl.h @@ -0,0 +1,208 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _ASM_S390_UAPI_IPL_H +#define _ASM_S390_UAPI_IPL_H + +#include + +/* IPL Parameter List header */ +struct ipl_pl_hdr { + __u32 len; + __u8 flags; + __u8 reserved1[2]; + __u8 version; +} __packed; + +#define IPL_PL_FLAG_IPLPS 0x80 +#define IPL_PL_FLAG_SIPL 0x40 +#define IPL_PL_FLAG_IPLSR 0x20 + +/* IPL Parameter Block header */ +struct ipl_pb_hdr { + __u32 len; + __u8 pbt; +} __packed; + +/* IPL Parameter Block types */ +enum ipl_pbt { + IPL_PBT_FCP = 0, + IPL_PBT_SCP_DATA = 1, + IPL_PBT_CCW = 2, + IPL_PBT_ECKD = 3, + IPL_PBT_NVME = 4, +}; + +/* IPL Parameter Block 0 with common fields */ +struct ipl_pb0_common { + __u32 len; + __u8 pbt; + __u8 flags; + __u8 reserved1[2]; + __u8 loadparm[8]; + __u8 reserved2[84]; +} __packed; + +#define IPL_PB0_FLAG_LOADPARM 0x80 + +/* IPL Parameter Block 0 for FCP */ +struct ipl_pb0_fcp { + __u32 len; + __u8 pbt; + __u8 reserved1[3]; + __u8 loadparm[8]; + __u8 reserved2[304]; + __u8 opt; + __u8 reserved3[3]; + __u8 cssid; + __u8 reserved4[1]; + __u16 devno; + __u8 reserved5[4]; + __u64 wwpn; + __u64 lun; + __u32 bootprog; + __u8 reserved6[12]; + __u64 br_lba; + __u32 scp_data_len; + __u8 reserved7[260]; + __u8 scp_data[]; +} __packed; + +#define IPL_PB0_FCP_OPT_IPL 0x10 +#define IPL_PB0_FCP_OPT_DUMP 0x20 + +/* IPL Parameter Block 0 for NVMe */ +struct ipl_pb0_nvme { + __u32 len; + __u8 pbt; + __u8 reserved1[3]; + __u8 loadparm[8]; + __u8 reserved2[304]; + __u8 opt; + __u8 reserved3[3]; + __u32 fid; + __u8 reserved4[12]; + __u32 nsid; + __u8 reserved5[4]; + __u32 bootprog; + __u8 reserved6[12]; + __u64 br_lba; + __u32 scp_data_len; + __u8 reserved7[260]; + __u8 scp_data[]; +} __packed; + +#define IPL_PB0_NVME_OPT_IPL 0x10 +#define IPL_PB0_NVME_OPT_DUMP 0x20 + +/* IPL Parameter Block 0 for CCW */ +struct ipl_pb0_ccw { + __u32 len; + __u8 pbt; + __u8 flags; + __u8 reserved1[2]; + __u8 loadparm[8]; + __u8 reserved2[84]; + __u16 reserved3 : 13; + __u8 ssid : 3; + __u16 devno; + __u8 vm_flags; + __u8 reserved4[3]; + __u32 vm_parm_len; + __u8 nss_name[8]; + __u8 vm_parm[64]; + __u8 reserved5[8]; +} __packed; + +/* IPL Parameter Block 0 for ECKD */ +struct ipl_pb0_eckd { + __u32 len; + __u8 pbt; + __u8 reserved1[3]; + __u32 reserved2[78]; + __u8 opt; + __u8 reserved4[4]; + __u8 reserved5:5; + __u8 ssid:3; + __u16 devno; + __u32 reserved6[5]; + __u32 bootprog; + __u8 reserved7[12]; + struct { + __u16 cyl; + __u8 head; + __u8 record; + __u32 reserved; + } br_chr __packed; + __u32 scp_data_len; + __u8 reserved8[260]; + __u8 scp_data[]; +} __packed; + +#define IPL_PB0_ECKD_OPT_IPL 0x10 +#define IPL_PB0_ECKD_OPT_DUMP 0x20 + +#define IPL_PB0_CCW_VM_FLAG_NSS 0x80 +#define IPL_PB0_CCW_VM_FLAG_VP 0x40 + +/* IPL Parameter Block 1 for additional SCP data */ +struct ipl_pb1_scp_data { + __u32 len; + __u8 pbt; + __u8 scp_data[]; +} __packed; + +/* IPL Report List header */ +struct ipl_rl_hdr { + __u32 len; + __u8 flags; + __u8 reserved1[2]; + __u8 version; + __u8 reserved2[8]; +} __packed; + +/* IPL Report Block header */ +struct ipl_rb_hdr { + __u32 len; + __u8 rbt; + __u8 reserved1[11]; +} __packed; + +/* IPL Report Block types */ +enum ipl_rbt { + IPL_RBT_CERTIFICATES = 1, + IPL_RBT_COMPONENTS = 2, +}; + +/* IPL Report Block for the certificate list */ +struct ipl_rb_certificate_entry { + __u64 addr; + __u64 len; +} __packed; + +struct ipl_rb_certificates { + __u32 len; + __u8 rbt; + __u8 reserved1[11]; + struct ipl_rb_certificate_entry entries[]; +} __packed; + +/* IPL Report Block for the component list */ +struct ipl_rb_component_entry { + __u64 addr; + __u64 len; + __u8 flags; + __u8 reserved1[5]; + __u16 certificate_index; + __u8 reserved2[8]; +}; + +#define IPL_RB_COMPONENT_FLAG_SIGNED 0x80 +#define IPL_RB_COMPONENT_FLAG_VERIFIED 0x40 + +struct ipl_rb_components { + __u32 len; + __u8 rbt; + __u8 reserved1[11]; + struct ipl_rb_component_entry entries[]; +} __packed; + +#endif diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h new file mode 100644 index 0000000000..abe926d43c --- /dev/null +++ b/arch/s390/include/uapi/asm/kvm.h @@ -0,0 +1,308 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __LINUX_KVM_S390_H +#define __LINUX_KVM_S390_H +/* + * KVM s390 specific structures and definitions + * + * Copyright IBM Corp. 2008, 2018 + * + * Author(s): Carsten Otte + * Christian Borntraeger + */ +#include + +#define __KVM_S390 +#define __KVM_HAVE_GUEST_DEBUG + +/* Device control API: s390-specific devices */ +#define KVM_DEV_FLIC_GET_ALL_IRQS 1 +#define KVM_DEV_FLIC_ENQUEUE 2 +#define KVM_DEV_FLIC_CLEAR_IRQS 3 +#define KVM_DEV_FLIC_APF_ENABLE 4 +#define KVM_DEV_FLIC_APF_DISABLE_WAIT 5 +#define KVM_DEV_FLIC_ADAPTER_REGISTER 6 +#define KVM_DEV_FLIC_ADAPTER_MODIFY 7 +#define KVM_DEV_FLIC_CLEAR_IO_IRQ 8 +#define KVM_DEV_FLIC_AISM 9 +#define KVM_DEV_FLIC_AIRQ_INJECT 10 +#define KVM_DEV_FLIC_AISM_ALL 11 +/* + * We can have up to 4*64k pending subchannels + 8 adapter interrupts, + * as well as up to ASYNC_PF_PER_VCPU*KVM_MAX_VCPUS pfault done interrupts. + * There are also sclp and machine checks. This gives us + * sizeof(kvm_s390_irq)*(4*65536+8+64*64+1+1) = 72 * 266250 = 19170000 + * Lets round up to 8192 pages. + */ +#define KVM_S390_MAX_FLOAT_IRQS 266250 +#define KVM_S390_FLIC_MAX_BUFFER 0x2000000 + +struct kvm_s390_io_adapter { + __u32 id; + __u8 isc; + __u8 maskable; + __u8 swap; + __u8 flags; +}; + +#define KVM_S390_ADAPTER_SUPPRESSIBLE 0x01 + +struct kvm_s390_ais_req { + __u8 isc; + __u16 mode; +}; + +struct kvm_s390_ais_all { + __u8 simm; + __u8 nimm; +}; + +#define KVM_S390_IO_ADAPTER_MASK 1 +#define KVM_S390_IO_ADAPTER_MAP 2 +#define KVM_S390_IO_ADAPTER_UNMAP 3 + +struct kvm_s390_io_adapter_req { + __u32 id; + __u8 type; + __u8 mask; + __u16 pad0; + __u64 addr; +}; + +/* kvm attr_group on vm fd */ +#define KVM_S390_VM_MEM_CTRL 0 +#define KVM_S390_VM_TOD 1 +#define KVM_S390_VM_CRYPTO 2 +#define KVM_S390_VM_CPU_MODEL 3 +#define KVM_S390_VM_MIGRATION 4 +#define KVM_S390_VM_CPU_TOPOLOGY 5 + +/* kvm attributes for mem_ctrl */ +#define KVM_S390_VM_MEM_ENABLE_CMMA 0 +#define KVM_S390_VM_MEM_CLR_CMMA 1 +#define KVM_S390_VM_MEM_LIMIT_SIZE 2 + +#define KVM_S390_NO_MEM_LIMIT U64_MAX + +/* kvm attributes for KVM_S390_VM_TOD */ +#define KVM_S390_VM_TOD_LOW 0 +#define KVM_S390_VM_TOD_HIGH 1 +#define KVM_S390_VM_TOD_EXT 2 + +struct kvm_s390_vm_tod_clock { + __u8 epoch_idx; + __u64 tod; +}; + +/* kvm attributes for KVM_S390_VM_CPU_MODEL */ +/* processor related attributes are r/w */ +#define KVM_S390_VM_CPU_PROCESSOR 0 +struct kvm_s390_vm_cpu_processor { + __u64 cpuid; + __u16 ibc; + __u8 pad[6]; + __u64 fac_list[256]; +}; + +/* machine related attributes are r/o */ +#define KVM_S390_VM_CPU_MACHINE 1 +struct kvm_s390_vm_cpu_machine { + __u64 cpuid; + __u32 ibc; + __u8 pad[4]; + __u64 fac_mask[256]; + __u64 fac_list[256]; +}; + +#define KVM_S390_VM_CPU_PROCESSOR_FEAT 2 +#define KVM_S390_VM_CPU_MACHINE_FEAT 3 + +#define KVM_S390_VM_CPU_FEAT_NR_BITS 1024 +#define KVM_S390_VM_CPU_FEAT_ESOP 0 +#define KVM_S390_VM_CPU_FEAT_SIEF2 1 +#define KVM_S390_VM_CPU_FEAT_64BSCAO 2 +#define KVM_S390_VM_CPU_FEAT_SIIF 3 +#define KVM_S390_VM_CPU_FEAT_GPERE 4 +#define KVM_S390_VM_CPU_FEAT_GSLS 5 +#define KVM_S390_VM_CPU_FEAT_IB 6 +#define KVM_S390_VM_CPU_FEAT_CEI 7 +#define KVM_S390_VM_CPU_FEAT_IBS 8 +#define KVM_S390_VM_CPU_FEAT_SKEY 9 +#define KVM_S390_VM_CPU_FEAT_CMMA 10 +#define KVM_S390_VM_CPU_FEAT_PFMFI 11 +#define KVM_S390_VM_CPU_FEAT_SIGPIF 12 +#define KVM_S390_VM_CPU_FEAT_KSS 13 +struct kvm_s390_vm_cpu_feat { + __u64 feat[16]; +}; + +#define KVM_S390_VM_CPU_PROCESSOR_SUBFUNC 4 +#define KVM_S390_VM_CPU_MACHINE_SUBFUNC 5 +/* for "test bit" instructions MSB 0 bit ordering, for "query" raw blocks */ +struct kvm_s390_vm_cpu_subfunc { + __u8 plo[32]; /* always */ + __u8 ptff[16]; /* with TOD-clock steering */ + __u8 kmac[16]; /* with MSA */ + __u8 kmc[16]; /* with MSA */ + __u8 km[16]; /* with MSA */ + __u8 kimd[16]; /* with MSA */ + __u8 klmd[16]; /* with MSA */ + __u8 pckmo[16]; /* with MSA3 */ + __u8 kmctr[16]; /* with MSA4 */ + __u8 kmf[16]; /* with MSA4 */ + __u8 kmo[16]; /* with MSA4 */ + __u8 pcc[16]; /* with MSA4 */ + __u8 ppno[16]; /* with MSA5 */ + __u8 kma[16]; /* with MSA8 */ + __u8 kdsa[16]; /* with MSA9 */ + __u8 sortl[32]; /* with STFLE.150 */ + __u8 dfltcc[32]; /* with STFLE.151 */ + __u8 reserved[1728]; +}; + +#define KVM_S390_VM_CPU_PROCESSOR_UV_FEAT_GUEST 6 +#define KVM_S390_VM_CPU_MACHINE_UV_FEAT_GUEST 7 + +#define KVM_S390_VM_CPU_UV_FEAT_NR_BITS 64 +struct kvm_s390_vm_cpu_uv_feat { + union { + struct { + __u64 : 4; + __u64 ap : 1; /* bit 4 */ + __u64 ap_intr : 1; /* bit 5 */ + __u64 : 58; + }; + __u64 feat; + }; +}; + +/* kvm attributes for crypto */ +#define KVM_S390_VM_CRYPTO_ENABLE_AES_KW 0 +#define KVM_S390_VM_CRYPTO_ENABLE_DEA_KW 1 +#define KVM_S390_VM_CRYPTO_DISABLE_AES_KW 2 +#define KVM_S390_VM_CRYPTO_DISABLE_DEA_KW 3 +#define KVM_S390_VM_CRYPTO_ENABLE_APIE 4 +#define KVM_S390_VM_CRYPTO_DISABLE_APIE 5 + +/* kvm attributes for migration mode */ +#define KVM_S390_VM_MIGRATION_STOP 0 +#define KVM_S390_VM_MIGRATION_START 1 +#define KVM_S390_VM_MIGRATION_STATUS 2 + +/* for KVM_GET_REGS and KVM_SET_REGS */ +struct kvm_regs { + /* general purpose regs for s390 */ + __u64 gprs[16]; +}; + +/* for KVM_GET_SREGS and KVM_SET_SREGS */ +struct kvm_sregs { + __u32 acrs[16]; + __u64 crs[16]; +}; + +/* for KVM_GET_FPU and KVM_SET_FPU */ +struct kvm_fpu { + __u32 fpc; + __u64 fprs[16]; +}; + +#define KVM_GUESTDBG_USE_HW_BP 0x00010000 + +#define KVM_HW_BP 1 +#define KVM_HW_WP_WRITE 2 +#define KVM_SINGLESTEP 4 + +struct kvm_debug_exit_arch { + __u64 addr; + __u8 type; + __u8 pad[7]; /* Should be set to 0 */ +}; + +struct kvm_hw_breakpoint { + __u64 addr; + __u64 phys_addr; + __u64 len; + __u8 type; + __u8 pad[7]; /* Should be set to 0 */ +}; + +/* for KVM_SET_GUEST_DEBUG */ +struct kvm_guest_debug_arch { + __u32 nr_hw_bp; + __u32 pad; /* Should be set to 0 */ + struct kvm_hw_breakpoint __user *hw_bp; +}; + +/* for KVM_SYNC_PFAULT and KVM_REG_S390_PFTOKEN */ +#define KVM_S390_PFAULT_TOKEN_INVALID 0xffffffffffffffffULL + +#define KVM_SYNC_PREFIX (1UL << 0) +#define KVM_SYNC_GPRS (1UL << 1) +#define KVM_SYNC_ACRS (1UL << 2) +#define KVM_SYNC_CRS (1UL << 3) +#define KVM_SYNC_ARCH0 (1UL << 4) +#define KVM_SYNC_PFAULT (1UL << 5) +#define KVM_SYNC_VRS (1UL << 6) +#define KVM_SYNC_RICCB (1UL << 7) +#define KVM_SYNC_FPRS (1UL << 8) +#define KVM_SYNC_GSCB (1UL << 9) +#define KVM_SYNC_BPBC (1UL << 10) +#define KVM_SYNC_ETOKEN (1UL << 11) +#define KVM_SYNC_DIAG318 (1UL << 12) + +#define KVM_SYNC_S390_VALID_FIELDS \ + (KVM_SYNC_PREFIX | KVM_SYNC_GPRS | KVM_SYNC_ACRS | KVM_SYNC_CRS | \ + KVM_SYNC_ARCH0 | KVM_SYNC_PFAULT | KVM_SYNC_VRS | KVM_SYNC_RICCB | \ + KVM_SYNC_FPRS | KVM_SYNC_GSCB | KVM_SYNC_BPBC | KVM_SYNC_ETOKEN | \ + KVM_SYNC_DIAG318) + +/* length and alignment of the sdnx as a power of two */ +#define SDNXC 8 +#define SDNXL (1UL << SDNXC) +/* definition of registers in kvm_run */ +struct kvm_sync_regs { + __u64 prefix; /* prefix register */ + __u64 gprs[16]; /* general purpose registers */ + __u32 acrs[16]; /* access registers */ + __u64 crs[16]; /* control registers */ + __u64 todpr; /* tod programmable register [ARCH0] */ + __u64 cputm; /* cpu timer [ARCH0] */ + __u64 ckc; /* clock comparator [ARCH0] */ + __u64 pp; /* program parameter [ARCH0] */ + __u64 gbea; /* guest breaking-event address [ARCH0] */ + __u64 pft; /* pfault token [PFAULT] */ + __u64 pfs; /* pfault select [PFAULT] */ + __u64 pfc; /* pfault compare [PFAULT] */ + union { + __u64 vrs[32][2]; /* vector registers (KVM_SYNC_VRS) */ + __u64 fprs[16]; /* fp registers (KVM_SYNC_FPRS) */ + }; + __u8 reserved[512]; /* for future vector expansion */ + __u32 fpc; /* valid on KVM_SYNC_VRS or KVM_SYNC_FPRS */ + __u8 bpbc : 1; /* bp mode */ + __u8 reserved2 : 7; + __u8 padding1[51]; /* riccb needs to be 64byte aligned */ + __u8 riccb[64]; /* runtime instrumentation controls block */ + __u64 diag318; /* diagnose 0x318 info */ + __u8 padding2[184]; /* sdnx needs to be 256byte aligned */ + union { + __u8 sdnx[SDNXL]; /* state description annex */ + struct { + __u64 reserved1[2]; + __u64 gscb[4]; + __u64 etoken; + __u64 etoken_extension; + }; + }; +}; + +#define KVM_REG_S390_TODPR (KVM_REG_S390 | KVM_REG_SIZE_U32 | 0x1) +#define KVM_REG_S390_EPOCHDIFF (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x2) +#define KVM_REG_S390_CPU_TIMER (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x3) +#define KVM_REG_S390_CLOCK_COMP (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x4) +#define KVM_REG_S390_PFTOKEN (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x5) +#define KVM_REG_S390_PFCOMPARE (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x6) +#define KVM_REG_S390_PFSELECT (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x7) +#define KVM_REG_S390_PP (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x8) +#define KVM_REG_S390_GBEA (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x9) +#endif diff --git a/arch/s390/include/uapi/asm/kvm_para.h b/arch/s390/include/uapi/asm/kvm_para.h new file mode 100644 index 0000000000..b9ab584adf --- /dev/null +++ b/arch/s390/include/uapi/asm/kvm_para.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * User API definitions for paravirtual devices on s390 + * + * Copyright IBM Corp. 2008 + * + * Author(s): Christian Borntraeger + */ diff --git a/arch/s390/include/uapi/asm/kvm_perf.h b/arch/s390/include/uapi/asm/kvm_perf.h new file mode 100644 index 0000000000..84606b8cc4 --- /dev/null +++ b/arch/s390/include/uapi/asm/kvm_perf.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Definitions for perf-kvm on s390 + * + * Copyright 2014 IBM Corp. + * Author(s): Alexander Yarygin + */ + +#ifndef __LINUX_KVM_PERF_S390_H +#define __LINUX_KVM_PERF_S390_H + +#include + +#define DECODE_STR_LEN 40 + +#define VCPU_ID "id" + +#define KVM_ENTRY_TRACE "kvm:kvm_s390_sie_enter" +#define KVM_EXIT_TRACE "kvm:kvm_s390_sie_exit" +#define KVM_EXIT_REASON "icptcode" + +#endif diff --git a/arch/s390/include/uapi/asm/monwriter.h b/arch/s390/include/uapi/asm/monwriter.h new file mode 100644 index 0000000000..03d172f314 --- /dev/null +++ b/arch/s390/include/uapi/asm/monwriter.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright IBM Corp. 2006 + * Character device driver for writing z/VM APPLDATA monitor records + * Version 1.0 + * Author(s): Melissa Howland + * + */ + +#ifndef _ASM_390_MONWRITER_H +#define _ASM_390_MONWRITER_H + +/* mon_function values */ +#define MONWRITE_START_INTERVAL 0x00 /* start interval recording */ +#define MONWRITE_STOP_INTERVAL 0x01 /* stop interval or config recording */ +#define MONWRITE_GEN_EVENT 0x02 /* generate event record */ +#define MONWRITE_START_CONFIG 0x03 /* start configuration recording */ + +/* the header the app uses in its write() data */ +struct monwrite_hdr { + unsigned char mon_function; + unsigned short applid; + unsigned char record_num; + unsigned short version; + unsigned short release; + unsigned short mod_level; + unsigned short datalen; + unsigned char hdrlen; + +} __attribute__((packed)); + +#endif /* _ASM_390_MONWRITER_H */ diff --git a/arch/s390/include/uapi/asm/perf_regs.h b/arch/s390/include/uapi/asm/perf_regs.h new file mode 100644 index 0000000000..d17dd9e5d5 --- /dev/null +++ b/arch/s390/include/uapi/asm/perf_regs.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _ASM_S390_PERF_REGS_H +#define _ASM_S390_PERF_REGS_H + +enum perf_event_s390_regs { + PERF_REG_S390_R0, + PERF_REG_S390_R1, + PERF_REG_S390_R2, + PERF_REG_S390_R3, + PERF_REG_S390_R4, + PERF_REG_S390_R5, + PERF_REG_S390_R6, + PERF_REG_S390_R7, + PERF_REG_S390_R8, + PERF_REG_S390_R9, + PERF_REG_S390_R10, + PERF_REG_S390_R11, + PERF_REG_S390_R12, + PERF_REG_S390_R13, + PERF_REG_S390_R14, + PERF_REG_S390_R15, + PERF_REG_S390_FP0, + PERF_REG_S390_FP1, + PERF_REG_S390_FP2, + PERF_REG_S390_FP3, + PERF_REG_S390_FP4, + PERF_REG_S390_FP5, + PERF_REG_S390_FP6, + PERF_REG_S390_FP7, + PERF_REG_S390_FP8, + PERF_REG_S390_FP9, + PERF_REG_S390_FP10, + PERF_REG_S390_FP11, + PERF_REG_S390_FP12, + PERF_REG_S390_FP13, + PERF_REG_S390_FP14, + PERF_REG_S390_FP15, + PERF_REG_S390_MASK, + PERF_REG_S390_PC, + + PERF_REG_S390_MAX +}; + +#endif /* _ASM_S390_PERF_REGS_H */ diff --git a/arch/s390/include/uapi/asm/pkey.h b/arch/s390/include/uapi/asm/pkey.h new file mode 100644 index 0000000000..5ad76471e7 --- /dev/null +++ b/arch/s390/include/uapi/asm/pkey.h @@ -0,0 +1,452 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Userspace interface to the pkey device driver + * + * Copyright IBM Corp. 2017, 2023 + * + * Author: Harald Freudenberger + * + */ + +#ifndef _UAPI_PKEY_H +#define _UAPI_PKEY_H + +#include +#include + +/* + * Ioctl calls supported by the pkey device driver + */ + +#define PKEY_IOCTL_MAGIC 'p' + +#define SECKEYBLOBSIZE 64 /* secure key blob size is always 64 bytes */ +#define PROTKEYBLOBSIZE 80 /* protected key blob size is always 80 bytes */ +#define MAXPROTKEYSIZE 64 /* a protected key blob may be up to 64 bytes */ +#define MAXCLRKEYSIZE 32 /* a clear key value may be up to 32 bytes */ +#define MAXAESCIPHERKEYSIZE 136 /* our aes cipher keys have always 136 bytes */ +#define MINEP11AESKEYBLOBSIZE 256 /* min EP11 AES key blob size */ +#define MAXEP11AESKEYBLOBSIZE 336 /* max EP11 AES key blob size */ + +/* Minimum size of a key blob */ +#define MINKEYBLOBSIZE SECKEYBLOBSIZE + +/* defines for the type field within the pkey_protkey struct */ +#define PKEY_KEYTYPE_AES_128 1 +#define PKEY_KEYTYPE_AES_192 2 +#define PKEY_KEYTYPE_AES_256 3 +#define PKEY_KEYTYPE_ECC 4 +#define PKEY_KEYTYPE_ECC_P256 5 +#define PKEY_KEYTYPE_ECC_P384 6 +#define PKEY_KEYTYPE_ECC_P521 7 +#define PKEY_KEYTYPE_ECC_ED25519 8 +#define PKEY_KEYTYPE_ECC_ED448 9 + +/* the newer ioctls use a pkey_key_type enum for type information */ +enum pkey_key_type { + PKEY_TYPE_CCA_DATA = (__u32) 1, + PKEY_TYPE_CCA_CIPHER = (__u32) 2, + PKEY_TYPE_EP11 = (__u32) 3, + PKEY_TYPE_CCA_ECC = (__u32) 0x1f, + PKEY_TYPE_EP11_AES = (__u32) 6, + PKEY_TYPE_EP11_ECC = (__u32) 7, +}; + +/* the newer ioctls use a pkey_key_size enum for key size information */ +enum pkey_key_size { + PKEY_SIZE_AES_128 = (__u32) 128, + PKEY_SIZE_AES_192 = (__u32) 192, + PKEY_SIZE_AES_256 = (__u32) 256, + PKEY_SIZE_UNKNOWN = (__u32) 0xFFFFFFFF, +}; + +/* some of the newer ioctls use these flags */ +#define PKEY_FLAGS_MATCH_CUR_MKVP 0x00000002 +#define PKEY_FLAGS_MATCH_ALT_MKVP 0x00000004 + +/* keygenflags defines for CCA AES cipher keys */ +#define PKEY_KEYGEN_XPRT_SYM 0x00008000 +#define PKEY_KEYGEN_XPRT_UASY 0x00004000 +#define PKEY_KEYGEN_XPRT_AASY 0x00002000 +#define PKEY_KEYGEN_XPRT_RAW 0x00001000 +#define PKEY_KEYGEN_XPRT_CPAC 0x00000800 +#define PKEY_KEYGEN_XPRT_DES 0x00000080 +#define PKEY_KEYGEN_XPRT_AES 0x00000040 +#define PKEY_KEYGEN_XPRT_RSA 0x00000008 + +/* Struct to hold apqn target info (card/domain pair) */ +struct pkey_apqn { + __u16 card; + __u16 domain; +}; + +/* Struct to hold a CCA AES secure key blob */ +struct pkey_seckey { + __u8 seckey[SECKEYBLOBSIZE]; /* the secure key blob */ +}; + +/* Struct to hold protected key and length info */ +struct pkey_protkey { + __u32 type; /* key type, one of the PKEY_KEYTYPE_AES values */ + __u32 len; /* bytes actually stored in protkey[] */ + __u8 protkey[MAXPROTKEYSIZE]; /* the protected key blob */ +}; + +/* Struct to hold an AES clear key value */ +struct pkey_clrkey { + __u8 clrkey[MAXCLRKEYSIZE]; /* 16, 24, or 32 byte clear key value */ +}; + +/* + * EP11 key blobs of type PKEY_TYPE_EP11_AES and PKEY_TYPE_EP11_ECC + * are ep11 blobs prepended by this header: + */ +struct ep11kblob_header { + __u8 type; /* always 0x00 */ + __u8 hver; /* header version, currently needs to be 0x00 */ + __u16 len; /* total length in bytes (including this header) */ + __u8 version; /* PKEY_TYPE_EP11_AES or PKEY_TYPE_EP11_ECC */ + __u8 res0; /* unused */ + __u16 bitlen; /* clear key bit len, 0 for unknown */ + __u8 res1[8]; /* unused */ +} __packed; + +/* + * Generate CCA AES secure key. + */ +struct pkey_genseck { + __u16 cardnr; /* in: card to use or FFFF for any */ + __u16 domain; /* in: domain or FFFF for any */ + __u32 keytype; /* in: key type to generate */ + struct pkey_seckey seckey; /* out: the secure key blob */ +}; +#define PKEY_GENSECK _IOWR(PKEY_IOCTL_MAGIC, 0x01, struct pkey_genseck) + +/* + * Construct CCA AES secure key from clear key value + */ +struct pkey_clr2seck { + __u16 cardnr; /* in: card to use or FFFF for any */ + __u16 domain; /* in: domain or FFFF for any */ + __u32 keytype; /* in: key type to generate */ + struct pkey_clrkey clrkey; /* in: the clear key value */ + struct pkey_seckey seckey; /* out: the secure key blob */ +}; +#define PKEY_CLR2SECK _IOWR(PKEY_IOCTL_MAGIC, 0x02, struct pkey_clr2seck) + +/* + * Fabricate AES protected key from a CCA AES secure key + */ +struct pkey_sec2protk { + __u16 cardnr; /* in: card to use or FFFF for any */ + __u16 domain; /* in: domain or FFFF for any */ + struct pkey_seckey seckey; /* in: the secure key blob */ + struct pkey_protkey protkey; /* out: the protected key */ +}; +#define PKEY_SEC2PROTK _IOWR(PKEY_IOCTL_MAGIC, 0x03, struct pkey_sec2protk) + +/* + * Fabricate AES protected key from clear key value + */ +struct pkey_clr2protk { + __u32 keytype; /* in: key type to generate */ + struct pkey_clrkey clrkey; /* in: the clear key value */ + struct pkey_protkey protkey; /* out: the protected key */ +}; +#define PKEY_CLR2PROTK _IOWR(PKEY_IOCTL_MAGIC, 0x04, struct pkey_clr2protk) + +/* + * Search for matching crypto card based on the Master Key + * Verification Pattern provided inside a CCA AES secure key. + */ +struct pkey_findcard { + struct pkey_seckey seckey; /* in: the secure key blob */ + __u16 cardnr; /* out: card number */ + __u16 domain; /* out: domain number */ +}; +#define PKEY_FINDCARD _IOWR(PKEY_IOCTL_MAGIC, 0x05, struct pkey_findcard) + +/* + * Combined together: findcard + sec2prot + */ +struct pkey_skey2pkey { + struct pkey_seckey seckey; /* in: the secure key blob */ + struct pkey_protkey protkey; /* out: the protected key */ +}; +#define PKEY_SKEY2PKEY _IOWR(PKEY_IOCTL_MAGIC, 0x06, struct pkey_skey2pkey) + +/* + * Verify the given CCA AES secure key for being able to be usable with + * the pkey module. Check for correct key type and check for having at + * least one crypto card being able to handle this key (master key + * or old master key verification pattern matches). + * Return some info about the key: keysize in bits, keytype (currently + * only AES), flag if key is wrapped with an old MKVP. + */ +struct pkey_verifykey { + struct pkey_seckey seckey; /* in: the secure key blob */ + __u16 cardnr; /* out: card number */ + __u16 domain; /* out: domain number */ + __u16 keysize; /* out: key size in bits */ + __u32 attributes; /* out: attribute bits */ +}; +#define PKEY_VERIFYKEY _IOWR(PKEY_IOCTL_MAGIC, 0x07, struct pkey_verifykey) +#define PKEY_VERIFY_ATTR_AES 0x00000001 /* key is an AES key */ +#define PKEY_VERIFY_ATTR_OLD_MKVP 0x00000100 /* key has old MKVP value */ + +/* + * Generate AES random protected key. + */ +struct pkey_genprotk { + __u32 keytype; /* in: key type to generate */ + struct pkey_protkey protkey; /* out: the protected key */ +}; + +#define PKEY_GENPROTK _IOWR(PKEY_IOCTL_MAGIC, 0x08, struct pkey_genprotk) + +/* + * Verify an AES protected key. + */ +struct pkey_verifyprotk { + struct pkey_protkey protkey; /* in: the protected key to verify */ +}; + +#define PKEY_VERIFYPROTK _IOW(PKEY_IOCTL_MAGIC, 0x09, struct pkey_verifyprotk) + +/* + * Transform an key blob (of any type) into a protected key + */ +struct pkey_kblob2pkey { + __u8 __user *key; /* in: the key blob */ + __u32 keylen; /* in: the key blob length */ + struct pkey_protkey protkey; /* out: the protected key */ +}; +#define PKEY_KBLOB2PROTK _IOWR(PKEY_IOCTL_MAGIC, 0x0A, struct pkey_kblob2pkey) + +/* + * Generate secure key, version 2. + * Generate CCA AES secure key, CCA AES cipher key or EP11 AES secure key. + * There needs to be a list of apqns given with at least one entry in there. + * All apqns in the list need to be exact apqns, 0xFFFF as ANY card or domain + * is not supported. The implementation walks through the list of apqns and + * tries to send the request to each apqn without any further checking (like + * card type or online state). If the apqn fails, simple the next one in the + * list is tried until success (return 0) or the end of the list is reached + * (return -1 with errno ENODEV). You may use the PKEY_APQNS4KT ioctl to + * generate a list of apqns based on the key type to generate. + * The keygenflags argument is passed to the low level generation functions + * individual for the key type and has a key type specific meaning. When + * generating CCA cipher keys you can use one or more of the PKEY_KEYGEN_* + * flags to widen the export possibilities. By default a cipher key is + * only exportable for CPACF (PKEY_KEYGEN_XPRT_CPAC). + * The keygenflag argument for generating an EP11 AES key should either be 0 + * to use the defaults which are XCP_BLOB_ENCRYPT, XCP_BLOB_DECRYPT and + * XCP_BLOB_PROTKEY_EXTRACTABLE or a valid combination of XCP_BLOB_* flags. + */ +struct pkey_genseck2 { + struct pkey_apqn __user *apqns; /* in: ptr to list of apqn targets*/ + __u32 apqn_entries; /* in: # of apqn target list entries */ + enum pkey_key_type type; /* in: key type to generate */ + enum pkey_key_size size; /* in: key size to generate */ + __u32 keygenflags; /* in: key generation flags */ + __u8 __user *key; /* in: pointer to key blob buffer */ + __u32 keylen; /* in: available key blob buffer size */ + /* out: actual key blob size */ +}; +#define PKEY_GENSECK2 _IOWR(PKEY_IOCTL_MAGIC, 0x11, struct pkey_genseck2) + +/* + * Generate secure key from clear key value, version 2. + * Construct an CCA AES secure key, CCA AES cipher key or EP11 AES secure + * key from a given clear key value. + * There needs to be a list of apqns given with at least one entry in there. + * All apqns in the list need to be exact apqns, 0xFFFF as ANY card or domain + * is not supported. The implementation walks through the list of apqns and + * tries to send the request to each apqn without any further checking (like + * card type or online state). If the apqn fails, simple the next one in the + * list is tried until success (return 0) or the end of the list is reached + * (return -1 with errno ENODEV). You may use the PKEY_APQNS4KT ioctl to + * generate a list of apqns based on the key type to generate. + * The keygenflags argument is passed to the low level generation functions + * individual for the key type and has a key type specific meaning. When + * generating CCA cipher keys you can use one or more of the PKEY_KEYGEN_* + * flags to widen the export possibilities. By default a cipher key is + * only exportable for CPACF (PKEY_KEYGEN_XPRT_CPAC). + * The keygenflag argument for generating an EP11 AES key should either be 0 + * to use the defaults which are XCP_BLOB_ENCRYPT, XCP_BLOB_DECRYPT and + * XCP_BLOB_PROTKEY_EXTRACTABLE or a valid combination of XCP_BLOB_* flags. + */ +struct pkey_clr2seck2 { + struct pkey_apqn __user *apqns; /* in: ptr to list of apqn targets */ + __u32 apqn_entries; /* in: # of apqn target list entries */ + enum pkey_key_type type; /* in: key type to generate */ + enum pkey_key_size size; /* in: key size to generate */ + __u32 keygenflags; /* in: key generation flags */ + struct pkey_clrkey clrkey; /* in: the clear key value */ + __u8 __user *key; /* in: pointer to key blob buffer */ + __u32 keylen; /* in: available key blob buffer size */ + /* out: actual key blob size */ +}; +#define PKEY_CLR2SECK2 _IOWR(PKEY_IOCTL_MAGIC, 0x12, struct pkey_clr2seck2) + +/* + * Verify the given secure key, version 2. + * Check for correct key type. If cardnr and domain are given (are not + * 0xFFFF) also check if this apqn is able to handle this type of key. + * If cardnr and/or domain is 0xFFFF, on return these values are filled + * with one apqn able to handle this key. + * The function also checks for the master key verification patterns + * of the key matching to the current or alternate mkvp of the apqn. + * For CCA AES secure keys and CCA AES cipher keys this means to check + * the key's mkvp against the current or old mkvp of the apqns. The flags + * field is updated with some additional info about the apqn mkvp + * match: If the current mkvp matches to the key's mkvp then the + * PKEY_FLAGS_MATCH_CUR_MKVP bit is set, if the alternate mkvp matches to + * the key's mkvp the PKEY_FLAGS_MATCH_ALT_MKVP is set. For CCA keys the + * alternate mkvp is the old master key verification pattern. + * CCA AES secure keys are also checked to have the CPACF export allowed + * bit enabled (XPRTCPAC) in the kmf1 field. + * EP11 keys are also supported and the wkvp of the key is checked against + * the current wkvp of the apqns. There is no alternate for this type of + * key and so on a match the flag PKEY_FLAGS_MATCH_CUR_MKVP always is set. + * EP11 keys are also checked to have XCP_BLOB_PROTKEY_EXTRACTABLE set. + * The ioctl returns 0 as long as the given or found apqn matches to + * matches with the current or alternate mkvp to the key's mkvp. If the given + * apqn does not match or there is no such apqn found, -1 with errno + * ENODEV is returned. + */ +struct pkey_verifykey2 { + __u8 __user *key; /* in: pointer to key blob */ + __u32 keylen; /* in: key blob size */ + __u16 cardnr; /* in/out: card number */ + __u16 domain; /* in/out: domain number */ + enum pkey_key_type type; /* out: the key type */ + enum pkey_key_size size; /* out: the key size */ + __u32 flags; /* out: additional key info flags */ +}; +#define PKEY_VERIFYKEY2 _IOWR(PKEY_IOCTL_MAGIC, 0x17, struct pkey_verifykey2) + +/* + * Transform a key blob into a protected key, version 2. + * There needs to be a list of apqns given with at least one entry in there. + * All apqns in the list need to be exact apqns, 0xFFFF as ANY card or domain + * is not supported. The implementation walks through the list of apqns and + * tries to send the request to each apqn without any further checking (like + * card type or online state). If the apqn fails, simple the next one in the + * list is tried until success (return 0) or the end of the list is reached + * (return -1 with errno ENODEV). You may use the PKEY_APQNS4K ioctl to + * generate a list of apqns based on the key. + * Deriving ECC protected keys from ECC secure keys is not supported with + * this ioctl, use PKEY_KBLOB2PROTK3 for this purpose. + */ +struct pkey_kblob2pkey2 { + __u8 __user *key; /* in: pointer to key blob */ + __u32 keylen; /* in: key blob size */ + struct pkey_apqn __user *apqns; /* in: ptr to list of apqn targets */ + __u32 apqn_entries; /* in: # of apqn target list entries */ + struct pkey_protkey protkey; /* out: the protected key */ +}; +#define PKEY_KBLOB2PROTK2 _IOWR(PKEY_IOCTL_MAGIC, 0x1A, struct pkey_kblob2pkey2) + +/* + * Build a list of APQNs based on a key blob given. + * Is able to find out which type of secure key is given (CCA AES secure + * key, CCA AES cipher key, CCA ECC private key, EP11 AES key, EP11 ECC private + * key) and tries to find all matching crypto cards based on the MKVP and maybe + * other criteria (like CCA AES cipher keys need a CEX5C or higher, EP11 keys + * with BLOB_PKEY_EXTRACTABLE need a CEX7 and EP11 api version 4). The list of + * APQNs is further filtered by the key's mkvp which needs to match to either + * the current mkvp (CCA and EP11) or the alternate mkvp (old mkvp, CCA adapters + * only) of the apqns. The flags argument may be used to limit the matching + * apqns. If the PKEY_FLAGS_MATCH_CUR_MKVP is given, only the current mkvp of + * each apqn is compared. Likewise with the PKEY_FLAGS_MATCH_ALT_MKVP. If both + * are given, it is assumed to return apqns where either the current or the + * alternate mkvp matches. At least one of the matching flags needs to be given. + * The flags argument for EP11 keys has no further action and is currently + * ignored (but needs to be given as PKEY_FLAGS_MATCH_CUR_MKVP) as there is only + * the wkvp from the key to match against the apqn's wkvp. + * The list of matching apqns is stored into the space given by the apqns + * argument and the number of stored entries goes into apqn_entries. If the list + * is empty (apqn_entries is 0) the apqn_entries field is updated to the number + * of apqn targets found and the ioctl returns with 0. If apqn_entries is > 0 + * but the number of apqn targets does not fit into the list, the apqn_targets + * field is updated with the number of required entries but there are no apqn + * values stored in the list and the ioctl returns with ENOSPC. If no matching + * APQN is found, the ioctl returns with 0 but the apqn_entries value is 0. + */ +struct pkey_apqns4key { + __u8 __user *key; /* in: pointer to key blob */ + __u32 keylen; /* in: key blob size */ + __u32 flags; /* in: match controlling flags */ + struct pkey_apqn __user *apqns; /* in/out: ptr to list of apqn targets*/ + __u32 apqn_entries; /* in: max # of apqn entries in the list */ + /* out: # apqns stored into the list */ +}; +#define PKEY_APQNS4K _IOWR(PKEY_IOCTL_MAGIC, 0x1B, struct pkey_apqns4key) + +/* + * Build a list of APQNs based on a key type given. + * Build a list of APQNs based on a given key type and maybe further + * restrict the list by given master key verification patterns. + * For different key types there may be different ways to match the + * master key verification patterns. For CCA keys (CCA data key and CCA + * cipher key) the first 8 bytes of cur_mkvp refer to the current AES mkvp value + * of the apqn and the first 8 bytes of the alt_mkvp refer to the old AES mkvp. + * For CCA ECC keys it is similar but the match is against the APKA current/old + * mkvp. The flags argument controls if the apqns current and/or alternate mkvp + * should match. If the PKEY_FLAGS_MATCH_CUR_MKVP is given, only the current + * mkvp of each apqn is compared. Likewise with the PKEY_FLAGS_MATCH_ALT_MKVP. + * If both are given, it is assumed to return apqns where either the + * current or the alternate mkvp matches. If no match flag is given + * (flags is 0) the mkvp values are ignored for the match process. + * For EP11 keys there is only the current wkvp. So if the apqns should also + * match to a given wkvp, then the PKEY_FLAGS_MATCH_CUR_MKVP flag should be + * set. The wkvp value is 32 bytes but only the leftmost 16 bytes are compared + * against the leftmost 16 byte of the wkvp of the apqn. + * The list of matching apqns is stored into the space given by the apqns + * argument and the number of stored entries goes into apqn_entries. If the list + * is empty (apqn_entries is 0) the apqn_entries field is updated to the number + * of apqn targets found and the ioctl returns with 0. If apqn_entries is > 0 + * but the number of apqn targets does not fit into the list, the apqn_targets + * field is updated with the number of required entries but there are no apqn + * values stored in the list and the ioctl returns with ENOSPC. If no matching + * APQN is found, the ioctl returns with 0 but the apqn_entries value is 0. + */ +struct pkey_apqns4keytype { + enum pkey_key_type type; /* in: key type */ + __u8 cur_mkvp[32]; /* in: current mkvp */ + __u8 alt_mkvp[32]; /* in: alternate mkvp */ + __u32 flags; /* in: match controlling flags */ + struct pkey_apqn __user *apqns; /* in/out: ptr to list of apqn targets*/ + __u32 apqn_entries; /* in: max # of apqn entries in the list */ + /* out: # apqns stored into the list */ +}; +#define PKEY_APQNS4KT _IOWR(PKEY_IOCTL_MAGIC, 0x1C, struct pkey_apqns4keytype) + +/* + * Transform a key blob into a protected key, version 3. + * The difference to version 2 of this ioctl is that the protected key + * buffer is now explicitly and not within a struct pkey_protkey any more. + * So this ioctl is also able to handle EP11 and CCA ECC secure keys and + * provide ECC protected keys. + * There needs to be a list of apqns given with at least one entry in there. + * All apqns in the list need to be exact apqns, 0xFFFF as ANY card or domain + * is not supported. The implementation walks through the list of apqns and + * tries to send the request to each apqn without any further checking (like + * card type or online state). If the apqn fails, simple the next one in the + * list is tried until success (return 0) or the end of the list is reached + * (return -1 with errno ENODEV). You may use the PKEY_APQNS4K ioctl to + * generate a list of apqns based on the key. + */ +struct pkey_kblob2pkey3 { + __u8 __user *key; /* in: pointer to key blob */ + __u32 keylen; /* in: key blob size */ + struct pkey_apqn __user *apqns; /* in: ptr to list of apqn targets */ + __u32 apqn_entries; /* in: # of apqn target list entries */ + __u32 pkeytype; /* out: prot key type (enum pkey_key_type) */ + __u32 pkeylen; /* in/out: size of pkey buffer/actual len of pkey */ + __u8 __user *pkey; /* in: pkey blob buffer space ptr */ +}; +#define PKEY_KBLOB2PROTK3 _IOWR(PKEY_IOCTL_MAGIC, 0x1D, struct pkey_kblob2pkey3) + +#endif /* _UAPI_PKEY_H */ diff --git a/arch/s390/include/uapi/asm/posix_types.h b/arch/s390/include/uapi/asm/posix_types.h new file mode 100644 index 0000000000..1913613e71 --- /dev/null +++ b/arch/s390/include/uapi/asm/posix_types.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * S390 version + * + */ + +#ifndef __ARCH_S390_POSIX_TYPES_H +#define __ARCH_S390_POSIX_TYPES_H + +/* + * This file is generally used by user-level software, so you need to + * be a little careful about namespace pollution etc. Also, we cannot + * assume GCC is being used. + */ + +typedef unsigned long __kernel_size_t; +typedef long __kernel_ssize_t; +#define __kernel_size_t __kernel_size_t + +typedef unsigned short __kernel_old_dev_t; +#define __kernel_old_dev_t __kernel_old_dev_t + +#ifdef __KERNEL__ +typedef unsigned short __kernel_old_uid_t; +typedef unsigned short __kernel_old_gid_t; +#define __kernel_old_uid_t __kernel_old_uid_t +#endif + +#ifndef __s390x__ + +typedef unsigned long __kernel_ino_t; +typedef unsigned short __kernel_mode_t; +typedef unsigned short __kernel_ipc_pid_t; +typedef unsigned short __kernel_uid_t; +typedef unsigned short __kernel_gid_t; +typedef int __kernel_ptrdiff_t; + +#else /* __s390x__ */ + +typedef unsigned int __kernel_ino_t; +typedef unsigned int __kernel_mode_t; +typedef int __kernel_ipc_pid_t; +typedef unsigned int __kernel_uid_t; +typedef unsigned int __kernel_gid_t; +typedef long __kernel_ptrdiff_t; +typedef unsigned long __kernel_sigset_t; /* at least 32 bits */ + +#endif /* __s390x__ */ + +#define __kernel_ino_t __kernel_ino_t +#define __kernel_mode_t __kernel_mode_t +#define __kernel_ipc_pid_t __kernel_ipc_pid_t +#define __kernel_uid_t __kernel_uid_t +#define __kernel_gid_t __kernel_gid_t + +#include + +#endif diff --git a/arch/s390/include/uapi/asm/ptrace.h b/arch/s390/include/uapi/asm/ptrace.h new file mode 100644 index 0000000000..bb0826024b --- /dev/null +++ b/arch/s390/include/uapi/asm/ptrace.h @@ -0,0 +1,455 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * S390 version + * Copyright IBM Corp. 1999, 2000 + * Author(s): Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com) + */ + +#ifndef _UAPI_S390_PTRACE_H +#define _UAPI_S390_PTRACE_H + +#include + +/* + * Offsets in the user_regs_struct. They are used for the ptrace + * system call and in entry.S + */ +#ifndef __s390x__ + +#define PT_PSWMASK 0x00 +#define PT_PSWADDR 0x04 +#define PT_GPR0 0x08 +#define PT_GPR1 0x0C +#define PT_GPR2 0x10 +#define PT_GPR3 0x14 +#define PT_GPR4 0x18 +#define PT_GPR5 0x1C +#define PT_GPR6 0x20 +#define PT_GPR7 0x24 +#define PT_GPR8 0x28 +#define PT_GPR9 0x2C +#define PT_GPR10 0x30 +#define PT_GPR11 0x34 +#define PT_GPR12 0x38 +#define PT_GPR13 0x3C +#define PT_GPR14 0x40 +#define PT_GPR15 0x44 +#define PT_ACR0 0x48 +#define PT_ACR1 0x4C +#define PT_ACR2 0x50 +#define PT_ACR3 0x54 +#define PT_ACR4 0x58 +#define PT_ACR5 0x5C +#define PT_ACR6 0x60 +#define PT_ACR7 0x64 +#define PT_ACR8 0x68 +#define PT_ACR9 0x6C +#define PT_ACR10 0x70 +#define PT_ACR11 0x74 +#define PT_ACR12 0x78 +#define PT_ACR13 0x7C +#define PT_ACR14 0x80 +#define PT_ACR15 0x84 +#define PT_ORIGGPR2 0x88 +#define PT_FPC 0x90 +/* + * A nasty fact of life that the ptrace api + * only supports passing of longs. + */ +#define PT_FPR0_HI 0x98 +#define PT_FPR0_LO 0x9C +#define PT_FPR1_HI 0xA0 +#define PT_FPR1_LO 0xA4 +#define PT_FPR2_HI 0xA8 +#define PT_FPR2_LO 0xAC +#define PT_FPR3_HI 0xB0 +#define PT_FPR3_LO 0xB4 +#define PT_FPR4_HI 0xB8 +#define PT_FPR4_LO 0xBC +#define PT_FPR5_HI 0xC0 +#define PT_FPR5_LO 0xC4 +#define PT_FPR6_HI 0xC8 +#define PT_FPR6_LO 0xCC +#define PT_FPR7_HI 0xD0 +#define PT_FPR7_LO 0xD4 +#define PT_FPR8_HI 0xD8 +#define PT_FPR8_LO 0XDC +#define PT_FPR9_HI 0xE0 +#define PT_FPR9_LO 0xE4 +#define PT_FPR10_HI 0xE8 +#define PT_FPR10_LO 0xEC +#define PT_FPR11_HI 0xF0 +#define PT_FPR11_LO 0xF4 +#define PT_FPR12_HI 0xF8 +#define PT_FPR12_LO 0xFC +#define PT_FPR13_HI 0x100 +#define PT_FPR13_LO 0x104 +#define PT_FPR14_HI 0x108 +#define PT_FPR14_LO 0x10C +#define PT_FPR15_HI 0x110 +#define PT_FPR15_LO 0x114 +#define PT_CR_9 0x118 +#define PT_CR_10 0x11C +#define PT_CR_11 0x120 +#define PT_IEEE_IP 0x13C +#define PT_LASTOFF PT_IEEE_IP +#define PT_ENDREGS 0x140-1 + +#define GPR_SIZE 4 +#define CR_SIZE 4 + +#define STACK_FRAME_OVERHEAD 96 /* size of minimum stack frame */ + +#else /* __s390x__ */ + +#define PT_PSWMASK 0x00 +#define PT_PSWADDR 0x08 +#define PT_GPR0 0x10 +#define PT_GPR1 0x18 +#define PT_GPR2 0x20 +#define PT_GPR3 0x28 +#define PT_GPR4 0x30 +#define PT_GPR5 0x38 +#define PT_GPR6 0x40 +#define PT_GPR7 0x48 +#define PT_GPR8 0x50 +#define PT_GPR9 0x58 +#define PT_GPR10 0x60 +#define PT_GPR11 0x68 +#define PT_GPR12 0x70 +#define PT_GPR13 0x78 +#define PT_GPR14 0x80 +#define PT_GPR15 0x88 +#define PT_ACR0 0x90 +#define PT_ACR1 0x94 +#define PT_ACR2 0x98 +#define PT_ACR3 0x9C +#define PT_ACR4 0xA0 +#define PT_ACR5 0xA4 +#define PT_ACR6 0xA8 +#define PT_ACR7 0xAC +#define PT_ACR8 0xB0 +#define PT_ACR9 0xB4 +#define PT_ACR10 0xB8 +#define PT_ACR11 0xBC +#define PT_ACR12 0xC0 +#define PT_ACR13 0xC4 +#define PT_ACR14 0xC8 +#define PT_ACR15 0xCC +#define PT_ORIGGPR2 0xD0 +#define PT_FPC 0xD8 +#define PT_FPR0 0xE0 +#define PT_FPR1 0xE8 +#define PT_FPR2 0xF0 +#define PT_FPR3 0xF8 +#define PT_FPR4 0x100 +#define PT_FPR5 0x108 +#define PT_FPR6 0x110 +#define PT_FPR7 0x118 +#define PT_FPR8 0x120 +#define PT_FPR9 0x128 +#define PT_FPR10 0x130 +#define PT_FPR11 0x138 +#define PT_FPR12 0x140 +#define PT_FPR13 0x148 +#define PT_FPR14 0x150 +#define PT_FPR15 0x158 +#define PT_CR_9 0x160 +#define PT_CR_10 0x168 +#define PT_CR_11 0x170 +#define PT_IEEE_IP 0x1A8 +#define PT_LASTOFF PT_IEEE_IP +#define PT_ENDREGS 0x1B0-1 + +#define GPR_SIZE 8 +#define CR_SIZE 8 + +#define STACK_FRAME_OVERHEAD 160 /* size of minimum stack frame */ + +#endif /* __s390x__ */ + +#ifndef __s390x__ + +#define PSW_MASK_PER _AC(0x40000000, UL) +#define PSW_MASK_DAT _AC(0x04000000, UL) +#define PSW_MASK_IO _AC(0x02000000, UL) +#define PSW_MASK_EXT _AC(0x01000000, UL) +#define PSW_MASK_KEY _AC(0x00F00000, UL) +#define PSW_MASK_BASE _AC(0x00080000, UL) /* always one */ +#define PSW_MASK_MCHECK _AC(0x00040000, UL) +#define PSW_MASK_WAIT _AC(0x00020000, UL) +#define PSW_MASK_PSTATE _AC(0x00010000, UL) +#define PSW_MASK_ASC _AC(0x0000C000, UL) +#define PSW_MASK_CC _AC(0x00003000, UL) +#define PSW_MASK_PM _AC(0x00000F00, UL) +#define PSW_MASK_RI _AC(0x00000000, UL) +#define PSW_MASK_EA _AC(0x00000000, UL) +#define PSW_MASK_BA _AC(0x00000000, UL) + +#define PSW_MASK_USER _AC(0x0000FF00, UL) + +#define PSW_ADDR_AMODE _AC(0x80000000, UL) +#define PSW_ADDR_INSN _AC(0x7FFFFFFF, UL) + +#define PSW_ASC_PRIMARY _AC(0x00000000, UL) +#define PSW_ASC_ACCREG _AC(0x00004000, UL) +#define PSW_ASC_SECONDARY _AC(0x00008000, UL) +#define PSW_ASC_HOME _AC(0x0000C000, UL) + +#else /* __s390x__ */ + +#define PSW_MASK_PER _AC(0x4000000000000000, UL) +#define PSW_MASK_DAT _AC(0x0400000000000000, UL) +#define PSW_MASK_IO _AC(0x0200000000000000, UL) +#define PSW_MASK_EXT _AC(0x0100000000000000, UL) +#define PSW_MASK_BASE _AC(0x0000000000000000, UL) +#define PSW_MASK_KEY _AC(0x00F0000000000000, UL) +#define PSW_MASK_MCHECK _AC(0x0004000000000000, UL) +#define PSW_MASK_WAIT _AC(0x0002000000000000, UL) +#define PSW_MASK_PSTATE _AC(0x0001000000000000, UL) +#define PSW_MASK_ASC _AC(0x0000C00000000000, UL) +#define PSW_MASK_CC _AC(0x0000300000000000, UL) +#define PSW_MASK_PM _AC(0x00000F0000000000, UL) +#define PSW_MASK_RI _AC(0x0000008000000000, UL) +#define PSW_MASK_EA _AC(0x0000000100000000, UL) +#define PSW_MASK_BA _AC(0x0000000080000000, UL) + +#define PSW_MASK_USER _AC(0x0000FF0180000000, UL) + +#define PSW_ADDR_AMODE _AC(0x0000000000000000, UL) +#define PSW_ADDR_INSN _AC(0xFFFFFFFFFFFFFFFF, UL) + +#define PSW_ASC_PRIMARY _AC(0x0000000000000000, UL) +#define PSW_ASC_ACCREG _AC(0x0000400000000000, UL) +#define PSW_ASC_SECONDARY _AC(0x0000800000000000, UL) +#define PSW_ASC_HOME _AC(0x0000C00000000000, UL) + +#endif /* __s390x__ */ + +#define NUM_GPRS 16 +#define NUM_FPRS 16 +#define NUM_CRS 16 +#define NUM_ACRS 16 + +#define NUM_CR_WORDS 3 + +#define FPR_SIZE 8 +#define FPC_SIZE 4 +#define FPC_PAD_SIZE 4 /* gcc insists on aligning the fpregs */ +#define ACR_SIZE 4 + + +#define PTRACE_OLDSETOPTIONS 21 +#define PTRACE_SYSEMU 31 +#define PTRACE_SYSEMU_SINGLESTEP 32 +#ifndef __ASSEMBLY__ +#include +#include + +typedef union { + float f; + double d; + __u64 ui; + struct + { + __u32 hi; + __u32 lo; + } fp; +} freg_t; + +typedef struct { + __u32 fpc; + __u32 pad; + freg_t fprs[NUM_FPRS]; +} s390_fp_regs; + +#define FPC_EXCEPTION_MASK 0xF8000000 +#define FPC_FLAGS_MASK 0x00F80000 +#define FPC_DXC_MASK 0x0000FF00 +#define FPC_RM_MASK 0x00000003 + +/* this typedef defines how a Program Status Word looks like */ +typedef struct { + unsigned long mask; + unsigned long addr; +} __attribute__ ((aligned(8))) psw_t; + +/* + * The s390_regs structure is used to define the elf_gregset_t. + */ +typedef struct { + psw_t psw; + unsigned long gprs[NUM_GPRS]; + unsigned int acrs[NUM_ACRS]; + unsigned long orig_gpr2; +} s390_regs; + +/* + * The user_pt_regs structure exports the beginning of + * the in-kernel pt_regs structure to user space. + */ +typedef struct { + unsigned long args[1]; + psw_t psw; + unsigned long gprs[NUM_GPRS]; +} user_pt_regs; + +/* + * Now for the user space program event recording (trace) definitions. + * The following structures are used only for the ptrace interface, don't + * touch or even look at it if you don't want to modify the user-space + * ptrace interface. In particular stay away from it for in-kernel PER. + */ +typedef struct { + unsigned long cr[NUM_CR_WORDS]; +} per_cr_words; + +#define PER_EM_MASK 0xE8000000UL + +typedef struct { +#ifdef __s390x__ + unsigned : 32; +#endif /* __s390x__ */ + unsigned em_branching : 1; + unsigned em_instruction_fetch : 1; + /* + * Switching on storage alteration automatically fixes + * the storage alteration event bit in the users std. + */ + unsigned em_storage_alteration : 1; + unsigned em_gpr_alt_unused : 1; + unsigned em_store_real_address : 1; + unsigned : 3; + unsigned branch_addr_ctl : 1; + unsigned : 1; + unsigned storage_alt_space_ctl : 1; + unsigned : 21; + unsigned long starting_addr; + unsigned long ending_addr; +} per_cr_bits; + +typedef struct { + unsigned short perc_atmid; + unsigned long address; + unsigned char access_id; +} per_lowcore_words; + +typedef struct { + unsigned perc_branching : 1; + unsigned perc_instruction_fetch : 1; + unsigned perc_storage_alteration : 1; + unsigned perc_gpr_alt_unused : 1; + unsigned perc_store_real_address : 1; + unsigned : 3; + unsigned atmid_psw_bit_31 : 1; + unsigned atmid_validity_bit : 1; + unsigned atmid_psw_bit_32 : 1; + unsigned atmid_psw_bit_5 : 1; + unsigned atmid_psw_bit_16 : 1; + unsigned atmid_psw_bit_17 : 1; + unsigned si : 2; + unsigned long address; + unsigned : 4; + unsigned access_id : 4; +} per_lowcore_bits; + +typedef struct { + union { + per_cr_words words; + per_cr_bits bits; + } control_regs; + /* + * The single_step and instruction_fetch bits are obsolete, + * the kernel always sets them to zero. To enable single + * stepping use ptrace(PTRACE_SINGLESTEP) instead. + */ + unsigned single_step : 1; + unsigned instruction_fetch : 1; + unsigned : 30; + /* + * These addresses are copied into cr10 & cr11 if single + * stepping is switched off + */ + unsigned long starting_addr; + unsigned long ending_addr; + union { + per_lowcore_words words; + per_lowcore_bits bits; + } lowcore; +} per_struct; + +typedef struct { + unsigned int len; + unsigned long kernel_addr; + unsigned long process_addr; +} ptrace_area; + +/* + * S/390 specific non posix ptrace requests. I chose unusual values so + * they are unlikely to clash with future ptrace definitions. + */ +#define PTRACE_PEEKUSR_AREA 0x5000 +#define PTRACE_POKEUSR_AREA 0x5001 +#define PTRACE_PEEKTEXT_AREA 0x5002 +#define PTRACE_PEEKDATA_AREA 0x5003 +#define PTRACE_POKETEXT_AREA 0x5004 +#define PTRACE_POKEDATA_AREA 0x5005 +#define PTRACE_GET_LAST_BREAK 0x5006 +#define PTRACE_PEEK_SYSTEM_CALL 0x5007 +#define PTRACE_POKE_SYSTEM_CALL 0x5008 +#define PTRACE_ENABLE_TE 0x5009 +#define PTRACE_DISABLE_TE 0x5010 +#define PTRACE_TE_ABORT_RAND 0x5011 + +/* + * The numbers chosen here are somewhat arbitrary but absolutely MUST + * not overlap with any of the number assigned in . + */ +#define PTRACE_SINGLEBLOCK 12 /* resume execution until next branch */ + +/* + * PT_PROT definition is loosely based on hppa bsd definition in + * gdb/hppab-nat.c + */ +#define PTRACE_PROT 21 + +typedef enum { + ptprot_set_access_watchpoint, + ptprot_set_write_watchpoint, + ptprot_disable_watchpoint +} ptprot_flags; + +typedef struct { + unsigned long lowaddr; + unsigned long hiaddr; + ptprot_flags prot; +} ptprot_area; + +/* Sequence of bytes for breakpoint illegal instruction. */ +#define S390_BREAKPOINT {0x0,0x1} +#define S390_BREAKPOINT_U16 ((__u16)0x0001) +#define S390_SYSCALL_OPCODE ((__u16)0x0a00) +#define S390_SYSCALL_SIZE 2 + +/* + * The user_regs_struct defines the way the user registers are + * store on the stack for signal handling. + */ +struct user_regs_struct { + psw_t psw; + unsigned long gprs[NUM_GPRS]; + unsigned int acrs[NUM_ACRS]; + unsigned long orig_gpr2; + s390_fp_regs fp_regs; + /* + * These per registers are in here so that gdb can modify them + * itself as there is no "official" ptrace interface for hardware + * watchpoints. This is the way intel does it. + */ + per_struct per_info; + unsigned long ieee_instruction_pointer; /* obsolete, always 0 */ +}; + +#endif /* __ASSEMBLY__ */ + +#endif /* _UAPI_S390_PTRACE_H */ diff --git a/arch/s390/include/uapi/asm/qeth.h b/arch/s390/include/uapi/asm/qeth.h new file mode 100644 index 0000000000..fac9995dfe --- /dev/null +++ b/arch/s390/include/uapi/asm/qeth.h @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * ioctl definitions for qeth driver + * + * Copyright IBM Corp. 2004 + * + * Author(s): Thomas Spatzier + * + */ +#ifndef __ASM_S390_QETH_IOCTL_H__ +#define __ASM_S390_QETH_IOCTL_H__ +#include +#include + +#define SIOC_QETH_ARP_SET_NO_ENTRIES (SIOCDEVPRIVATE) +#define SIOC_QETH_ARP_QUERY_INFO (SIOCDEVPRIVATE + 1) +#define SIOC_QETH_ARP_ADD_ENTRY (SIOCDEVPRIVATE + 2) +#define SIOC_QETH_ARP_REMOVE_ENTRY (SIOCDEVPRIVATE + 3) +#define SIOC_QETH_ARP_FLUSH_CACHE (SIOCDEVPRIVATE + 4) +#define SIOC_QETH_ADP_SET_SNMP_CONTROL (SIOCDEVPRIVATE + 5) +#define SIOC_QETH_GET_CARD_TYPE (SIOCDEVPRIVATE + 6) +#define SIOC_QETH_QUERY_OAT (SIOCDEVPRIVATE + 7) + +struct qeth_arp_cache_entry { + __u8 macaddr[6]; + __u8 reserved1[2]; + __u8 ipaddr[16]; /* for both IPv4 and IPv6 */ + __u8 reserved2[32]; +} __attribute__ ((packed)); + +enum qeth_arp_ipaddrtype { + QETHARP_IP_ADDR_V4 = 1, + QETHARP_IP_ADDR_V6 = 2, +}; +struct qeth_arp_entrytype { + __u8 mac; + __u8 ip; +} __attribute__((packed)); + +#define QETH_QARP_MEDIASPECIFIC_BYTES 32 +#define QETH_QARP_MACADDRTYPE_BYTES 1 +struct qeth_arp_qi_entry7 { + __u8 media_specific[QETH_QARP_MEDIASPECIFIC_BYTES]; + struct qeth_arp_entrytype type; + __u8 macaddr[6]; + __u8 ipaddr[4]; +} __attribute__((packed)); + +struct qeth_arp_qi_entry7_ipv6 { + __u8 media_specific[QETH_QARP_MEDIASPECIFIC_BYTES]; + struct qeth_arp_entrytype type; + __u8 macaddr[6]; + __u8 ipaddr[16]; +} __attribute__((packed)); + +struct qeth_arp_qi_entry7_short { + struct qeth_arp_entrytype type; + __u8 macaddr[6]; + __u8 ipaddr[4]; +} __attribute__((packed)); + +struct qeth_arp_qi_entry7_short_ipv6 { + struct qeth_arp_entrytype type; + __u8 macaddr[6]; + __u8 ipaddr[16]; +} __attribute__((packed)); + +struct qeth_arp_qi_entry5 { + __u8 media_specific[QETH_QARP_MEDIASPECIFIC_BYTES]; + struct qeth_arp_entrytype type; + __u8 ipaddr[4]; +} __attribute__((packed)); + +struct qeth_arp_qi_entry5_ipv6 { + __u8 media_specific[QETH_QARP_MEDIASPECIFIC_BYTES]; + struct qeth_arp_entrytype type; + __u8 ipaddr[16]; +} __attribute__((packed)); + +struct qeth_arp_qi_entry5_short { + struct qeth_arp_entrytype type; + __u8 ipaddr[4]; +} __attribute__((packed)); + +struct qeth_arp_qi_entry5_short_ipv6 { + struct qeth_arp_entrytype type; + __u8 ipaddr[16]; +} __attribute__((packed)); +/* + * can be set by user if no "media specific information" is wanted + * -> saves a lot of space in user space buffer + */ +#define QETH_QARP_STRIP_ENTRIES 0x8000 +#define QETH_QARP_WITH_IPV6 0x4000 +#define QETH_QARP_REQUEST_MASK 0x00ff + +/* data sent to user space as result of query arp ioctl */ +#define QETH_QARP_USER_DATA_SIZE 20000 +#define QETH_QARP_MASK_OFFSET 4 +#define QETH_QARP_ENTRIES_OFFSET 6 +struct qeth_arp_query_user_data { + union { + __u32 data_len; /* set by user space program */ + __u32 no_entries; /* set by kernel */ + } u; + __u16 mask_bits; + char *entries; +} __attribute__((packed)); + +struct qeth_query_oat_data { + __u32 command; + __u32 buffer_len; + __u32 response_len; + __u64 ptr; +}; +#endif /* __ASM_S390_QETH_IOCTL_H__ */ diff --git a/arch/s390/include/uapi/asm/raw3270.h b/arch/s390/include/uapi/asm/raw3270.h new file mode 100644 index 0000000000..6676f102bd --- /dev/null +++ b/arch/s390/include/uapi/asm/raw3270.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __ASM_S390_UAPI_RAW3270_H +#define __ASM_S390_UAPI_RAW3270_H + +/* Local Channel Commands */ +#define TC_WRITE 0x01 /* Write */ +#define TC_RDBUF 0x02 /* Read Buffer */ +#define TC_EWRITE 0x05 /* Erase write */ +#define TC_READMOD 0x06 /* Read modified */ +#define TC_EWRITEA 0x0d /* Erase write alternate */ +#define TC_WRITESF 0x11 /* Write structured field */ + +/* Buffer Control Orders */ +#define TO_GE 0x08 /* Graphics Escape */ +#define TO_SF 0x1d /* Start field */ +#define TO_SBA 0x11 /* Set buffer address */ +#define TO_IC 0x13 /* Insert cursor */ +#define TO_PT 0x05 /* Program tab */ +#define TO_RA 0x3c /* Repeat to address */ +#define TO_SFE 0x29 /* Start field extended */ +#define TO_EUA 0x12 /* Erase unprotected to address */ +#define TO_MF 0x2c /* Modify field */ +#define TO_SA 0x28 /* Set attribute */ + +/* Field Attribute Bytes */ +#define TF_INPUT 0x40 /* Visible input */ +#define TF_INPUTN 0x4c /* Invisible input */ +#define TF_INMDT 0xc1 /* Visible, Set-MDT */ +#define TF_LOG 0x60 + +/* Character Attribute Bytes */ +#define TAT_RESET 0x00 +#define TAT_FIELD 0xc0 +#define TAT_EXTHI 0x41 +#define TAT_FGCOLOR 0x42 +#define TAT_CHARS 0x43 +#define TAT_BGCOLOR 0x45 +#define TAT_TRANS 0x46 + +/* Extended-Highlighting Bytes */ +#define TAX_RESET 0x00 +#define TAX_BLINK 0xf1 +#define TAX_REVER 0xf2 +#define TAX_UNDER 0xf4 + +/* Reset value */ +#define TAR_RESET 0x00 + +/* Color values */ +#define TAC_RESET 0x00 +#define TAC_BLUE 0xf1 +#define TAC_RED 0xf2 +#define TAC_PINK 0xf3 +#define TAC_GREEN 0xf4 +#define TAC_TURQ 0xf5 +#define TAC_YELLOW 0xf6 +#define TAC_WHITE 0xf7 +#define TAC_DEFAULT 0x00 + +/* Write Control Characters */ +#define TW_NONE 0x40 /* No particular action */ +#define TW_KR 0xc2 /* Keyboard restore */ +#define TW_PLUSALARM 0x04 /* Add this bit for alarm */ + +#define RAW3270_FIRSTMINOR 1 /* First minor number */ +#define RAW3270_MAXDEVS 255 /* Max number of 3270 devices */ + +#define AID_CLEAR 0x6d +#define AID_ENTER 0x7d +#define AID_PF3 0xf3 +#define AID_PF7 0xf7 +#define AID_PF8 0xf8 +#define AID_READ_PARTITION 0x88 + +#endif /* __ASM_S390_UAPI_RAW3270_H */ diff --git a/arch/s390/include/uapi/asm/runtime_instr.h b/arch/s390/include/uapi/asm/runtime_instr.h new file mode 100644 index 0000000000..455da46e31 --- /dev/null +++ b/arch/s390/include/uapi/asm/runtime_instr.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _S390_UAPI_RUNTIME_INSTR_H +#define _S390_UAPI_RUNTIME_INSTR_H + +#include + +#define S390_RUNTIME_INSTR_START 0x1 +#define S390_RUNTIME_INSTR_STOP 0x2 + +struct runtime_instr_cb { + __u64 rca; + __u64 roa; + __u64 rla; + + __u32 v : 1; + __u32 s : 1; + __u32 k : 1; + __u32 h : 1; + __u32 a : 1; + __u32 reserved1 : 3; + __u32 ps : 1; + __u32 qs : 1; + __u32 pc : 1; + __u32 qc : 1; + __u32 reserved2 : 1; + __u32 g : 1; + __u32 u : 1; + __u32 l : 1; + __u32 key : 4; + __u32 reserved3 : 8; + __u32 t : 1; + __u32 rgs : 3; + + __u32 m : 4; + __u32 n : 1; + __u32 mae : 1; + __u32 reserved4 : 2; + __u32 c : 1; + __u32 r : 1; + __u32 b : 1; + __u32 j : 1; + __u32 e : 1; + __u32 x : 1; + __u32 reserved5 : 2; + __u32 bpxn : 1; + __u32 bpxt : 1; + __u32 bpti : 1; + __u32 bpni : 1; + __u32 reserved6 : 2; + + __u32 d : 1; + __u32 f : 1; + __u32 ic : 4; + __u32 dc : 4; + + __u64 reserved7; + __u64 sf; + __u64 rsic; + __u64 reserved8; +} __attribute__((__packed__, __aligned__(8))); + +static inline void load_runtime_instr_cb(struct runtime_instr_cb *cb) +{ + asm volatile(".insn rsy,0xeb0000000060,0,0,%0" /* LRIC */ + : : "Q" (*cb)); +} + +static inline void store_runtime_instr_cb(struct runtime_instr_cb *cb) +{ + asm volatile(".insn rsy,0xeb0000000061,0,0,%0" /* STRIC */ + : "=Q" (*cb) : : "cc"); +} + +#endif /* _S390_UAPI_RUNTIME_INSTR_H */ diff --git a/arch/s390/include/uapi/asm/schid.h b/arch/s390/include/uapi/asm/schid.h new file mode 100644 index 0000000000..a3e1cf1685 --- /dev/null +++ b/arch/s390/include/uapi/asm/schid.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPIASM_SCHID_H +#define _UAPIASM_SCHID_H + +#include + +#ifndef __ASSEMBLY__ + +struct subchannel_id { + __u32 cssid : 8; + __u32 : 4; + __u32 m : 1; + __u32 ssid : 2; + __u32 one : 1; + __u32 sch_no : 16; +} __attribute__ ((packed, aligned(4))); + +#endif /* __ASSEMBLY__ */ + +#endif /* _UAPIASM_SCHID_H */ diff --git a/arch/s390/include/uapi/asm/sclp_ctl.h b/arch/s390/include/uapi/asm/sclp_ctl.h new file mode 100644 index 0000000000..e4e8c4dcd1 --- /dev/null +++ b/arch/s390/include/uapi/asm/sclp_ctl.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * IOCTL interface for SCLP + * + * Copyright IBM Corp. 2012 + * + * Author: Michael Holzheu + */ + +#ifndef _ASM_SCLP_CTL_H +#define _ASM_SCLP_CTL_H + +#include + +struct sclp_ctl_sccb { + __u32 cmdw; + __u64 sccb; +} __attribute__((packed)); + +#define SCLP_CTL_IOCTL_MAGIC 0x10 + +#define SCLP_CTL_SCCB \ + _IOWR(SCLP_CTL_IOCTL_MAGIC, 0x10, struct sclp_ctl_sccb) + +#endif diff --git a/arch/s390/include/uapi/asm/setup.h b/arch/s390/include/uapi/asm/setup.h new file mode 100644 index 0000000000..598d769e76 --- /dev/null +++ b/arch/s390/include/uapi/asm/setup.h @@ -0,0 +1 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ diff --git a/arch/s390/include/uapi/asm/sie.h b/arch/s390/include/uapi/asm/sie.h new file mode 100644 index 0000000000..ede318653c --- /dev/null +++ b/arch/s390/include/uapi/asm/sie.h @@ -0,0 +1,252 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_ASM_S390_SIE_H +#define _UAPI_ASM_S390_SIE_H + +#define diagnose_codes \ + { 0x10, "DIAG (0x10) release pages" }, \ + { 0x44, "DIAG (0x44) time slice end" }, \ + { 0x9c, "DIAG (0x9c) time slice end directed" }, \ + { 0x204, "DIAG (0x204) logical-cpu utilization" }, \ + { 0x258, "DIAG (0x258) page-reference services" }, \ + { 0x288, "DIAG (0x288) watchdog functions" }, \ + { 0x308, "DIAG (0x308) ipl functions" }, \ + { 0x500, "DIAG (0x500) KVM virtio functions" }, \ + { 0x501, "DIAG (0x501) KVM breakpoint" } + +#define sigp_order_codes \ + { 0x01, "SIGP sense" }, \ + { 0x02, "SIGP external call" }, \ + { 0x03, "SIGP emergency signal" }, \ + { 0x04, "SIGP start" }, \ + { 0x05, "SIGP stop" }, \ + { 0x06, "SIGP restart" }, \ + { 0x09, "SIGP stop and store status" }, \ + { 0x0b, "SIGP initial cpu reset" }, \ + { 0x0c, "SIGP cpu reset" }, \ + { 0x0d, "SIGP set prefix" }, \ + { 0x0e, "SIGP store status at address" }, \ + { 0x12, "SIGP set architecture" }, \ + { 0x13, "SIGP conditional emergency signal" }, \ + { 0x15, "SIGP sense running" }, \ + { 0x16, "SIGP set multithreading"}, \ + { 0x17, "SIGP store additional status at address"} + +#define icpt_prog_codes \ + { 0x0001, "Prog Operation" }, \ + { 0x0002, "Prog Privileged Operation" }, \ + { 0x0003, "Prog Execute" }, \ + { 0x0004, "Prog Protection" }, \ + { 0x0005, "Prog Addressing" }, \ + { 0x0006, "Prog Specification" }, \ + { 0x0007, "Prog Data" }, \ + { 0x0008, "Prog Fixedpoint overflow" }, \ + { 0x0009, "Prog Fixedpoint divide" }, \ + { 0x000A, "Prog Decimal overflow" }, \ + { 0x000B, "Prog Decimal divide" }, \ + { 0x000C, "Prog HFP exponent overflow" }, \ + { 0x000D, "Prog HFP exponent underflow" }, \ + { 0x000E, "Prog HFP significance" }, \ + { 0x000F, "Prog HFP divide" }, \ + { 0x0010, "Prog Segment translation" }, \ + { 0x0011, "Prog Page translation" }, \ + { 0x0012, "Prog Translation specification" }, \ + { 0x0013, "Prog Special operation" }, \ + { 0x0015, "Prog Operand" }, \ + { 0x0016, "Prog Trace table" }, \ + { 0x0017, "Prog ASNtranslation specification" }, \ + { 0x001C, "Prog Spaceswitch event" }, \ + { 0x001D, "Prog HFP square root" }, \ + { 0x001F, "Prog PCtranslation specification" }, \ + { 0x0020, "Prog AFX translation" }, \ + { 0x0021, "Prog ASX translation" }, \ + { 0x0022, "Prog LX translation" }, \ + { 0x0023, "Prog EX translation" }, \ + { 0x0024, "Prog Primary authority" }, \ + { 0x0025, "Prog Secondary authority" }, \ + { 0x0026, "Prog LFXtranslation exception" }, \ + { 0x0027, "Prog LSXtranslation exception" }, \ + { 0x0028, "Prog ALET specification" }, \ + { 0x0029, "Prog ALEN translation" }, \ + { 0x002A, "Prog ALE sequence" }, \ + { 0x002B, "Prog ASTE validity" }, \ + { 0x002C, "Prog ASTE sequence" }, \ + { 0x002D, "Prog Extended authority" }, \ + { 0x002E, "Prog LSTE sequence" }, \ + { 0x002F, "Prog ASTE instance" }, \ + { 0x0030, "Prog Stack full" }, \ + { 0x0031, "Prog Stack empty" }, \ + { 0x0032, "Prog Stack specification" }, \ + { 0x0033, "Prog Stack type" }, \ + { 0x0034, "Prog Stack operation" }, \ + { 0x0039, "Prog Region first translation" }, \ + { 0x003A, "Prog Region second translation" }, \ + { 0x003B, "Prog Region third translation" }, \ + { 0x0040, "Prog Monitor event" }, \ + { 0x0080, "Prog PER event" }, \ + { 0x0119, "Prog Crypto operation" } + +#define exit_code_ipa0(ipa0, opcode, mnemonic) \ + { (ipa0 << 8 | opcode), #ipa0 " " mnemonic } +#define exit_code(opcode, mnemonic) \ + { opcode, mnemonic } + +#define icpt_insn_codes \ + exit_code_ipa0(0x01, 0x01, "PR"), \ + exit_code_ipa0(0x01, 0x04, "PTFF"), \ + exit_code_ipa0(0x01, 0x07, "SCKPF"), \ + exit_code_ipa0(0xAA, 0x00, "RINEXT"), \ + exit_code_ipa0(0xAA, 0x01, "RION"), \ + exit_code_ipa0(0xAA, 0x02, "TRIC"), \ + exit_code_ipa0(0xAA, 0x03, "RIOFF"), \ + exit_code_ipa0(0xAA, 0x04, "RIEMIT"), \ + exit_code_ipa0(0xB2, 0x02, "STIDP"), \ + exit_code_ipa0(0xB2, 0x04, "SCK"), \ + exit_code_ipa0(0xB2, 0x05, "STCK"), \ + exit_code_ipa0(0xB2, 0x06, "SCKC"), \ + exit_code_ipa0(0xB2, 0x07, "STCKC"), \ + exit_code_ipa0(0xB2, 0x08, "SPT"), \ + exit_code_ipa0(0xB2, 0x09, "STPT"), \ + exit_code_ipa0(0xB2, 0x0d, "PTLB"), \ + exit_code_ipa0(0xB2, 0x10, "SPX"), \ + exit_code_ipa0(0xB2, 0x11, "STPX"), \ + exit_code_ipa0(0xB2, 0x12, "STAP"), \ + exit_code_ipa0(0xB2, 0x14, "SIE"), \ + exit_code_ipa0(0xB2, 0x16, "SETR"), \ + exit_code_ipa0(0xB2, 0x17, "STETR"), \ + exit_code_ipa0(0xB2, 0x18, "PC"), \ + exit_code_ipa0(0xB2, 0x20, "SERVC"), \ + exit_code_ipa0(0xB2, 0x21, "IPTE"), \ + exit_code_ipa0(0xB2, 0x28, "PT"), \ + exit_code_ipa0(0xB2, 0x29, "ISKE"), \ + exit_code_ipa0(0xB2, 0x2a, "RRBE"), \ + exit_code_ipa0(0xB2, 0x2b, "SSKE"), \ + exit_code_ipa0(0xB2, 0x2c, "TB"), \ + exit_code_ipa0(0xB2, 0x2e, "PGIN"), \ + exit_code_ipa0(0xB2, 0x2f, "PGOUT"), \ + exit_code_ipa0(0xB2, 0x30, "CSCH"), \ + exit_code_ipa0(0xB2, 0x31, "HSCH"), \ + exit_code_ipa0(0xB2, 0x32, "MSCH"), \ + exit_code_ipa0(0xB2, 0x33, "SSCH"), \ + exit_code_ipa0(0xB2, 0x34, "STSCH"), \ + exit_code_ipa0(0xB2, 0x35, "TSCH"), \ + exit_code_ipa0(0xB2, 0x36, "TPI"), \ + exit_code_ipa0(0xB2, 0x37, "SAL"), \ + exit_code_ipa0(0xB2, 0x38, "RSCH"), \ + exit_code_ipa0(0xB2, 0x39, "STCRW"), \ + exit_code_ipa0(0xB2, 0x3a, "STCPS"), \ + exit_code_ipa0(0xB2, 0x3b, "RCHP"), \ + exit_code_ipa0(0xB2, 0x3c, "SCHM"), \ + exit_code_ipa0(0xB2, 0x40, "BAKR"), \ + exit_code_ipa0(0xB2, 0x48, "PALB"), \ + exit_code_ipa0(0xB2, 0x4c, "TAR"), \ + exit_code_ipa0(0xB2, 0x50, "CSP"), \ + exit_code_ipa0(0xB2, 0x54, "MVPG"), \ + exit_code_ipa0(0xB2, 0x56, "STHYI"), \ + exit_code_ipa0(0xB2, 0x58, "BSG"), \ + exit_code_ipa0(0xB2, 0x5a, "BSA"), \ + exit_code_ipa0(0xB2, 0x5f, "CHSC"), \ + exit_code_ipa0(0xB2, 0x74, "SIGA"), \ + exit_code_ipa0(0xB2, 0x76, "XSCH"), \ + exit_code_ipa0(0xB2, 0x78, "STCKE"), \ + exit_code_ipa0(0xB2, 0x7c, "STCKF"), \ + exit_code_ipa0(0xB2, 0x7d, "STSI"), \ + exit_code_ipa0(0xB2, 0xb0, "STFLE"), \ + exit_code_ipa0(0xB2, 0xb1, "STFL"), \ + exit_code_ipa0(0xB2, 0xb2, "LPSWE"), \ + exit_code_ipa0(0xB2, 0xf8, "TEND"), \ + exit_code_ipa0(0xB2, 0xfc, "TABORT"), \ + exit_code_ipa0(0xB9, 0x1e, "KMAC"), \ + exit_code_ipa0(0xB9, 0x28, "PCKMO"), \ + exit_code_ipa0(0xB9, 0x2a, "KMF"), \ + exit_code_ipa0(0xB9, 0x2b, "KMO"), \ + exit_code_ipa0(0xB9, 0x2d, "KMCTR"), \ + exit_code_ipa0(0xB9, 0x2e, "KM"), \ + exit_code_ipa0(0xB9, 0x2f, "KMC"), \ + exit_code_ipa0(0xB9, 0x3e, "KIMD"), \ + exit_code_ipa0(0xB9, 0x3f, "KLMD"), \ + exit_code_ipa0(0xB9, 0x8a, "CSPG"), \ + exit_code_ipa0(0xB9, 0x8d, "EPSW"), \ + exit_code_ipa0(0xB9, 0x8e, "IDTE"), \ + exit_code_ipa0(0xB9, 0x8f, "CRDTE"), \ + exit_code_ipa0(0xB9, 0x9c, "EQBS"), \ + exit_code_ipa0(0xB9, 0xa2, "PTF"), \ + exit_code_ipa0(0xB9, 0xab, "ESSA"), \ + exit_code_ipa0(0xB9, 0xae, "RRBM"), \ + exit_code_ipa0(0xB9, 0xaf, "PFMF"), \ + exit_code_ipa0(0xE3, 0x03, "LRAG"), \ + exit_code_ipa0(0xE3, 0x13, "LRAY"), \ + exit_code_ipa0(0xE3, 0x25, "NTSTG"), \ + exit_code_ipa0(0xE5, 0x00, "LASP"), \ + exit_code_ipa0(0xE5, 0x01, "TPROT"), \ + exit_code_ipa0(0xE5, 0x60, "TBEGIN"), \ + exit_code_ipa0(0xE5, 0x61, "TBEGINC"), \ + exit_code_ipa0(0xEB, 0x25, "STCTG"), \ + exit_code_ipa0(0xEB, 0x2f, "LCTLG"), \ + exit_code_ipa0(0xEB, 0x60, "LRIC"), \ + exit_code_ipa0(0xEB, 0x61, "STRIC"), \ + exit_code_ipa0(0xEB, 0x62, "MRIC"), \ + exit_code_ipa0(0xEB, 0x8a, "SQBS"), \ + exit_code_ipa0(0xC8, 0x01, "ECTG"), \ + exit_code(0x0a, "SVC"), \ + exit_code(0x80, "SSM"), \ + exit_code(0x82, "LPSW"), \ + exit_code(0x83, "DIAG"), \ + exit_code(0xae, "SIGP"), \ + exit_code(0xac, "STNSM"), \ + exit_code(0xad, "STOSM"), \ + exit_code(0xb1, "LRA"), \ + exit_code(0xb6, "STCTL"), \ + exit_code(0xb7, "LCTL"), \ + exit_code(0xee, "PLO") + +#define sie_intercept_code \ + { 0x00, "Host interruption" }, \ + { 0x04, "Instruction" }, \ + { 0x08, "Program interruption" }, \ + { 0x0c, "Instruction and program interruption" }, \ + { 0x10, "External request" }, \ + { 0x14, "External interruption" }, \ + { 0x18, "I/O request" }, \ + { 0x1c, "Wait state" }, \ + { 0x20, "Validity" }, \ + { 0x28, "Stop request" }, \ + { 0x2c, "Operation exception" }, \ + { 0x38, "Partial-execution" }, \ + { 0x3c, "I/O interruption" }, \ + { 0x40, "I/O instruction" }, \ + { 0x48, "Timing subset" } + +/* + * This is the simple interceptable instructions decoder. + * + * It will be used as userspace interface and it can be used in places + * that does not allow to use general decoder functions, + * such as trace events declarations. + * + * Some userspace tools may want to parse this code + * and would be confused by switch(), if() and other statements, + * but they can understand conditional operator. + */ +#define INSN_DECODE_IPA0(ipa0, insn, rshift, mask) \ + (insn >> 56) == (ipa0) ? \ + ((ipa0 << 8) | ((insn >> rshift) & mask)) : + +#define INSN_DECODE(insn) (insn >> 56) + +/* + * The macro icpt_insn_decoder() takes an intercepted instruction + * and returns a key, which can be used to find a mnemonic name + * of the instruction in the icpt_insn_codes table. + */ +#define icpt_insn_decoder(insn) ( \ + INSN_DECODE_IPA0(0x01, insn, 48, 0xff) \ + INSN_DECODE_IPA0(0xaa, insn, 48, 0x0f) \ + INSN_DECODE_IPA0(0xb2, insn, 48, 0xff) \ + INSN_DECODE_IPA0(0xb9, insn, 48, 0xff) \ + INSN_DECODE_IPA0(0xe3, insn, 48, 0xff) \ + INSN_DECODE_IPA0(0xe5, insn, 48, 0xff) \ + INSN_DECODE_IPA0(0xeb, insn, 16, 0xff) \ + INSN_DECODE_IPA0(0xc8, insn, 48, 0x0f) \ + INSN_DECODE(insn)) + +#endif /* _UAPI_ASM_S390_SIE_H */ diff --git a/arch/s390/include/uapi/asm/sigcontext.h b/arch/s390/include/uapi/asm/sigcontext.h new file mode 100644 index 0000000000..8b35033334 --- /dev/null +++ b/arch/s390/include/uapi/asm/sigcontext.h @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * S390 version + * Copyright IBM Corp. 1999, 2000 + */ + +#ifndef _ASM_S390_SIGCONTEXT_H +#define _ASM_S390_SIGCONTEXT_H + +#include +#include + +#define __NUM_GPRS 16 +#define __NUM_FPRS 16 +#define __NUM_ACRS 16 +#define __NUM_VXRS 32 +#define __NUM_VXRS_LOW 16 +#define __NUM_VXRS_HIGH 16 + +#ifndef __s390x__ + +/* Has to be at least _NSIG_WORDS from asm/signal.h */ +#define _SIGCONTEXT_NSIG 64 +#define _SIGCONTEXT_NSIG_BPW 32 +/* Size of stack frame allocated when calling signal handler. */ +#define __SIGNAL_FRAMESIZE 96 + +#else /* __s390x__ */ + +/* Has to be at least _NSIG_WORDS from asm/signal.h */ +#define _SIGCONTEXT_NSIG 64 +#define _SIGCONTEXT_NSIG_BPW 64 +/* Size of stack frame allocated when calling signal handler. */ +#define __SIGNAL_FRAMESIZE 160 + +#endif /* __s390x__ */ + +#define _SIGCONTEXT_NSIG_WORDS (_SIGCONTEXT_NSIG / _SIGCONTEXT_NSIG_BPW) +#define _SIGMASK_COPY_SIZE (sizeof(unsigned long)*_SIGCONTEXT_NSIG_WORDS) + +typedef struct +{ + unsigned long mask; + unsigned long addr; +} __attribute__ ((aligned(8))) _psw_t; + +typedef struct +{ + _psw_t psw; + unsigned long gprs[__NUM_GPRS]; + unsigned int acrs[__NUM_ACRS]; +} _s390_regs_common; + +typedef struct +{ + unsigned int fpc; + unsigned int pad; + double fprs[__NUM_FPRS]; +} _s390_fp_regs; + +typedef struct +{ + _s390_regs_common regs; + _s390_fp_regs fpregs; +} _sigregs; + +typedef struct +{ +#ifndef __s390x__ + unsigned long gprs_high[__NUM_GPRS]; +#endif + unsigned long long vxrs_low[__NUM_VXRS_LOW]; + __vector128 vxrs_high[__NUM_VXRS_HIGH]; + unsigned char __reserved[128]; +} _sigregs_ext; + +struct sigcontext +{ + unsigned long oldmask[_SIGCONTEXT_NSIG_WORDS]; + _sigregs __user *sregs; +}; + + +#endif + diff --git a/arch/s390/include/uapi/asm/signal.h b/arch/s390/include/uapi/asm/signal.h new file mode 100644 index 0000000000..e74d6ba1bd --- /dev/null +++ b/arch/s390/include/uapi/asm/signal.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * S390 version + * + * Derived from "include/asm-i386/signal.h" + */ + +#ifndef _UAPI_ASMS390_SIGNAL_H +#define _UAPI_ASMS390_SIGNAL_H + +#include +#include + +/* Avoid too many header ordering problems. */ +struct siginfo; +struct pt_regs; + +#ifndef __KERNEL__ +/* Here we must cater to libcs that poke about in kernel headers. */ + +#define NSIG 32 +typedef unsigned long sigset_t; + +#endif /* __KERNEL__ */ + +#define SIGHUP 1 +#define SIGINT 2 +#define SIGQUIT 3 +#define SIGILL 4 +#define SIGTRAP 5 +#define SIGABRT 6 +#define SIGIOT 6 +#define SIGBUS 7 +#define SIGFPE 8 +#define SIGKILL 9 +#define SIGUSR1 10 +#define SIGSEGV 11 +#define SIGUSR2 12 +#define SIGPIPE 13 +#define SIGALRM 14 +#define SIGTERM 15 +#define SIGSTKFLT 16 +#define SIGCHLD 17 +#define SIGCONT 18 +#define SIGSTOP 19 +#define SIGTSTP 20 +#define SIGTTIN 21 +#define SIGTTOU 22 +#define SIGURG 23 +#define SIGXCPU 24 +#define SIGXFSZ 25 +#define SIGVTALRM 26 +#define SIGPROF 27 +#define SIGWINCH 28 +#define SIGIO 29 +#define SIGPOLL SIGIO +/* +#define SIGLOST 29 +*/ +#define SIGPWR 30 +#define SIGSYS 31 +#define SIGUNUSED 31 + +/* These should not be considered constants from userland. */ +#define SIGRTMIN 32 +#define SIGRTMAX _NSIG + +#define SA_RESTORER 0x04000000 + +#define MINSIGSTKSZ 2048 +#define SIGSTKSZ 8192 + +#include + +#ifndef __KERNEL__ + +/* + * There are two system calls in regard to sigaction, sys_rt_sigaction + * and sys_sigaction. Internally the kernel uses the struct old_sigaction + * for the older sys_sigaction system call, and the kernel version of the + * struct sigaction for the newer sys_rt_sigaction. + * + * The uapi definition for struct sigaction has made a strange distinction + * between 31-bit and 64-bit in the past. For 64-bit the uapi structure + * looks like the kernel struct sigaction, but for 31-bit it used to + * look like the kernel struct old_sigaction. That practically made the + * structure unusable for either system call. To get around this problem + * the glibc always had its own definitions for the sigaction structures. + * + * The current struct sigaction uapi definition below is suitable for the + * sys_rt_sigaction system call only. + */ +struct sigaction { + union { + __sighandler_t _sa_handler; + void (*_sa_sigaction)(int, struct siginfo *, void *); + } _u; + unsigned long sa_flags; + void (*sa_restorer)(void); + sigset_t sa_mask; +}; + +#define sa_handler _u._sa_handler +#define sa_sigaction _u._sa_sigaction + +#endif /* __KERNEL__ */ + +typedef struct sigaltstack { + void __user *ss_sp; + int ss_flags; + __kernel_size_t ss_size; +} stack_t; + + +#endif /* _UAPI_ASMS390_SIGNAL_H */ diff --git a/arch/s390/include/uapi/asm/stat.h b/arch/s390/include/uapi/asm/stat.h new file mode 100644 index 0000000000..ac253d2360 --- /dev/null +++ b/arch/s390/include/uapi/asm/stat.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * S390 version + * + * Derived from "include/asm-i386/stat.h" + */ + +#ifndef _S390_STAT_H +#define _S390_STAT_H + +#ifndef __s390x__ +struct __old_kernel_stat { + unsigned short st_dev; + unsigned short st_ino; + unsigned short st_mode; + unsigned short st_nlink; + unsigned short st_uid; + unsigned short st_gid; + unsigned short st_rdev; + unsigned long st_size; + unsigned long st_atime; + unsigned long st_mtime; + unsigned long st_ctime; +}; + +struct stat { + unsigned short st_dev; + unsigned short __pad1; + unsigned long st_ino; + unsigned short st_mode; + unsigned short st_nlink; + unsigned short st_uid; + unsigned short st_gid; + unsigned short st_rdev; + unsigned short __pad2; + unsigned long st_size; + unsigned long st_blksize; + unsigned long st_blocks; + unsigned long st_atime; + unsigned long st_atime_nsec; + unsigned long st_mtime; + unsigned long st_mtime_nsec; + unsigned long st_ctime; + unsigned long st_ctime_nsec; + unsigned long __unused4; + unsigned long __unused5; +}; + +/* This matches struct stat64 in glibc2.1, hence the absolutely + * insane amounts of padding around dev_t's. + */ +struct stat64 { + unsigned long long st_dev; + unsigned int __pad1; +#define STAT64_HAS_BROKEN_ST_INO 1 + unsigned long __st_ino; + unsigned int st_mode; + unsigned int st_nlink; + unsigned long st_uid; + unsigned long st_gid; + unsigned long long st_rdev; + unsigned int __pad3; + long long st_size; + unsigned long st_blksize; + unsigned char __pad4[4]; + unsigned long __pad5; /* future possible st_blocks high bits */ + unsigned long st_blocks; /* Number 512-byte blocks allocated. */ + unsigned long st_atime; + unsigned long st_atime_nsec; + unsigned long st_mtime; + unsigned long st_mtime_nsec; + unsigned long st_ctime; + unsigned long st_ctime_nsec; /* will be high 32 bits of ctime someday */ + unsigned long long st_ino; +}; + +#else /* __s390x__ */ + +struct stat { + unsigned long st_dev; + unsigned long st_ino; + unsigned long st_nlink; + unsigned int st_mode; + unsigned int st_uid; + unsigned int st_gid; + unsigned int __pad1; + unsigned long st_rdev; + unsigned long st_size; + unsigned long st_atime; + unsigned long st_atime_nsec; + unsigned long st_mtime; + unsigned long st_mtime_nsec; + unsigned long st_ctime; + unsigned long st_ctime_nsec; + unsigned long st_blksize; + long st_blocks; + unsigned long __unused[3]; +}; + +#endif /* __s390x__ */ + +#define STAT_HAVE_NSEC 1 + +#endif diff --git a/arch/s390/include/uapi/asm/statfs.h b/arch/s390/include/uapi/asm/statfs.h new file mode 100644 index 0000000000..f85b50723d --- /dev/null +++ b/arch/s390/include/uapi/asm/statfs.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * S390 version + * + * Derived from "include/asm-i386/statfs.h" + */ + +#ifndef _S390_STATFS_H +#define _S390_STATFS_H + +/* + * We can't use because in 64-bit mode + * we mix ints of different sizes in our struct statfs. + */ + +#ifndef __KERNEL_STRICT_NAMES +#include +typedef __kernel_fsid_t fsid_t; +#endif + +struct statfs { + unsigned int f_type; + unsigned int f_bsize; + unsigned long f_blocks; + unsigned long f_bfree; + unsigned long f_bavail; + unsigned long f_files; + unsigned long f_ffree; + __kernel_fsid_t f_fsid; + unsigned int f_namelen; + unsigned int f_frsize; + unsigned int f_flags; + unsigned int f_spare[5]; +}; + +struct statfs64 { + unsigned int f_type; + unsigned int f_bsize; + unsigned long long f_blocks; + unsigned long long f_bfree; + unsigned long long f_bavail; + unsigned long long f_files; + unsigned long long f_ffree; + __kernel_fsid_t f_fsid; + unsigned int f_namelen; + unsigned int f_frsize; + unsigned int f_flags; + unsigned int f_spare[5]; +}; + +#endif diff --git a/arch/s390/include/uapi/asm/sthyi.h b/arch/s390/include/uapi/asm/sthyi.h new file mode 100644 index 0000000000..b1b0223169 --- /dev/null +++ b/arch/s390/include/uapi/asm/sthyi.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_ASM_STHYI_H +#define _UAPI_ASM_STHYI_H + +#define STHYI_FC_CP_IFL_CAP 0 + +#endif /* _UAPI_ASM_STHYI_H */ diff --git a/arch/s390/include/uapi/asm/tape390.h b/arch/s390/include/uapi/asm/tape390.h new file mode 100644 index 0000000000..90266c6964 --- /dev/null +++ b/arch/s390/include/uapi/asm/tape390.h @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/************************************************************************* + * + * enables user programs to display messages and control encryption + * on s390 tape devices + * + * Copyright IBM Corp. 2001, 2006 + * Author(s): Michael Holzheu + * + *************************************************************************/ + +#ifndef _TAPE390_H +#define _TAPE390_H + +#define TAPE390_DISPLAY _IOW('d', 1, struct display_struct) + +/* + * The TAPE390_DISPLAY ioctl calls the Load Display command + * which transfers 17 bytes of data from the channel to the subsystem: + * - 1 format control byte, and + * - two 8-byte messages + * + * Format control byte: + * 0-2: New Message Overlay + * 3: Alternate Messages + * 4: Blink Message + * 5: Display Low/High Message + * 6: Reserved + * 7: Automatic Load Request + * + */ + +typedef struct display_struct { + char cntrl; + char message1[8]; + char message2[8]; +} display_struct; + +/* + * Tape encryption support + */ + +struct tape390_crypt_info { + char capability; + char status; + char medium_status; +} __attribute__ ((packed)); + + +/* Macros for "capable" field */ +#define TAPE390_CRYPT_SUPPORTED_MASK 0x01 +#define TAPE390_CRYPT_SUPPORTED(x) \ + ((x.capability & TAPE390_CRYPT_SUPPORTED_MASK)) + +/* Macros for "status" field */ +#define TAPE390_CRYPT_ON_MASK 0x01 +#define TAPE390_CRYPT_ON(x) (((x.status) & TAPE390_CRYPT_ON_MASK)) + +/* Macros for "medium status" field */ +#define TAPE390_MEDIUM_LOADED_MASK 0x01 +#define TAPE390_MEDIUM_ENCRYPTED_MASK 0x02 +#define TAPE390_MEDIUM_ENCRYPTED(x) \ + (((x.medium_status) & TAPE390_MEDIUM_ENCRYPTED_MASK)) +#define TAPE390_MEDIUM_LOADED(x) \ + (((x.medium_status) & TAPE390_MEDIUM_LOADED_MASK)) + +/* + * The TAPE390_CRYPT_SET ioctl is used to switch on/off encryption. + * The "encryption_capable" and "tape_status" fields are ignored for this ioctl! + */ +#define TAPE390_CRYPT_SET _IOW('d', 2, struct tape390_crypt_info) + +/* + * The TAPE390_CRYPT_QUERY ioctl is used to query the encryption state. + */ +#define TAPE390_CRYPT_QUERY _IOR('d', 3, struct tape390_crypt_info) + +/* Values for "kekl1/2_type" and "kekl1/2_type_on_tape" fields */ +#define TAPE390_KEKL_TYPE_NONE 0 +#define TAPE390_KEKL_TYPE_LABEL 1 +#define TAPE390_KEKL_TYPE_HASH 2 + +struct tape390_kekl { + unsigned char type; + unsigned char type_on_tape; + char label[65]; +} __attribute__ ((packed)); + +struct tape390_kekl_pair { + struct tape390_kekl kekl[2]; +} __attribute__ ((packed)); + +/* + * The TAPE390_KEKL_SET ioctl is used to set Key Encrypting Key labels. + */ +#define TAPE390_KEKL_SET _IOW('d', 4, struct tape390_kekl_pair) + +/* + * The TAPE390_KEKL_QUERY ioctl is used to query Key Encrypting Key labels. + */ +#define TAPE390_KEKL_QUERY _IOR('d', 5, struct tape390_kekl_pair) + +#endif diff --git a/arch/s390/include/uapi/asm/types.h b/arch/s390/include/uapi/asm/types.h new file mode 100644 index 0000000000..84457dbb26 --- /dev/null +++ b/arch/s390/include/uapi/asm/types.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * S390 version + * + * Derived from "include/asm-i386/types.h" + */ + +#ifndef _UAPI_S390_TYPES_H +#define _UAPI_S390_TYPES_H + +#include + +#ifndef __ASSEMBLY__ + +typedef unsigned long addr_t; +typedef __signed__ long saddr_t; + +typedef struct { + union { + struct { + __u64 high; + __u64 low; + }; + __u32 u[4]; + }; +} __attribute__((packed, aligned(4))) __vector128; + +#endif /* __ASSEMBLY__ */ + +#endif /* _UAPI_S390_TYPES_H */ diff --git a/arch/s390/include/uapi/asm/ucontext.h b/arch/s390/include/uapi/asm/ucontext.h new file mode 100644 index 0000000000..c95f42e853 --- /dev/null +++ b/arch/s390/include/uapi/asm/ucontext.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * S390 version + * + * Derived from "include/asm-i386/ucontext.h" + */ + +#ifndef _ASM_S390_UCONTEXT_H +#define _ASM_S390_UCONTEXT_H + +#define UC_GPRS_HIGH 1 /* uc_mcontext_ext has valid high gprs */ +#define UC_VXRS 2 /* uc_mcontext_ext has valid vector regs */ + +/* + * The struct ucontext_extended describes how the registers are stored + * on a rt signal frame. Please note that the structure is not fixed, + * if new CPU registers are added to the user state the size of the + * struct ucontext_extended will increase. + */ +struct ucontext_extended { + unsigned long uc_flags; + struct ucontext *uc_link; + stack_t uc_stack; + _sigregs uc_mcontext; + sigset_t uc_sigmask; + /* Allow for uc_sigmask growth. Glibc uses a 1024-bit sigset_t. */ + unsigned char __unused[128 - sizeof(sigset_t)]; + _sigregs_ext uc_mcontext_ext; +}; + +struct ucontext { + unsigned long uc_flags; + struct ucontext *uc_link; + stack_t uc_stack; + _sigregs uc_mcontext; + sigset_t uc_sigmask; + /* Allow for uc_sigmask growth. Glibc uses a 1024-bit sigset_t. */ + unsigned char __unused[128 - sizeof(sigset_t)]; +}; + +#endif /* !_ASM_S390_UCONTEXT_H */ diff --git a/arch/s390/include/uapi/asm/unistd.h b/arch/s390/include/uapi/asm/unistd.h new file mode 100644 index 0000000000..01b5fe8b9d --- /dev/null +++ b/arch/s390/include/uapi/asm/unistd.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * S390 version + * + * Derived from "include/asm-i386/unistd.h" + */ + +#ifndef _UAPI_ASM_S390_UNISTD_H_ +#define _UAPI_ASM_S390_UNISTD_H_ + +#ifdef __s390x__ +#include +#else +#include +#endif + +#endif /* _UAPI_ASM_S390_UNISTD_H_ */ diff --git a/arch/s390/include/uapi/asm/uvdevice.h b/arch/s390/include/uapi/asm/uvdevice.h new file mode 100644 index 0000000000..b9c2f14a6a --- /dev/null +++ b/arch/s390/include/uapi/asm/uvdevice.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright IBM Corp. 2022 + * Author(s): Steffen Eiden + */ +#ifndef __S390_ASM_UVDEVICE_H +#define __S390_ASM_UVDEVICE_H + +#include + +struct uvio_ioctl_cb { + __u32 flags; + __u16 uv_rc; /* UV header rc value */ + __u16 uv_rrc; /* UV header rrc value */ + __u64 argument_addr; /* Userspace address of uvio argument */ + __u32 argument_len; + __u8 reserved14[0x40 - 0x14]; /* must be zero */ +}; + +#define UVIO_ATT_USER_DATA_LEN 0x100 +#define UVIO_ATT_UID_LEN 0x10 +struct uvio_attest { + __u64 arcb_addr; /* 0x0000 */ + __u64 meas_addr; /* 0x0008 */ + __u64 add_data_addr; /* 0x0010 */ + __u8 user_data[UVIO_ATT_USER_DATA_LEN]; /* 0x0018 */ + __u8 config_uid[UVIO_ATT_UID_LEN]; /* 0x0118 */ + __u32 arcb_len; /* 0x0128 */ + __u32 meas_len; /* 0x012c */ + __u32 add_data_len; /* 0x0130 */ + __u16 user_data_len; /* 0x0134 */ + __u16 reserved136; /* 0x0136 */ +}; + +/** + * uvio_uvdev_info - Information of supported functions + * @supp_uvio_cmds - supported IOCTLs by this device + * @supp_uv_cmds - supported UVCs corresponding to the IOCTL + * + * UVIO request to get information about supported request types by this + * uvdevice and the Ultravisor. Everything is output. Bits are in LSB0 + * ordering. If the bit is set in both, @supp_uvio_cmds and @supp_uv_cmds, the + * uvdevice and the Ultravisor support that call. + * + * Note that bit 0 (UVIO_IOCTL_UVDEV_INFO_NR) is always zero for `supp_uv_cmds` + * as there is no corresponding UV-call. + */ +struct uvio_uvdev_info { + /* + * If bit `n` is set, this device supports the IOCTL with nr `n`. + */ + __u64 supp_uvio_cmds; + /* + * If bit `n` is set, the Ultravisor(UV) supports the UV-call + * corresponding to the IOCTL with nr `n` in the calling contextx (host + * or guest). The value is only valid if the corresponding bit in + * @supp_uvio_cmds is set as well. + */ + __u64 supp_uv_cmds; +}; + +/* + * The following max values define an upper length for the IOCTL in/out buffers. + * However, they do not represent the maximum the Ultravisor allows which is + * often way smaller. By allowing larger buffer sizes we hopefully do not need + * to update the code with every machine update. It is therefore possible for + * userspace to request more memory than actually used by kernel/UV. + */ +#define UVIO_ATT_ARCB_MAX_LEN 0x100000 +#define UVIO_ATT_MEASUREMENT_MAX_LEN 0x8000 +#define UVIO_ATT_ADDITIONAL_MAX_LEN 0x8000 +#define UVIO_ADD_SECRET_MAX_LEN 0x100000 +#define UVIO_LIST_SECRETS_LEN 0x1000 + +#define UVIO_DEVICE_NAME "uv" +#define UVIO_TYPE_UVC 'u' + +enum UVIO_IOCTL_NR { + UVIO_IOCTL_UVDEV_INFO_NR = 0x00, + UVIO_IOCTL_ATT_NR, + UVIO_IOCTL_ADD_SECRET_NR, + UVIO_IOCTL_LIST_SECRETS_NR, + UVIO_IOCTL_LOCK_SECRETS_NR, + /* must be the last entry */ + UVIO_IOCTL_NUM_IOCTLS +}; + +#define UVIO_IOCTL(nr) _IOWR(UVIO_TYPE_UVC, nr, struct uvio_ioctl_cb) +#define UVIO_IOCTL_UVDEV_INFO UVIO_IOCTL(UVIO_IOCTL_UVDEV_INFO_NR) +#define UVIO_IOCTL_ATT UVIO_IOCTL(UVIO_IOCTL_ATT_NR) +#define UVIO_IOCTL_ADD_SECRET UVIO_IOCTL(UVIO_IOCTL_ADD_SECRET_NR) +#define UVIO_IOCTL_LIST_SECRETS UVIO_IOCTL(UVIO_IOCTL_LIST_SECRETS_NR) +#define UVIO_IOCTL_LOCK_SECRETS UVIO_IOCTL(UVIO_IOCTL_LOCK_SECRETS_NR) + +#define UVIO_SUPP_CALL(nr) (1ULL << (nr)) +#define UVIO_SUPP_UDEV_INFO UVIO_SUPP_CALL(UVIO_IOCTL_UDEV_INFO_NR) +#define UVIO_SUPP_ATT UVIO_SUPP_CALL(UVIO_IOCTL_ATT_NR) +#define UVIO_SUPP_ADD_SECRET UVIO_SUPP_CALL(UVIO_IOCTL_ADD_SECRET_NR) +#define UVIO_SUPP_LIST_SECRETS UVIO_SUPP_CALL(UVIO_IOCTL_LIST_SECRETS_NR) +#define UVIO_SUPP_LOCK_SECRETS UVIO_SUPP_CALL(UVIO_IOCTL_LOCK_SECRETS_NR) + +#endif /* __S390_ASM_UVDEVICE_H */ diff --git a/arch/s390/include/uapi/asm/virtio-ccw.h b/arch/s390/include/uapi/asm/virtio-ccw.h new file mode 100644 index 0000000000..2b605f7e84 --- /dev/null +++ b/arch/s390/include/uapi/asm/virtio-ccw.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* + * Definitions for virtio-ccw devices. + * + * Copyright IBM Corp. 2013 + * + * Author(s): Cornelia Huck + */ +#ifndef __KVM_VIRTIO_CCW_H +#define __KVM_VIRTIO_CCW_H + +/* Alignment of vring buffers. */ +#define KVM_VIRTIO_CCW_RING_ALIGN 4096 + +/* Subcode for diagnose 500 (virtio hypercall). */ +#define KVM_S390_VIRTIO_CCW_NOTIFY 3 + +#endif diff --git a/arch/s390/include/uapi/asm/vmcp.h b/arch/s390/include/uapi/asm/vmcp.h new file mode 100644 index 0000000000..aeaaa03003 --- /dev/null +++ b/arch/s390/include/uapi/asm/vmcp.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright IBM Corp. 2004, 2005 + * Interface implementation for communication with the z/VM control program + * Version 1.0 + * Author(s): Christian Borntraeger + * + * + * z/VMs CP offers the possibility to issue commands via the diagnose code 8 + * this driver implements a character device that issues these commands and + * returns the answer of CP. + * + * The idea of this driver is based on cpint from Neale Ferguson + */ + +#ifndef _UAPI_ASM_VMCP_H +#define _UAPI_ASM_VMCP_H + +#include + +#define VMCP_GETCODE _IOR(0x10, 1, int) +#define VMCP_SETBUF _IOW(0x10, 2, int) +#define VMCP_GETSIZE _IOR(0x10, 3, int) + +#endif /* _UAPI_ASM_VMCP_H */ diff --git a/arch/s390/include/uapi/asm/vtoc.h b/arch/s390/include/uapi/asm/vtoc.h new file mode 100644 index 0000000000..50c1d7b9e8 --- /dev/null +++ b/arch/s390/include/uapi/asm/vtoc.h @@ -0,0 +1,214 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * This file contains volume label definitions for DASD devices. + * + * Copyright IBM Corp. 2005 + * + * Author(s): Volker Sameske + * + */ + +#ifndef _ASM_S390_VTOC_H +#define _ASM_S390_VTOC_H + +#include + +struct vtoc_ttr +{ + __u16 tt; + __u8 r; +} __attribute__ ((packed)); + +struct vtoc_cchhb +{ + __u16 cc; + __u16 hh; + __u8 b; +} __attribute__ ((packed)); + +struct vtoc_cchh +{ + __u16 cc; + __u16 hh; +} __attribute__ ((packed)); + +struct vtoc_labeldate +{ + __u8 year; + __u16 day; +} __attribute__ ((packed)); + +struct vtoc_volume_label_cdl +{ + char volkey[4]; /* volume key = volume label */ + char vollbl[4]; /* volume label */ + char volid[6]; /* volume identifier */ + __u8 security; /* security byte */ + struct vtoc_cchhb vtoc; /* VTOC address */ + char res1[5]; /* reserved */ + char cisize[4]; /* CI-size for FBA,... */ + /* ...blanks for CKD */ + char blkperci[4]; /* no of blocks per CI (FBA), blanks for CKD */ + char labperci[4]; /* no of labels per CI (FBA), blanks for CKD */ + char res2[4]; /* reserved */ + char lvtoc[14]; /* owner code for LVTOC */ + char res3[29]; /* reserved */ +} __attribute__ ((packed)); + +struct vtoc_volume_label_ldl { + char vollbl[4]; /* volume label */ + char volid[6]; /* volume identifier */ + char res3[69]; /* reserved */ + char ldl_version; /* version number, valid for ldl format */ + __u64 formatted_blocks; /* valid when ldl_version >= f2 */ +} __attribute__ ((packed)); + +struct vtoc_extent +{ + __u8 typeind; /* extent type indicator */ + __u8 seqno; /* extent sequence number */ + struct vtoc_cchh llimit; /* starting point of this extent */ + struct vtoc_cchh ulimit; /* ending point of this extent */ +} __attribute__ ((packed)); + +struct vtoc_dev_const +{ + __u16 DS4DSCYL; /* number of logical cyls */ + __u16 DS4DSTRK; /* number of tracks in a logical cylinder */ + __u16 DS4DEVTK; /* device track length */ + __u8 DS4DEVI; /* non-last keyed record overhead */ + __u8 DS4DEVL; /* last keyed record overhead */ + __u8 DS4DEVK; /* non-keyed record overhead differential */ + __u8 DS4DEVFG; /* flag byte */ + __u16 DS4DEVTL; /* device tolerance */ + __u8 DS4DEVDT; /* number of DSCB's per track */ + __u8 DS4DEVDB; /* number of directory blocks per track */ +} __attribute__ ((packed)); + +struct vtoc_format1_label +{ + char DS1DSNAM[44]; /* data set name */ + __u8 DS1FMTID; /* format identifier */ + char DS1DSSN[6]; /* data set serial number */ + __u16 DS1VOLSQ; /* volume sequence number */ + struct vtoc_labeldate DS1CREDT; /* creation date: ydd */ + struct vtoc_labeldate DS1EXPDT; /* expiration date */ + __u8 DS1NOEPV; /* number of extents on volume */ + __u8 DS1NOBDB; /* no. of bytes used in last direction blk */ + __u8 DS1FLAG1; /* flag 1 */ + char DS1SYSCD[13]; /* system code */ + struct vtoc_labeldate DS1REFD; /* date last referenced */ + __u8 DS1SMSFG; /* system managed storage indicators */ + __u8 DS1SCXTF; /* sec. space extension flag byte */ + __u16 DS1SCXTV; /* secondary space extension value */ + __u8 DS1DSRG1; /* data set organisation byte 1 */ + __u8 DS1DSRG2; /* data set organisation byte 2 */ + __u8 DS1RECFM; /* record format */ + __u8 DS1OPTCD; /* option code */ + __u16 DS1BLKL; /* block length */ + __u16 DS1LRECL; /* record length */ + __u8 DS1KEYL; /* key length */ + __u16 DS1RKP; /* relative key position */ + __u8 DS1DSIND; /* data set indicators */ + __u8 DS1SCAL1; /* secondary allocation flag byte */ + char DS1SCAL3[3]; /* secondary allocation quantity */ + struct vtoc_ttr DS1LSTAR; /* last used track and block on track */ + __u16 DS1TRBAL; /* space remaining on last used track */ + __u16 res1; /* reserved */ + struct vtoc_extent DS1EXT1; /* first extent description */ + struct vtoc_extent DS1EXT2; /* second extent description */ + struct vtoc_extent DS1EXT3; /* third extent description */ + struct vtoc_cchhb DS1PTRDS; /* possible pointer to f2 or f3 DSCB */ +} __attribute__ ((packed)); + +struct vtoc_format4_label +{ + char DS4KEYCD[44]; /* key code for VTOC labels: 44 times 0x04 */ + __u8 DS4IDFMT; /* format identifier */ + struct vtoc_cchhb DS4HPCHR; /* highest address of a format 1 DSCB */ + __u16 DS4DSREC; /* number of available DSCB's */ + struct vtoc_cchh DS4HCCHH; /* CCHH of next available alternate track */ + __u16 DS4NOATK; /* number of remaining alternate tracks */ + __u8 DS4VTOCI; /* VTOC indicators */ + __u8 DS4NOEXT; /* number of extents in VTOC */ + __u8 DS4SMSFG; /* system managed storage indicators */ + __u8 DS4DEVAC; /* number of alternate cylinders. + * Subtract from first two bytes of + * DS4DEVSZ to get number of usable + * cylinders. can be zero. valid + * only if DS4DEVAV on. */ + struct vtoc_dev_const DS4DEVCT; /* device constants */ + char DS4AMTIM[8]; /* VSAM time stamp */ + char DS4AMCAT[3]; /* VSAM catalog indicator */ + char DS4R2TIM[8]; /* VSAM volume/catalog match time stamp */ + char res1[5]; /* reserved */ + char DS4F6PTR[5]; /* pointer to first format 6 DSCB */ + struct vtoc_extent DS4VTOCE; /* VTOC extent description */ + char res2[10]; /* reserved */ + __u8 DS4EFLVL; /* extended free-space management level */ + struct vtoc_cchhb DS4EFPTR; /* pointer to extended free-space info */ + char res3; /* reserved */ + __u32 DS4DCYL; /* number of logical cyls */ + char res4[2]; /* reserved */ + __u8 DS4DEVF2; /* device flags */ + char res5; /* reserved */ +} __attribute__ ((packed)); + +struct vtoc_ds5ext +{ + __u16 t; /* RTA of the first track of free extent */ + __u16 fc; /* number of whole cylinders in free ext. */ + __u8 ft; /* number of remaining free tracks */ +} __attribute__ ((packed)); + +struct vtoc_format5_label +{ + char DS5KEYID[4]; /* key identifier */ + struct vtoc_ds5ext DS5AVEXT; /* first available (free-space) extent. */ + struct vtoc_ds5ext DS5EXTAV[7]; /* seven available extents */ + __u8 DS5FMTID; /* format identifier */ + struct vtoc_ds5ext DS5MAVET[18]; /* eighteen available extents */ + struct vtoc_cchhb DS5PTRDS; /* pointer to next format5 DSCB */ +} __attribute__ ((packed)); + +struct vtoc_ds7ext +{ + __u32 a; /* starting RTA value */ + __u32 b; /* ending RTA value + 1 */ +} __attribute__ ((packed)); + +struct vtoc_format7_label +{ + char DS7KEYID[4]; /* key identifier */ + struct vtoc_ds7ext DS7EXTNT[5]; /* space for 5 extent descriptions */ + __u8 DS7FMTID; /* format identifier */ + struct vtoc_ds7ext DS7ADEXT[11]; /* space for 11 extent descriptions */ + char res1[2]; /* reserved */ + struct vtoc_cchhb DS7PTRDS; /* pointer to next FMT7 DSCB */ +} __attribute__ ((packed)); + +struct vtoc_cms_label { + __u8 label_id[4]; /* Label identifier */ + __u8 vol_id[6]; /* Volid */ + __u16 version_id; /* Version identifier */ + __u32 block_size; /* Disk block size */ + __u32 origin_ptr; /* Disk origin pointer */ + __u32 usable_count; /* Number of usable cylinders/blocks */ + __u32 formatted_count; /* Maximum number of formatted cylinders/ + * blocks */ + __u32 block_count; /* Disk size in CMS blocks */ + __u32 used_count; /* Number of CMS blocks in use */ + __u32 fst_size; /* File Status Table (FST) size */ + __u32 fst_count; /* Number of FSTs per CMS block */ + __u8 format_date[6]; /* Disk FORMAT date */ + __u8 reserved1[2]; + __u32 disk_offset; /* Disk offset when reserved*/ + __u32 map_block; /* Allocation Map Block with next hole */ + __u32 hblk_disp; /* Displacement into HBLK data of next hole */ + __u32 user_disp; /* Displacement into user part of Allocation + * map */ + __u8 reserved2[4]; + __u8 segment_name[8]; /* Name of shared segment */ +} __attribute__ ((packed)); + +#endif /* _ASM_S390_VTOC_H */ diff --git a/arch/s390/include/uapi/asm/zcrypt.h b/arch/s390/include/uapi/asm/zcrypt.h new file mode 100644 index 0000000000..f4785abe1b --- /dev/null +++ b/arch/s390/include/uapi/asm/zcrypt.h @@ -0,0 +1,373 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * include/asm-s390/zcrypt.h + * + * zcrypt 2.2.1 (user-visible header) + * + * Copyright IBM Corp. 2001, 2022 + * Author(s): Robert Burroughs + * Eric Rossman (edrossma@us.ibm.com) + * + * Hotplug & misc device support: Jochen Roehrig (roehrig@de.ibm.com) + */ + +#ifndef __ASM_S390_ZCRYPT_H +#define __ASM_S390_ZCRYPT_H + +#define ZCRYPT_VERSION 2 +#define ZCRYPT_RELEASE 2 +#define ZCRYPT_VARIANT 1 + +#include +#include +#include + +/* Name of the zcrypt device driver. */ +#define ZCRYPT_NAME "zcrypt" + +/** + * struct ica_rsa_modexpo + * + * Requirements: + * - outputdatalength is at least as large as inputdatalength. + * - All key parts are right justified in their fields, padded on + * the left with zeroes. + * - length(b_key) = inputdatalength + * - length(n_modulus) = inputdatalength + */ +struct ica_rsa_modexpo { + __u8 __user *inputdata; + __u32 inputdatalength; + __u8 __user *outputdata; + __u32 outputdatalength; + __u8 __user *b_key; + __u8 __user *n_modulus; +}; + +/** + * struct ica_rsa_modexpo_crt + * + * Requirements: + * - inputdatalength is even. + * - outputdatalength is at least as large as inputdatalength. + * - All key parts are right justified in their fields, padded on + * the left with zeroes. + * - length(bp_key) = inputdatalength/2 + 8 + * - length(bq_key) = inputdatalength/2 + * - length(np_key) = inputdatalength/2 + 8 + * - length(nq_key) = inputdatalength/2 + * - length(u_mult_inv) = inputdatalength/2 + 8 + */ +struct ica_rsa_modexpo_crt { + __u8 __user *inputdata; + __u32 inputdatalength; + __u8 __user *outputdata; + __u32 outputdatalength; + __u8 __user *bp_key; + __u8 __user *bq_key; + __u8 __user *np_prime; + __u8 __user *nq_prime; + __u8 __user *u_mult_inv; +}; + +/** + * CPRBX + * Note that all shorts and ints are big-endian. + * All pointer fields are 16 bytes long, and mean nothing. + * + * A request CPRB is followed by a request_parameter_block. + * + * The request (or reply) parameter block is organized thus: + * function code + * VUD block + * key block + */ +struct CPRBX { + __u16 cprb_len; /* CPRB length 220 */ + __u8 cprb_ver_id; /* CPRB version id. 0x02 */ + __u8 ctfm; /* Command Type Filtering Mask */ + __u8 pad_000[2]; /* Alignment pad bytes */ + __u8 func_id[2]; /* function id 0x5432 */ + __u8 cprb_flags[4]; /* Flags */ + __u32 req_parml; /* request parameter buffer len */ + __u32 req_datal; /* request data buffer */ + __u32 rpl_msgbl; /* reply message block length */ + __u32 rpld_parml; /* replied parameter block len */ + __u32 rpl_datal; /* reply data block len */ + __u32 rpld_datal; /* replied data block len */ + __u32 req_extbl; /* request extension block len */ + __u8 _pad_001[4]; /* reserved */ + __u32 rpld_extbl; /* replied extension block len */ + __u8 _pad_002[16 - sizeof(__u8 *)]; + __u8 __user *req_parmb; /* request parm block 'address' */ + __u8 _pad_003[16 - sizeof(__u8 *)]; + __u8 __user *req_datab; /* request data block 'address' */ + __u8 _pad_004[16 - sizeof(__u8 *)]; + __u8 __user *rpl_parmb; /* reply parm block 'address' */ + __u8 _pad_005[16 - sizeof(__u8 *)]; + __u8 __user *rpl_datab; /* reply data block 'address' */ + __u8 _pad_006[16 - sizeof(__u8 *)]; + __u8 __user *req_extb; /* request extension block 'addr'*/ + __u8 _pad_007[16 - sizeof(__u8 *)]; + __u8 __user *rpl_extb; /* reply extension block 'address'*/ + __u16 ccp_rtcode; /* server return code */ + __u16 ccp_rscode; /* server reason code */ + __u32 mac_data_len; /* Mac Data Length */ + __u8 logon_id[8]; /* Logon Identifier */ + __u8 mac_value[8]; /* Mac Value */ + __u8 mac_content_flgs; /* Mac content flag byte */ + __u8 _pad_008; /* Alignment */ + __u16 domain; /* Domain */ + __u8 _pad_009[12]; /* reserved, checked for zeros */ + __u8 _pad_010[36]; /* reserved */ +} __attribute__((packed)); + +/** + * xcRB + */ +struct ica_xcRB { + __u16 agent_ID; + __u32 user_defined; + __u16 request_ID; + __u32 request_control_blk_length; + __u8 _padding1[16 - sizeof(__u8 *)]; + __u8 __user *request_control_blk_addr; + __u32 request_data_length; + __u8 _padding2[16 - sizeof(__u8 *)]; + __u8 __user *request_data_address; + __u32 reply_control_blk_length; + __u8 _padding3[16 - sizeof(__u8 *)]; + __u8 __user *reply_control_blk_addr; + __u32 reply_data_length; + __u8 __padding4[16 - sizeof(__u8 *)]; + __u8 __user *reply_data_addr; + __u16 priority_window; + __u32 status; +} __attribute__((packed)); + +/** + * struct ep11_cprb - EP11 connectivity programming request block + * @cprb_len: CPRB header length [0x0020] + * @cprb_ver_id: CPRB version id. [0x04] + * @pad_000: Alignment pad bytes + * @flags: Admin bit [0x80], Special bit [0x20] + * @func_id: Function id / subtype [0x5434] "T4" + * @source_id: Source id [originator id] + * @target_id: Target id [usage/ctrl domain id] + * @ret_code: Return code + * @reserved1: Reserved + * @reserved2: Reserved + * @payload_len: Payload length + */ +struct ep11_cprb { + __u16 cprb_len; + __u8 cprb_ver_id; + __u8 pad_000[2]; + __u8 flags; + __u8 func_id[2]; + __u32 source_id; + __u32 target_id; + __u32 ret_code; + __u32 reserved1; + __u32 reserved2; + __u32 payload_len; +} __attribute__((packed)); + +/** + * struct ep11_target_dev - EP11 target device list + * @ap_id: AP device id + * @dom_id: Usage domain id + */ +struct ep11_target_dev { + __u16 ap_id; + __u16 dom_id; +}; + +/** + * struct ep11_urb - EP11 user request block + * @targets_num: Number of target adapters + * @targets: Addr to target adapter list + * @weight: Level of request priority + * @req_no: Request id/number + * @req_len: Request length + * @req: Addr to request block + * @resp_len: Response length + * @resp: Addr to response block + */ +struct ep11_urb { + __u16 targets_num; + __u8 __user *targets; + __u64 weight; + __u64 req_no; + __u64 req_len; + __u8 __user *req; + __u64 resp_len; + __u8 __user *resp; +} __attribute__((packed)); + +/** + * struct zcrypt_device_status_ext + * @hwtype: raw hardware type + * @qid: 8 bit device index, 8 bit domain + * @functions: AP device function bit field 'abcdef' + * a, b, c = reserved + * d = CCA coprocessor + * e = Accelerator + * f = EP11 coprocessor + * @online online status + * @reserved reserved + */ +struct zcrypt_device_status_ext { + unsigned int hwtype:8; + unsigned int qid:16; + unsigned int online:1; + unsigned int functions:6; + unsigned int reserved:1; +}; + +#define MAX_ZDEV_CARDIDS_EXT 256 +#define MAX_ZDEV_DOMAINS_EXT 256 + +/* Maximum number of zcrypt devices */ +#define MAX_ZDEV_ENTRIES_EXT (MAX_ZDEV_CARDIDS_EXT * MAX_ZDEV_DOMAINS_EXT) + +/* Device matrix of all zcrypt devices */ +struct zcrypt_device_matrix_ext { + struct zcrypt_device_status_ext device[MAX_ZDEV_ENTRIES_EXT]; +}; + +#define AUTOSELECT 0xFFFFFFFF +#define AUTOSEL_AP ((__u16)0xFFFF) +#define AUTOSEL_DOM ((__u16)0xFFFF) + +#define ZCRYPT_IOCTL_MAGIC 'z' + +/** + * Interface notes: + * + * The ioctl()s which are implemented (along with relevant details) + * are: + * + * ICARSAMODEXPO + * Perform an RSA operation using a Modulus-Exponent pair + * This takes an ica_rsa_modexpo struct as its arg. + * + * NOTE: please refer to the comments preceding this structure + * for the implementation details for the contents of the + * block + * + * ICARSACRT + * Perform an RSA operation using a Chinese-Remainder Theorem key + * This takes an ica_rsa_modexpo_crt struct as its arg. + * + * NOTE: please refer to the comments preceding this structure + * for the implementation details for the contents of the + * block + * + * ZSECSENDCPRB + * Send an arbitrary CPRB to a crypto card. + * + * ZSENDEP11CPRB + * Send an arbitrary EP11 CPRB to an EP11 coprocessor crypto card. + * + * ZCRYPT_DEVICE_STATUS + * The given struct zcrypt_device_matrix_ext is updated with + * status information for each currently known apqn. + * + * ZCRYPT_STATUS_MASK + * Return an MAX_ZDEV_CARDIDS_EXT element array of unsigned chars for the + * status of all devices. + * 0x01: PCICA + * 0x02: PCICC + * 0x03: PCIXCC_MCL2 + * 0x04: PCIXCC_MCL3 + * 0x05: CEX2C + * 0x06: CEX2A + * 0x07: CEX3C + * 0x08: CEX3A + * 0x0a: CEX4 + * 0x0b: CEX5 + * 0x0c: CEX6, CEX7 or CEX8 + * 0x0d: device is disabled + * + * ZCRYPT_QDEPTH_MASK + * Return an MAX_ZDEV_CARDIDS_EXT element array of unsigned chars for the + * queue depth of all devices. + * + * ZCRYPT_PERDEV_REQCNT + * Return an MAX_ZDEV_CARDIDS_EXT element array of unsigned integers for + * the number of successfully completed requests per device since the + * device was detected and made available. + * + */ + +/** + * Supported ioctl calls + */ +#define ICARSAMODEXPO _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x05, 0) +#define ICARSACRT _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x06, 0) +#define ZSECSENDCPRB _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x81, 0) +#define ZSENDEP11CPRB _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x04, 0) + +#define ZCRYPT_DEVICE_STATUS _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x5f, 0) +#define ZCRYPT_STATUS_MASK _IOR(ZCRYPT_IOCTL_MAGIC, 0x58, char[MAX_ZDEV_CARDIDS_EXT]) +#define ZCRYPT_QDEPTH_MASK _IOR(ZCRYPT_IOCTL_MAGIC, 0x59, char[MAX_ZDEV_CARDIDS_EXT]) +#define ZCRYPT_PERDEV_REQCNT _IOR(ZCRYPT_IOCTL_MAGIC, 0x5a, int[MAX_ZDEV_CARDIDS_EXT]) + +/* + * Support for multiple zcrypt device nodes. + */ + +/* Nr of minor device node numbers to allocate. */ +#define ZCRYPT_MAX_MINOR_NODES 256 + +/* Max amount of possible ioctls */ +#define MAX_ZDEV_IOCTLS (1 << _IOC_NRBITS) + +/* + * Only deprecated defines, structs and ioctls below this line. + */ + +/* Deprecated: use MAX_ZDEV_CARDIDS_EXT */ +#define MAX_ZDEV_CARDIDS 64 +/* Deprecated: use MAX_ZDEV_DOMAINS_EXT */ +#define MAX_ZDEV_DOMAINS 256 + +/* Deprecated: use MAX_ZDEV_ENTRIES_EXT */ +#define MAX_ZDEV_ENTRIES (MAX_ZDEV_CARDIDS * MAX_ZDEV_DOMAINS) + +/* Deprecated: use struct zcrypt_device_status_ext */ +struct zcrypt_device_status { + unsigned int hwtype:8; + unsigned int qid:14; + unsigned int online:1; + unsigned int functions:6; + unsigned int reserved:3; +}; + +/* Deprecated: use struct zcrypt_device_matrix_ext */ +struct zcrypt_device_matrix { + struct zcrypt_device_status device[MAX_ZDEV_ENTRIES]; +}; + +/* Deprecated: use ZCRYPT_DEVICE_STATUS */ +#define ZDEVICESTATUS _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x4f, 0) +/* Deprecated: use ZCRYPT_STATUS_MASK */ +#define Z90STAT_STATUS_MASK _IOR(ZCRYPT_IOCTL_MAGIC, 0x48, char[64]) +/* Deprecated: use ZCRYPT_QDEPTH_MASK */ +#define Z90STAT_QDEPTH_MASK _IOR(ZCRYPT_IOCTL_MAGIC, 0x49, char[64]) +/* Deprecated: use ZCRYPT_PERDEV_REQCNT */ +#define Z90STAT_PERDEV_REQCNT _IOR(ZCRYPT_IOCTL_MAGIC, 0x4a, int[64]) + +/* Deprecated: use sysfs to query these values */ +#define Z90STAT_REQUESTQ_COUNT _IOR(ZCRYPT_IOCTL_MAGIC, 0x44, int) +#define Z90STAT_PENDINGQ_COUNT _IOR(ZCRYPT_IOCTL_MAGIC, 0x45, int) +#define Z90STAT_TOTALOPEN_COUNT _IOR(ZCRYPT_IOCTL_MAGIC, 0x46, int) +#define Z90STAT_DOMAIN_INDEX _IOR(ZCRYPT_IOCTL_MAGIC, 0x47, int) + +/* + * The ioctl number ranges 0x40 - 0x42 and 0x4b - 0x4e had been used in the + * past, don't assign new ioctls for these. + */ + +#endif /* __ASM_S390_ZCRYPT_H */ diff --git a/arch/s390/kernel/.gitignore b/arch/s390/kernel/.gitignore new file mode 100644 index 0000000000..bbb90f92d0 --- /dev/null +++ b/arch/s390/kernel/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +vmlinux.lds diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile new file mode 100644 index 0000000000..0df2b88cc0 --- /dev/null +++ b/arch/s390/kernel/Makefile @@ -0,0 +1,85 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the linux kernel. +# + +ifdef CONFIG_FUNCTION_TRACER + +# Do not trace tracer code +CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) + +# Do not trace early setup code +CFLAGS_REMOVE_early.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_rethook.o = $(CC_FLAGS_FTRACE) + +endif + +GCOV_PROFILE_early.o := n +KCOV_INSTRUMENT_early.o := n +UBSAN_SANITIZE_early.o := n +KASAN_SANITIZE_ipl.o := n +KASAN_SANITIZE_machine_kexec.o := n + +# +# Passing null pointers is ok for smp code, since we access the lowcore here. +# +CFLAGS_smp.o := -Wno-nonnull + +# +# Disable tailcall optimizations for stack / callchain walking functions +# since this might generate broken code when accessing register 15 and +# passing its content to other functions. +# +CFLAGS_stacktrace.o += -fno-optimize-sibling-calls +CFLAGS_dumpstack.o += -fno-optimize-sibling-calls +CFLAGS_unwind_bc.o += -fno-optimize-sibling-calls + +obj-y := head64.o traps.o time.o process.o earlypgm.o early.o setup.o idle.o vtime.o +obj-y += processor.o syscall.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o +obj-y += debug.o irq.o ipl.o dis.o diag.o vdso.o cpufeature.o +obj-y += sysinfo.o lgr.o os_info.o +obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o +obj-y += entry.o reipl.o kdebugfs.o alternative.o +obj-y += nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o +obj-y += smp.o text_amode31.o stacktrace.o abs_lowcore.o + +extra-y += vmlinux.lds + +obj-$(CONFIG_SYSFS) += nospec-sysfs.o +CFLAGS_REMOVE_nospec-branch.o += $(CC_FLAGS_EXPOLINE) + +obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_SCHED_TOPOLOGY) += topology.o +obj-$(CONFIG_NUMA) += numa.o +obj-$(CONFIG_AUDIT) += audit.o +compat-obj-$(CONFIG_AUDIT) += compat_audit.o +obj-$(CONFIG_COMPAT) += compat_linux.o compat_signal.o +obj-$(CONFIG_COMPAT) += $(compat-obj-y) +obj-$(CONFIG_EARLY_PRINTK) += early_printk.o +obj-$(CONFIG_KPROBES) += kprobes.o +obj-$(CONFIG_KPROBES) += kprobes_insn_page.o +obj-$(CONFIG_KPROBES) += mcount.o +obj-$(CONFIG_RETHOOK) += rethook.o +obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o +obj-$(CONFIG_FUNCTION_TRACER) += mcount.o +obj-$(CONFIG_CRASH_DUMP) += crash_dump.o +obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o relocate_kernel.o +obj-$(CONFIG_UPROBES) += uprobes.o +obj-$(CONFIG_JUMP_LABEL) += jump_label.o + +obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file.o kexec_image.o +obj-$(CONFIG_KEXEC_FILE) += kexec_elf.o +obj-$(CONFIG_CERT_STORE) += cert_store.o +obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_arch.o + +obj-$(CONFIG_PERF_EVENTS) += perf_event.o +obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf.o perf_cpum_sf.o +obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_events.o perf_regs.o +obj-$(CONFIG_PERF_EVENTS) += perf_pai_crypto.o perf_pai_ext.o + +obj-$(CONFIG_TRACEPOINTS) += trace.o +obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o + +# vdso +obj-y += vdso64/ +obj-$(CONFIG_COMPAT) += vdso32/ diff --git a/arch/s390/kernel/abs_lowcore.c b/arch/s390/kernel/abs_lowcore.c new file mode 100644 index 0000000000..f9efc54ec4 --- /dev/null +++ b/arch/s390/kernel/abs_lowcore.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +unsigned long __bootdata_preserved(__abs_lowcore); + +int abs_lowcore_map(int cpu, struct lowcore *lc, bool alloc) +{ + unsigned long addr = __abs_lowcore + (cpu * sizeof(struct lowcore)); + unsigned long phys = __pa(lc); + int rc, i; + + for (i = 0; i < LC_PAGES; i++) { + rc = __vmem_map_4k_page(addr, phys, PAGE_KERNEL, alloc); + if (rc) { + /* + * Do not unmap allocated page tables in case the + * allocation was not requested. In such a case the + * request is expected coming from an atomic context, + * while the unmap attempt might sleep. + */ + if (alloc) { + for (--i; i >= 0; i--) { + addr -= PAGE_SIZE; + vmem_unmap_4k_page(addr); + } + } + return rc; + } + addr += PAGE_SIZE; + phys += PAGE_SIZE; + } + return 0; +} + +void abs_lowcore_unmap(int cpu) +{ + unsigned long addr = __abs_lowcore + (cpu * sizeof(struct lowcore)); + int i; + + for (i = 0; i < LC_PAGES; i++) { + vmem_unmap_4k_page(addr); + addr += PAGE_SIZE; + } +} diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c new file mode 100644 index 0000000000..e7bca29f9c --- /dev/null +++ b/arch/s390/kernel/alternative.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include + +static int __initdata_or_module alt_instr_disabled; + +static int __init disable_alternative_instructions(char *str) +{ + alt_instr_disabled = 1; + return 0; +} + +early_param("noaltinstr", disable_alternative_instructions); + +static void __init_or_module __apply_alternatives(struct alt_instr *start, + struct alt_instr *end) +{ + struct alt_instr *a; + u8 *instr, *replacement; + + /* + * The scan order should be from start to end. A later scanned + * alternative code can overwrite previously scanned alternative code. + */ + for (a = start; a < end; a++) { + instr = (u8 *)&a->instr_offset + a->instr_offset; + replacement = (u8 *)&a->repl_offset + a->repl_offset; + + if (!__test_facility(a->facility, alt_stfle_fac_list)) + continue; + + if (unlikely(a->instrlen % 2)) { + WARN_ONCE(1, "cpu alternatives instructions length is " + "odd, skipping patching\n"); + continue; + } + + s390_kernel_write(instr, replacement, a->instrlen); + } +} + +void __init_or_module apply_alternatives(struct alt_instr *start, + struct alt_instr *end) +{ + if (!alt_instr_disabled) + __apply_alternatives(start, end); +} + +extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; +void __init apply_alternative_instructions(void) +{ + apply_alternatives(__alt_instructions, __alt_instructions_end); +} + +static void do_sync_core(void *info) +{ + sync_core(); +} + +void text_poke_sync(void) +{ + on_each_cpu(do_sync_core, NULL, 1); +} + +void text_poke_sync_lock(void) +{ + cpus_read_lock(); + text_poke_sync(); + cpus_read_unlock(); +} diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c new file mode 100644 index 0000000000..fa5f6885c7 --- /dev/null +++ b/arch/s390/kernel/asm-offsets.c @@ -0,0 +1,190 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Generate definitions needed by assembly language modules. + * This code generates raw asm output which is post-processed to extract + * and format the required data. + */ + +#define ASM_OFFSETS_C + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int main(void) +{ + /* task struct offsets */ + OFFSET(__TASK_stack, task_struct, stack); + OFFSET(__TASK_thread, task_struct, thread); + OFFSET(__TASK_pid, task_struct, pid); + BLANK(); + /* thread struct offsets */ + OFFSET(__THREAD_ksp, thread_struct, ksp); + BLANK(); + /* thread info offsets */ + OFFSET(__TI_flags, task_struct, thread_info.flags); + BLANK(); + /* pt_regs offsets */ + OFFSET(__PT_PSW, pt_regs, psw); + OFFSET(__PT_GPRS, pt_regs, gprs); + OFFSET(__PT_R0, pt_regs, gprs[0]); + OFFSET(__PT_R1, pt_regs, gprs[1]); + OFFSET(__PT_R2, pt_regs, gprs[2]); + OFFSET(__PT_R3, pt_regs, gprs[3]); + OFFSET(__PT_R4, pt_regs, gprs[4]); + OFFSET(__PT_R5, pt_regs, gprs[5]); + OFFSET(__PT_R6, pt_regs, gprs[6]); + OFFSET(__PT_R7, pt_regs, gprs[7]); + OFFSET(__PT_R8, pt_regs, gprs[8]); + OFFSET(__PT_R9, pt_regs, gprs[9]); + OFFSET(__PT_R10, pt_regs, gprs[10]); + OFFSET(__PT_R11, pt_regs, gprs[11]); + OFFSET(__PT_R12, pt_regs, gprs[12]); + OFFSET(__PT_R13, pt_regs, gprs[13]); + OFFSET(__PT_R14, pt_regs, gprs[14]); + OFFSET(__PT_R15, pt_regs, gprs[15]); + OFFSET(__PT_ORIG_GPR2, pt_regs, orig_gpr2); + OFFSET(__PT_FLAGS, pt_regs, flags); + OFFSET(__PT_CR1, pt_regs, cr1); + OFFSET(__PT_LAST_BREAK, pt_regs, last_break); + DEFINE(__PT_SIZE, sizeof(struct pt_regs)); + BLANK(); + /* stack_frame offsets */ + OFFSET(__SF_BACKCHAIN, stack_frame, back_chain); + OFFSET(__SF_GPRS, stack_frame, gprs); + OFFSET(__SF_EMPTY, stack_frame, empty[0]); + OFFSET(__SF_SIE_CONTROL, stack_frame, sie_control_block); + OFFSET(__SF_SIE_SAVEAREA, stack_frame, sie_savearea); + OFFSET(__SF_SIE_REASON, stack_frame, sie_reason); + OFFSET(__SF_SIE_FLAGS, stack_frame, sie_flags); + OFFSET(__SF_SIE_CONTROL_PHYS, stack_frame, sie_control_block_phys); + DEFINE(STACK_FRAME_OVERHEAD, sizeof(struct stack_frame)); + BLANK(); + /* idle data offsets */ + OFFSET(__CLOCK_IDLE_ENTER, s390_idle_data, clock_idle_enter); + OFFSET(__TIMER_IDLE_ENTER, s390_idle_data, timer_idle_enter); + OFFSET(__MT_CYCLES_ENTER, s390_idle_data, mt_cycles_enter); + BLANK(); + /* hardware defined lowcore locations 0x000 - 0x1ff */ + OFFSET(__LC_EXT_PARAMS, lowcore, ext_params); + OFFSET(__LC_EXT_CPU_ADDR, lowcore, ext_cpu_addr); + OFFSET(__LC_EXT_INT_CODE, lowcore, ext_int_code); + OFFSET(__LC_PGM_ILC, lowcore, pgm_ilc); + OFFSET(__LC_PGM_INT_CODE, lowcore, pgm_code); + OFFSET(__LC_DATA_EXC_CODE, lowcore, data_exc_code); + OFFSET(__LC_MON_CLASS_NR, lowcore, mon_class_num); + OFFSET(__LC_PER_CODE, lowcore, per_code); + OFFSET(__LC_PER_ATMID, lowcore, per_atmid); + OFFSET(__LC_PER_ADDRESS, lowcore, per_address); + OFFSET(__LC_EXC_ACCESS_ID, lowcore, exc_access_id); + OFFSET(__LC_PER_ACCESS_ID, lowcore, per_access_id); + OFFSET(__LC_OP_ACCESS_ID, lowcore, op_access_id); + OFFSET(__LC_AR_MODE_ID, lowcore, ar_mode_id); + OFFSET(__LC_TRANS_EXC_CODE, lowcore, trans_exc_code); + OFFSET(__LC_MON_CODE, lowcore, monitor_code); + OFFSET(__LC_SUBCHANNEL_ID, lowcore, subchannel_id); + OFFSET(__LC_SUBCHANNEL_NR, lowcore, subchannel_nr); + OFFSET(__LC_IO_INT_PARM, lowcore, io_int_parm); + OFFSET(__LC_IO_INT_WORD, lowcore, io_int_word); + OFFSET(__LC_MCCK_CODE, lowcore, mcck_interruption_code); + OFFSET(__LC_EXT_DAMAGE_CODE, lowcore, external_damage_code); + OFFSET(__LC_MCCK_FAIL_STOR_ADDR, lowcore, failing_storage_address); + OFFSET(__LC_PGM_LAST_BREAK, lowcore, pgm_last_break); + OFFSET(__LC_RETURN_LPSWE, lowcore, return_lpswe); + OFFSET(__LC_RETURN_MCCK_LPSWE, lowcore, return_mcck_lpswe); + OFFSET(__LC_RST_OLD_PSW, lowcore, restart_old_psw); + OFFSET(__LC_EXT_OLD_PSW, lowcore, external_old_psw); + OFFSET(__LC_SVC_OLD_PSW, lowcore, svc_old_psw); + OFFSET(__LC_PGM_OLD_PSW, lowcore, program_old_psw); + OFFSET(__LC_MCK_OLD_PSW, lowcore, mcck_old_psw); + OFFSET(__LC_IO_OLD_PSW, lowcore, io_old_psw); + OFFSET(__LC_RST_NEW_PSW, lowcore, restart_psw); + OFFSET(__LC_EXT_NEW_PSW, lowcore, external_new_psw); + OFFSET(__LC_SVC_NEW_PSW, lowcore, svc_new_psw); + OFFSET(__LC_PGM_NEW_PSW, lowcore, program_new_psw); + OFFSET(__LC_MCK_NEW_PSW, lowcore, mcck_new_psw); + OFFSET(__LC_IO_NEW_PSW, lowcore, io_new_psw); + /* software defined lowcore locations 0x200 - 0xdff*/ + OFFSET(__LC_SAVE_AREA_SYNC, lowcore, save_area_sync); + OFFSET(__LC_SAVE_AREA_ASYNC, lowcore, save_area_async); + OFFSET(__LC_SAVE_AREA_RESTART, lowcore, save_area_restart); + OFFSET(__LC_CPU_FLAGS, lowcore, cpu_flags); + OFFSET(__LC_RETURN_PSW, lowcore, return_psw); + OFFSET(__LC_RETURN_MCCK_PSW, lowcore, return_mcck_psw); + OFFSET(__LC_SYS_ENTER_TIMER, lowcore, sys_enter_timer); + OFFSET(__LC_MCCK_ENTER_TIMER, lowcore, mcck_enter_timer); + OFFSET(__LC_EXIT_TIMER, lowcore, exit_timer); + OFFSET(__LC_LAST_UPDATE_TIMER, lowcore, last_update_timer); + OFFSET(__LC_LAST_UPDATE_CLOCK, lowcore, last_update_clock); + OFFSET(__LC_INT_CLOCK, lowcore, int_clock); + OFFSET(__LC_BOOT_CLOCK, lowcore, boot_clock); + OFFSET(__LC_CURRENT, lowcore, current_task); + OFFSET(__LC_KERNEL_STACK, lowcore, kernel_stack); + OFFSET(__LC_ASYNC_STACK, lowcore, async_stack); + OFFSET(__LC_NODAT_STACK, lowcore, nodat_stack); + OFFSET(__LC_RESTART_STACK, lowcore, restart_stack); + OFFSET(__LC_MCCK_STACK, lowcore, mcck_stack); + OFFSET(__LC_RESTART_FN, lowcore, restart_fn); + OFFSET(__LC_RESTART_DATA, lowcore, restart_data); + OFFSET(__LC_RESTART_SOURCE, lowcore, restart_source); + OFFSET(__LC_RESTART_FLAGS, lowcore, restart_flags); + OFFSET(__LC_KERNEL_ASCE, lowcore, kernel_asce); + OFFSET(__LC_USER_ASCE, lowcore, user_asce); + OFFSET(__LC_LPP, lowcore, lpp); + OFFSET(__LC_CURRENT_PID, lowcore, current_pid); + OFFSET(__LC_GMAP, lowcore, gmap); + OFFSET(__LC_LAST_BREAK, lowcore, last_break); + /* software defined ABI-relevant lowcore locations 0xe00 - 0xe20 */ + OFFSET(__LC_DUMP_REIPL, lowcore, ipib); + OFFSET(__LC_VMCORE_INFO, lowcore, vmcore_info); + OFFSET(__LC_OS_INFO, lowcore, os_info); + /* hardware defined lowcore locations 0x1000 - 0x18ff */ + OFFSET(__LC_MCESAD, lowcore, mcesad); + OFFSET(__LC_EXT_PARAMS2, lowcore, ext_params2); + OFFSET(__LC_FPREGS_SAVE_AREA, lowcore, floating_pt_save_area); + OFFSET(__LC_GPREGS_SAVE_AREA, lowcore, gpregs_save_area); + OFFSET(__LC_PSW_SAVE_AREA, lowcore, psw_save_area); + OFFSET(__LC_PREFIX_SAVE_AREA, lowcore, prefixreg_save_area); + OFFSET(__LC_FP_CREG_SAVE_AREA, lowcore, fpt_creg_save_area); + OFFSET(__LC_TOD_PROGREG_SAVE_AREA, lowcore, tod_progreg_save_area); + OFFSET(__LC_CPU_TIMER_SAVE_AREA, lowcore, cpu_timer_save_area); + OFFSET(__LC_CLOCK_COMP_SAVE_AREA, lowcore, clock_comp_save_area); + OFFSET(__LC_LAST_BREAK_SAVE_AREA, lowcore, last_break_save_area); + OFFSET(__LC_AREGS_SAVE_AREA, lowcore, access_regs_save_area); + OFFSET(__LC_CREGS_SAVE_AREA, lowcore, cregs_save_area); + OFFSET(__LC_PGM_TDB, lowcore, pgm_tdb); + BLANK(); + /* gmap/sie offsets */ + OFFSET(__GMAP_ASCE, gmap, asce); + OFFSET(__SIE_PROG0C, kvm_s390_sie_block, prog0c); + OFFSET(__SIE_PROG20, kvm_s390_sie_block, prog20); + /* kexec_sha_region */ + OFFSET(__KEXEC_SHA_REGION_START, kexec_sha_region, start); + OFFSET(__KEXEC_SHA_REGION_LEN, kexec_sha_region, len); + DEFINE(__KEXEC_SHA_REGION_SIZE, sizeof(struct kexec_sha_region)); + /* sizeof kernel parameter area */ + DEFINE(__PARMAREA_SIZE, sizeof(struct parmarea)); + /* kernel parameter area offsets */ + DEFINE(IPL_DEVICE, PARMAREA + offsetof(struct parmarea, ipl_device)); + DEFINE(INITRD_START, PARMAREA + offsetof(struct parmarea, initrd_start)); + DEFINE(INITRD_SIZE, PARMAREA + offsetof(struct parmarea, initrd_size)); + DEFINE(OLDMEM_BASE, PARMAREA + offsetof(struct parmarea, oldmem_base)); + DEFINE(OLDMEM_SIZE, PARMAREA + offsetof(struct parmarea, oldmem_size)); + DEFINE(COMMAND_LINE, PARMAREA + offsetof(struct parmarea, command_line)); + DEFINE(MAX_COMMAND_LINE_SIZE, PARMAREA + offsetof(struct parmarea, max_command_line_size)); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* function graph return value tracing */ + OFFSET(__FGRAPH_RET_GPR2, fgraph_ret_regs, gpr2); + OFFSET(__FGRAPH_RET_FP, fgraph_ret_regs, fp); + DEFINE(__FGRAPH_RET_SIZE, sizeof(struct fgraph_ret_regs)); +#endif + OFFSET(__FTRACE_REGS_PT_REGS, ftrace_regs, regs); + DEFINE(__FTRACE_REGS_SIZE, sizeof(struct ftrace_regs)); + return 0; +} diff --git a/arch/s390/kernel/audit.c b/arch/s390/kernel/audit.c new file mode 100644 index 0000000000..02051a596b --- /dev/null +++ b/arch/s390/kernel/audit.c @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include "audit.h" + +static unsigned dir_class[] = { +#include +~0U +}; + +static unsigned read_class[] = { +#include +~0U +}; + +static unsigned write_class[] = { +#include +~0U +}; + +static unsigned chattr_class[] = { +#include +~0U +}; + +static unsigned signal_class[] = { +#include +~0U +}; + +int audit_classify_arch(int arch) +{ +#ifdef CONFIG_COMPAT + if (arch == AUDIT_ARCH_S390) + return 1; +#endif + return 0; +} + +int audit_classify_syscall(int abi, unsigned syscall) +{ +#ifdef CONFIG_COMPAT + if (abi == AUDIT_ARCH_S390) + return s390_classify_syscall(syscall); +#endif + switch(syscall) { + case __NR_open: + return AUDITSC_OPEN; + case __NR_openat: + return AUDITSC_OPENAT; + case __NR_socketcall: + return AUDITSC_SOCKETCALL; + case __NR_execve: + return AUDITSC_EXECVE; + case __NR_openat2: + return AUDITSC_OPENAT2; + default: + return AUDITSC_NATIVE; + } +} + +static int __init audit_classes_init(void) +{ +#ifdef CONFIG_COMPAT + audit_register_class(AUDIT_CLASS_WRITE_32, s390_write_class); + audit_register_class(AUDIT_CLASS_READ_32, s390_read_class); + audit_register_class(AUDIT_CLASS_DIR_WRITE_32, s390_dir_class); + audit_register_class(AUDIT_CLASS_CHATTR_32, s390_chattr_class); + audit_register_class(AUDIT_CLASS_SIGNAL_32, s390_signal_class); +#endif + audit_register_class(AUDIT_CLASS_WRITE, write_class); + audit_register_class(AUDIT_CLASS_READ, read_class); + audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class); + audit_register_class(AUDIT_CLASS_CHATTR, chattr_class); + audit_register_class(AUDIT_CLASS_SIGNAL, signal_class); + return 0; +} + +__initcall(audit_classes_init); diff --git a/arch/s390/kernel/audit.h b/arch/s390/kernel/audit.h new file mode 100644 index 0000000000..4d4b596412 --- /dev/null +++ b/arch/s390/kernel/audit.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ARCH_S390_KERNEL_AUDIT_H +#define __ARCH_S390_KERNEL_AUDIT_H + +#include + +#ifdef CONFIG_COMPAT +extern int s390_classify_syscall(unsigned); +extern __u32 s390_dir_class[]; +extern __u32 s390_write_class[]; +extern __u32 s390_read_class[]; +extern __u32 s390_chattr_class[]; +extern __u32 s390_signal_class[]; +#endif /* CONFIG_COMPAT */ + +#endif /* __ARCH_S390_KERNEL_AUDIT_H */ diff --git a/arch/s390/kernel/cache.c b/arch/s390/kernel/cache.c new file mode 100644 index 0000000000..56254fa06f --- /dev/null +++ b/arch/s390/kernel/cache.c @@ -0,0 +1,170 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Extract CPU cache information and expose them via sysfs. + * + * Copyright IBM Corp. 2012 + */ + +#include +#include +#include +#include + +enum { + CACHE_SCOPE_NOTEXISTS, + CACHE_SCOPE_PRIVATE, + CACHE_SCOPE_SHARED, + CACHE_SCOPE_RESERVED, +}; + +enum { + CTYPE_SEPARATE, + CTYPE_DATA, + CTYPE_INSTRUCTION, + CTYPE_UNIFIED, +}; + +enum { + EXTRACT_TOPOLOGY, + EXTRACT_LINE_SIZE, + EXTRACT_SIZE, + EXTRACT_ASSOCIATIVITY, +}; + +enum { + CACHE_TI_UNIFIED = 0, + CACHE_TI_DATA = 0, + CACHE_TI_INSTRUCTION, +}; + +struct cache_info { + unsigned char : 4; + unsigned char scope : 2; + unsigned char type : 2; +}; + +#define CACHE_MAX_LEVEL 8 +union cache_topology { + struct cache_info ci[CACHE_MAX_LEVEL]; + unsigned long raw; +}; + +static const char * const cache_type_string[] = { + "", + "Instruction", + "Data", + "", + "Unified", +}; + +static const enum cache_type cache_type_map[] = { + [CTYPE_SEPARATE] = CACHE_TYPE_SEPARATE, + [CTYPE_DATA] = CACHE_TYPE_DATA, + [CTYPE_INSTRUCTION] = CACHE_TYPE_INST, + [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED, +}; + +void show_cacheinfo(struct seq_file *m) +{ + struct cpu_cacheinfo *this_cpu_ci; + struct cacheinfo *cache; + int idx; + + this_cpu_ci = get_cpu_cacheinfo(cpumask_any(cpu_online_mask)); + for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) { + cache = this_cpu_ci->info_list + idx; + seq_printf(m, "cache%-11d: ", idx); + seq_printf(m, "level=%d ", cache->level); + seq_printf(m, "type=%s ", cache_type_string[cache->type]); + seq_printf(m, "scope=%s ", + cache->disable_sysfs ? "Shared" : "Private"); + seq_printf(m, "size=%dK ", cache->size >> 10); + seq_printf(m, "line_size=%u ", cache->coherency_line_size); + seq_printf(m, "associativity=%d", cache->ways_of_associativity); + seq_puts(m, "\n"); + } +} + +static inline enum cache_type get_cache_type(struct cache_info *ci, int level) +{ + if (level >= CACHE_MAX_LEVEL) + return CACHE_TYPE_NOCACHE; + ci += level; + if (ci->scope != CACHE_SCOPE_SHARED && ci->scope != CACHE_SCOPE_PRIVATE) + return CACHE_TYPE_NOCACHE; + return cache_type_map[ci->type]; +} + +static inline unsigned long ecag(int ai, int li, int ti) +{ + return __ecag(ECAG_CACHE_ATTRIBUTE, ai << 4 | li << 1 | ti); +} + +static void ci_leaf_init(struct cacheinfo *this_leaf, int private, + enum cache_type type, unsigned int level, int cpu) +{ + int ti, num_sets; + + if (type == CACHE_TYPE_INST) + ti = CACHE_TI_INSTRUCTION; + else + ti = CACHE_TI_UNIFIED; + this_leaf->level = level + 1; + this_leaf->type = type; + this_leaf->coherency_line_size = ecag(EXTRACT_LINE_SIZE, level, ti); + this_leaf->ways_of_associativity = ecag(EXTRACT_ASSOCIATIVITY, level, ti); + this_leaf->size = ecag(EXTRACT_SIZE, level, ti); + num_sets = this_leaf->size / this_leaf->coherency_line_size; + num_sets /= this_leaf->ways_of_associativity; + this_leaf->number_of_sets = num_sets; + cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map); + if (!private) + this_leaf->disable_sysfs = true; +} + +int init_cache_level(unsigned int cpu) +{ + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + unsigned int level = 0, leaves = 0; + union cache_topology ct; + enum cache_type ctype; + + if (!this_cpu_ci) + return -EINVAL; + ct.raw = ecag(EXTRACT_TOPOLOGY, 0, 0); + do { + ctype = get_cache_type(&ct.ci[0], level); + if (ctype == CACHE_TYPE_NOCACHE) + break; + /* Separate instruction and data caches */ + leaves += (ctype == CACHE_TYPE_SEPARATE) ? 2 : 1; + } while (++level < CACHE_MAX_LEVEL); + this_cpu_ci->num_levels = level; + this_cpu_ci->num_leaves = leaves; + return 0; +} + +int populate_cache_leaves(unsigned int cpu) +{ + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + struct cacheinfo *this_leaf = this_cpu_ci->info_list; + unsigned int level, idx, pvt; + union cache_topology ct; + enum cache_type ctype; + + ct.raw = ecag(EXTRACT_TOPOLOGY, 0, 0); + for (idx = 0, level = 0; level < this_cpu_ci->num_levels && + idx < this_cpu_ci->num_leaves; idx++, level++) { + if (!this_leaf) + return -EINVAL; + pvt = (ct.ci[level].scope == CACHE_SCOPE_PRIVATE) ? 1 : 0; + ctype = get_cache_type(&ct.ci[0], level); + if (ctype == CACHE_TYPE_SEPARATE) { + ci_leaf_init(this_leaf++, pvt, CACHE_TYPE_DATA, level, cpu); + ci_leaf_init(this_leaf++, pvt, CACHE_TYPE_INST, level, cpu); + } else { + ci_leaf_init(this_leaf++, pvt, ctype, level, cpu); + } + } + return 0; +} diff --git a/arch/s390/kernel/cert_store.c b/arch/s390/kernel/cert_store.c new file mode 100644 index 0000000000..554447768b --- /dev/null +++ b/arch/s390/kernel/cert_store.c @@ -0,0 +1,812 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DIAG 0x320 support and certificate store handling + * + * Copyright IBM Corp. 2023 + * Author(s): Anastasia Eskova + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DIAG_MAX_RETRIES 10 + +#define VCE_FLAGS_VALID_MASK 0x80 + +#define ISM_LEN_DWORDS 4 +#define VCSSB_LEN_BYTES 128 +#define VCSSB_LEN_NO_CERTS 4 +#define VCB_LEN_NO_CERTS 64 +#define VC_NAME_LEN_BYTES 64 + +#define CERT_STORE_KEY_TYPE_NAME "cert_store_key" +#define CERT_STORE_KEYRING_NAME "cert_store" + +static debug_info_t *cert_store_dbf; +static debug_info_t *cert_store_hexdump; + +#define pr_dbf_msg(fmt, ...) \ + debug_sprintf_event(cert_store_dbf, 3, fmt "\n", ## __VA_ARGS__) + +enum diag320_subcode { + DIAG320_SUBCODES = 0, + DIAG320_STORAGE = 1, + DIAG320_CERT_BLOCK = 2, +}; + +enum diag320_rc { + DIAG320_RC_OK = 0x0001, + DIAG320_RC_CS_NOMATCH = 0x0306, +}; + +/* Verification Certificates Store Support Block (VCSSB). */ +struct vcssb { + u32 vcssb_length; + u8 pad_0x04[3]; + u8 version; + u8 pad_0x08[8]; + u32 cs_token; + u8 pad_0x14[12]; + u16 total_vc_index_count; + u16 max_vc_index_count; + u8 pad_0x24[28]; + u32 max_vce_length; + u32 max_vcxe_length; + u8 pad_0x48[8]; + u32 max_single_vcb_length; + u32 total_vcb_length; + u32 max_single_vcxb_length; + u32 total_vcxb_length; + u8 pad_0x60[32]; +} __packed __aligned(8); + +/* Verification Certificate Entry (VCE) Header. */ +struct vce_header { + u32 vce_length; + u8 flags; + u8 key_type; + u16 vc_index; + u8 vc_name[VC_NAME_LEN_BYTES]; /* EBCDIC */ + u8 vc_format; + u8 pad_0x49; + u16 key_id_length; + u8 pad_0x4c; + u8 vc_hash_type; + u16 vc_hash_length; + u8 pad_0x50[4]; + u32 vc_length; + u8 pad_0x58[8]; + u16 vc_hash_offset; + u16 vc_offset; + u8 pad_0x64[28]; +} __packed __aligned(4); + +/* Verification Certificate Block (VCB) Header. */ +struct vcb_header { + u32 vcb_input_length; + u8 pad_0x04[4]; + u16 first_vc_index; + u16 last_vc_index; + u32 pad_0x0c; + u32 cs_token; + u8 pad_0x14[12]; + u32 vcb_output_length; + u8 pad_0x24[3]; + u8 version; + u16 stored_vc_count; + u16 remaining_vc_count; + u8 pad_0x2c[20]; +} __packed __aligned(4); + +/* Verification Certificate Block (VCB). */ +struct vcb { + struct vcb_header vcb_hdr; + u8 vcb_buf[]; +} __packed __aligned(4); + +/* Verification Certificate Entry (VCE). */ +struct vce { + struct vce_header vce_hdr; + u8 cert_data_buf[]; +} __packed __aligned(4); + +static void cert_store_key_describe(const struct key *key, struct seq_file *m) +{ + char ascii[VC_NAME_LEN_BYTES + 1]; + + /* + * First 64 bytes of the key description is key name in EBCDIC CP 500. + * Convert it to ASCII for displaying in /proc/keys. + */ + strscpy(ascii, key->description, sizeof(ascii)); + EBCASC_500(ascii, VC_NAME_LEN_BYTES); + seq_puts(m, ascii); + + seq_puts(m, &key->description[VC_NAME_LEN_BYTES]); + if (key_is_positive(key)) + seq_printf(m, ": %u", key->datalen); +} + +/* + * Certificate store key type takes over properties of + * user key but cannot be updated. + */ +static struct key_type key_type_cert_store_key = { + .name = CERT_STORE_KEY_TYPE_NAME, + .preparse = user_preparse, + .free_preparse = user_free_preparse, + .instantiate = generic_key_instantiate, + .revoke = user_revoke, + .destroy = user_destroy, + .describe = cert_store_key_describe, + .read = user_read, +}; + +/* Logging functions. */ +static void pr_dbf_vcb(const struct vcb *b) +{ + pr_dbf_msg("VCB Header:"); + pr_dbf_msg("vcb_input_length: %d", b->vcb_hdr.vcb_input_length); + pr_dbf_msg("first_vc_index: %d", b->vcb_hdr.first_vc_index); + pr_dbf_msg("last_vc_index: %d", b->vcb_hdr.last_vc_index); + pr_dbf_msg("cs_token: %d", b->vcb_hdr.cs_token); + pr_dbf_msg("vcb_output_length: %d", b->vcb_hdr.vcb_output_length); + pr_dbf_msg("version: %d", b->vcb_hdr.version); + pr_dbf_msg("stored_vc_count: %d", b->vcb_hdr.stored_vc_count); + pr_dbf_msg("remaining_vc_count: %d", b->vcb_hdr.remaining_vc_count); +} + +static void pr_dbf_vce(const struct vce *e) +{ + unsigned char vc_name[VC_NAME_LEN_BYTES + 1]; + char log_string[VC_NAME_LEN_BYTES + 40]; + + pr_dbf_msg("VCE Header:"); + pr_dbf_msg("vce_hdr.vce_length: %d", e->vce_hdr.vce_length); + pr_dbf_msg("vce_hdr.flags: %d", e->vce_hdr.flags); + pr_dbf_msg("vce_hdr.key_type: %d", e->vce_hdr.key_type); + pr_dbf_msg("vce_hdr.vc_index: %d", e->vce_hdr.vc_index); + pr_dbf_msg("vce_hdr.vc_format: %d", e->vce_hdr.vc_format); + pr_dbf_msg("vce_hdr.key_id_length: %d", e->vce_hdr.key_id_length); + pr_dbf_msg("vce_hdr.vc_hash_type: %d", e->vce_hdr.vc_hash_type); + pr_dbf_msg("vce_hdr.vc_hash_length: %d", e->vce_hdr.vc_hash_length); + pr_dbf_msg("vce_hdr.vc_hash_offset: %d", e->vce_hdr.vc_hash_offset); + pr_dbf_msg("vce_hdr.vc_length: %d", e->vce_hdr.vc_length); + pr_dbf_msg("vce_hdr.vc_offset: %d", e->vce_hdr.vc_offset); + + /* Certificate name in ASCII. */ + memcpy(vc_name, e->vce_hdr.vc_name, VC_NAME_LEN_BYTES); + EBCASC_500(vc_name, VC_NAME_LEN_BYTES); + vc_name[VC_NAME_LEN_BYTES] = '\0'; + + snprintf(log_string, sizeof(log_string), + "index: %d vce_hdr.vc_name (ASCII): %s", + e->vce_hdr.vc_index, vc_name); + debug_text_event(cert_store_hexdump, 3, log_string); + + /* Certificate data. */ + debug_text_event(cert_store_hexdump, 3, "VCE: Certificate data start"); + debug_event(cert_store_hexdump, 3, (u8 *)e->cert_data_buf, 128); + debug_text_event(cert_store_hexdump, 3, "VCE: Certificate data end"); + debug_event(cert_store_hexdump, 3, + (u8 *)e->cert_data_buf + e->vce_hdr.vce_length - 128, 128); +} + +static void pr_dbf_vcssb(const struct vcssb *s) +{ + debug_text_event(cert_store_hexdump, 3, "DIAG320 Subcode1"); + debug_event(cert_store_hexdump, 3, (u8 *)s, VCSSB_LEN_BYTES); + + pr_dbf_msg("VCSSB:"); + pr_dbf_msg("vcssb_length: %u", s->vcssb_length); + pr_dbf_msg("version: %u", s->version); + pr_dbf_msg("cs_token: %u", s->cs_token); + pr_dbf_msg("total_vc_index_count: %u", s->total_vc_index_count); + pr_dbf_msg("max_vc_index_count: %u", s->max_vc_index_count); + pr_dbf_msg("max_vce_length: %u", s->max_vce_length); + pr_dbf_msg("max_vcxe_length: %u", s->max_vce_length); + pr_dbf_msg("max_single_vcb_length: %u", s->max_single_vcb_length); + pr_dbf_msg("total_vcb_length: %u", s->total_vcb_length); + pr_dbf_msg("max_single_vcxb_length: %u", s->max_single_vcxb_length); + pr_dbf_msg("total_vcxb_length: %u", s->total_vcxb_length); +} + +static int __diag320(unsigned long subcode, void *addr) +{ + union register_pair rp = { .even = (unsigned long)addr, }; + + asm volatile( + " diag %[rp],%[subcode],0x320\n" + "0: nopr %%r7\n" + EX_TABLE(0b, 0b) + : [rp] "+d" (rp.pair) + : [subcode] "d" (subcode) + : "cc", "memory"); + + return rp.odd; +} + +static int diag320(unsigned long subcode, void *addr) +{ + diag_stat_inc(DIAG_STAT_X320); + + return __diag320(subcode, addr); +} + +/* + * Calculate SHA256 hash of the VCE certificate and compare it to hash stored in + * VCE. Return -EINVAL if hashes don't match. + */ +static int check_certificate_hash(const struct vce *vce) +{ + u8 hash[SHA256_DIGEST_SIZE]; + u16 vc_hash_length; + u8 *vce_hash; + + vce_hash = (u8 *)vce + vce->vce_hdr.vc_hash_offset; + vc_hash_length = vce->vce_hdr.vc_hash_length; + sha256((u8 *)vce + vce->vce_hdr.vc_offset, vce->vce_hdr.vc_length, hash); + if (memcmp(vce_hash, hash, vc_hash_length) == 0) + return 0; + + pr_dbf_msg("SHA256 hash of received certificate does not match"); + debug_text_event(cert_store_hexdump, 3, "VCE hash:"); + debug_event(cert_store_hexdump, 3, vce_hash, SHA256_DIGEST_SIZE); + debug_text_event(cert_store_hexdump, 3, "Calculated hash:"); + debug_event(cert_store_hexdump, 3, hash, SHA256_DIGEST_SIZE); + + return -EINVAL; +} + +static int check_certificate_valid(const struct vce *vce) +{ + if (!(vce->vce_hdr.flags & VCE_FLAGS_VALID_MASK)) { + pr_dbf_msg("Certificate entry is invalid"); + return -EINVAL; + } + if (vce->vce_hdr.vc_format != 1) { + pr_dbf_msg("Certificate format is not supported"); + return -EINVAL; + } + if (vce->vce_hdr.vc_hash_type != 1) { + pr_dbf_msg("Hash type is not supported"); + return -EINVAL; + } + + return check_certificate_hash(vce); +} + +static struct key *get_user_session_keyring(void) +{ + key_ref_t us_keyring_ref; + + us_keyring_ref = lookup_user_key(KEY_SPEC_USER_SESSION_KEYRING, + KEY_LOOKUP_CREATE, KEY_NEED_LINK); + if (IS_ERR(us_keyring_ref)) { + pr_dbf_msg("Couldn't get user session keyring: %ld", + PTR_ERR(us_keyring_ref)); + return ERR_PTR(-ENOKEY); + } + key_ref_put(us_keyring_ref); + return key_ref_to_ptr(us_keyring_ref); +} + +/* Invalidate all keys from cert_store keyring. */ +static int invalidate_keyring_keys(struct key *keyring) +{ + unsigned long num_keys, key_index; + size_t keyring_payload_len; + key_serial_t *key_array; + struct key *current_key; + int rc; + + keyring_payload_len = key_type_keyring.read(keyring, NULL, 0); + num_keys = keyring_payload_len / sizeof(key_serial_t); + key_array = kcalloc(num_keys, sizeof(key_serial_t), GFP_KERNEL); + if (!key_array) + return -ENOMEM; + + rc = key_type_keyring.read(keyring, (char *)key_array, keyring_payload_len); + if (rc != keyring_payload_len) { + pr_dbf_msg("Couldn't read keyring payload"); + goto out; + } + + for (key_index = 0; key_index < num_keys; key_index++) { + current_key = key_lookup(key_array[key_index]); + pr_dbf_msg("Invalidating key %08x", current_key->serial); + + key_invalidate(current_key); + key_put(current_key); + rc = key_unlink(keyring, current_key); + if (rc) { + pr_dbf_msg("Couldn't unlink key %08x: %d", current_key->serial, rc); + break; + } + } +out: + kfree(key_array); + return rc; +} + +static struct key *find_cs_keyring(void) +{ + key_ref_t cs_keyring_ref; + struct key *cs_keyring; + + cs_keyring_ref = keyring_search(make_key_ref(get_user_session_keyring(), true), + &key_type_keyring, CERT_STORE_KEYRING_NAME, + false); + if (!IS_ERR(cs_keyring_ref)) { + cs_keyring = key_ref_to_ptr(cs_keyring_ref); + key_ref_put(cs_keyring_ref); + goto found; + } + /* Search default locations: thread, process, session keyrings */ + cs_keyring = request_key(&key_type_keyring, CERT_STORE_KEYRING_NAME, NULL); + if (IS_ERR(cs_keyring)) + return NULL; + key_put(cs_keyring); +found: + return cs_keyring; +} + +static void cleanup_cs_keys(void) +{ + struct key *cs_keyring; + + cs_keyring = find_cs_keyring(); + if (!cs_keyring) + return; + + pr_dbf_msg("Found cert_store keyring. Purging..."); + /* + * Remove cert_store_key_type in case invalidation + * of old cert_store keys failed (= severe error). + */ + if (invalidate_keyring_keys(cs_keyring)) + unregister_key_type(&key_type_cert_store_key); + + keyring_clear(cs_keyring); + key_invalidate(cs_keyring); + key_put(cs_keyring); + key_unlink(get_user_session_keyring(), cs_keyring); +} + +static struct key *create_cs_keyring(void) +{ + static struct key *cs_keyring; + + /* Cleanup previous cs_keyring and all associated keys if any. */ + cleanup_cs_keys(); + cs_keyring = keyring_alloc(CERT_STORE_KEYRING_NAME, GLOBAL_ROOT_UID, + GLOBAL_ROOT_GID, current_cred(), + (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA | KEY_ALLOC_SET_KEEP, + NULL, get_user_session_keyring()); + if (IS_ERR(cs_keyring)) { + pr_dbf_msg("Can't allocate cert_store keyring"); + return NULL; + } + + pr_dbf_msg("Successfully allocated cert_store keyring: %08x", cs_keyring->serial); + + /* + * In case a previous clean-up ran into an + * error and unregistered key type. + */ + register_key_type(&key_type_cert_store_key); + + return cs_keyring; +} + +/* + * Allocate memory and create key description in format + * [key name in EBCDIC]:[VCE index]:[CS token]. + * Return a pointer to key description or NULL if memory + * allocation failed. Memory should be freed by caller. + */ +static char *get_key_description(struct vcssb *vcssb, const struct vce *vce) +{ + size_t len, name_len; + u32 cs_token; + char *desc; + + cs_token = vcssb->cs_token; + /* Description string contains "%64s:%05u:%010u\0". */ + name_len = sizeof(vce->vce_hdr.vc_name); + len = name_len + 1 + 5 + 1 + 10 + 1; + desc = kmalloc(len, GFP_KERNEL); + if (!desc) + return NULL; + + memcpy(desc, vce->vce_hdr.vc_name, name_len); + snprintf(desc + name_len, len - name_len, ":%05u:%010u", + vce->vce_hdr.vc_index, cs_token); + + return desc; +} + +/* + * Create a key of type "cert_store_key" using the data from VCE for key + * payload and key description. Link the key to "cert_store" keyring. + */ +static int create_key_from_vce(struct vcssb *vcssb, struct vce *vce, + struct key *keyring) +{ + key_ref_t newkey; + char *desc; + int rc; + + desc = get_key_description(vcssb, vce); + if (!desc) + return -ENOMEM; + + newkey = key_create_or_update( + make_key_ref(keyring, true), CERT_STORE_KEY_TYPE_NAME, + desc, (u8 *)vce + vce->vce_hdr.vc_offset, + vce->vce_hdr.vc_length, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA); + + rc = PTR_ERR_OR_ZERO(newkey); + if (rc) { + pr_dbf_msg("Couldn't create a key from Certificate Entry (%d)", rc); + rc = -ENOKEY; + goto out; + } + + key_ref_put(newkey); +out: + kfree(desc); + return rc; +} + +/* Get Verification Certificate Storage Size block with DIAG320 subcode2. */ +static int get_vcssb(struct vcssb *vcssb) +{ + int diag320_rc; + + memset(vcssb, 0, sizeof(*vcssb)); + vcssb->vcssb_length = VCSSB_LEN_BYTES; + diag320_rc = diag320(DIAG320_STORAGE, vcssb); + pr_dbf_vcssb(vcssb); + + if (diag320_rc != DIAG320_RC_OK) { + pr_dbf_msg("Diag 320 Subcode 1 returned bad RC: %04x", diag320_rc); + return -EIO; + } + if (vcssb->vcssb_length == VCSSB_LEN_NO_CERTS) { + pr_dbf_msg("No certificates available for current configuration"); + return -ENOKEY; + } + + return 0; +} + +static u32 get_4k_mult_vcb_size(struct vcssb *vcssb) +{ + return round_up(vcssb->max_single_vcb_length, PAGE_SIZE); +} + +/* Fill input fields of single-entry VCB that will be read by LPAR. */ +static void fill_vcb_input(struct vcssb *vcssb, struct vcb *vcb, u16 index) +{ + memset(vcb, 0, sizeof(*vcb)); + vcb->vcb_hdr.vcb_input_length = get_4k_mult_vcb_size(vcssb); + vcb->vcb_hdr.cs_token = vcssb->cs_token; + + /* Request single entry. */ + vcb->vcb_hdr.first_vc_index = index; + vcb->vcb_hdr.last_vc_index = index; +} + +static void extract_vce_from_sevcb(struct vcb *vcb, struct vce *vce) +{ + struct vce *extracted_vce; + + extracted_vce = (struct vce *)vcb->vcb_buf; + memcpy(vce, vcb->vcb_buf, extracted_vce->vce_hdr.vce_length); + pr_dbf_vce(vce); +} + +static int get_sevcb(struct vcssb *vcssb, u16 index, struct vcb *vcb) +{ + int rc, diag320_rc; + + fill_vcb_input(vcssb, vcb, index); + + diag320_rc = diag320(DIAG320_CERT_BLOCK, vcb); + pr_dbf_msg("Diag 320 Subcode2 RC %2x", diag320_rc); + pr_dbf_vcb(vcb); + + switch (diag320_rc) { + case DIAG320_RC_OK: + rc = 0; + if (vcb->vcb_hdr.vcb_output_length == VCB_LEN_NO_CERTS) { + pr_dbf_msg("No certificate entry for index %u", index); + rc = -ENOKEY; + } else if (vcb->vcb_hdr.remaining_vc_count != 0) { + /* Retry on insufficient space. */ + pr_dbf_msg("Couldn't get all requested certificates"); + rc = -EAGAIN; + } + break; + case DIAG320_RC_CS_NOMATCH: + pr_dbf_msg("Certificate Store token mismatch"); + rc = -EAGAIN; + break; + default: + pr_dbf_msg("Diag 320 Subcode2 returned bad rc (0x%4x)", diag320_rc); + rc = -EINVAL; + break; + } + + return rc; +} + +/* + * Allocate memory for single-entry VCB, get VCB via DIAG320 subcode 2 call, + * extract VCE and create a key from its' certificate. + */ +static int create_key_from_sevcb(struct vcssb *vcssb, u16 index, + struct key *keyring) +{ + struct vcb *vcb; + struct vce *vce; + int rc; + + rc = -ENOMEM; + vcb = vmalloc(get_4k_mult_vcb_size(vcssb)); + vce = vmalloc(vcssb->max_single_vcb_length - sizeof(vcb->vcb_hdr)); + if (!vcb || !vce) + goto out; + + rc = get_sevcb(vcssb, index, vcb); + if (rc) + goto out; + + extract_vce_from_sevcb(vcb, vce); + rc = check_certificate_valid(vce); + if (rc) + goto out; + + rc = create_key_from_vce(vcssb, vce, keyring); + if (rc) + goto out; + + pr_dbf_msg("Successfully created key from Certificate Entry %d", index); +out: + vfree(vce); + vfree(vcb); + return rc; +} + +/* + * Request a single-entry VCB for each VCE available for the partition. + * Create a key from it and link it to cert_store keyring. If no keys + * could be created (i.e. VCEs were invalid) return -ENOKEY. + */ +static int add_certificates_to_keyring(struct vcssb *vcssb, struct key *keyring) +{ + int rc, index, count, added; + + count = 0; + added = 0; + /* Certificate Store entries indices start with 1 and have no gaps. */ + for (index = 1; index < vcssb->total_vc_index_count + 1; index++) { + pr_dbf_msg("Creating key from VCE %u", index); + rc = create_key_from_sevcb(vcssb, index, keyring); + count++; + + if (rc == -EAGAIN) + return rc; + + if (rc) + pr_dbf_msg("Creating key from VCE %u failed (%d)", index, rc); + else + added++; + } + + if (added == 0) { + pr_dbf_msg("Processed %d entries. No keys created", count); + return -ENOKEY; + } + + pr_info("Added %d of %d keys to cert_store keyring", added, count); + + /* + * Do not allow to link more keys to certificate store keyring after all + * the VCEs were processed. + */ + rc = keyring_restrict(make_key_ref(keyring, true), NULL, NULL); + if (rc) + pr_dbf_msg("Failed to set restriction to cert_store keyring (%d)", rc); + + return 0; +} + +/* + * Check which DIAG320 subcodes are installed. + * Return -ENOENT if subcodes 1 or 2 are not available. + */ +static int query_diag320_subcodes(void) +{ + unsigned long ism[ISM_LEN_DWORDS]; + int rc; + + rc = diag320(0, ism); + if (rc != DIAG320_RC_OK) { + pr_dbf_msg("DIAG320 subcode query returned %04x", rc); + return -ENOENT; + } + + debug_text_event(cert_store_hexdump, 3, "DIAG320 Subcode 0"); + debug_event(cert_store_hexdump, 3, ism, sizeof(ism)); + + if (!test_bit_inv(1, ism) || !test_bit_inv(2, ism)) { + pr_dbf_msg("Not all required DIAG320 subcodes are installed"); + return -ENOENT; + } + + return 0; +} + +/* + * Check if Certificate Store is supported by the firmware and DIAG320 subcodes + * 1 and 2 are installed. Create cert_store keyring and link all certificates + * available for the current partition to it as "cert_store_key" type + * keys. On refresh or error invalidate cert_store keyring and destroy + * all keys of "cert_store_key" type. + */ +static int fill_cs_keyring(void) +{ + struct key *cs_keyring; + struct vcssb *vcssb; + int rc; + + rc = -ENOMEM; + vcssb = kmalloc(VCSSB_LEN_BYTES, GFP_KERNEL); + if (!vcssb) + goto cleanup_keys; + + rc = -ENOENT; + if (!sclp.has_diag320) { + pr_dbf_msg("Certificate Store is not supported"); + goto cleanup_keys; + } + + rc = query_diag320_subcodes(); + if (rc) + goto cleanup_keys; + + rc = get_vcssb(vcssb); + if (rc) + goto cleanup_keys; + + rc = -ENOMEM; + cs_keyring = create_cs_keyring(); + if (!cs_keyring) + goto cleanup_keys; + + rc = add_certificates_to_keyring(vcssb, cs_keyring); + if (rc) + goto cleanup_cs_keyring; + + goto out; + +cleanup_cs_keyring: + key_put(cs_keyring); +cleanup_keys: + cleanup_cs_keys(); +out: + kfree(vcssb); + return rc; +} + +static DEFINE_MUTEX(cs_refresh_lock); +static int cs_status_val = -1; + +static ssize_t cs_status_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + if (cs_status_val == -1) + return sysfs_emit(buf, "uninitialized\n"); + else if (cs_status_val == 0) + return sysfs_emit(buf, "ok\n"); + + return sysfs_emit(buf, "failed (%d)\n", cs_status_val); +} + +static struct kobj_attribute cs_status_attr = __ATTR_RO(cs_status); + +static ssize_t refresh_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int rc, retries; + + pr_dbf_msg("Refresh certificate store information requested"); + rc = mutex_lock_interruptible(&cs_refresh_lock); + if (rc) + return rc; + + for (retries = 0; retries < DIAG_MAX_RETRIES; retries++) { + /* Request certificates from certificate store. */ + rc = fill_cs_keyring(); + if (rc) + pr_dbf_msg("Failed to refresh certificate store information (%d)", rc); + if (rc != -EAGAIN) + break; + } + cs_status_val = rc; + mutex_unlock(&cs_refresh_lock); + + return rc ?: count; +} + +static struct kobj_attribute refresh_attr = __ATTR_WO(refresh); + +static const struct attribute *cert_store_attrs[] __initconst = { + &cs_status_attr.attr, + &refresh_attr.attr, + NULL, +}; + +static struct kobject *cert_store_kobj; + +static int __init cert_store_init(void) +{ + int rc = -ENOMEM; + + cert_store_dbf = debug_register("cert_store_msg", 10, 1, 64); + if (!cert_store_dbf) + goto cleanup_dbf; + + cert_store_hexdump = debug_register("cert_store_hexdump", 3, 1, 128); + if (!cert_store_hexdump) + goto cleanup_dbf; + + debug_register_view(cert_store_hexdump, &debug_hex_ascii_view); + debug_register_view(cert_store_dbf, &debug_sprintf_view); + + /* Create directory /sys/firmware/cert_store. */ + cert_store_kobj = kobject_create_and_add("cert_store", firmware_kobj); + if (!cert_store_kobj) + goto cleanup_dbf; + + rc = sysfs_create_files(cert_store_kobj, cert_store_attrs); + if (rc) + goto cleanup_kobj; + + register_key_type(&key_type_cert_store_key); + + return rc; + +cleanup_kobj: + kobject_put(cert_store_kobj); +cleanup_dbf: + debug_unregister(cert_store_dbf); + debug_unregister(cert_store_hexdump); + + return rc; +} +device_initcall(cert_store_init); diff --git a/arch/s390/kernel/compat_audit.c b/arch/s390/kernel/compat_audit.c new file mode 100644 index 0000000000..a7c46e8310 --- /dev/null +++ b/arch/s390/kernel/compat_audit.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +#undef __s390x__ +#include +#include +#include "audit.h" + +unsigned s390_dir_class[] = { +#include +~0U +}; + +unsigned s390_chattr_class[] = { +#include +~0U +}; + +unsigned s390_write_class[] = { +#include +~0U +}; + +unsigned s390_read_class[] = { +#include +~0U +}; + +unsigned s390_signal_class[] = { +#include +~0U +}; + +int s390_classify_syscall(unsigned syscall) +{ + switch(syscall) { + case __NR_open: + return AUDITSC_OPEN; + case __NR_openat: + return AUDITSC_OPENAT; + case __NR_socketcall: + return AUDITSC_SOCKETCALL; + case __NR_execve: + return AUDITSC_EXECVE; + case __NR_openat2: + return AUDITSC_OPENAT2; + default: + return AUDITSC_COMPAT; + } +} diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c new file mode 100644 index 0000000000..f9d418d1b6 --- /dev/null +++ b/arch/s390/kernel/compat_linux.c @@ -0,0 +1,289 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * S390 version + * Copyright IBM Corp. 2000 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), + * Gerhard Tonn (ton@de.ibm.com) + * Thomas Spatzier (tspat@de.ibm.com) + * + * Conversion between 31bit and 64bit native syscalls. + * + * Heavily inspired by the 32-bit Sparc compat code which is + * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz) + * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) + * + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include "compat_linux.h" + +#ifdef CONFIG_SYSVIPC +COMPAT_SYSCALL_DEFINE5(s390_ipc, uint, call, int, first, compat_ulong_t, second, + compat_ulong_t, third, compat_uptr_t, ptr) +{ + if (call >> 16) /* hack for backward compatibility */ + return -EINVAL; + return compat_ksys_ipc(call, first, second, third, ptr, third); +} +#endif + +COMPAT_SYSCALL_DEFINE3(s390_truncate64, const char __user *, path, u32, high, u32, low) +{ + return ksys_truncate(path, (unsigned long)high << 32 | low); +} + +COMPAT_SYSCALL_DEFINE3(s390_ftruncate64, unsigned int, fd, u32, high, u32, low) +{ + return ksys_ftruncate(fd, (unsigned long)high << 32 | low); +} + +COMPAT_SYSCALL_DEFINE5(s390_pread64, unsigned int, fd, char __user *, ubuf, + compat_size_t, count, u32, high, u32, low) +{ + if ((compat_ssize_t) count < 0) + return -EINVAL; + return ksys_pread64(fd, ubuf, count, (unsigned long)high << 32 | low); +} + +COMPAT_SYSCALL_DEFINE5(s390_pwrite64, unsigned int, fd, const char __user *, ubuf, + compat_size_t, count, u32, high, u32, low) +{ + if ((compat_ssize_t) count < 0) + return -EINVAL; + return ksys_pwrite64(fd, ubuf, count, (unsigned long)high << 32 | low); +} + +COMPAT_SYSCALL_DEFINE4(s390_readahead, int, fd, u32, high, u32, low, s32, count) +{ + return ksys_readahead(fd, (unsigned long)high << 32 | low, count); +} + +struct stat64_emu31 { + unsigned long long st_dev; + unsigned int __pad1; +#define STAT64_HAS_BROKEN_ST_INO 1 + u32 __st_ino; + unsigned int st_mode; + unsigned int st_nlink; + u32 st_uid; + u32 st_gid; + unsigned long long st_rdev; + unsigned int __pad3; + long st_size; + u32 st_blksize; + unsigned char __pad4[4]; + u32 __pad5; /* future possible st_blocks high bits */ + u32 st_blocks; /* Number 512-byte blocks allocated. */ + u32 st_atime; + u32 __pad6; + u32 st_mtime; + u32 __pad7; + u32 st_ctime; + u32 __pad8; /* will be high 32 bits of ctime someday */ + unsigned long st_ino; +}; + +static int cp_stat64(struct stat64_emu31 __user *ubuf, struct kstat *stat) +{ + struct stat64_emu31 tmp; + + memset(&tmp, 0, sizeof(tmp)); + + tmp.st_dev = huge_encode_dev(stat->dev); + tmp.st_ino = stat->ino; + tmp.__st_ino = (u32)stat->ino; + tmp.st_mode = stat->mode; + tmp.st_nlink = (unsigned int)stat->nlink; + tmp.st_uid = from_kuid_munged(current_user_ns(), stat->uid); + tmp.st_gid = from_kgid_munged(current_user_ns(), stat->gid); + tmp.st_rdev = huge_encode_dev(stat->rdev); + tmp.st_size = stat->size; + tmp.st_blksize = (u32)stat->blksize; + tmp.st_blocks = (u32)stat->blocks; + tmp.st_atime = (u32)stat->atime.tv_sec; + tmp.st_mtime = (u32)stat->mtime.tv_sec; + tmp.st_ctime = (u32)stat->ctime.tv_sec; + + return copy_to_user(ubuf,&tmp,sizeof(tmp)) ? -EFAULT : 0; +} + +COMPAT_SYSCALL_DEFINE2(s390_stat64, const char __user *, filename, struct stat64_emu31 __user *, statbuf) +{ + struct kstat stat; + int ret = vfs_stat(filename, &stat); + if (!ret) + ret = cp_stat64(statbuf, &stat); + return ret; +} + +COMPAT_SYSCALL_DEFINE2(s390_lstat64, const char __user *, filename, struct stat64_emu31 __user *, statbuf) +{ + struct kstat stat; + int ret = vfs_lstat(filename, &stat); + if (!ret) + ret = cp_stat64(statbuf, &stat); + return ret; +} + +COMPAT_SYSCALL_DEFINE2(s390_fstat64, unsigned int, fd, struct stat64_emu31 __user *, statbuf) +{ + struct kstat stat; + int ret = vfs_fstat(fd, &stat); + if (!ret) + ret = cp_stat64(statbuf, &stat); + return ret; +} + +COMPAT_SYSCALL_DEFINE4(s390_fstatat64, unsigned int, dfd, const char __user *, filename, + struct stat64_emu31 __user *, statbuf, int, flag) +{ + struct kstat stat; + int error; + + error = vfs_fstatat(dfd, filename, &stat, flag); + if (error) + return error; + return cp_stat64(statbuf, &stat); +} + +/* + * Linux/i386 didn't use to be able to handle more than + * 4 system call parameters, so these system calls used a memory + * block for parameter passing.. + */ + +struct mmap_arg_struct_emu31 { + compat_ulong_t addr; + compat_ulong_t len; + compat_ulong_t prot; + compat_ulong_t flags; + compat_ulong_t fd; + compat_ulong_t offset; +}; + +COMPAT_SYSCALL_DEFINE1(s390_old_mmap, struct mmap_arg_struct_emu31 __user *, arg) +{ + struct mmap_arg_struct_emu31 a; + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; + if (a.offset & ~PAGE_MASK) + return -EINVAL; + return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset >> PAGE_SHIFT); +} + +COMPAT_SYSCALL_DEFINE1(s390_mmap2, struct mmap_arg_struct_emu31 __user *, arg) +{ + struct mmap_arg_struct_emu31 a; + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; + return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset); +} + +COMPAT_SYSCALL_DEFINE3(s390_read, unsigned int, fd, char __user *, buf, compat_size_t, count) +{ + if ((compat_ssize_t) count < 0) + return -EINVAL; + + return ksys_read(fd, buf, count); +} + +COMPAT_SYSCALL_DEFINE3(s390_write, unsigned int, fd, const char __user *, buf, compat_size_t, count) +{ + if ((compat_ssize_t) count < 0) + return -EINVAL; + + return ksys_write(fd, buf, count); +} + +/* + * 31 bit emulation wrapper functions for sys_fadvise64/fadvise64_64. + * These need to rewrite the advise values for POSIX_FADV_{DONTNEED,NOREUSE} + * because the 31 bit values differ from the 64 bit values. + */ + +COMPAT_SYSCALL_DEFINE5(s390_fadvise64, int, fd, u32, high, u32, low, compat_size_t, len, int, advise) +{ + if (advise == 4) + advise = POSIX_FADV_DONTNEED; + else if (advise == 5) + advise = POSIX_FADV_NOREUSE; + return ksys_fadvise64_64(fd, (unsigned long)high << 32 | low, len, + advise); +} + +struct fadvise64_64_args { + int fd; + long long offset; + long long len; + int advice; +}; + +COMPAT_SYSCALL_DEFINE1(s390_fadvise64_64, struct fadvise64_64_args __user *, args) +{ + struct fadvise64_64_args a; + + if ( copy_from_user(&a, args, sizeof(a)) ) + return -EFAULT; + if (a.advice == 4) + a.advice = POSIX_FADV_DONTNEED; + else if (a.advice == 5) + a.advice = POSIX_FADV_NOREUSE; + return ksys_fadvise64_64(a.fd, a.offset, a.len, a.advice); +} + +COMPAT_SYSCALL_DEFINE6(s390_sync_file_range, int, fd, u32, offhigh, u32, offlow, + u32, nhigh, u32, nlow, unsigned int, flags) +{ + return ksys_sync_file_range(fd, ((loff_t)offhigh << 32) + offlow, + ((u64)nhigh << 32) + nlow, flags); +} + +COMPAT_SYSCALL_DEFINE6(s390_fallocate, int, fd, int, mode, u32, offhigh, u32, offlow, + u32, lenhigh, u32, lenlow) +{ + return ksys_fallocate(fd, mode, ((loff_t)offhigh << 32) + offlow, + ((u64)lenhigh << 32) + lenlow); +} diff --git a/arch/s390/kernel/compat_linux.h b/arch/s390/kernel/compat_linux.h new file mode 100644 index 0000000000..ef23739b27 --- /dev/null +++ b/arch/s390/kernel/compat_linux.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390X_S390_H +#define _ASM_S390X_S390_H + +#include +#include +#include +#include + +/* + * Macro that masks the high order bit of a 32 bit pointer and + * converts it to a 64 bit pointer. + */ +#define A(__x) ((unsigned long)((__x) & 0x7FFFFFFFUL)) +#define AA(__x) ((unsigned long)(__x)) + +/* Now 32bit compatibility types */ +struct ipc_kludge_32 { + __u32 msgp; /* pointer */ + __s32 msgtyp; +}; + +/* asm/sigcontext.h */ +typedef union { + __u64 d; + __u32 f; +} freg_t32; + +typedef struct { + unsigned int fpc; + unsigned int pad; + freg_t32 fprs[__NUM_FPRS]; +} _s390_fp_regs32; + +typedef struct { + psw_t32 psw; + __u32 gprs[__NUM_GPRS]; + __u32 acrs[__NUM_ACRS]; +} _s390_regs_common32; + +typedef struct { + _s390_regs_common32 regs; + _s390_fp_regs32 fpregs; +} _sigregs32; + +typedef struct { + __u32 gprs_high[__NUM_GPRS]; + __u64 vxrs_low[__NUM_VXRS_LOW]; + __vector128 vxrs_high[__NUM_VXRS_HIGH]; + __u8 __reserved[128]; +} _sigregs_ext32; + +#define _SIGCONTEXT_NSIG32 64 +#define _SIGCONTEXT_NSIG_BPW32 32 +#define __SIGNAL_FRAMESIZE32 96 +#define _SIGMASK_COPY_SIZE32 (sizeof(u32) * 2) + +struct sigcontext32 { + __u32 oldmask[_COMPAT_NSIG_WORDS]; + __u32 sregs; /* pointer */ +}; + +/* asm/signal.h */ + +/* asm/ucontext.h */ +struct ucontext32 { + __u32 uc_flags; + __u32 uc_link; /* pointer */ + compat_stack_t uc_stack; + _sigregs32 uc_mcontext; + compat_sigset_t uc_sigmask; + /* Allow for uc_sigmask growth. Glibc uses a 1024-bit sigset_t. */ + unsigned char __unused[128 - sizeof(compat_sigset_t)]; + _sigregs_ext32 uc_mcontext_ext; +}; + +struct stat64_emu31; +struct mmap_arg_struct_emu31; +struct fadvise64_64_args; + +long compat_sys_s390_truncate64(const char __user *path, u32 high, u32 low); +long compat_sys_s390_ftruncate64(unsigned int fd, u32 high, u32 low); +long compat_sys_s390_pread64(unsigned int fd, char __user *ubuf, compat_size_t count, u32 high, u32 low); +long compat_sys_s390_pwrite64(unsigned int fd, const char __user *ubuf, compat_size_t count, u32 high, u32 low); +long compat_sys_s390_readahead(int fd, u32 high, u32 low, s32 count); +long compat_sys_s390_stat64(const char __user *filename, struct stat64_emu31 __user *statbuf); +long compat_sys_s390_lstat64(const char __user *filename, struct stat64_emu31 __user *statbuf); +long compat_sys_s390_fstat64(unsigned int fd, struct stat64_emu31 __user *statbuf); +long compat_sys_s390_fstatat64(unsigned int dfd, const char __user *filename, struct stat64_emu31 __user *statbuf, int flag); +long compat_sys_s390_old_mmap(struct mmap_arg_struct_emu31 __user *arg); +long compat_sys_s390_mmap2(struct mmap_arg_struct_emu31 __user *arg); +long compat_sys_s390_read(unsigned int fd, char __user *buf, compat_size_t count); +long compat_sys_s390_write(unsigned int fd, const char __user *buf, compat_size_t count); +long compat_sys_s390_fadvise64(int fd, u32 high, u32 low, compat_size_t len, int advise); +long compat_sys_s390_fadvise64_64(struct fadvise64_64_args __user *args); +long compat_sys_s390_sync_file_range(int fd, u32 offhigh, u32 offlow, u32 nhigh, u32 nlow, unsigned int flags); +long compat_sys_s390_fallocate(int fd, int mode, u32 offhigh, u32 offlow, u32 lenhigh, u32 lenlow); +long compat_sys_sigreturn(void); +long compat_sys_rt_sigreturn(void); + +#endif /* _ASM_S390X_S390_H */ diff --git a/arch/s390/kernel/compat_ptrace.h b/arch/s390/kernel/compat_ptrace.h new file mode 100644 index 0000000000..3c400fc7e9 --- /dev/null +++ b/arch/s390/kernel/compat_ptrace.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _PTRACE32_H +#define _PTRACE32_H + +#include /* needed for NUM_CR_WORDS */ +#include "compat_linux.h" /* needed for psw_compat_t */ + +struct compat_per_struct_kernel { + __u32 cr9; /* PER control bits */ + __u32 cr10; /* PER starting address */ + __u32 cr11; /* PER ending address */ + __u32 bits; /* Obsolete software bits */ + __u32 starting_addr; /* User specified start address */ + __u32 ending_addr; /* User specified end address */ + __u16 perc_atmid; /* PER trap ATMID */ + __u32 address; /* PER trap instruction address */ + __u8 access_id; /* PER trap access identification */ +}; + +struct compat_user_regs_struct +{ + psw_compat_t psw; + u32 gprs[NUM_GPRS]; + u32 acrs[NUM_ACRS]; + u32 orig_gpr2; + /* nb: there's a 4-byte hole here */ + s390_fp_regs fp_regs; + /* + * These per registers are in here so that gdb can modify them + * itself as there is no "official" ptrace interface for hardware + * watchpoints. This is the way intel does it. + */ + struct compat_per_struct_kernel per_info; + u32 ieee_instruction_pointer; /* obsolete, always 0 */ +}; + +struct compat_user { + /* We start with the registers, to mimic the way that "memory" + is returned from the ptrace(3,...) function. */ + struct compat_user_regs_struct regs; + /* The rest of this junk is to help gdb figure out what goes where */ + u32 u_tsize; /* Text segment size (pages). */ + u32 u_dsize; /* Data segment size (pages). */ + u32 u_ssize; /* Stack segment size (pages). */ + u32 start_code; /* Starting virtual address of text. */ + u32 start_stack; /* Starting virtual address of stack area. + This is actually the bottom of the stack, + the top of the stack is always found in the + esp register. */ + s32 signal; /* Signal that caused the core dump. */ + u32 u_ar0; /* Used by gdb to help find the values for */ + /* the registers. */ + u32 magic; /* To uniquely identify a core file */ + char u_comm[32]; /* User command that was responsible */ +}; + +typedef struct +{ + __u32 len; + __u32 kernel_addr; + __u32 process_addr; +} compat_ptrace_area; + +#endif /* _PTRACE32_H */ diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c new file mode 100644 index 0000000000..cecedd01d4 --- /dev/null +++ b/arch/s390/kernel/compat_signal.c @@ -0,0 +1,422 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2000, 2006 + * Author(s): Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com) + * Gerhard Tonn (ton@de.ibm.com) + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "compat_linux.h" +#include "compat_ptrace.h" +#include "entry.h" + +typedef struct +{ + __u8 callee_used_stack[__SIGNAL_FRAMESIZE32]; + struct sigcontext32 sc; + _sigregs32 sregs; + int signo; + _sigregs_ext32 sregs_ext; + __u16 svc_insn; /* Offset of svc_insn is NOT fixed! */ +} sigframe32; + +typedef struct +{ + __u8 callee_used_stack[__SIGNAL_FRAMESIZE32]; + __u16 svc_insn; + compat_siginfo_t info; + struct ucontext32 uc; +} rt_sigframe32; + +/* Store registers needed to create the signal frame */ +static void store_sigregs(void) +{ + save_access_regs(current->thread.acrs); + save_fpu_regs(); +} + +/* Load registers after signal return */ +static void load_sigregs(void) +{ + restore_access_regs(current->thread.acrs); +} + +static int save_sigregs32(struct pt_regs *regs, _sigregs32 __user *sregs) +{ + _sigregs32 user_sregs; + int i; + + user_sregs.regs.psw.mask = (__u32)(regs->psw.mask >> 32); + user_sregs.regs.psw.mask &= PSW32_MASK_USER | PSW32_MASK_RI; + user_sregs.regs.psw.mask |= PSW32_USER_BITS; + user_sregs.regs.psw.addr = (__u32) regs->psw.addr | + (__u32)(regs->psw.mask & PSW_MASK_BA); + for (i = 0; i < NUM_GPRS; i++) + user_sregs.regs.gprs[i] = (__u32) regs->gprs[i]; + memcpy(&user_sregs.regs.acrs, current->thread.acrs, + sizeof(user_sregs.regs.acrs)); + fpregs_store((_s390_fp_regs *) &user_sregs.fpregs, ¤t->thread.fpu); + if (__copy_to_user(sregs, &user_sregs, sizeof(_sigregs32))) + return -EFAULT; + return 0; +} + +static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs) +{ + _sigregs32 user_sregs; + int i; + + /* Always make any pending restarted system call return -EINTR */ + current->restart_block.fn = do_no_restart_syscall; + + if (__copy_from_user(&user_sregs, &sregs->regs, sizeof(user_sregs))) + return -EFAULT; + + if (!is_ri_task(current) && (user_sregs.regs.psw.mask & PSW32_MASK_RI)) + return -EINVAL; + + /* Test the floating-point-control word. */ + if (test_fp_ctl(user_sregs.fpregs.fpc)) + return -EINVAL; + + /* Use regs->psw.mask instead of PSW_USER_BITS to preserve PER bit. */ + regs->psw.mask = (regs->psw.mask & ~(PSW_MASK_USER | PSW_MASK_RI)) | + (__u64)(user_sregs.regs.psw.mask & PSW32_MASK_USER) << 32 | + (__u64)(user_sregs.regs.psw.mask & PSW32_MASK_RI) << 32 | + (__u64)(user_sregs.regs.psw.addr & PSW32_ADDR_AMODE); + /* Check for invalid user address space control. */ + if ((regs->psw.mask & PSW_MASK_ASC) == PSW_ASC_HOME) + regs->psw.mask = PSW_ASC_PRIMARY | + (regs->psw.mask & ~PSW_MASK_ASC); + regs->psw.addr = (__u64)(user_sregs.regs.psw.addr & PSW32_ADDR_INSN); + for (i = 0; i < NUM_GPRS; i++) + regs->gprs[i] = (__u64) user_sregs.regs.gprs[i]; + memcpy(¤t->thread.acrs, &user_sregs.regs.acrs, + sizeof(current->thread.acrs)); + fpregs_load((_s390_fp_regs *) &user_sregs.fpregs, ¤t->thread.fpu); + + clear_pt_regs_flag(regs, PIF_SYSCALL); /* No longer in a system call */ + return 0; +} + +static int save_sigregs_ext32(struct pt_regs *regs, + _sigregs_ext32 __user *sregs_ext) +{ + __u32 gprs_high[NUM_GPRS]; + __u64 vxrs[__NUM_VXRS_LOW]; + int i; + + /* Save high gprs to signal stack */ + for (i = 0; i < NUM_GPRS; i++) + gprs_high[i] = regs->gprs[i] >> 32; + if (__copy_to_user(&sregs_ext->gprs_high, &gprs_high, + sizeof(sregs_ext->gprs_high))) + return -EFAULT; + + /* Save vector registers to signal stack */ + if (MACHINE_HAS_VX) { + for (i = 0; i < __NUM_VXRS_LOW; i++) + vxrs[i] = current->thread.fpu.vxrs[i].low; + if (__copy_to_user(&sregs_ext->vxrs_low, vxrs, + sizeof(sregs_ext->vxrs_low)) || + __copy_to_user(&sregs_ext->vxrs_high, + current->thread.fpu.vxrs + __NUM_VXRS_LOW, + sizeof(sregs_ext->vxrs_high))) + return -EFAULT; + } + return 0; +} + +static int restore_sigregs_ext32(struct pt_regs *regs, + _sigregs_ext32 __user *sregs_ext) +{ + __u32 gprs_high[NUM_GPRS]; + __u64 vxrs[__NUM_VXRS_LOW]; + int i; + + /* Restore high gprs from signal stack */ + if (__copy_from_user(&gprs_high, &sregs_ext->gprs_high, + sizeof(sregs_ext->gprs_high))) + return -EFAULT; + for (i = 0; i < NUM_GPRS; i++) + *(__u32 *)®s->gprs[i] = gprs_high[i]; + + /* Restore vector registers from signal stack */ + if (MACHINE_HAS_VX) { + if (__copy_from_user(vxrs, &sregs_ext->vxrs_low, + sizeof(sregs_ext->vxrs_low)) || + __copy_from_user(current->thread.fpu.vxrs + __NUM_VXRS_LOW, + &sregs_ext->vxrs_high, + sizeof(sregs_ext->vxrs_high))) + return -EFAULT; + for (i = 0; i < __NUM_VXRS_LOW; i++) + current->thread.fpu.vxrs[i].low = vxrs[i]; + } + return 0; +} + +COMPAT_SYSCALL_DEFINE0(sigreturn) +{ + struct pt_regs *regs = task_pt_regs(current); + sigframe32 __user *frame = (sigframe32 __user *)regs->gprs[15]; + sigset_t set; + + if (get_compat_sigset(&set, (compat_sigset_t __user *)frame->sc.oldmask)) + goto badframe; + set_current_blocked(&set); + save_fpu_regs(); + if (restore_sigregs32(regs, &frame->sregs)) + goto badframe; + if (restore_sigregs_ext32(regs, &frame->sregs_ext)) + goto badframe; + load_sigregs(); + return regs->gprs[2]; +badframe: + force_sig(SIGSEGV); + return 0; +} + +COMPAT_SYSCALL_DEFINE0(rt_sigreturn) +{ + struct pt_regs *regs = task_pt_regs(current); + rt_sigframe32 __user *frame = (rt_sigframe32 __user *)regs->gprs[15]; + sigset_t set; + + if (get_compat_sigset(&set, &frame->uc.uc_sigmask)) + goto badframe; + set_current_blocked(&set); + if (compat_restore_altstack(&frame->uc.uc_stack)) + goto badframe; + save_fpu_regs(); + if (restore_sigregs32(regs, &frame->uc.uc_mcontext)) + goto badframe; + if (restore_sigregs_ext32(regs, &frame->uc.uc_mcontext_ext)) + goto badframe; + load_sigregs(); + return regs->gprs[2]; +badframe: + force_sig(SIGSEGV); + return 0; +} + +/* + * Set up a signal frame. + */ + + +/* + * Determine which stack to use.. + */ +static inline void __user * +get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size) +{ + unsigned long sp; + + /* Default to using normal stack */ + sp = (unsigned long) A(regs->gprs[15]); + + /* Overflow on alternate signal stack gives SIGSEGV. */ + if (on_sig_stack(sp) && !on_sig_stack((sp - frame_size) & -8UL)) + return (void __user *) -1UL; + + /* This is the X/Open sanctioned signal stack switching. */ + if (ka->sa.sa_flags & SA_ONSTACK) { + if (! sas_ss_flags(sp)) + sp = current->sas_ss_sp + current->sas_ss_size; + } + + return (void __user *)((sp - frame_size) & -8ul); +} + +static int setup_frame32(struct ksignal *ksig, sigset_t *set, + struct pt_regs *regs) +{ + int sig = ksig->sig; + sigframe32 __user *frame; + unsigned long restorer; + size_t frame_size; + + /* + * gprs_high are always present for 31-bit compat tasks. + * The space for vector registers is only allocated if + * the machine supports it + */ + frame_size = sizeof(*frame) - sizeof(frame->sregs_ext.__reserved); + if (!MACHINE_HAS_VX) + frame_size -= sizeof(frame->sregs_ext.vxrs_low) + + sizeof(frame->sregs_ext.vxrs_high); + frame = get_sigframe(&ksig->ka, regs, frame_size); + if (frame == (void __user *) -1UL) + return -EFAULT; + + /* Set up backchain. */ + if (__put_user(regs->gprs[15], (unsigned int __user *) frame)) + return -EFAULT; + + /* Create struct sigcontext32 on the signal stack */ + if (put_compat_sigset((compat_sigset_t __user *)frame->sc.oldmask, + set, sizeof(compat_sigset_t))) + return -EFAULT; + if (__put_user(ptr_to_compat(&frame->sregs), &frame->sc.sregs)) + return -EFAULT; + + /* Store registers needed to create the signal frame */ + store_sigregs(); + + /* Create _sigregs32 on the signal stack */ + if (save_sigregs32(regs, &frame->sregs)) + return -EFAULT; + + /* Place signal number on stack to allow backtrace from handler. */ + if (__put_user(regs->gprs[2], (int __force __user *) &frame->signo)) + return -EFAULT; + + /* Create _sigregs_ext32 on the signal stack */ + if (save_sigregs_ext32(regs, &frame->sregs_ext)) + return -EFAULT; + + /* Set up to return from userspace. If provided, use a stub + already in userspace. */ + if (ksig->ka.sa.sa_flags & SA_RESTORER) { + restorer = (unsigned long __force) + ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE; + } else { + restorer = VDSO32_SYMBOL(current, sigreturn); + } + + /* Set up registers for signal handler */ + regs->gprs[14] = restorer; + regs->gprs[15] = (__force __u64) frame; + /* Force 31 bit amode and default user address space control. */ + regs->psw.mask = PSW_MASK_BA | + (PSW_USER_BITS & PSW_MASK_ASC) | + (regs->psw.mask & ~PSW_MASK_ASC); + regs->psw.addr = (__force __u64) ksig->ka.sa.sa_handler; + + regs->gprs[2] = sig; + regs->gprs[3] = (__force __u64) &frame->sc; + + /* We forgot to include these in the sigcontext. + To avoid breaking binary compatibility, they are passed as args. */ + if (sig == SIGSEGV || sig == SIGBUS || sig == SIGILL || + sig == SIGTRAP || sig == SIGFPE) { + /* set extra registers only for synchronous signals */ + regs->gprs[4] = regs->int_code & 127; + regs->gprs[5] = regs->int_parm_long; + regs->gprs[6] = current->thread.last_break; + } + + return 0; +} + +static int setup_rt_frame32(struct ksignal *ksig, sigset_t *set, + struct pt_regs *regs) +{ + rt_sigframe32 __user *frame; + unsigned long restorer; + size_t frame_size; + u32 uc_flags; + + frame_size = sizeof(*frame) - + sizeof(frame->uc.uc_mcontext_ext.__reserved); + /* + * gprs_high are always present for 31-bit compat tasks. + * The space for vector registers is only allocated if + * the machine supports it + */ + uc_flags = UC_GPRS_HIGH; + if (MACHINE_HAS_VX) { + uc_flags |= UC_VXRS; + } else + frame_size -= sizeof(frame->uc.uc_mcontext_ext.vxrs_low) + + sizeof(frame->uc.uc_mcontext_ext.vxrs_high); + frame = get_sigframe(&ksig->ka, regs, frame_size); + if (frame == (void __user *) -1UL) + return -EFAULT; + + /* Set up backchain. */ + if (__put_user(regs->gprs[15], (unsigned int __force __user *) frame)) + return -EFAULT; + + /* Set up to return from userspace. If provided, use a stub + already in userspace. */ + if (ksig->ka.sa.sa_flags & SA_RESTORER) { + restorer = (unsigned long __force) + ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE; + } else { + restorer = VDSO32_SYMBOL(current, rt_sigreturn); + } + + /* Create siginfo on the signal stack */ + if (copy_siginfo_to_user32(&frame->info, &ksig->info)) + return -EFAULT; + + /* Store registers needed to create the signal frame */ + store_sigregs(); + + /* Create ucontext on the signal stack. */ + if (__put_user(uc_flags, &frame->uc.uc_flags) || + __put_user(0, &frame->uc.uc_link) || + __compat_save_altstack(&frame->uc.uc_stack, regs->gprs[15]) || + save_sigregs32(regs, &frame->uc.uc_mcontext) || + put_compat_sigset(&frame->uc.uc_sigmask, set, sizeof(compat_sigset_t)) || + save_sigregs_ext32(regs, &frame->uc.uc_mcontext_ext)) + return -EFAULT; + + /* Set up registers for signal handler */ + regs->gprs[14] = restorer; + regs->gprs[15] = (__force __u64) frame; + /* Force 31 bit amode and default user address space control. */ + regs->psw.mask = PSW_MASK_BA | + (PSW_USER_BITS & PSW_MASK_ASC) | + (regs->psw.mask & ~PSW_MASK_ASC); + regs->psw.addr = (__u64 __force) ksig->ka.sa.sa_handler; + + regs->gprs[2] = ksig->sig; + regs->gprs[3] = (__force __u64) &frame->info; + regs->gprs[4] = (__force __u64) &frame->uc; + regs->gprs[5] = current->thread.last_break; + return 0; +} + +/* + * OK, we're invoking a handler + */ + +void handle_signal32(struct ksignal *ksig, sigset_t *oldset, + struct pt_regs *regs) +{ + int ret; + + /* Set up the stack frame */ + if (ksig->ka.sa.sa_flags & SA_SIGINFO) + ret = setup_rt_frame32(ksig, oldset, regs); + else + ret = setup_frame32(ksig, oldset, regs); + + signal_setup_done(ret, ksig, test_thread_flag(TIF_SINGLE_STEP)); +} + diff --git a/arch/s390/kernel/cpcmd.c b/arch/s390/kernel/cpcmd.c new file mode 100644 index 0000000000..b210a29d3e --- /dev/null +++ b/arch/s390/kernel/cpcmd.c @@ -0,0 +1,114 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * S390 version + * Copyright IBM Corp. 1999, 2007 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), + * Christian Borntraeger (cborntra@de.ibm.com), + */ + +#define KMSG_COMPONENT "cpcmd" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_SPINLOCK(cpcmd_lock); +static char cpcmd_buf[241]; + +static int diag8_noresponse(int cmdlen) +{ + asm volatile( + " diag %[rx],%[ry],0x8\n" + : [ry] "+&d" (cmdlen) + : [rx] "d" (__pa(cpcmd_buf)) + : "cc"); + return cmdlen; +} + +static int diag8_response(int cmdlen, char *response, int *rlen) +{ + union register_pair rx, ry; + int cc; + + rx.even = __pa(cpcmd_buf); + rx.odd = __pa(response); + ry.even = cmdlen | 0x40000000L; + ry.odd = *rlen; + asm volatile( + " diag %[rx],%[ry],0x8\n" + " ipm %[cc]\n" + " srl %[cc],28\n" + : [cc] "=&d" (cc), [ry] "+&d" (ry.pair) + : [rx] "d" (rx.pair) + : "cc"); + if (cc) + *rlen += ry.odd; + else + *rlen = ry.odd; + return ry.even; +} + +/* + * __cpcmd has some restrictions over cpcmd + * - __cpcmd is unlocked and therefore not SMP-safe + */ +int __cpcmd(const char *cmd, char *response, int rlen, int *response_code) +{ + int cmdlen; + int rc; + int response_len; + + cmdlen = strlen(cmd); + BUG_ON(cmdlen > 240); + memcpy(cpcmd_buf, cmd, cmdlen); + ASCEBC(cpcmd_buf, cmdlen); + + diag_stat_inc(DIAG_STAT_X008); + if (response) { + memset(response, 0, rlen); + response_len = rlen; + rc = diag8_response(cmdlen, response, &rlen); + EBCASC(response, response_len); + } else { + rc = diag8_noresponse(cmdlen); + } + if (response_code) + *response_code = rc; + return rlen; +} +EXPORT_SYMBOL(__cpcmd); + +int cpcmd(const char *cmd, char *response, int rlen, int *response_code) +{ + unsigned long flags; + char *lowbuf; + int len; + + if (is_vmalloc_or_module_addr(response)) { + lowbuf = kmalloc(rlen, GFP_KERNEL); + if (!lowbuf) { + pr_warn("The cpcmd kernel function failed to allocate a response buffer\n"); + return -ENOMEM; + } + spin_lock_irqsave(&cpcmd_lock, flags); + len = __cpcmd(cmd, lowbuf, rlen, response_code); + spin_unlock_irqrestore(&cpcmd_lock, flags); + memcpy(response, lowbuf, rlen); + kfree(lowbuf); + } else { + spin_lock_irqsave(&cpcmd_lock, flags); + len = __cpcmd(cmd, response, rlen, response_code); + spin_unlock_irqrestore(&cpcmd_lock, flags); + } + return len; +} +EXPORT_SYMBOL(cpcmd); diff --git a/arch/s390/kernel/cpufeature.c b/arch/s390/kernel/cpufeature.c new file mode 100644 index 0000000000..1b2ae42a0c --- /dev/null +++ b/arch/s390/kernel/cpufeature.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2022 + */ + +#include +#include +#include + +enum { + TYPE_HWCAP, + TYPE_FACILITY, +}; + +struct s390_cpu_feature { + unsigned int type : 4; + unsigned int num : 28; +}; + +static struct s390_cpu_feature s390_cpu_features[MAX_CPU_FEATURES] = { + [S390_CPU_FEATURE_MSA] = {.type = TYPE_HWCAP, .num = HWCAP_NR_MSA}, + [S390_CPU_FEATURE_VXRS] = {.type = TYPE_HWCAP, .num = HWCAP_NR_VXRS}, + [S390_CPU_FEATURE_UV] = {.type = TYPE_FACILITY, .num = 158}, +}; + +/* + * cpu_have_feature - Test CPU features on module initialization + */ +int cpu_have_feature(unsigned int num) +{ + struct s390_cpu_feature *feature; + + if (WARN_ON_ONCE(num >= MAX_CPU_FEATURES)) + return 0; + feature = &s390_cpu_features[num]; + switch (feature->type) { + case TYPE_HWCAP: + return !!(elf_hwcap & BIT(feature->num)); + case TYPE_FACILITY: + return test_facility(feature->num); + default: + WARN_ON_ONCE(1); + return 0; + } +} +EXPORT_SYMBOL(cpu_have_feature); diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c new file mode 100644 index 0000000000..7af69948b2 --- /dev/null +++ b/arch/s390/kernel/crash_dump.c @@ -0,0 +1,652 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * S390 kdump implementation + * + * Copyright IBM Corp. 2011 + * Author(s): Michael Holzheu + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PTR_ADD(x, y) (((char *) (x)) + ((unsigned long) (y))) +#define PTR_SUB(x, y) (((char *) (x)) - ((unsigned long) (y))) +#define PTR_DIFF(x, y) ((unsigned long)(((char *) (x)) - ((unsigned long) (y)))) + +static struct memblock_region oldmem_region; + +static struct memblock_type oldmem_type = { + .cnt = 1, + .max = 1, + .total_size = 0, + .regions = &oldmem_region, + .name = "oldmem", +}; + +struct save_area { + struct list_head list; + u64 psw[2]; + u64 ctrs[16]; + u64 gprs[16]; + u32 acrs[16]; + u64 fprs[16]; + u32 fpc; + u32 prefix; + u32 todpreg; + u64 timer; + u64 todcmp; + u64 vxrs_low[16]; + __vector128 vxrs_high[16]; +}; + +static LIST_HEAD(dump_save_areas); + +/* + * Allocate a save area + */ +struct save_area * __init save_area_alloc(bool is_boot_cpu) +{ + struct save_area *sa; + + sa = memblock_alloc(sizeof(*sa), 8); + if (!sa) + return NULL; + + if (is_boot_cpu) + list_add(&sa->list, &dump_save_areas); + else + list_add_tail(&sa->list, &dump_save_areas); + return sa; +} + +/* + * Return the address of the save area for the boot CPU + */ +struct save_area * __init save_area_boot_cpu(void) +{ + return list_first_entry_or_null(&dump_save_areas, struct save_area, list); +} + +/* + * Copy CPU registers into the save area + */ +void __init save_area_add_regs(struct save_area *sa, void *regs) +{ + struct lowcore *lc; + + lc = (struct lowcore *)(regs - __LC_FPREGS_SAVE_AREA); + memcpy(&sa->psw, &lc->psw_save_area, sizeof(sa->psw)); + memcpy(&sa->ctrs, &lc->cregs_save_area, sizeof(sa->ctrs)); + memcpy(&sa->gprs, &lc->gpregs_save_area, sizeof(sa->gprs)); + memcpy(&sa->acrs, &lc->access_regs_save_area, sizeof(sa->acrs)); + memcpy(&sa->fprs, &lc->floating_pt_save_area, sizeof(sa->fprs)); + memcpy(&sa->fpc, &lc->fpt_creg_save_area, sizeof(sa->fpc)); + memcpy(&sa->prefix, &lc->prefixreg_save_area, sizeof(sa->prefix)); + memcpy(&sa->todpreg, &lc->tod_progreg_save_area, sizeof(sa->todpreg)); + memcpy(&sa->timer, &lc->cpu_timer_save_area, sizeof(sa->timer)); + memcpy(&sa->todcmp, &lc->clock_comp_save_area, sizeof(sa->todcmp)); +} + +/* + * Copy vector registers into the save area + */ +void __init save_area_add_vxrs(struct save_area *sa, __vector128 *vxrs) +{ + int i; + + /* Copy lower halves of vector registers 0-15 */ + for (i = 0; i < 16; i++) + sa->vxrs_low[i] = vxrs[i].low; + /* Copy vector registers 16-31 */ + memcpy(sa->vxrs_high, vxrs + 16, 16 * sizeof(__vector128)); +} + +static size_t copy_oldmem_iter(struct iov_iter *iter, unsigned long src, size_t count) +{ + size_t len, copied, res = 0; + + while (count) { + if (!oldmem_data.start && src < sclp.hsa_size) { + /* Copy from zfcp/nvme dump HSA area */ + len = min(count, sclp.hsa_size - src); + copied = memcpy_hsa_iter(iter, src, len); + } else { + /* Check for swapped kdump oldmem areas */ + if (oldmem_data.start && src - oldmem_data.start < oldmem_data.size) { + src -= oldmem_data.start; + len = min(count, oldmem_data.size - src); + } else if (oldmem_data.start && src < oldmem_data.size) { + len = min(count, oldmem_data.size - src); + src += oldmem_data.start; + } else { + len = count; + } + copied = memcpy_real_iter(iter, src, len); + } + count -= copied; + src += copied; + res += copied; + if (copied < len) + break; + } + return res; +} + +int copy_oldmem_kernel(void *dst, unsigned long src, size_t count) +{ + struct iov_iter iter; + struct kvec kvec; + + kvec.iov_base = dst; + kvec.iov_len = count; + iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count); + if (copy_oldmem_iter(&iter, src, count) < count) + return -EFAULT; + return 0; +} + +/* + * Copy one page from "oldmem" + */ +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, size_t csize, + unsigned long offset) +{ + unsigned long src; + + src = pfn_to_phys(pfn) + offset; + return copy_oldmem_iter(iter, src, csize); +} + +/* + * Remap "oldmem" for kdump + * + * For the kdump reserved memory this functions performs a swap operation: + * [0 - OLDMEM_SIZE] is mapped to [OLDMEM_BASE - OLDMEM_BASE + OLDMEM_SIZE] + */ +static int remap_oldmem_pfn_range_kdump(struct vm_area_struct *vma, + unsigned long from, unsigned long pfn, + unsigned long size, pgprot_t prot) +{ + unsigned long size_old; + int rc; + + if (pfn < oldmem_data.size >> PAGE_SHIFT) { + size_old = min(size, oldmem_data.size - (pfn << PAGE_SHIFT)); + rc = remap_pfn_range(vma, from, + pfn + (oldmem_data.start >> PAGE_SHIFT), + size_old, prot); + if (rc || size == size_old) + return rc; + size -= size_old; + from += size_old; + pfn += size_old >> PAGE_SHIFT; + } + return remap_pfn_range(vma, from, pfn, size, prot); +} + +/* + * Remap "oldmem" for zfcp/nvme dump + * + * We only map available memory above HSA size. Memory below HSA size + * is read on demand using the copy_oldmem_page() function. + */ +static int remap_oldmem_pfn_range_zfcpdump(struct vm_area_struct *vma, + unsigned long from, + unsigned long pfn, + unsigned long size, pgprot_t prot) +{ + unsigned long hsa_end = sclp.hsa_size; + unsigned long size_hsa; + + if (pfn < hsa_end >> PAGE_SHIFT) { + size_hsa = min(size, hsa_end - (pfn << PAGE_SHIFT)); + if (size == size_hsa) + return 0; + size -= size_hsa; + from += size_hsa; + pfn += size_hsa >> PAGE_SHIFT; + } + return remap_pfn_range(vma, from, pfn, size, prot); +} + +/* + * Remap "oldmem" for kdump or zfcp/nvme dump + */ +int remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + if (oldmem_data.start) + return remap_oldmem_pfn_range_kdump(vma, from, pfn, size, prot); + else + return remap_oldmem_pfn_range_zfcpdump(vma, from, pfn, size, + prot); +} + +static const char *nt_name(Elf64_Word type) +{ + const char *name = "LINUX"; + + if (type == NT_PRPSINFO || type == NT_PRSTATUS || type == NT_PRFPREG) + name = KEXEC_CORE_NOTE_NAME; + return name; +} + +/* + * Initialize ELF note + */ +static void *nt_init_name(void *buf, Elf64_Word type, void *desc, int d_len, + const char *name) +{ + Elf64_Nhdr *note; + u64 len; + + note = (Elf64_Nhdr *)buf; + note->n_namesz = strlen(name) + 1; + note->n_descsz = d_len; + note->n_type = type; + len = sizeof(Elf64_Nhdr); + + memcpy(buf + len, name, note->n_namesz); + len = roundup(len + note->n_namesz, 4); + + memcpy(buf + len, desc, note->n_descsz); + len = roundup(len + note->n_descsz, 4); + + return PTR_ADD(buf, len); +} + +static inline void *nt_init(void *buf, Elf64_Word type, void *desc, int d_len) +{ + return nt_init_name(buf, type, desc, d_len, nt_name(type)); +} + +/* + * Calculate the size of ELF note + */ +static size_t nt_size_name(int d_len, const char *name) +{ + size_t size; + + size = sizeof(Elf64_Nhdr); + size += roundup(strlen(name) + 1, 4); + size += roundup(d_len, 4); + + return size; +} + +static inline size_t nt_size(Elf64_Word type, int d_len) +{ + return nt_size_name(d_len, nt_name(type)); +} + +/* + * Fill ELF notes for one CPU with save area registers + */ +static void *fill_cpu_elf_notes(void *ptr, int cpu, struct save_area *sa) +{ + struct elf_prstatus nt_prstatus; + elf_fpregset_t nt_fpregset; + + /* Prepare prstatus note */ + memset(&nt_prstatus, 0, sizeof(nt_prstatus)); + memcpy(&nt_prstatus.pr_reg.gprs, sa->gprs, sizeof(sa->gprs)); + memcpy(&nt_prstatus.pr_reg.psw, sa->psw, sizeof(sa->psw)); + memcpy(&nt_prstatus.pr_reg.acrs, sa->acrs, sizeof(sa->acrs)); + nt_prstatus.common.pr_pid = cpu; + /* Prepare fpregset (floating point) note */ + memset(&nt_fpregset, 0, sizeof(nt_fpregset)); + memcpy(&nt_fpregset.fpc, &sa->fpc, sizeof(sa->fpc)); + memcpy(&nt_fpregset.fprs, &sa->fprs, sizeof(sa->fprs)); + /* Create ELF notes for the CPU */ + ptr = nt_init(ptr, NT_PRSTATUS, &nt_prstatus, sizeof(nt_prstatus)); + ptr = nt_init(ptr, NT_PRFPREG, &nt_fpregset, sizeof(nt_fpregset)); + ptr = nt_init(ptr, NT_S390_TIMER, &sa->timer, sizeof(sa->timer)); + ptr = nt_init(ptr, NT_S390_TODCMP, &sa->todcmp, sizeof(sa->todcmp)); + ptr = nt_init(ptr, NT_S390_TODPREG, &sa->todpreg, sizeof(sa->todpreg)); + ptr = nt_init(ptr, NT_S390_CTRS, &sa->ctrs, sizeof(sa->ctrs)); + ptr = nt_init(ptr, NT_S390_PREFIX, &sa->prefix, sizeof(sa->prefix)); + if (MACHINE_HAS_VX) { + ptr = nt_init(ptr, NT_S390_VXRS_HIGH, + &sa->vxrs_high, sizeof(sa->vxrs_high)); + ptr = nt_init(ptr, NT_S390_VXRS_LOW, + &sa->vxrs_low, sizeof(sa->vxrs_low)); + } + return ptr; +} + +/* + * Calculate size of ELF notes per cpu + */ +static size_t get_cpu_elf_notes_size(void) +{ + struct save_area *sa = NULL; + size_t size; + + size = nt_size(NT_PRSTATUS, sizeof(struct elf_prstatus)); + size += nt_size(NT_PRFPREG, sizeof(elf_fpregset_t)); + size += nt_size(NT_S390_TIMER, sizeof(sa->timer)); + size += nt_size(NT_S390_TODCMP, sizeof(sa->todcmp)); + size += nt_size(NT_S390_TODPREG, sizeof(sa->todpreg)); + size += nt_size(NT_S390_CTRS, sizeof(sa->ctrs)); + size += nt_size(NT_S390_PREFIX, sizeof(sa->prefix)); + if (MACHINE_HAS_VX) { + size += nt_size(NT_S390_VXRS_HIGH, sizeof(sa->vxrs_high)); + size += nt_size(NT_S390_VXRS_LOW, sizeof(sa->vxrs_low)); + } + + return size; +} + +/* + * Initialize prpsinfo note (new kernel) + */ +static void *nt_prpsinfo(void *ptr) +{ + struct elf_prpsinfo prpsinfo; + + memset(&prpsinfo, 0, sizeof(prpsinfo)); + prpsinfo.pr_sname = 'R'; + strcpy(prpsinfo.pr_fname, "vmlinux"); + return nt_init(ptr, NT_PRPSINFO, &prpsinfo, sizeof(prpsinfo)); +} + +/* + * Get vmcoreinfo using lowcore->vmcore_info (new kernel) + */ +static void *get_vmcoreinfo_old(unsigned long *size) +{ + char nt_name[11], *vmcoreinfo; + unsigned long addr; + Elf64_Nhdr note; + + if (copy_oldmem_kernel(&addr, __LC_VMCORE_INFO, sizeof(addr))) + return NULL; + memset(nt_name, 0, sizeof(nt_name)); + if (copy_oldmem_kernel(¬e, addr, sizeof(note))) + return NULL; + if (copy_oldmem_kernel(nt_name, addr + sizeof(note), + sizeof(nt_name) - 1)) + return NULL; + if (strcmp(nt_name, VMCOREINFO_NOTE_NAME) != 0) + return NULL; + vmcoreinfo = kzalloc(note.n_descsz, GFP_KERNEL); + if (!vmcoreinfo) + return NULL; + if (copy_oldmem_kernel(vmcoreinfo, addr + 24, note.n_descsz)) { + kfree(vmcoreinfo); + return NULL; + } + *size = note.n_descsz; + return vmcoreinfo; +} + +/* + * Initialize vmcoreinfo note (new kernel) + */ +static void *nt_vmcoreinfo(void *ptr) +{ + const char *name = VMCOREINFO_NOTE_NAME; + unsigned long size; + void *vmcoreinfo; + + vmcoreinfo = os_info_old_entry(OS_INFO_VMCOREINFO, &size); + if (vmcoreinfo) + return nt_init_name(ptr, 0, vmcoreinfo, size, name); + + vmcoreinfo = get_vmcoreinfo_old(&size); + if (!vmcoreinfo) + return ptr; + ptr = nt_init_name(ptr, 0, vmcoreinfo, size, name); + kfree(vmcoreinfo); + return ptr; +} + +static size_t nt_vmcoreinfo_size(void) +{ + const char *name = VMCOREINFO_NOTE_NAME; + unsigned long size; + void *vmcoreinfo; + + vmcoreinfo = os_info_old_entry(OS_INFO_VMCOREINFO, &size); + if (vmcoreinfo) + return nt_size_name(size, name); + + vmcoreinfo = get_vmcoreinfo_old(&size); + if (!vmcoreinfo) + return 0; + + kfree(vmcoreinfo); + return nt_size_name(size, name); +} + +/* + * Initialize final note (needed for /proc/vmcore code) + */ +static void *nt_final(void *ptr) +{ + Elf64_Nhdr *note; + + note = (Elf64_Nhdr *) ptr; + note->n_namesz = 0; + note->n_descsz = 0; + note->n_type = 0; + return PTR_ADD(ptr, sizeof(Elf64_Nhdr)); +} + +/* + * Initialize ELF header (new kernel) + */ +static void *ehdr_init(Elf64_Ehdr *ehdr, int mem_chunk_cnt) +{ + memset(ehdr, 0, sizeof(*ehdr)); + memcpy(ehdr->e_ident, ELFMAG, SELFMAG); + ehdr->e_ident[EI_CLASS] = ELFCLASS64; + ehdr->e_ident[EI_DATA] = ELFDATA2MSB; + ehdr->e_ident[EI_VERSION] = EV_CURRENT; + memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD); + ehdr->e_type = ET_CORE; + ehdr->e_machine = EM_S390; + ehdr->e_version = EV_CURRENT; + ehdr->e_phoff = sizeof(Elf64_Ehdr); + ehdr->e_ehsize = sizeof(Elf64_Ehdr); + ehdr->e_phentsize = sizeof(Elf64_Phdr); + ehdr->e_phnum = mem_chunk_cnt + 1; + return ehdr + 1; +} + +/* + * Return CPU count for ELF header (new kernel) + */ +static int get_cpu_cnt(void) +{ + struct save_area *sa; + int cpus = 0; + + list_for_each_entry(sa, &dump_save_areas, list) + if (sa->prefix != 0) + cpus++; + return cpus; +} + +/* + * Return memory chunk count for ELF header (new kernel) + */ +static int get_mem_chunk_cnt(void) +{ + int cnt = 0; + u64 idx; + + for_each_physmem_range(idx, &oldmem_type, NULL, NULL) + cnt++; + return cnt; +} + +/* + * Initialize ELF loads (new kernel) + */ +static void loads_init(Elf64_Phdr *phdr, u64 loads_offset) +{ + phys_addr_t start, end; + u64 idx; + + for_each_physmem_range(idx, &oldmem_type, &start, &end) { + phdr->p_filesz = end - start; + phdr->p_type = PT_LOAD; + phdr->p_offset = start; + phdr->p_vaddr = start; + phdr->p_paddr = start; + phdr->p_memsz = end - start; + phdr->p_flags = PF_R | PF_W | PF_X; + phdr->p_align = PAGE_SIZE; + phdr++; + } +} + +/* + * Initialize notes (new kernel) + */ +static void *notes_init(Elf64_Phdr *phdr, void *ptr, u64 notes_offset) +{ + struct save_area *sa; + void *ptr_start = ptr; + int cpu; + + ptr = nt_prpsinfo(ptr); + + cpu = 1; + list_for_each_entry(sa, &dump_save_areas, list) + if (sa->prefix != 0) + ptr = fill_cpu_elf_notes(ptr, cpu++, sa); + ptr = nt_vmcoreinfo(ptr); + ptr = nt_final(ptr); + memset(phdr, 0, sizeof(*phdr)); + phdr->p_type = PT_NOTE; + phdr->p_offset = notes_offset; + phdr->p_filesz = (unsigned long) PTR_SUB(ptr, ptr_start); + phdr->p_memsz = phdr->p_filesz; + return ptr; +} + +static size_t get_elfcorehdr_size(int mem_chunk_cnt) +{ + size_t size; + + size = sizeof(Elf64_Ehdr); + /* PT_NOTES */ + size += sizeof(Elf64_Phdr); + /* nt_prpsinfo */ + size += nt_size(NT_PRPSINFO, sizeof(struct elf_prpsinfo)); + /* regsets */ + size += get_cpu_cnt() * get_cpu_elf_notes_size(); + /* nt_vmcoreinfo */ + size += nt_vmcoreinfo_size(); + /* nt_final */ + size += sizeof(Elf64_Nhdr); + /* PT_LOADS */ + size += mem_chunk_cnt * sizeof(Elf64_Phdr); + + return size; +} + +/* + * Create ELF core header (new kernel) + */ +int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size) +{ + Elf64_Phdr *phdr_notes, *phdr_loads; + size_t alloc_size; + int mem_chunk_cnt; + void *ptr, *hdr; + u64 hdr_off; + + /* If we are not in kdump or zfcp/nvme dump mode return */ + if (!oldmem_data.start && !is_ipl_type_dump()) + return 0; + /* If we cannot get HSA size for zfcp/nvme dump return error */ + if (is_ipl_type_dump() && !sclp.hsa_size) + return -ENODEV; + + /* For kdump, exclude previous crashkernel memory */ + if (oldmem_data.start) { + oldmem_region.base = oldmem_data.start; + oldmem_region.size = oldmem_data.size; + oldmem_type.total_size = oldmem_data.size; + } + + mem_chunk_cnt = get_mem_chunk_cnt(); + + alloc_size = get_elfcorehdr_size(mem_chunk_cnt); + + hdr = kzalloc(alloc_size, GFP_KERNEL); + + /* Without elfcorehdr /proc/vmcore cannot be created. Thus creating + * a dump with this crash kernel will fail. Panic now to allow other + * dump mechanisms to take over. + */ + if (!hdr) + panic("s390 kdump allocating elfcorehdr failed"); + + /* Init elf header */ + ptr = ehdr_init(hdr, mem_chunk_cnt); + /* Init program headers */ + phdr_notes = ptr; + ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr)); + phdr_loads = ptr; + ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr) * mem_chunk_cnt); + /* Init notes */ + hdr_off = PTR_DIFF(ptr, hdr); + ptr = notes_init(phdr_notes, ptr, ((unsigned long) hdr) + hdr_off); + /* Init loads */ + hdr_off = PTR_DIFF(ptr, hdr); + loads_init(phdr_loads, hdr_off); + *addr = (unsigned long long) hdr; + *size = (unsigned long long) hdr_off; + BUG_ON(elfcorehdr_size > alloc_size); + return 0; +} + +/* + * Free ELF core header (new kernel) + */ +void elfcorehdr_free(unsigned long long addr) +{ + kfree((void *)(unsigned long)addr); +} + +/* + * Read from ELF header + */ +ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos) +{ + void *src = (void *)(unsigned long)*ppos; + + memcpy(buf, src, count); + *ppos += count; + return count; +} + +/* + * Read from ELF notes data + */ +ssize_t elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos) +{ + void *src = (void *)(unsigned long)*ppos; + + memcpy(buf, src, count); + *ppos += count; + return count; +} diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c new file mode 100644 index 0000000000..a85e0c3e70 --- /dev/null +++ b/arch/s390/kernel/debug.c @@ -0,0 +1,1574 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * S/390 debug facility + * + * Copyright IBM Corp. 1999, 2020 + * + * Author(s): Michael Holzheu (holzheu@de.ibm.com), + * Holger Smolinski (Holger.Smolinski@de.ibm.com) + * + * Bugreports to: + */ + +#define KMSG_COMPONENT "s390dbf" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define DEBUG_PROLOG_ENTRY -1 + +#define ALL_AREAS 0 /* copy all debug areas */ +#define NO_AREAS 1 /* copy no debug areas */ + +/* typedefs */ + +typedef struct file_private_info { + loff_t offset; /* offset of last read in file */ + int act_area; /* number of last formated area */ + int act_page; /* act page in given area */ + int act_entry; /* last formated entry (offset */ + /* relative to beginning of last */ + /* formated page) */ + size_t act_entry_offset; /* up to this offset we copied */ + /* in last read the last formated */ + /* entry to userland */ + char temp_buf[2048]; /* buffer for output */ + debug_info_t *debug_info_org; /* original debug information */ + debug_info_t *debug_info_snap; /* snapshot of debug information */ + struct debug_view *view; /* used view of debug info */ +} file_private_info_t; + +typedef struct { + char *string; + /* + * This assumes that all args are converted into longs + * on L/390 this is the case for all types of parameter + * except of floats, and long long (32 bit) + * + */ + long args[]; +} debug_sprintf_entry_t; + +/* internal function prototyes */ + +static int debug_init(void); +static ssize_t debug_output(struct file *file, char __user *user_buf, + size_t user_len, loff_t *offset); +static ssize_t debug_input(struct file *file, const char __user *user_buf, + size_t user_len, loff_t *offset); +static int debug_open(struct inode *inode, struct file *file); +static int debug_close(struct inode *inode, struct file *file); +static debug_info_t *debug_info_create(const char *name, int pages_per_area, + int nr_areas, int buf_size, umode_t mode); +static void debug_info_get(debug_info_t *); +static void debug_info_put(debug_info_t *); +static int debug_prolog_level_fn(debug_info_t *id, + struct debug_view *view, char *out_buf); +static int debug_input_level_fn(debug_info_t *id, struct debug_view *view, + struct file *file, const char __user *user_buf, + size_t user_buf_size, loff_t *offset); +static int debug_prolog_pages_fn(debug_info_t *id, + struct debug_view *view, char *out_buf); +static int debug_input_pages_fn(debug_info_t *id, struct debug_view *view, + struct file *file, const char __user *user_buf, + size_t user_buf_size, loff_t *offset); +static int debug_input_flush_fn(debug_info_t *id, struct debug_view *view, + struct file *file, const char __user *user_buf, + size_t user_buf_size, loff_t *offset); +static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view, + char *out_buf, const char *in_buf); +static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view, + char *out_buf, const char *inbuf); +static void debug_areas_swap(debug_info_t *a, debug_info_t *b); +static void debug_events_append(debug_info_t *dest, debug_info_t *src); + +/* globals */ + +struct debug_view debug_hex_ascii_view = { + "hex_ascii", + NULL, + &debug_dflt_header_fn, + &debug_hex_ascii_format_fn, + NULL, + NULL +}; +EXPORT_SYMBOL(debug_hex_ascii_view); + +static struct debug_view debug_level_view = { + "level", + &debug_prolog_level_fn, + NULL, + NULL, + &debug_input_level_fn, + NULL +}; + +static struct debug_view debug_pages_view = { + "pages", + &debug_prolog_pages_fn, + NULL, + NULL, + &debug_input_pages_fn, + NULL +}; + +static struct debug_view debug_flush_view = { + "flush", + NULL, + NULL, + NULL, + &debug_input_flush_fn, + NULL +}; + +struct debug_view debug_sprintf_view = { + "sprintf", + NULL, + &debug_dflt_header_fn, + &debug_sprintf_format_fn, + NULL, + NULL +}; +EXPORT_SYMBOL(debug_sprintf_view); + +/* used by dump analysis tools to determine version of debug feature */ +static unsigned int __used debug_feature_version = __DEBUG_FEATURE_VERSION; + +/* static globals */ + +static debug_info_t *debug_area_first; +static debug_info_t *debug_area_last; +static DEFINE_MUTEX(debug_mutex); + +static int initialized; +static int debug_critical; + +static const struct file_operations debug_file_ops = { + .owner = THIS_MODULE, + .read = debug_output, + .write = debug_input, + .open = debug_open, + .release = debug_close, + .llseek = no_llseek, +}; + +static struct dentry *debug_debugfs_root_entry; + +/* functions */ + +/* + * debug_areas_alloc + * - Debug areas are implemented as a threedimensonal array: + * areas[areanumber][pagenumber][pageoffset] + */ + +static debug_entry_t ***debug_areas_alloc(int pages_per_area, int nr_areas) +{ + debug_entry_t ***areas; + int i, j; + + areas = kmalloc_array(nr_areas, sizeof(debug_entry_t **), GFP_KERNEL); + if (!areas) + goto fail_malloc_areas; + for (i = 0; i < nr_areas; i++) { + /* GFP_NOWARN to avoid user triggerable WARN, we handle fails */ + areas[i] = kmalloc_array(pages_per_area, + sizeof(debug_entry_t *), + GFP_KERNEL | __GFP_NOWARN); + if (!areas[i]) + goto fail_malloc_areas2; + for (j = 0; j < pages_per_area; j++) { + areas[i][j] = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!areas[i][j]) { + for (j--; j >= 0 ; j--) + kfree(areas[i][j]); + kfree(areas[i]); + goto fail_malloc_areas2; + } + } + } + return areas; + +fail_malloc_areas2: + for (i--; i >= 0; i--) { + for (j = 0; j < pages_per_area; j++) + kfree(areas[i][j]); + kfree(areas[i]); + } + kfree(areas); +fail_malloc_areas: + return NULL; +} + +/* + * debug_info_alloc + * - alloc new debug-info + */ +static debug_info_t *debug_info_alloc(const char *name, int pages_per_area, + int nr_areas, int buf_size, int level, + int mode) +{ + debug_info_t *rc; + + /* alloc everything */ + rc = kmalloc(sizeof(debug_info_t), GFP_KERNEL); + if (!rc) + goto fail_malloc_rc; + rc->active_entries = kcalloc(nr_areas, sizeof(int), GFP_KERNEL); + if (!rc->active_entries) + goto fail_malloc_active_entries; + rc->active_pages = kcalloc(nr_areas, sizeof(int), GFP_KERNEL); + if (!rc->active_pages) + goto fail_malloc_active_pages; + if ((mode == ALL_AREAS) && (pages_per_area != 0)) { + rc->areas = debug_areas_alloc(pages_per_area, nr_areas); + if (!rc->areas) + goto fail_malloc_areas; + } else { + rc->areas = NULL; + } + + /* initialize members */ + spin_lock_init(&rc->lock); + rc->pages_per_area = pages_per_area; + rc->nr_areas = nr_areas; + rc->active_area = 0; + rc->level = level; + rc->buf_size = buf_size; + rc->entry_size = sizeof(debug_entry_t) + buf_size; + strscpy(rc->name, name, sizeof(rc->name)); + memset(rc->views, 0, DEBUG_MAX_VIEWS * sizeof(struct debug_view *)); + memset(rc->debugfs_entries, 0, DEBUG_MAX_VIEWS * sizeof(struct dentry *)); + refcount_set(&(rc->ref_count), 0); + + return rc; + +fail_malloc_areas: + kfree(rc->active_pages); +fail_malloc_active_pages: + kfree(rc->active_entries); +fail_malloc_active_entries: + kfree(rc); +fail_malloc_rc: + return NULL; +} + +/* + * debug_areas_free + * - free all debug areas + */ +static void debug_areas_free(debug_info_t *db_info) +{ + int i, j; + + if (!db_info->areas) + return; + for (i = 0; i < db_info->nr_areas; i++) { + for (j = 0; j < db_info->pages_per_area; j++) + kfree(db_info->areas[i][j]); + kfree(db_info->areas[i]); + } + kfree(db_info->areas); + db_info->areas = NULL; +} + +/* + * debug_info_free + * - free memory debug-info + */ +static void debug_info_free(debug_info_t *db_info) +{ + debug_areas_free(db_info); + kfree(db_info->active_entries); + kfree(db_info->active_pages); + kfree(db_info); +} + +/* + * debug_info_create + * - create new debug-info + */ + +static debug_info_t *debug_info_create(const char *name, int pages_per_area, + int nr_areas, int buf_size, umode_t mode) +{ + debug_info_t *rc; + + rc = debug_info_alloc(name, pages_per_area, nr_areas, buf_size, + DEBUG_DEFAULT_LEVEL, ALL_AREAS); + if (!rc) + goto out; + + rc->mode = mode & ~S_IFMT; + refcount_set(&rc->ref_count, 1); +out: + return rc; +} + +/* + * debug_info_copy + * - copy debug-info + */ +static debug_info_t *debug_info_copy(debug_info_t *in, int mode) +{ + unsigned long flags; + debug_info_t *rc; + int i, j; + + /* get a consistent copy of the debug areas */ + do { + rc = debug_info_alloc(in->name, in->pages_per_area, + in->nr_areas, in->buf_size, in->level, mode); + spin_lock_irqsave(&in->lock, flags); + if (!rc) + goto out; + /* has something changed in the meantime ? */ + if ((rc->pages_per_area == in->pages_per_area) && + (rc->nr_areas == in->nr_areas)) { + break; + } + spin_unlock_irqrestore(&in->lock, flags); + debug_info_free(rc); + } while (1); + + if (mode == NO_AREAS) + goto out; + + for (i = 0; i < in->nr_areas; i++) { + for (j = 0; j < in->pages_per_area; j++) + memcpy(rc->areas[i][j], in->areas[i][j], PAGE_SIZE); + } +out: + spin_unlock_irqrestore(&in->lock, flags); + return rc; +} + +/* + * debug_info_get + * - increments reference count for debug-info + */ +static void debug_info_get(debug_info_t *db_info) +{ + if (db_info) + refcount_inc(&db_info->ref_count); +} + +/* + * debug_info_put: + * - decreases reference count for debug-info and frees it if necessary + */ +static void debug_info_put(debug_info_t *db_info) +{ + if (!db_info) + return; + if (refcount_dec_and_test(&db_info->ref_count)) + debug_info_free(db_info); +} + +/* + * debug_format_entry: + * - format one debug entry and return size of formated data + */ +static int debug_format_entry(file_private_info_t *p_info) +{ + debug_info_t *id_snap = p_info->debug_info_snap; + struct debug_view *view = p_info->view; + debug_entry_t *act_entry; + size_t len = 0; + + if (p_info->act_entry == DEBUG_PROLOG_ENTRY) { + /* print prolog */ + if (view->prolog_proc) + len += view->prolog_proc(id_snap, view, p_info->temp_buf); + goto out; + } + if (!id_snap->areas) /* this is true, if we have a prolog only view */ + goto out; /* or if 'pages_per_area' is 0 */ + act_entry = (debug_entry_t *) ((char *)id_snap->areas[p_info->act_area] + [p_info->act_page] + p_info->act_entry); + + if (act_entry->clock == 0LL) + goto out; /* empty entry */ + if (view->header_proc) + len += view->header_proc(id_snap, view, p_info->act_area, + act_entry, p_info->temp_buf + len); + if (view->format_proc) + len += view->format_proc(id_snap, view, p_info->temp_buf + len, + DEBUG_DATA(act_entry)); +out: + return len; +} + +/* + * debug_next_entry: + * - goto next entry in p_info + */ +static inline int debug_next_entry(file_private_info_t *p_info) +{ + debug_info_t *id; + + id = p_info->debug_info_snap; + if (p_info->act_entry == DEBUG_PROLOG_ENTRY) { + p_info->act_entry = 0; + p_info->act_page = 0; + goto out; + } + if (!id->areas) + return 1; + p_info->act_entry += id->entry_size; + /* switch to next page, if we reached the end of the page */ + if (p_info->act_entry > (PAGE_SIZE - id->entry_size)) { + /* next page */ + p_info->act_entry = 0; + p_info->act_page += 1; + if ((p_info->act_page % id->pages_per_area) == 0) { + /* next area */ + p_info->act_area++; + p_info->act_page = 0; + } + if (p_info->act_area >= id->nr_areas) + return 1; + } +out: + return 0; +} + +/* + * debug_output: + * - called for user read() + * - copies formated debug entries to the user buffer + */ +static ssize_t debug_output(struct file *file, /* file descriptor */ + char __user *user_buf, /* user buffer */ + size_t len, /* length of buffer */ + loff_t *offset) /* offset in the file */ +{ + size_t count = 0; + size_t entry_offset; + file_private_info_t *p_info; + + p_info = (file_private_info_t *) file->private_data; + if (*offset != p_info->offset) + return -EPIPE; + if (p_info->act_area >= p_info->debug_info_snap->nr_areas) + return 0; + entry_offset = p_info->act_entry_offset; + while (count < len) { + int formatted_line_residue; + int formatted_line_size; + int user_buf_residue; + size_t copy_size; + + formatted_line_size = debug_format_entry(p_info); + formatted_line_residue = formatted_line_size - entry_offset; + user_buf_residue = len-count; + copy_size = min(user_buf_residue, formatted_line_residue); + if (copy_size) { + if (copy_to_user(user_buf + count, p_info->temp_buf + + entry_offset, copy_size)) + return -EFAULT; + count += copy_size; + entry_offset += copy_size; + } + if (copy_size == formatted_line_residue) { + entry_offset = 0; + if (debug_next_entry(p_info)) + goto out; + } + } +out: + p_info->offset = *offset + count; + p_info->act_entry_offset = entry_offset; + *offset = p_info->offset; + return count; +} + +/* + * debug_input: + * - called for user write() + * - calls input function of view + */ +static ssize_t debug_input(struct file *file, const char __user *user_buf, + size_t length, loff_t *offset) +{ + file_private_info_t *p_info; + int rc = 0; + + mutex_lock(&debug_mutex); + p_info = ((file_private_info_t *) file->private_data); + if (p_info->view->input_proc) { + rc = p_info->view->input_proc(p_info->debug_info_org, + p_info->view, file, user_buf, + length, offset); + } else { + rc = -EPERM; + } + mutex_unlock(&debug_mutex); + return rc; /* number of input characters */ +} + +/* + * debug_open: + * - called for user open() + * - copies formated output to private_data area of the file + * handle + */ +static int debug_open(struct inode *inode, struct file *file) +{ + debug_info_t *debug_info, *debug_info_snapshot; + file_private_info_t *p_info; + int i, rc = 0; + + mutex_lock(&debug_mutex); + debug_info = file_inode(file)->i_private; + /* find debug view */ + for (i = 0; i < DEBUG_MAX_VIEWS; i++) { + if (!debug_info->views[i]) + continue; + else if (debug_info->debugfs_entries[i] == file->f_path.dentry) + goto found; /* found view ! */ + } + /* no entry found */ + rc = -EINVAL; + goto out; + +found: + + /* Make snapshot of current debug areas to get it consistent. */ + /* To copy all the areas is only needed, if we have a view which */ + /* formats the debug areas. */ + + if (!debug_info->views[i]->format_proc && !debug_info->views[i]->header_proc) + debug_info_snapshot = debug_info_copy(debug_info, NO_AREAS); + else + debug_info_snapshot = debug_info_copy(debug_info, ALL_AREAS); + + if (!debug_info_snapshot) { + rc = -ENOMEM; + goto out; + } + p_info = kmalloc(sizeof(file_private_info_t), GFP_KERNEL); + if (!p_info) { + debug_info_free(debug_info_snapshot); + rc = -ENOMEM; + goto out; + } + p_info->offset = 0; + p_info->debug_info_snap = debug_info_snapshot; + p_info->debug_info_org = debug_info; + p_info->view = debug_info->views[i]; + p_info->act_area = 0; + p_info->act_page = 0; + p_info->act_entry = DEBUG_PROLOG_ENTRY; + p_info->act_entry_offset = 0; + file->private_data = p_info; + debug_info_get(debug_info); + nonseekable_open(inode, file); +out: + mutex_unlock(&debug_mutex); + return rc; +} + +/* + * debug_close: + * - called for user close() + * - deletes private_data area of the file handle + */ +static int debug_close(struct inode *inode, struct file *file) +{ + file_private_info_t *p_info; + + p_info = (file_private_info_t *) file->private_data; + if (p_info->debug_info_snap) + debug_info_free(p_info->debug_info_snap); + debug_info_put(p_info->debug_info_org); + kfree(file->private_data); + return 0; /* success */ +} + +/* Create debugfs entries and add to internal list. */ +static void _debug_register(debug_info_t *id) +{ + /* create root directory */ + id->debugfs_root_entry = debugfs_create_dir(id->name, + debug_debugfs_root_entry); + + /* append new element to linked list */ + if (!debug_area_first) { + /* first element in list */ + debug_area_first = id; + id->prev = NULL; + } else { + /* append element to end of list */ + debug_area_last->next = id; + id->prev = debug_area_last; + } + debug_area_last = id; + id->next = NULL; + + debug_register_view(id, &debug_level_view); + debug_register_view(id, &debug_flush_view); + debug_register_view(id, &debug_pages_view); +} + +/** + * debug_register_mode() - creates and initializes debug area. + * + * @name: Name of debug log (e.g. used for debugfs entry) + * @pages_per_area: Number of pages, which will be allocated per area + * @nr_areas: Number of debug areas + * @buf_size: Size of data area in each debug entry + * @mode: File mode for debugfs files. E.g. S_IRWXUGO + * @uid: User ID for debugfs files. Currently only 0 is supported. + * @gid: Group ID for debugfs files. Currently only 0 is supported. + * + * Return: + * - Handle for generated debug area + * - %NULL if register failed + * + * Allocates memory for a debug log. + * Must not be called within an interrupt handler. + */ +debug_info_t *debug_register_mode(const char *name, int pages_per_area, + int nr_areas, int buf_size, umode_t mode, + uid_t uid, gid_t gid) +{ + debug_info_t *rc = NULL; + + /* Since debugfs currently does not support uid/gid other than root, */ + /* we do not allow gid/uid != 0 until we get support for that. */ + if ((uid != 0) || (gid != 0)) + pr_warn("Root becomes the owner of all s390dbf files in sysfs\n"); + BUG_ON(!initialized); + + /* create new debug_info */ + rc = debug_info_create(name, pages_per_area, nr_areas, buf_size, mode); + if (rc) { + mutex_lock(&debug_mutex); + _debug_register(rc); + mutex_unlock(&debug_mutex); + } else { + pr_err("Registering debug feature %s failed\n", name); + } + return rc; +} +EXPORT_SYMBOL(debug_register_mode); + +/** + * debug_register() - creates and initializes debug area with default file mode. + * + * @name: Name of debug log (e.g. used for debugfs entry) + * @pages_per_area: Number of pages, which will be allocated per area + * @nr_areas: Number of debug areas + * @buf_size: Size of data area in each debug entry + * + * Return: + * - Handle for generated debug area + * - %NULL if register failed + * + * Allocates memory for a debug log. + * The debugfs file mode access permissions are read and write for user. + * Must not be called within an interrupt handler. + */ +debug_info_t *debug_register(const char *name, int pages_per_area, + int nr_areas, int buf_size) +{ + return debug_register_mode(name, pages_per_area, nr_areas, buf_size, + S_IRUSR | S_IWUSR, 0, 0); +} +EXPORT_SYMBOL(debug_register); + +/** + * debug_register_static() - registers a static debug area + * + * @id: Handle for static debug area + * @pages_per_area: Number of pages per area + * @nr_areas: Number of debug areas + * + * Register debug_info_t defined using DEFINE_STATIC_DEBUG_INFO. + * + * Note: This function is called automatically via an initcall generated by + * DEFINE_STATIC_DEBUG_INFO. + */ +void debug_register_static(debug_info_t *id, int pages_per_area, int nr_areas) +{ + unsigned long flags; + debug_info_t *copy; + + if (!initialized) { + pr_err("Tried to register debug feature %s too early\n", + id->name); + return; + } + + copy = debug_info_alloc("", pages_per_area, nr_areas, id->buf_size, + id->level, ALL_AREAS); + if (!copy) { + pr_err("Registering debug feature %s failed\n", id->name); + + /* Clear pointers to prevent tracing into released initdata. */ + spin_lock_irqsave(&id->lock, flags); + id->areas = NULL; + id->active_pages = NULL; + id->active_entries = NULL; + spin_unlock_irqrestore(&id->lock, flags); + + return; + } + + /* Replace static trace area with dynamic copy. */ + spin_lock_irqsave(&id->lock, flags); + debug_events_append(copy, id); + debug_areas_swap(id, copy); + spin_unlock_irqrestore(&id->lock, flags); + + /* Clear pointers to initdata and discard copy. */ + copy->areas = NULL; + copy->active_pages = NULL; + copy->active_entries = NULL; + debug_info_free(copy); + + mutex_lock(&debug_mutex); + _debug_register(id); + mutex_unlock(&debug_mutex); +} + +/* Remove debugfs entries and remove from internal list. */ +static void _debug_unregister(debug_info_t *id) +{ + int i; + + for (i = 0; i < DEBUG_MAX_VIEWS; i++) { + if (!id->views[i]) + continue; + debugfs_remove(id->debugfs_entries[i]); + } + debugfs_remove(id->debugfs_root_entry); + if (id == debug_area_first) + debug_area_first = id->next; + if (id == debug_area_last) + debug_area_last = id->prev; + if (id->prev) + id->prev->next = id->next; + if (id->next) + id->next->prev = id->prev; +} + +/** + * debug_unregister() - give back debug area. + * + * @id: handle for debug log + * + * Return: + * none + */ +void debug_unregister(debug_info_t *id) +{ + if (!id) + return; + mutex_lock(&debug_mutex); + _debug_unregister(id); + mutex_unlock(&debug_mutex); + + debug_info_put(id); +} +EXPORT_SYMBOL(debug_unregister); + +/* + * debug_set_size: + * - set area size (number of pages) and number of areas + */ +static int debug_set_size(debug_info_t *id, int nr_areas, int pages_per_area) +{ + debug_info_t *new_id; + unsigned long flags; + + if (!id || (nr_areas <= 0) || (pages_per_area < 0)) + return -EINVAL; + + new_id = debug_info_alloc("", pages_per_area, nr_areas, id->buf_size, + id->level, ALL_AREAS); + if (!new_id) { + pr_info("Allocating memory for %i pages failed\n", + pages_per_area); + return -ENOMEM; + } + + spin_lock_irqsave(&id->lock, flags); + debug_events_append(new_id, id); + debug_areas_swap(new_id, id); + debug_info_free(new_id); + spin_unlock_irqrestore(&id->lock, flags); + pr_info("%s: set new size (%i pages)\n", id->name, pages_per_area); + + return 0; +} + +/** + * debug_set_level() - Sets new actual debug level if new_level is valid. + * + * @id: handle for debug log + * @new_level: new debug level + * + * Return: + * none + */ +void debug_set_level(debug_info_t *id, int new_level) +{ + unsigned long flags; + + if (!id) + return; + + if (new_level == DEBUG_OFF_LEVEL) { + pr_info("%s: switched off\n", id->name); + } else if ((new_level > DEBUG_MAX_LEVEL) || (new_level < 0)) { + pr_info("%s: level %i is out of range (%i - %i)\n", + id->name, new_level, 0, DEBUG_MAX_LEVEL); + return; + } + + spin_lock_irqsave(&id->lock, flags); + id->level = new_level; + spin_unlock_irqrestore(&id->lock, flags); +} +EXPORT_SYMBOL(debug_set_level); + +/* + * proceed_active_entry: + * - set active entry to next in the ring buffer + */ +static inline void proceed_active_entry(debug_info_t *id) +{ + if ((id->active_entries[id->active_area] += id->entry_size) + > (PAGE_SIZE - id->entry_size)) { + id->active_entries[id->active_area] = 0; + id->active_pages[id->active_area] = + (id->active_pages[id->active_area] + 1) % + id->pages_per_area; + } +} + +/* + * proceed_active_area: + * - set active area to next in the ring buffer + */ +static inline void proceed_active_area(debug_info_t *id) +{ + id->active_area++; + id->active_area = id->active_area % id->nr_areas; +} + +/* + * get_active_entry: + */ +static inline debug_entry_t *get_active_entry(debug_info_t *id) +{ + return (debug_entry_t *) (((char *) id->areas[id->active_area] + [id->active_pages[id->active_area]]) + + id->active_entries[id->active_area]); +} + +/* Swap debug areas of a and b. */ +static void debug_areas_swap(debug_info_t *a, debug_info_t *b) +{ + swap(a->nr_areas, b->nr_areas); + swap(a->pages_per_area, b->pages_per_area); + swap(a->areas, b->areas); + swap(a->active_area, b->active_area); + swap(a->active_pages, b->active_pages); + swap(a->active_entries, b->active_entries); +} + +/* Append all debug events in active area from source to destination log. */ +static void debug_events_append(debug_info_t *dest, debug_info_t *src) +{ + debug_entry_t *from, *to, *last; + + if (!src->areas || !dest->areas) + return; + + /* Loop over all entries in src, starting with oldest. */ + from = get_active_entry(src); + last = from; + do { + if (from->clock != 0LL) { + to = get_active_entry(dest); + memset(to, 0, dest->entry_size); + memcpy(to, from, min(src->entry_size, + dest->entry_size)); + proceed_active_entry(dest); + } + + proceed_active_entry(src); + from = get_active_entry(src); + } while (from != last); +} + +/* + * debug_finish_entry: + * - set timestamp, caller address, cpu number etc. + */ + +static inline void debug_finish_entry(debug_info_t *id, debug_entry_t *active, + int level, int exception) +{ + unsigned long timestamp; + union tod_clock clk; + + store_tod_clock_ext(&clk); + timestamp = clk.us; + timestamp -= TOD_UNIX_EPOCH >> 12; + active->clock = timestamp; + active->cpu = smp_processor_id(); + active->caller = __builtin_return_address(0); + active->exception = exception; + active->level = level; + proceed_active_entry(id); + if (exception) + proceed_active_area(id); +} + +static int debug_stoppable = 1; +static int debug_active = 1; + +#define CTL_S390DBF_STOPPABLE 5678 +#define CTL_S390DBF_ACTIVE 5679 + +/* + * proc handler for the running debug_active sysctl + * always allow read, allow write only if debug_stoppable is set or + * if debug_active is already off + */ +static int s390dbf_procactive(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + if (!write || debug_stoppable || !debug_active) + return proc_dointvec(table, write, buffer, lenp, ppos); + else + return 0; +} + +static struct ctl_table s390dbf_table[] = { + { + .procname = "debug_stoppable", + .data = &debug_stoppable, + .maxlen = sizeof(int), + .mode = S_IRUGO | S_IWUSR, + .proc_handler = proc_dointvec, + }, + { + .procname = "debug_active", + .data = &debug_active, + .maxlen = sizeof(int), + .mode = S_IRUGO | S_IWUSR, + .proc_handler = s390dbf_procactive, + }, + { } +}; + +static struct ctl_table_header *s390dbf_sysctl_header; + +/** + * debug_stop_all() - stops the debug feature if stopping is allowed. + * + * Return: + * - none + * + * Currently used in case of a kernel oops. + */ +void debug_stop_all(void) +{ + if (debug_stoppable) + debug_active = 0; +} +EXPORT_SYMBOL(debug_stop_all); + +/** + * debug_set_critical() - event/exception functions try lock instead of spin. + * + * Return: + * - none + * + * Currently used in case of stopping all CPUs but the current one. + * Once in this state, functions to write a debug entry for an + * event or exception no longer spin on the debug area lock, + * but only try to get it and fail if they do not get the lock. + */ +void debug_set_critical(void) +{ + debug_critical = 1; +} + +/* + * debug_event_common: + * - write debug entry with given size + */ +debug_entry_t *debug_event_common(debug_info_t *id, int level, const void *buf, + int len) +{ + debug_entry_t *active; + unsigned long flags; + + if (!debug_active || !id->areas) + return NULL; + if (debug_critical) { + if (!spin_trylock_irqsave(&id->lock, flags)) + return NULL; + } else { + spin_lock_irqsave(&id->lock, flags); + } + do { + active = get_active_entry(id); + memcpy(DEBUG_DATA(active), buf, min(len, id->buf_size)); + if (len < id->buf_size) + memset((DEBUG_DATA(active)) + len, 0, id->buf_size - len); + debug_finish_entry(id, active, level, 0); + len -= id->buf_size; + buf += id->buf_size; + } while (len > 0); + + spin_unlock_irqrestore(&id->lock, flags); + return active; +} +EXPORT_SYMBOL(debug_event_common); + +/* + * debug_exception_common: + * - write debug entry with given size and switch to next debug area + */ +debug_entry_t *debug_exception_common(debug_info_t *id, int level, + const void *buf, int len) +{ + debug_entry_t *active; + unsigned long flags; + + if (!debug_active || !id->areas) + return NULL; + if (debug_critical) { + if (!spin_trylock_irqsave(&id->lock, flags)) + return NULL; + } else { + spin_lock_irqsave(&id->lock, flags); + } + do { + active = get_active_entry(id); + memcpy(DEBUG_DATA(active), buf, min(len, id->buf_size)); + if (len < id->buf_size) + memset((DEBUG_DATA(active)) + len, 0, id->buf_size - len); + debug_finish_entry(id, active, level, len <= id->buf_size); + len -= id->buf_size; + buf += id->buf_size; + } while (len > 0); + + spin_unlock_irqrestore(&id->lock, flags); + return active; +} +EXPORT_SYMBOL(debug_exception_common); + +/* + * counts arguments in format string for sprintf view + */ +static inline int debug_count_numargs(char *string) +{ + int numargs = 0; + + while (*string) { + if (*string++ == '%') + numargs++; + } + return numargs; +} + +/* + * debug_sprintf_event: + */ +debug_entry_t *__debug_sprintf_event(debug_info_t *id, int level, char *string, ...) +{ + debug_sprintf_entry_t *curr_event; + debug_entry_t *active; + unsigned long flags; + int numargs, idx; + va_list ap; + + if (!debug_active || !id->areas) + return NULL; + numargs = debug_count_numargs(string); + + if (debug_critical) { + if (!spin_trylock_irqsave(&id->lock, flags)) + return NULL; + } else { + spin_lock_irqsave(&id->lock, flags); + } + active = get_active_entry(id); + curr_event = (debug_sprintf_entry_t *) DEBUG_DATA(active); + va_start(ap, string); + curr_event->string = string; + for (idx = 0; idx < min(numargs, (int)(id->buf_size / sizeof(long)) - 1); idx++) + curr_event->args[idx] = va_arg(ap, long); + va_end(ap); + debug_finish_entry(id, active, level, 0); + spin_unlock_irqrestore(&id->lock, flags); + + return active; +} +EXPORT_SYMBOL(__debug_sprintf_event); + +/* + * debug_sprintf_exception: + */ +debug_entry_t *__debug_sprintf_exception(debug_info_t *id, int level, char *string, ...) +{ + debug_sprintf_entry_t *curr_event; + debug_entry_t *active; + unsigned long flags; + int numargs, idx; + va_list ap; + + if (!debug_active || !id->areas) + return NULL; + + numargs = debug_count_numargs(string); + + if (debug_critical) { + if (!spin_trylock_irqsave(&id->lock, flags)) + return NULL; + } else { + spin_lock_irqsave(&id->lock, flags); + } + active = get_active_entry(id); + curr_event = (debug_sprintf_entry_t *)DEBUG_DATA(active); + va_start(ap, string); + curr_event->string = string; + for (idx = 0; idx < min(numargs, (int)(id->buf_size / sizeof(long)) - 1); idx++) + curr_event->args[idx] = va_arg(ap, long); + va_end(ap); + debug_finish_entry(id, active, level, 1); + spin_unlock_irqrestore(&id->lock, flags); + + return active; +} +EXPORT_SYMBOL(__debug_sprintf_exception); + +/** + * debug_register_view() - registers new debug view and creates debugfs + * dir entry + * + * @id: handle for debug log + * @view: pointer to debug view struct + * + * Return: + * - 0 : ok + * - < 0: Error + */ +int debug_register_view(debug_info_t *id, struct debug_view *view) +{ + unsigned long flags; + struct dentry *pde; + umode_t mode; + int rc = 0; + int i; + + if (!id) + goto out; + mode = (id->mode | S_IFREG) & ~S_IXUGO; + if (!(view->prolog_proc || view->format_proc || view->header_proc)) + mode &= ~(S_IRUSR | S_IRGRP | S_IROTH); + if (!view->input_proc) + mode &= ~(S_IWUSR | S_IWGRP | S_IWOTH); + pde = debugfs_create_file(view->name, mode, id->debugfs_root_entry, + id, &debug_file_ops); + spin_lock_irqsave(&id->lock, flags); + for (i = 0; i < DEBUG_MAX_VIEWS; i++) { + if (!id->views[i]) + break; + } + if (i == DEBUG_MAX_VIEWS) { + rc = -1; + } else { + id->views[i] = view; + id->debugfs_entries[i] = pde; + } + spin_unlock_irqrestore(&id->lock, flags); + if (rc) { + pr_err("Registering view %s/%s would exceed the maximum " + "number of views %i\n", id->name, view->name, i); + debugfs_remove(pde); + } +out: + return rc; +} +EXPORT_SYMBOL(debug_register_view); + +/** + * debug_unregister_view() - unregisters debug view and removes debugfs + * dir entry + * + * @id: handle for debug log + * @view: pointer to debug view struct + * + * Return: + * - 0 : ok + * - < 0: Error + */ +int debug_unregister_view(debug_info_t *id, struct debug_view *view) +{ + struct dentry *dentry = NULL; + unsigned long flags; + int i, rc = 0; + + if (!id) + goto out; + spin_lock_irqsave(&id->lock, flags); + for (i = 0; i < DEBUG_MAX_VIEWS; i++) { + if (id->views[i] == view) + break; + } + if (i == DEBUG_MAX_VIEWS) { + rc = -1; + } else { + dentry = id->debugfs_entries[i]; + id->views[i] = NULL; + id->debugfs_entries[i] = NULL; + } + spin_unlock_irqrestore(&id->lock, flags); + debugfs_remove(dentry); +out: + return rc; +} +EXPORT_SYMBOL(debug_unregister_view); + +static inline char *debug_get_user_string(const char __user *user_buf, + size_t user_len) +{ + char *buffer; + + buffer = kmalloc(user_len + 1, GFP_KERNEL); + if (!buffer) + return ERR_PTR(-ENOMEM); + if (copy_from_user(buffer, user_buf, user_len) != 0) { + kfree(buffer); + return ERR_PTR(-EFAULT); + } + /* got the string, now strip linefeed. */ + if (buffer[user_len - 1] == '\n') + buffer[user_len - 1] = 0; + else + buffer[user_len] = 0; + return buffer; +} + +static inline int debug_get_uint(char *buf) +{ + int rc; + + buf = skip_spaces(buf); + rc = simple_strtoul(buf, &buf, 10); + if (*buf) + rc = -EINVAL; + return rc; +} + +/* + * functions for debug-views + *********************************** +*/ + +/* + * prints out actual debug level + */ + +static int debug_prolog_pages_fn(debug_info_t *id, struct debug_view *view, + char *out_buf) +{ + return sprintf(out_buf, "%i\n", id->pages_per_area); +} + +/* + * reads new size (number of pages per debug area) + */ + +static int debug_input_pages_fn(debug_info_t *id, struct debug_view *view, + struct file *file, const char __user *user_buf, + size_t user_len, loff_t *offset) +{ + int rc, new_pages; + char *str; + + if (user_len > 0x10000) + user_len = 0x10000; + if (*offset != 0) { + rc = -EPIPE; + goto out; + } + str = debug_get_user_string(user_buf, user_len); + if (IS_ERR(str)) { + rc = PTR_ERR(str); + goto out; + } + new_pages = debug_get_uint(str); + if (new_pages < 0) { + rc = -EINVAL; + goto free_str; + } + rc = debug_set_size(id, id->nr_areas, new_pages); + if (rc != 0) { + rc = -EINVAL; + goto free_str; + } + rc = user_len; +free_str: + kfree(str); +out: + *offset += user_len; + return rc; /* number of input characters */ +} + +/* + * prints out actual debug level + */ +static int debug_prolog_level_fn(debug_info_t *id, struct debug_view *view, + char *out_buf) +{ + int rc = 0; + + if (id->level == DEBUG_OFF_LEVEL) + rc = sprintf(out_buf, "-\n"); + else + rc = sprintf(out_buf, "%i\n", id->level); + return rc; +} + +/* + * reads new debug level + */ +static int debug_input_level_fn(debug_info_t *id, struct debug_view *view, + struct file *file, const char __user *user_buf, + size_t user_len, loff_t *offset) +{ + int rc, new_level; + char *str; + + if (user_len > 0x10000) + user_len = 0x10000; + if (*offset != 0) { + rc = -EPIPE; + goto out; + } + str = debug_get_user_string(user_buf, user_len); + if (IS_ERR(str)) { + rc = PTR_ERR(str); + goto out; + } + if (str[0] == '-') { + debug_set_level(id, DEBUG_OFF_LEVEL); + rc = user_len; + goto free_str; + } else { + new_level = debug_get_uint(str); + } + if (new_level < 0) { + pr_warn("%s is not a valid level for a debug feature\n", str); + rc = -EINVAL; + } else { + debug_set_level(id, new_level); + rc = user_len; + } +free_str: + kfree(str); +out: + *offset += user_len; + return rc; /* number of input characters */ +} + +/* + * flushes debug areas + */ +static void debug_flush(debug_info_t *id, int area) +{ + unsigned long flags; + int i, j; + + if (!id || !id->areas) + return; + spin_lock_irqsave(&id->lock, flags); + if (area == DEBUG_FLUSH_ALL) { + id->active_area = 0; + memset(id->active_entries, 0, id->nr_areas * sizeof(int)); + for (i = 0; i < id->nr_areas; i++) { + id->active_pages[i] = 0; + for (j = 0; j < id->pages_per_area; j++) + memset(id->areas[i][j], 0, PAGE_SIZE); + } + } else if (area >= 0 && area < id->nr_areas) { + id->active_entries[area] = 0; + id->active_pages[area] = 0; + for (i = 0; i < id->pages_per_area; i++) + memset(id->areas[area][i], 0, PAGE_SIZE); + } + spin_unlock_irqrestore(&id->lock, flags); +} + +/* + * view function: flushes debug areas + */ +static int debug_input_flush_fn(debug_info_t *id, struct debug_view *view, + struct file *file, const char __user *user_buf, + size_t user_len, loff_t *offset) +{ + char input_buf[1]; + int rc = user_len; + + if (user_len > 0x10000) + user_len = 0x10000; + if (*offset != 0) { + rc = -EPIPE; + goto out; + } + if (copy_from_user(input_buf, user_buf, 1)) { + rc = -EFAULT; + goto out; + } + if (input_buf[0] == '-') { + debug_flush(id, DEBUG_FLUSH_ALL); + goto out; + } + if (isdigit(input_buf[0])) { + int area = ((int) input_buf[0] - (int) '0'); + + debug_flush(id, area); + goto out; + } + + pr_info("Flushing debug data failed because %c is not a valid " + "area\n", input_buf[0]); + +out: + *offset += user_len; + return rc; /* number of input characters */ +} + +/* + * prints debug data in hex/ascii format + */ +static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view, + char *out_buf, const char *in_buf) +{ + int i, rc = 0; + + for (i = 0; i < id->buf_size; i++) + rc += sprintf(out_buf + rc, "%02x ", ((unsigned char *) in_buf)[i]); + rc += sprintf(out_buf + rc, "| "); + for (i = 0; i < id->buf_size; i++) { + unsigned char c = in_buf[i]; + + if (isascii(c) && isprint(c)) + rc += sprintf(out_buf + rc, "%c", c); + else + rc += sprintf(out_buf + rc, "."); + } + rc += sprintf(out_buf + rc, "\n"); + return rc; +} + +/* + * prints header for debug entry + */ +int debug_dflt_header_fn(debug_info_t *id, struct debug_view *view, + int area, debug_entry_t *entry, char *out_buf) +{ + unsigned long sec, usec; + unsigned long caller; + unsigned int level; + char *except_str; + int rc = 0; + + level = entry->level; + sec = entry->clock; + usec = do_div(sec, USEC_PER_SEC); + + if (entry->exception) + except_str = "*"; + else + except_str = "-"; + caller = (unsigned long) entry->caller; + rc += sprintf(out_buf, "%02i %011ld:%06lu %1u %1s %04u %px ", + area, sec, usec, level, except_str, + entry->cpu, (void *)caller); + return rc; +} +EXPORT_SYMBOL(debug_dflt_header_fn); + +/* + * prints debug data sprintf-formated: + * debug_sprinf_event/exception calls must be used together with this view + */ + +#define DEBUG_SPRINTF_MAX_ARGS 10 + +static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view, + char *out_buf, const char *inbuf) +{ + debug_sprintf_entry_t *curr_event = (debug_sprintf_entry_t *)inbuf; + int num_longs, num_used_args = 0, i, rc = 0; + int index[DEBUG_SPRINTF_MAX_ARGS]; + + /* count of longs fit into one entry */ + num_longs = id->buf_size / sizeof(long); + + if (num_longs < 1) + goto out; /* bufsize of entry too small */ + if (num_longs == 1) { + /* no args, we use only the string */ + strcpy(out_buf, curr_event->string); + rc = strlen(curr_event->string); + goto out; + } + + /* number of arguments used for sprintf (without the format string) */ + num_used_args = min(DEBUG_SPRINTF_MAX_ARGS, (num_longs - 1)); + + memset(index, 0, DEBUG_SPRINTF_MAX_ARGS * sizeof(int)); + + for (i = 0; i < num_used_args; i++) + index[i] = i; + + rc = sprintf(out_buf, curr_event->string, curr_event->args[index[0]], + curr_event->args[index[1]], curr_event->args[index[2]], + curr_event->args[index[3]], curr_event->args[index[4]], + curr_event->args[index[5]], curr_event->args[index[6]], + curr_event->args[index[7]], curr_event->args[index[8]], + curr_event->args[index[9]]); +out: + return rc; +} + +/* + * debug_init: + * - is called exactly once to initialize the debug feature + */ +static int __init debug_init(void) +{ + s390dbf_sysctl_header = register_sysctl("s390dbf", s390dbf_table); + mutex_lock(&debug_mutex); + debug_debugfs_root_entry = debugfs_create_dir(DEBUG_DIR_ROOT, NULL); + initialized = 1; + mutex_unlock(&debug_mutex); + return 0; +} +postcore_initcall(debug_init); diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c new file mode 100644 index 0000000000..f9f06cd8fc --- /dev/null +++ b/arch/s390/kernel/diag.c @@ -0,0 +1,269 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Implementation of s390 diagnose codes + * + * Copyright IBM Corp. 2007 + * Author(s): Michael Holzheu + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "entry.h" + +struct diag_stat { + unsigned int counter[NR_DIAG_STAT]; +}; + +static DEFINE_PER_CPU(struct diag_stat, diag_stat); + +struct diag_desc { + int code; + char *name; +}; + +static const struct diag_desc diag_map[NR_DIAG_STAT] = { + [DIAG_STAT_X008] = { .code = 0x008, .name = "Console Function" }, + [DIAG_STAT_X00C] = { .code = 0x00c, .name = "Pseudo Timer" }, + [DIAG_STAT_X010] = { .code = 0x010, .name = "Release Pages" }, + [DIAG_STAT_X014] = { .code = 0x014, .name = "Spool File Services" }, + [DIAG_STAT_X044] = { .code = 0x044, .name = "Voluntary Timeslice End" }, + [DIAG_STAT_X064] = { .code = 0x064, .name = "NSS Manipulation" }, + [DIAG_STAT_X08C] = { .code = 0x08c, .name = "Access 3270 Display Device Information" }, + [DIAG_STAT_X09C] = { .code = 0x09c, .name = "Relinquish Timeslice" }, + [DIAG_STAT_X0DC] = { .code = 0x0dc, .name = "Appldata Control" }, + [DIAG_STAT_X204] = { .code = 0x204, .name = "Logical-CPU Utilization" }, + [DIAG_STAT_X210] = { .code = 0x210, .name = "Device Information" }, + [DIAG_STAT_X224] = { .code = 0x224, .name = "EBCDIC-Name Table" }, + [DIAG_STAT_X250] = { .code = 0x250, .name = "Block I/O" }, + [DIAG_STAT_X258] = { .code = 0x258, .name = "Page-Reference Services" }, + [DIAG_STAT_X26C] = { .code = 0x26c, .name = "Certain System Information" }, + [DIAG_STAT_X288] = { .code = 0x288, .name = "Time Bomb" }, + [DIAG_STAT_X2C4] = { .code = 0x2c4, .name = "FTP Services" }, + [DIAG_STAT_X2FC] = { .code = 0x2fc, .name = "Guest Performance Data" }, + [DIAG_STAT_X304] = { .code = 0x304, .name = "Partition-Resource Service" }, + [DIAG_STAT_X308] = { .code = 0x308, .name = "List-Directed IPL" }, + [DIAG_STAT_X318] = { .code = 0x318, .name = "CP Name and Version Codes" }, + [DIAG_STAT_X320] = { .code = 0x320, .name = "Certificate Store" }, + [DIAG_STAT_X500] = { .code = 0x500, .name = "Virtio Service" }, +}; + +struct diag_ops __amode31_ref diag_amode31_ops = { + .diag210 = _diag210_amode31, + .diag26c = _diag26c_amode31, + .diag14 = _diag14_amode31, + .diag0c = _diag0c_amode31, + .diag8c = _diag8c_amode31, + .diag308_reset = _diag308_reset_amode31 +}; + +static struct diag210 _diag210_tmp_amode31 __section(".amode31.data"); +struct diag210 __amode31_ref *__diag210_tmp_amode31 = &_diag210_tmp_amode31; + +static struct diag8c _diag8c_tmp_amode31 __section(".amode31.data"); +static struct diag8c __amode31_ref *__diag8c_tmp_amode31 = &_diag8c_tmp_amode31; + +static int show_diag_stat(struct seq_file *m, void *v) +{ + struct diag_stat *stat; + unsigned long n = (unsigned long) v - 1; + int cpu, prec, tmp; + + cpus_read_lock(); + if (n == 0) { + seq_puts(m, " "); + + for_each_online_cpu(cpu) { + prec = 10; + for (tmp = 10; cpu >= tmp; tmp *= 10) + prec--; + seq_printf(m, "%*s%d", prec, "CPU", cpu); + } + seq_putc(m, '\n'); + } else if (n <= NR_DIAG_STAT) { + seq_printf(m, "diag %03x:", diag_map[n-1].code); + for_each_online_cpu(cpu) { + stat = &per_cpu(diag_stat, cpu); + seq_printf(m, " %10u", stat->counter[n-1]); + } + seq_printf(m, " %s\n", diag_map[n-1].name); + } + cpus_read_unlock(); + return 0; +} + +static void *show_diag_stat_start(struct seq_file *m, loff_t *pos) +{ + return *pos <= NR_DIAG_STAT ? (void *)((unsigned long) *pos + 1) : NULL; +} + +static void *show_diag_stat_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return show_diag_stat_start(m, pos); +} + +static void show_diag_stat_stop(struct seq_file *m, void *v) +{ +} + +static const struct seq_operations show_diag_stat_sops = { + .start = show_diag_stat_start, + .next = show_diag_stat_next, + .stop = show_diag_stat_stop, + .show = show_diag_stat, +}; + +DEFINE_SEQ_ATTRIBUTE(show_diag_stat); + +static int __init show_diag_stat_init(void) +{ + debugfs_create_file("diag_stat", 0400, NULL, NULL, + &show_diag_stat_fops); + return 0; +} + +device_initcall(show_diag_stat_init); + +void diag_stat_inc(enum diag_stat_enum nr) +{ + this_cpu_inc(diag_stat.counter[nr]); + trace_s390_diagnose(diag_map[nr].code); +} +EXPORT_SYMBOL(diag_stat_inc); + +void notrace diag_stat_inc_norecursion(enum diag_stat_enum nr) +{ + this_cpu_inc(diag_stat.counter[nr]); + trace_s390_diagnose_norecursion(diag_map[nr].code); +} +EXPORT_SYMBOL(diag_stat_inc_norecursion); + +/* + * Diagnose 14: Input spool file manipulation + */ +int diag14(unsigned long rx, unsigned long ry1, unsigned long subcode) +{ + diag_stat_inc(DIAG_STAT_X014); + return diag_amode31_ops.diag14(rx, ry1, subcode); +} +EXPORT_SYMBOL(diag14); + +static inline int __diag204(unsigned long *subcode, unsigned long size, void *addr) +{ + union register_pair rp = { .even = *subcode, .odd = size }; + + asm volatile( + " diag %[addr],%[rp],0x204\n" + "0: nopr %%r7\n" + EX_TABLE(0b,0b) + : [rp] "+&d" (rp.pair) : [addr] "d" (addr) : "memory"); + *subcode = rp.even; + return rp.odd; +} + +/** + * diag204() - Issue diagnose 204 call. + * @subcode: Subcode of diagnose 204 to be executed. + * @size: Size of area in pages which @area points to, if given. + * @addr: Vmalloc'ed memory area where the result is written to. + * + * Execute diagnose 204 with the given subcode and write the result to the + * memory area specified with @addr. For subcodes which do not write a + * result to memory both @size and @addr must be zero. If @addr is + * specified it must be page aligned and must have been allocated with + * vmalloc(). Conversion to real / physical addresses will be handled by + * this function if required. + */ +int diag204(unsigned long subcode, unsigned long size, void *addr) +{ + if (addr) { + if (WARN_ON_ONCE(!is_vmalloc_addr(addr))) + return -1; + if (WARN_ON_ONCE(!IS_ALIGNED((unsigned long)addr, PAGE_SIZE))) + return -1; + } + if ((subcode & DIAG204_SUBCODE_MASK) == DIAG204_SUBC_STIB4) + addr = (void *)pfn_to_phys(vmalloc_to_pfn(addr)); + diag_stat_inc(DIAG_STAT_X204); + size = __diag204(&subcode, size, addr); + if (subcode) + return -1; + return size; +} +EXPORT_SYMBOL(diag204); + +/* + * Diagnose 210: Get information about a virtual device + */ +int diag210(struct diag210 *addr) +{ + static DEFINE_SPINLOCK(diag210_lock); + unsigned long flags; + int ccode; + + spin_lock_irqsave(&diag210_lock, flags); + *__diag210_tmp_amode31 = *addr; + + diag_stat_inc(DIAG_STAT_X210); + ccode = diag_amode31_ops.diag210(__diag210_tmp_amode31); + + *addr = *__diag210_tmp_amode31; + spin_unlock_irqrestore(&diag210_lock, flags); + + return ccode; +} +EXPORT_SYMBOL(diag210); + +/* + * Diagnose 8C: Access 3270 Display Device Information + */ +int diag8c(struct diag8c *addr, struct ccw_dev_id *devno) +{ + static DEFINE_SPINLOCK(diag8c_lock); + unsigned long flags; + int ccode; + + spin_lock_irqsave(&diag8c_lock, flags); + + diag_stat_inc(DIAG_STAT_X08C); + ccode = diag_amode31_ops.diag8c(__diag8c_tmp_amode31, devno, sizeof(*addr)); + + *addr = *__diag8c_tmp_amode31; + spin_unlock_irqrestore(&diag8c_lock, flags); + + return ccode; +} +EXPORT_SYMBOL(diag8c); + +int diag224(void *ptr) +{ + int rc = -EOPNOTSUPP; + + diag_stat_inc(DIAG_STAT_X224); + asm volatile( + " diag %1,%2,0x224\n" + "0: lhi %0,0x0\n" + "1:\n" + EX_TABLE(0b,1b) + : "+d" (rc) :"d" (0), "d" (ptr) : "memory"); + return rc; +} +EXPORT_SYMBOL(diag224); + +/* + * Diagnose 26C: Access Certain System Information + */ +int diag26c(void *req, void *resp, enum diag26c_sc subcode) +{ + diag_stat_inc(DIAG_STAT_X26C); + return diag_amode31_ops.diag26c(req, resp, subcode); +} +EXPORT_SYMBOL(diag26c); diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c new file mode 100644 index 0000000000..89dc826a8d --- /dev/null +++ b/arch/s390/kernel/dis.c @@ -0,0 +1,590 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Disassemble s390 instructions. + * + * Copyright IBM Corp. 2007 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Type of operand */ +#define OPERAND_GPR 0x1 /* Operand printed as %rx */ +#define OPERAND_FPR 0x2 /* Operand printed as %fx */ +#define OPERAND_AR 0x4 /* Operand printed as %ax */ +#define OPERAND_CR 0x8 /* Operand printed as %cx */ +#define OPERAND_VR 0x10 /* Operand printed as %vx */ +#define OPERAND_DISP 0x20 /* Operand printed as displacement */ +#define OPERAND_BASE 0x40 /* Operand printed as base register */ +#define OPERAND_INDEX 0x80 /* Operand printed as index register */ +#define OPERAND_PCREL 0x100 /* Operand printed as pc-relative symbol */ +#define OPERAND_SIGNED 0x200 /* Operand printed as signed value */ +#define OPERAND_LENGTH 0x400 /* Operand printed as length (+1) */ + +struct s390_operand { + unsigned char bits; /* The number of bits in the operand. */ + unsigned char shift; /* The number of bits to shift. */ + unsigned short flags; /* One bit syntax flags. */ +}; + +struct s390_insn { + union { + const char name[5]; + struct { + unsigned char zero; + unsigned int offset; + } __packed; + }; + unsigned char opfrag; + unsigned char format; +}; + +struct s390_opcode_offset { + unsigned char opcode; + unsigned char mask; + unsigned char byte; + unsigned short offset; + unsigned short count; +} __packed; + +enum { + UNUSED, + A_8, /* Access reg. starting at position 8 */ + A_12, /* Access reg. starting at position 12 */ + A_24, /* Access reg. starting at position 24 */ + A_28, /* Access reg. starting at position 28 */ + B_16, /* Base register starting at position 16 */ + B_32, /* Base register starting at position 32 */ + C_8, /* Control reg. starting at position 8 */ + C_12, /* Control reg. starting at position 12 */ + D20_20, /* 20 bit displacement starting at 20 */ + D_20, /* Displacement starting at position 20 */ + D_36, /* Displacement starting at position 36 */ + F_8, /* FPR starting at position 8 */ + F_12, /* FPR starting at position 12 */ + F_16, /* FPR starting at position 16 */ + F_24, /* FPR starting at position 24 */ + F_28, /* FPR starting at position 28 */ + F_32, /* FPR starting at position 32 */ + I8_8, /* 8 bit signed value starting at 8 */ + I8_32, /* 8 bit signed value starting at 32 */ + I16_16, /* 16 bit signed value starting at 16 */ + I16_32, /* 16 bit signed value starting at 32 */ + I32_16, /* 32 bit signed value starting at 16 */ + J12_12, /* 12 bit PC relative offset at 12 */ + J16_16, /* 16 bit PC relative offset at 16 */ + J16_32, /* 16 bit PC relative offset at 32 */ + J24_24, /* 24 bit PC relative offset at 24 */ + J32_16, /* 32 bit PC relative offset at 16 */ + L4_8, /* 4 bit length starting at position 8 */ + L4_12, /* 4 bit length starting at position 12 */ + L8_8, /* 8 bit length starting at position 8 */ + R_8, /* GPR starting at position 8 */ + R_12, /* GPR starting at position 12 */ + R_16, /* GPR starting at position 16 */ + R_24, /* GPR starting at position 24 */ + R_28, /* GPR starting at position 28 */ + U4_8, /* 4 bit unsigned value starting at 8 */ + U4_12, /* 4 bit unsigned value starting at 12 */ + U4_16, /* 4 bit unsigned value starting at 16 */ + U4_20, /* 4 bit unsigned value starting at 20 */ + U4_24, /* 4 bit unsigned value starting at 24 */ + U4_28, /* 4 bit unsigned value starting at 28 */ + U4_32, /* 4 bit unsigned value starting at 32 */ + U4_36, /* 4 bit unsigned value starting at 36 */ + U8_8, /* 8 bit unsigned value starting at 8 */ + U8_16, /* 8 bit unsigned value starting at 16 */ + U8_24, /* 8 bit unsigned value starting at 24 */ + U8_28, /* 8 bit unsigned value starting at 28 */ + U8_32, /* 8 bit unsigned value starting at 32 */ + U12_16, /* 12 bit unsigned value starting at 16 */ + U16_16, /* 16 bit unsigned value starting at 16 */ + U16_32, /* 16 bit unsigned value starting at 32 */ + U32_16, /* 32 bit unsigned value starting at 16 */ + VX_12, /* Vector index register starting at position 12 */ + V_8, /* Vector reg. starting at position 8 */ + V_12, /* Vector reg. starting at position 12 */ + V_16, /* Vector reg. starting at position 16 */ + V_32, /* Vector reg. starting at position 32 */ + X_12, /* Index register starting at position 12 */ +}; + +static const struct s390_operand operands[] = { + [UNUSED] = { 0, 0, 0 }, + [A_8] = { 4, 8, OPERAND_AR }, + [A_12] = { 4, 12, OPERAND_AR }, + [A_24] = { 4, 24, OPERAND_AR }, + [A_28] = { 4, 28, OPERAND_AR }, + [B_16] = { 4, 16, OPERAND_BASE | OPERAND_GPR }, + [B_32] = { 4, 32, OPERAND_BASE | OPERAND_GPR }, + [C_8] = { 4, 8, OPERAND_CR }, + [C_12] = { 4, 12, OPERAND_CR }, + [D20_20] = { 20, 20, OPERAND_DISP | OPERAND_SIGNED }, + [D_20] = { 12, 20, OPERAND_DISP }, + [D_36] = { 12, 36, OPERAND_DISP }, + [F_8] = { 4, 8, OPERAND_FPR }, + [F_12] = { 4, 12, OPERAND_FPR }, + [F_16] = { 4, 16, OPERAND_FPR }, + [F_24] = { 4, 24, OPERAND_FPR }, + [F_28] = { 4, 28, OPERAND_FPR }, + [F_32] = { 4, 32, OPERAND_FPR }, + [I8_8] = { 8, 8, OPERAND_SIGNED }, + [I8_32] = { 8, 32, OPERAND_SIGNED }, + [I16_16] = { 16, 16, OPERAND_SIGNED }, + [I16_32] = { 16, 32, OPERAND_SIGNED }, + [I32_16] = { 32, 16, OPERAND_SIGNED }, + [J12_12] = { 12, 12, OPERAND_PCREL }, + [J16_16] = { 16, 16, OPERAND_PCREL }, + [J16_32] = { 16, 32, OPERAND_PCREL }, + [J24_24] = { 24, 24, OPERAND_PCREL }, + [J32_16] = { 32, 16, OPERAND_PCREL }, + [L4_8] = { 4, 8, OPERAND_LENGTH }, + [L4_12] = { 4, 12, OPERAND_LENGTH }, + [L8_8] = { 8, 8, OPERAND_LENGTH }, + [R_8] = { 4, 8, OPERAND_GPR }, + [R_12] = { 4, 12, OPERAND_GPR }, + [R_16] = { 4, 16, OPERAND_GPR }, + [R_24] = { 4, 24, OPERAND_GPR }, + [R_28] = { 4, 28, OPERAND_GPR }, + [U4_8] = { 4, 8, 0 }, + [U4_12] = { 4, 12, 0 }, + [U4_16] = { 4, 16, 0 }, + [U4_20] = { 4, 20, 0 }, + [U4_24] = { 4, 24, 0 }, + [U4_28] = { 4, 28, 0 }, + [U4_32] = { 4, 32, 0 }, + [U4_36] = { 4, 36, 0 }, + [U8_8] = { 8, 8, 0 }, + [U8_16] = { 8, 16, 0 }, + [U8_24] = { 8, 24, 0 }, + [U8_28] = { 8, 28, 0 }, + [U8_32] = { 8, 32, 0 }, + [U12_16] = { 12, 16, 0 }, + [U16_16] = { 16, 16, 0 }, + [U16_32] = { 16, 32, 0 }, + [U32_16] = { 32, 16, 0 }, + [VX_12] = { 4, 12, OPERAND_INDEX | OPERAND_VR }, + [V_8] = { 4, 8, OPERAND_VR }, + [V_12] = { 4, 12, OPERAND_VR }, + [V_16] = { 4, 16, OPERAND_VR }, + [V_32] = { 4, 32, OPERAND_VR }, + [X_12] = { 4, 12, OPERAND_INDEX | OPERAND_GPR }, +}; + +static const unsigned char formats[][6] = { + [INSTR_E] = { 0, 0, 0, 0, 0, 0 }, + [INSTR_IE_UU] = { U4_24, U4_28, 0, 0, 0, 0 }, + [INSTR_MII_UPP] = { U4_8, J12_12, J24_24 }, + [INSTR_RIE_R0IU] = { R_8, I16_16, U4_32, 0, 0, 0 }, + [INSTR_RIE_R0UU] = { R_8, U16_16, U4_32, 0, 0, 0 }, + [INSTR_RIE_RRI0] = { R_8, R_12, I16_16, 0, 0, 0 }, + [INSTR_RIE_RRP] = { R_8, R_12, J16_16, 0, 0, 0 }, + [INSTR_RIE_RRPU] = { R_8, R_12, U4_32, J16_16, 0, 0 }, + [INSTR_RIE_RRUUU] = { R_8, R_12, U8_16, U8_24, U8_32, 0 }, + [INSTR_RIE_RUI0] = { R_8, I16_16, U4_12, 0, 0, 0 }, + [INSTR_RIE_RUPI] = { R_8, I8_32, U4_12, J16_16, 0, 0 }, + [INSTR_RIE_RUPU] = { R_8, U8_32, U4_12, J16_16, 0, 0 }, + [INSTR_RIL_RI] = { R_8, I32_16, 0, 0, 0, 0 }, + [INSTR_RIL_RP] = { R_8, J32_16, 0, 0, 0, 0 }, + [INSTR_RIL_RU] = { R_8, U32_16, 0, 0, 0, 0 }, + [INSTR_RIL_UP] = { U4_8, J32_16, 0, 0, 0, 0 }, + [INSTR_RIS_RURDI] = { R_8, I8_32, U4_12, D_20, B_16, 0 }, + [INSTR_RIS_RURDU] = { R_8, U8_32, U4_12, D_20, B_16, 0 }, + [INSTR_RI_RI] = { R_8, I16_16, 0, 0, 0, 0 }, + [INSTR_RI_RP] = { R_8, J16_16, 0, 0, 0, 0 }, + [INSTR_RI_RU] = { R_8, U16_16, 0, 0, 0, 0 }, + [INSTR_RI_UP] = { U4_8, J16_16, 0, 0, 0, 0 }, + [INSTR_RRE_00] = { 0, 0, 0, 0, 0, 0 }, + [INSTR_RRE_AA] = { A_24, A_28, 0, 0, 0, 0 }, + [INSTR_RRE_AR] = { A_24, R_28, 0, 0, 0, 0 }, + [INSTR_RRE_F0] = { F_24, 0, 0, 0, 0, 0 }, + [INSTR_RRE_FF] = { F_24, F_28, 0, 0, 0, 0 }, + [INSTR_RRE_FR] = { F_24, R_28, 0, 0, 0, 0 }, + [INSTR_RRE_R0] = { R_24, 0, 0, 0, 0, 0 }, + [INSTR_RRE_RA] = { R_24, A_28, 0, 0, 0, 0 }, + [INSTR_RRE_RF] = { R_24, F_28, 0, 0, 0, 0 }, + [INSTR_RRE_RR] = { R_24, R_28, 0, 0, 0, 0 }, + [INSTR_RRF_0UFF] = { F_24, F_28, U4_20, 0, 0, 0 }, + [INSTR_RRF_0URF] = { R_24, F_28, U4_20, 0, 0, 0 }, + [INSTR_RRF_F0FF] = { F_16, F_24, F_28, 0, 0, 0 }, + [INSTR_RRF_F0FF2] = { F_24, F_16, F_28, 0, 0, 0 }, + [INSTR_RRF_F0FR] = { F_24, F_16, R_28, 0, 0, 0 }, + [INSTR_RRF_FFRU] = { F_24, F_16, R_28, U4_20, 0, 0 }, + [INSTR_RRF_FUFF] = { F_24, F_16, F_28, U4_20, 0, 0 }, + [INSTR_RRF_FUFF2] = { F_24, F_28, F_16, U4_20, 0, 0 }, + [INSTR_RRF_R0RR] = { R_24, R_16, R_28, 0, 0, 0 }, + [INSTR_RRF_R0RR2] = { R_24, R_28, R_16, 0, 0, 0 }, + [INSTR_RRF_RURR] = { R_24, R_28, R_16, U4_20, 0, 0 }, + [INSTR_RRF_RURR2] = { R_24, R_16, R_28, U4_20, 0, 0 }, + [INSTR_RRF_U0FF] = { F_24, U4_16, F_28, 0, 0, 0 }, + [INSTR_RRF_U0RF] = { R_24, U4_16, F_28, 0, 0, 0 }, + [INSTR_RRF_U0RR] = { R_24, R_28, U4_16, 0, 0, 0 }, + [INSTR_RRF_URR] = { R_24, R_28, U8_16, 0, 0, 0 }, + [INSTR_RRF_UUFF] = { F_24, U4_16, F_28, U4_20, 0, 0 }, + [INSTR_RRF_UUFR] = { F_24, U4_16, R_28, U4_20, 0, 0 }, + [INSTR_RRF_UURF] = { R_24, U4_16, F_28, U4_20, 0, 0 }, + [INSTR_RRS_RRRDU] = { R_8, R_12, U4_32, D_20, B_16 }, + [INSTR_RR_FF] = { F_8, F_12, 0, 0, 0, 0 }, + [INSTR_RR_R0] = { R_8, 0, 0, 0, 0, 0 }, + [INSTR_RR_RR] = { R_8, R_12, 0, 0, 0, 0 }, + [INSTR_RR_U0] = { U8_8, 0, 0, 0, 0, 0 }, + [INSTR_RR_UR] = { U4_8, R_12, 0, 0, 0, 0 }, + [INSTR_RSI_RRP] = { R_8, R_12, J16_16, 0, 0, 0 }, + [INSTR_RSL_LRDFU] = { F_32, D_20, L8_8, B_16, U4_36, 0 }, + [INSTR_RSL_R0RD] = { D_20, L4_8, B_16, 0, 0, 0 }, + [INSTR_RSY_AARD] = { A_8, A_12, D20_20, B_16, 0, 0 }, + [INSTR_RSY_CCRD] = { C_8, C_12, D20_20, B_16, 0, 0 }, + [INSTR_RSY_RDRU] = { R_8, D20_20, B_16, U4_12, 0, 0 }, + [INSTR_RSY_RRRD] = { R_8, R_12, D20_20, B_16, 0, 0 }, + [INSTR_RSY_RURD] = { R_8, U4_12, D20_20, B_16, 0, 0 }, + [INSTR_RSY_RURD2] = { R_8, D20_20, B_16, U4_12, 0, 0 }, + [INSTR_RS_AARD] = { A_8, A_12, D_20, B_16, 0, 0 }, + [INSTR_RS_CCRD] = { C_8, C_12, D_20, B_16, 0, 0 }, + [INSTR_RS_R0RD] = { R_8, D_20, B_16, 0, 0, 0 }, + [INSTR_RS_RRRD] = { R_8, R_12, D_20, B_16, 0, 0 }, + [INSTR_RS_RURD] = { R_8, U4_12, D_20, B_16, 0, 0 }, + [INSTR_RXE_FRRD] = { F_8, D_20, X_12, B_16, 0, 0 }, + [INSTR_RXE_RRRDU] = { R_8, D_20, X_12, B_16, U4_32, 0 }, + [INSTR_RXF_FRRDF] = { F_32, F_8, D_20, X_12, B_16, 0 }, + [INSTR_RXY_FRRD] = { F_8, D20_20, X_12, B_16, 0, 0 }, + [INSTR_RXY_RRRD] = { R_8, D20_20, X_12, B_16, 0, 0 }, + [INSTR_RXY_URRD] = { U4_8, D20_20, X_12, B_16, 0, 0 }, + [INSTR_RX_FRRD] = { F_8, D_20, X_12, B_16, 0, 0 }, + [INSTR_RX_RRRD] = { R_8, D_20, X_12, B_16, 0, 0 }, + [INSTR_RX_URRD] = { U4_8, D_20, X_12, B_16, 0, 0 }, + [INSTR_SIL_RDI] = { D_20, B_16, I16_32, 0, 0, 0 }, + [INSTR_SIL_RDU] = { D_20, B_16, U16_32, 0, 0, 0 }, + [INSTR_SIY_IRD] = { D20_20, B_16, I8_8, 0, 0, 0 }, + [INSTR_SIY_RD] = { D20_20, B_16, 0, 0, 0, 0 }, + [INSTR_SIY_URD] = { D20_20, B_16, U8_8, 0, 0, 0 }, + [INSTR_SI_RD] = { D_20, B_16, 0, 0, 0, 0 }, + [INSTR_SI_URD] = { D_20, B_16, U8_8, 0, 0, 0 }, + [INSTR_SMI_U0RDP] = { U4_8, J16_32, D_20, B_16, 0, 0 }, + [INSTR_SSE_RDRD] = { D_20, B_16, D_36, B_32, 0, 0 }, + [INSTR_SSF_RRDRD] = { D_20, B_16, D_36, B_32, R_8, 0 }, + [INSTR_SSF_RRDRD2] = { R_8, D_20, B_16, D_36, B_32, 0 }, + [INSTR_SS_L0RDRD] = { D_20, L8_8, B_16, D_36, B_32, 0 }, + [INSTR_SS_L2RDRD] = { D_20, B_16, D_36, L8_8, B_32, 0 }, + [INSTR_SS_LIRDRD] = { D_20, L4_8, B_16, D_36, B_32, U4_12 }, + [INSTR_SS_LLRDRD] = { D_20, L4_8, B_16, D_36, L4_12, B_32 }, + [INSTR_SS_RRRDRD] = { D_20, R_8, B_16, D_36, B_32, R_12 }, + [INSTR_SS_RRRDRD2] = { R_8, D_20, B_16, R_12, D_36, B_32 }, + [INSTR_SS_RRRDRD3] = { R_8, R_12, D_20, B_16, D_36, B_32 }, + [INSTR_S_00] = { 0, 0, 0, 0, 0, 0 }, + [INSTR_S_RD] = { D_20, B_16, 0, 0, 0, 0 }, + [INSTR_VRI_V0IU] = { V_8, I16_16, U4_32, 0, 0, 0 }, + [INSTR_VRI_V0U] = { V_8, U16_16, 0, 0, 0, 0 }, + [INSTR_VRI_V0UU2] = { V_8, U16_16, U4_32, 0, 0, 0 }, + [INSTR_VRI_V0UUU] = { V_8, U8_16, U8_24, U4_32, 0, 0 }, + [INSTR_VRI_VR0UU] = { V_8, R_12, U8_28, U4_24, 0, 0 }, + [INSTR_VRI_VVUU] = { V_8, V_12, U16_16, U4_32, 0, 0 }, + [INSTR_VRI_VVUUU] = { V_8, V_12, U12_16, U4_32, U4_28, 0 }, + [INSTR_VRI_VVUUU2] = { V_8, V_12, U8_28, U8_16, U4_24, 0 }, + [INSTR_VRI_VVV0U] = { V_8, V_12, V_16, U8_24, 0, 0 }, + [INSTR_VRI_VVV0UU] = { V_8, V_12, V_16, U8_24, U4_32, 0 }, + [INSTR_VRI_VVV0UU2] = { V_8, V_12, V_16, U8_28, U4_24, 0 }, + [INSTR_VRR_0V] = { V_12, 0, 0, 0, 0, 0 }, + [INSTR_VRR_0VV0U] = { V_12, V_16, U4_24, 0, 0, 0 }, + [INSTR_VRR_RV0UU] = { R_8, V_12, U4_24, U4_28, 0, 0 }, + [INSTR_VRR_VRR] = { V_8, R_12, R_16, 0, 0, 0 }, + [INSTR_VRR_VV] = { V_8, V_12, 0, 0, 0, 0 }, + [INSTR_VRR_VV0U] = { V_8, V_12, U4_32, 0, 0, 0 }, + [INSTR_VRR_VV0U0U] = { V_8, V_12, U4_32, U4_24, 0, 0 }, + [INSTR_VRR_VV0U2] = { V_8, V_12, U4_24, 0, 0, 0 }, + [INSTR_VRR_VV0UU2] = { V_8, V_12, U4_32, U4_28, 0, 0 }, + [INSTR_VRR_VV0UUU] = { V_8, V_12, U4_32, U4_28, U4_24, 0 }, + [INSTR_VRR_VVV] = { V_8, V_12, V_16, 0, 0, 0 }, + [INSTR_VRR_VVV0U] = { V_8, V_12, V_16, U4_32, 0, 0 }, + [INSTR_VRR_VVV0U0] = { V_8, V_12, V_16, U4_24, 0, 0 }, + [INSTR_VRR_VVV0U0U] = { V_8, V_12, V_16, U4_32, U4_24, 0 }, + [INSTR_VRR_VVV0UU] = { V_8, V_12, V_16, U4_32, U4_28, 0 }, + [INSTR_VRR_VVV0UUU] = { V_8, V_12, V_16, U4_32, U4_28, U4_24 }, + [INSTR_VRR_VVV0V] = { V_8, V_12, V_16, V_32, 0, 0 }, + [INSTR_VRR_VVVU0UV] = { V_8, V_12, V_16, V_32, U4_28, U4_20 }, + [INSTR_VRR_VVVU0V] = { V_8, V_12, V_16, V_32, U4_20, 0 }, + [INSTR_VRR_VVVUU0V] = { V_8, V_12, V_16, V_32, U4_20, U4_24 }, + [INSTR_VRS_RRDV] = { V_32, R_12, D_20, B_16, 0, 0 }, + [INSTR_VRS_RVRDU] = { R_8, V_12, D_20, B_16, U4_32, 0 }, + [INSTR_VRS_VRRD] = { V_8, R_12, D_20, B_16, 0, 0 }, + [INSTR_VRS_VRRDU] = { V_8, R_12, D_20, B_16, U4_32, 0 }, + [INSTR_VRS_VVRDU] = { V_8, V_12, D_20, B_16, U4_32, 0 }, + [INSTR_VRV_VVXRDU] = { V_8, D_20, VX_12, B_16, U4_32, 0 }, + [INSTR_VRX_VRRDU] = { V_8, D_20, X_12, B_16, U4_32, 0 }, + [INSTR_VRX_VV] = { V_8, V_12, 0, 0, 0, 0 }, + [INSTR_VSI_URDV] = { V_32, D_20, B_16, U8_8, 0, 0 }, +}; + +static char long_insn_name[][7] = LONG_INSN_INITIALIZER; +static struct s390_insn opcode[] = OPCODE_TABLE_INITIALIZER; +static struct s390_opcode_offset opcode_offset[] = OPCODE_OFFSET_INITIALIZER; + +/* Extracts an operand value from an instruction. */ +static unsigned int extract_operand(unsigned char *code, + const struct s390_operand *operand) +{ + unsigned char *cp; + unsigned int val; + int bits; + + /* Extract fragments of the operand byte for byte. */ + cp = code + operand->shift / 8; + bits = (operand->shift & 7) + operand->bits; + val = 0; + do { + val <<= 8; + val |= (unsigned int) *cp++; + bits -= 8; + } while (bits > 0); + val >>= -bits; + val &= ((1U << (operand->bits - 1)) << 1) - 1; + + /* Check for special long displacement case. */ + if (operand->bits == 20 && operand->shift == 20) + val = (val & 0xff) << 12 | (val & 0xfff00) >> 8; + + /* Check for register extensions bits for vector registers. */ + if (operand->flags & OPERAND_VR) { + if (operand->shift == 8) + val |= (code[4] & 8) << 1; + else if (operand->shift == 12) + val |= (code[4] & 4) << 2; + else if (operand->shift == 16) + val |= (code[4] & 2) << 3; + else if (operand->shift == 32) + val |= (code[4] & 1) << 4; + } + + /* Sign extend value if the operand is signed or pc relative. */ + if ((operand->flags & (OPERAND_SIGNED | OPERAND_PCREL)) && + (val & (1U << (operand->bits - 1)))) + val |= (-1U << (operand->bits - 1)) << 1; + + /* Double value if the operand is pc relative. */ + if (operand->flags & OPERAND_PCREL) + val <<= 1; + + /* Length x in an instructions has real length x + 1. */ + if (operand->flags & OPERAND_LENGTH) + val++; + return val; +} + +struct s390_insn *find_insn(unsigned char *code) +{ + struct s390_opcode_offset *entry; + struct s390_insn *insn; + unsigned char opfrag; + int i; + + /* Search the opcode offset table to find an entry which + * matches the beginning of the opcode. If there is no match + * the last entry will be used, which is the default entry for + * unknown instructions as well as 1-byte opcode instructions. + */ + for (i = 0; i < ARRAY_SIZE(opcode_offset); i++) { + entry = &opcode_offset[i]; + if (entry->opcode == code[0]) + break; + } + + opfrag = *(code + entry->byte) & entry->mask; + + insn = &opcode[entry->offset]; + for (i = 0; i < entry->count; i++) { + if (insn->opfrag == opfrag) + return insn; + insn++; + } + return NULL; +} + +static int print_insn(char *buffer, unsigned char *code, unsigned long addr) +{ + struct s390_insn *insn; + const unsigned char *ops; + const struct s390_operand *operand; + unsigned int value; + char separator; + char *ptr; + int i; + + ptr = buffer; + insn = find_insn(code); + if (insn) { + if (insn->zero == 0) + ptr += sprintf(ptr, "%.7s\t", + long_insn_name[insn->offset]); + else + ptr += sprintf(ptr, "%.5s\t", insn->name); + /* Extract the operands. */ + separator = 0; + for (ops = formats[insn->format], i = 0; + *ops != 0 && i < 6; ops++, i++) { + operand = operands + *ops; + value = extract_operand(code, operand); + if ((operand->flags & OPERAND_INDEX) && value == 0) + continue; + if ((operand->flags & OPERAND_BASE) && + value == 0 && separator == '(') { + separator = ','; + continue; + } + if (separator) + ptr += sprintf(ptr, "%c", separator); + if (operand->flags & OPERAND_GPR) + ptr += sprintf(ptr, "%%r%i", value); + else if (operand->flags & OPERAND_FPR) + ptr += sprintf(ptr, "%%f%i", value); + else if (operand->flags & OPERAND_AR) + ptr += sprintf(ptr, "%%a%i", value); + else if (operand->flags & OPERAND_CR) + ptr += sprintf(ptr, "%%c%i", value); + else if (operand->flags & OPERAND_VR) + ptr += sprintf(ptr, "%%v%i", value); + else if (operand->flags & OPERAND_PCREL) { + void *pcrel = (void *)((int)value + addr); + + ptr += sprintf(ptr, "%px", pcrel); + } else if (operand->flags & OPERAND_SIGNED) + ptr += sprintf(ptr, "%i", value); + else + ptr += sprintf(ptr, "%u", value); + if (operand->flags & OPERAND_DISP) + separator = '('; + else if (operand->flags & OPERAND_BASE) { + ptr += sprintf(ptr, ")"); + separator = ','; + } else + separator = ','; + } + } else + ptr += sprintf(ptr, "unknown"); + return (int) (ptr - buffer); +} + +static int copy_from_regs(struct pt_regs *regs, void *dst, void *src, int len) +{ + if (user_mode(regs)) { + if (copy_from_user(dst, (char __user *)src, len)) + return -EFAULT; + } else { + if (copy_from_kernel_nofault(dst, src, len)) + return -EFAULT; + } + return 0; +} + +void show_code(struct pt_regs *regs) +{ + char *mode = user_mode(regs) ? "User" : "Krnl"; + unsigned char code[64]; + char buffer[128], *ptr; + unsigned long addr; + int start, end, opsize, hops, i; + + /* Get a snapshot of the 64 bytes surrounding the fault address. */ + for (start = 32; start && regs->psw.addr >= 34 - start; start -= 2) { + addr = regs->psw.addr - 34 + start; + if (copy_from_regs(regs, code + start - 2, (void *)addr, 2)) + break; + } + for (end = 32; end < 64; end += 2) { + addr = regs->psw.addr + end - 32; + if (copy_from_regs(regs, code + end, (void *)addr, 2)) + break; + } + /* Code snapshot usable ? */ + if ((regs->psw.addr & 1) || start >= end) { + printk("%s Code: Bad PSW.\n", mode); + return; + } + /* Find a starting point for the disassembly. */ + while (start < 32) { + for (i = 0, hops = 0; start + i < 32 && hops < 3; hops++) { + if (!find_insn(code + start + i)) + break; + i += insn_length(code[start + i]); + } + if (start + i == 32) + /* Looks good, sequence ends at PSW. */ + break; + start += 2; + } + /* Decode the instructions. */ + ptr = buffer; + ptr += sprintf(ptr, "%s Code:", mode); + hops = 0; + while (start < end && hops < 8) { + opsize = insn_length(code[start]); + if (start + opsize == 32) + *ptr++ = '#'; + else if (start == 32) + *ptr++ = '>'; + else + *ptr++ = ' '; + addr = regs->psw.addr + start - 32; + ptr += sprintf(ptr, "%px: ", (void *)addr); + if (start + opsize >= end) + break; + for (i = 0; i < opsize; i++) + ptr += sprintf(ptr, "%02x", code[start + i]); + *ptr++ = '\t'; + if (i < 6) + *ptr++ = '\t'; + ptr += print_insn(ptr, code + start, addr); + start += opsize; + pr_cont("%s", buffer); + ptr = buffer; + ptr += sprintf(ptr, "\n "); + hops++; + } + pr_cont("\n"); +} + +void print_fn_code(unsigned char *code, unsigned long len) +{ + char buffer[128], *ptr; + int opsize, i; + + while (len) { + ptr = buffer; + opsize = insn_length(*code); + if (opsize > len) + break; + ptr += sprintf(ptr, "%px: ", code); + for (i = 0; i < opsize; i++) + ptr += sprintf(ptr, "%02x", code[i]); + *ptr++ = '\t'; + if (i < 4) + *ptr++ = '\t'; + ptr += print_insn(ptr, code, (unsigned long) code); + *ptr++ = '\n'; + *ptr++ = 0; + printk("%s", buffer); + code += opsize; + len -= opsize; + } +} diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c new file mode 100644 index 0000000000..d2012635b0 --- /dev/null +++ b/arch/s390/kernel/dumpstack.c @@ -0,0 +1,224 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Stack dumping functions + * + * Copyright IBM Corp. 1999, 2013 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +const char *stack_type_name(enum stack_type type) +{ + switch (type) { + case STACK_TYPE_TASK: + return "task"; + case STACK_TYPE_IRQ: + return "irq"; + case STACK_TYPE_NODAT: + return "nodat"; + case STACK_TYPE_RESTART: + return "restart"; + default: + return "unknown"; + } +} +EXPORT_SYMBOL_GPL(stack_type_name); + +static inline bool in_stack(unsigned long sp, struct stack_info *info, + enum stack_type type, unsigned long stack) +{ + if (sp < stack || sp >= stack + THREAD_SIZE) + return false; + info->type = type; + info->begin = stack; + info->end = stack + THREAD_SIZE; + return true; +} + +static bool in_task_stack(unsigned long sp, struct task_struct *task, + struct stack_info *info) +{ + unsigned long stack = (unsigned long)task_stack_page(task); + + return in_stack(sp, info, STACK_TYPE_TASK, stack); +} + +static bool in_irq_stack(unsigned long sp, struct stack_info *info) +{ + unsigned long stack = S390_lowcore.async_stack - STACK_INIT_OFFSET; + + return in_stack(sp, info, STACK_TYPE_IRQ, stack); +} + +static bool in_nodat_stack(unsigned long sp, struct stack_info *info) +{ + unsigned long stack = S390_lowcore.nodat_stack - STACK_INIT_OFFSET; + + return in_stack(sp, info, STACK_TYPE_NODAT, stack); +} + +static bool in_mcck_stack(unsigned long sp, struct stack_info *info) +{ + unsigned long stack = S390_lowcore.mcck_stack - STACK_INIT_OFFSET; + + return in_stack(sp, info, STACK_TYPE_MCCK, stack); +} + +static bool in_restart_stack(unsigned long sp, struct stack_info *info) +{ + unsigned long stack = S390_lowcore.restart_stack - STACK_INIT_OFFSET; + + return in_stack(sp, info, STACK_TYPE_RESTART, stack); +} + +int get_stack_info(unsigned long sp, struct task_struct *task, + struct stack_info *info, unsigned long *visit_mask) +{ + if (!sp) + goto unknown; + + /* Sanity check: ABI requires SP to be aligned 8 bytes. */ + if (sp & 0x7) + goto unknown; + + /* Check per-task stack */ + if (in_task_stack(sp, task, info)) + goto recursion_check; + + if (task != current) + goto unknown; + + /* Check per-cpu stacks */ + if (!in_irq_stack(sp, info) && + !in_nodat_stack(sp, info) && + !in_restart_stack(sp, info) && + !in_mcck_stack(sp, info)) + goto unknown; + +recursion_check: + /* + * Make sure we don't iterate through any given stack more than once. + * If it comes up a second time then there's something wrong going on: + * just break out and report an unknown stack type. + */ + if (*visit_mask & (1UL << info->type)) + goto unknown; + *visit_mask |= 1UL << info->type; + return 0; +unknown: + info->type = STACK_TYPE_UNKNOWN; + return -EINVAL; +} + +void show_stack(struct task_struct *task, unsigned long *stack, + const char *loglvl) +{ + struct unwind_state state; + + printk("%sCall Trace:\n", loglvl); + unwind_for_each_frame(&state, task, NULL, (unsigned long) stack) + printk(state.reliable ? "%s [<%016lx>] %pSR \n" : + "%s([<%016lx>] %pSR)\n", + loglvl, state.ip, (void *) state.ip); + debug_show_held_locks(task ? : current); +} + +static void show_last_breaking_event(struct pt_regs *regs) +{ + printk("Last Breaking-Event-Address:\n"); + printk(" [<%016lx>] ", regs->last_break); + if (user_mode(regs)) { + print_vma_addr(KERN_CONT, regs->last_break); + pr_cont("\n"); + } else { + pr_cont("%pSR\n", (void *)regs->last_break); + } +} + +void show_registers(struct pt_regs *regs) +{ + struct psw_bits *psw = &psw_bits(regs->psw); + char *mode; + + mode = user_mode(regs) ? "User" : "Krnl"; + printk("%s PSW : %px %px", mode, (void *)regs->psw.mask, (void *)regs->psw.addr); + if (!user_mode(regs)) + pr_cont(" (%pSR)", (void *)regs->psw.addr); + pr_cont("\n"); + printk(" R:%x T:%x IO:%x EX:%x Key:%x M:%x W:%x " + "P:%x AS:%x CC:%x PM:%x", psw->per, psw->dat, psw->io, psw->ext, + psw->key, psw->mcheck, psw->wait, psw->pstate, psw->as, psw->cc, psw->pm); + pr_cont(" RI:%x EA:%x\n", psw->ri, psw->eaba); + printk("%s GPRS: %016lx %016lx %016lx %016lx\n", mode, + regs->gprs[0], regs->gprs[1], regs->gprs[2], regs->gprs[3]); + printk(" %016lx %016lx %016lx %016lx\n", + regs->gprs[4], regs->gprs[5], regs->gprs[6], regs->gprs[7]); + printk(" %016lx %016lx %016lx %016lx\n", + regs->gprs[8], regs->gprs[9], regs->gprs[10], regs->gprs[11]); + printk(" %016lx %016lx %016lx %016lx\n", + regs->gprs[12], regs->gprs[13], regs->gprs[14], regs->gprs[15]); + show_code(regs); +} + +void show_regs(struct pt_regs *regs) +{ + show_regs_print_info(KERN_DEFAULT); + show_registers(regs); + /* Show stack backtrace if pt_regs is from kernel mode */ + if (!user_mode(regs)) + show_stack(NULL, (unsigned long *) regs->gprs[15], KERN_DEFAULT); + show_last_breaking_event(regs); +} + +static DEFINE_SPINLOCK(die_lock); + +void __noreturn die(struct pt_regs *regs, const char *str) +{ + static int die_counter; + + oops_enter(); + lgr_info_log(); + debug_stop_all(); + console_verbose(); + spin_lock_irq(&die_lock); + bust_spinlocks(1); + printk("%s: %04x ilc:%d [#%d] ", str, regs->int_code & 0xffff, + regs->int_code >> 17, ++die_counter); +#ifdef CONFIG_PREEMPT + pr_cont("PREEMPT "); +#elif defined(CONFIG_PREEMPT_RT) + pr_cont("PREEMPT_RT "); +#endif + pr_cont("SMP "); + if (debug_pagealloc_enabled()) + pr_cont("DEBUG_PAGEALLOC"); + pr_cont("\n"); + notify_die(DIE_OOPS, str, regs, 0, regs->int_code & 0xffff, SIGSEGV); + print_modules(); + show_regs(regs); + bust_spinlocks(0); + add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); + spin_unlock_irq(&die_lock); + if (in_interrupt()) + panic("Fatal exception in interrupt"); + if (panic_on_oops) + panic("Fatal exception: panic_on_oops"); + oops_exit(); + make_task_dead(SIGSEGV); +} diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c new file mode 100644 index 0000000000..442ce0489e --- /dev/null +++ b/arch/s390/kernel/early.c @@ -0,0 +1,320 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2007, 2009 + * Author(s): Hongjie Yang , + */ + +#define KMSG_COMPONENT "setup" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "entry.h" + +#define decompressor_handled_param(param) \ +static int __init ignore_decompressor_param_##param(char *s) \ +{ \ + return 0; \ +} \ +early_param(#param, ignore_decompressor_param_##param) + +decompressor_handled_param(mem); +decompressor_handled_param(vmalloc); +decompressor_handled_param(dfltcc); +decompressor_handled_param(facilities); +decompressor_handled_param(nokaslr); +#if IS_ENABLED(CONFIG_KVM) +decompressor_handled_param(prot_virt); +#endif + +static void __init kasan_early_init(void) +{ +#ifdef CONFIG_KASAN + init_task.kasan_depth = 0; + sclp_early_printk("KernelAddressSanitizer initialized\n"); +#endif +} + +static void __init reset_tod_clock(void) +{ + union tod_clock clk; + + if (store_tod_clock_ext_cc(&clk) == 0) + return; + /* TOD clock not running. Set the clock to Unix Epoch. */ + if (set_tod_clock(TOD_UNIX_EPOCH) || store_tod_clock_ext_cc(&clk)) + disabled_wait(); + + memset(&tod_clock_base, 0, sizeof(tod_clock_base)); + tod_clock_base.tod = TOD_UNIX_EPOCH; + S390_lowcore.last_update_clock = TOD_UNIX_EPOCH; +} + +/* + * Initialize storage key for kernel pages + */ +static noinline __init void init_kernel_storage_key(void) +{ +#if PAGE_DEFAULT_KEY + unsigned long end_pfn, init_pfn; + + end_pfn = PFN_UP(__pa(_end)); + + for (init_pfn = 0 ; init_pfn < end_pfn; init_pfn++) + page_set_storage_key(init_pfn << PAGE_SHIFT, + PAGE_DEFAULT_KEY, 0); +#endif +} + +static __initdata char sysinfo_page[PAGE_SIZE] __aligned(PAGE_SIZE); + +static noinline __init void detect_machine_type(void) +{ + struct sysinfo_3_2_2 *vmms = (struct sysinfo_3_2_2 *)&sysinfo_page; + + /* Check current-configuration-level */ + if (stsi(NULL, 0, 0, 0) <= 2) { + S390_lowcore.machine_flags |= MACHINE_FLAG_LPAR; + return; + } + /* Get virtual-machine cpu information. */ + if (stsi(vmms, 3, 2, 2) || !vmms->count) + return; + + /* Detect known hypervisors */ + if (!memcmp(vmms->vm[0].cpi, "\xd2\xe5\xd4", 3)) + S390_lowcore.machine_flags |= MACHINE_FLAG_KVM; + else if (!memcmp(vmms->vm[0].cpi, "\xa9\x61\xe5\xd4", 4)) + S390_lowcore.machine_flags |= MACHINE_FLAG_VM; +} + +/* Remove leading, trailing and double whitespace. */ +static inline void strim_all(char *str) +{ + char *s; + + s = strim(str); + if (s != str) + memmove(str, s, strlen(s)); + while (*str) { + if (!isspace(*str++)) + continue; + if (isspace(*str)) { + s = skip_spaces(str); + memmove(str, s, strlen(s) + 1); + } + } +} + +static noinline __init void setup_arch_string(void) +{ + struct sysinfo_1_1_1 *mach = (struct sysinfo_1_1_1 *)&sysinfo_page; + struct sysinfo_3_2_2 *vm = (struct sysinfo_3_2_2 *)&sysinfo_page; + char mstr[80], hvstr[17]; + + if (stsi(mach, 1, 1, 1)) + return; + EBCASC(mach->manufacturer, sizeof(mach->manufacturer)); + EBCASC(mach->type, sizeof(mach->type)); + EBCASC(mach->model, sizeof(mach->model)); + EBCASC(mach->model_capacity, sizeof(mach->model_capacity)); + sprintf(mstr, "%-16.16s %-4.4s %-16.16s %-16.16s", + mach->manufacturer, mach->type, + mach->model, mach->model_capacity); + strim_all(mstr); + if (stsi(vm, 3, 2, 2) == 0 && vm->count) { + EBCASC(vm->vm[0].cpi, sizeof(vm->vm[0].cpi)); + sprintf(hvstr, "%-16.16s", vm->vm[0].cpi); + strim_all(hvstr); + } else { + sprintf(hvstr, "%s", + MACHINE_IS_LPAR ? "LPAR" : + MACHINE_IS_VM ? "z/VM" : + MACHINE_IS_KVM ? "KVM" : "unknown"); + } + dump_stack_set_arch_desc("%s (%s)", mstr, hvstr); +} + +static __init void setup_topology(void) +{ + int max_mnest; + + if (!test_facility(11)) + return; + S390_lowcore.machine_flags |= MACHINE_FLAG_TOPOLOGY; + for (max_mnest = 6; max_mnest > 1; max_mnest--) { + if (stsi(&sysinfo_page, 15, 1, max_mnest) == 0) + break; + } + topology_max_mnest = max_mnest; +} + +void __do_early_pgm_check(struct pt_regs *regs) +{ + if (!fixup_exception(regs)) + disabled_wait(); +} + +static noinline __init void setup_lowcore_early(void) +{ + psw_t psw; + + psw.addr = (unsigned long)early_pgm_check_handler; + psw.mask = PSW_KERNEL_BITS; + S390_lowcore.program_new_psw = psw; + S390_lowcore.preempt_count = INIT_PREEMPT_COUNT; +} + +static noinline __init void setup_facility_list(void) +{ + memcpy(alt_stfle_fac_list, stfle_fac_list, sizeof(alt_stfle_fac_list)); + if (!IS_ENABLED(CONFIG_KERNEL_NOBP)) + __clear_facility(82, alt_stfle_fac_list); +} + +static __init void detect_diag9c(void) +{ + unsigned int cpu_address; + int rc; + + cpu_address = stap(); + diag_stat_inc(DIAG_STAT_X09C); + asm volatile( + " diag %2,0,0x9c\n" + "0: la %0,0\n" + "1:\n" + EX_TABLE(0b,1b) + : "=d" (rc) : "0" (-EOPNOTSUPP), "d" (cpu_address) : "cc"); + if (!rc) + S390_lowcore.machine_flags |= MACHINE_FLAG_DIAG9C; +} + +static __init void detect_machine_facilities(void) +{ + if (test_facility(8)) { + S390_lowcore.machine_flags |= MACHINE_FLAG_EDAT1; + __ctl_set_bit(0, 23); + } + if (test_facility(78)) + S390_lowcore.machine_flags |= MACHINE_FLAG_EDAT2; + if (test_facility(3)) + S390_lowcore.machine_flags |= MACHINE_FLAG_IDTE; + if (test_facility(50) && test_facility(73)) { + S390_lowcore.machine_flags |= MACHINE_FLAG_TE; + __ctl_set_bit(0, 55); + } + if (test_facility(51)) + S390_lowcore.machine_flags |= MACHINE_FLAG_TLB_LC; + if (test_facility(129)) { + S390_lowcore.machine_flags |= MACHINE_FLAG_VX; + __ctl_set_bit(0, 17); + } + if (test_facility(130)) + S390_lowcore.machine_flags |= MACHINE_FLAG_NX; + if (test_facility(133)) + S390_lowcore.machine_flags |= MACHINE_FLAG_GS; + if (test_facility(139) && (tod_clock_base.tod >> 63)) { + /* Enabled signed clock comparator comparisons */ + S390_lowcore.machine_flags |= MACHINE_FLAG_SCC; + clock_comparator_max = -1ULL >> 1; + __ctl_set_bit(0, 53); + } + if (IS_ENABLED(CONFIG_PCI) && test_facility(153)) { + S390_lowcore.machine_flags |= MACHINE_FLAG_PCI_MIO; + /* the control bit is set during PCI initialization */ + } + if (test_facility(194)) + S390_lowcore.machine_flags |= MACHINE_FLAG_RDP; +} + +static inline void save_vector_registers(void) +{ +#ifdef CONFIG_CRASH_DUMP + if (test_facility(129)) + save_vx_regs(boot_cpu_vector_save_area); +#endif +} + +static inline void setup_control_registers(void) +{ + unsigned long reg; + + __ctl_store(reg, 0, 0); + reg |= CR0_LOW_ADDRESS_PROTECTION; + reg |= CR0_EMERGENCY_SIGNAL_SUBMASK; + reg |= CR0_EXTERNAL_CALL_SUBMASK; + __ctl_load(reg, 0, 0); +} + +static inline void setup_access_registers(void) +{ + unsigned int acrs[NUM_ACRS] = { 0 }; + + restore_access_regs(acrs); +} + +static int __init disable_vector_extension(char *str) +{ + S390_lowcore.machine_flags &= ~MACHINE_FLAG_VX; + __ctl_clear_bit(0, 17); + return 0; +} +early_param("novx", disable_vector_extension); + +char __bootdata(early_command_line)[COMMAND_LINE_SIZE]; +static void __init setup_boot_command_line(void) +{ + /* copy arch command line */ + strscpy(boot_command_line, early_command_line, COMMAND_LINE_SIZE); +} + +static void __init sort_amode31_extable(void) +{ + sort_extable(__start_amode31_ex_table, __stop_amode31_ex_table); +} + +void __init startup_init(void) +{ + kasan_early_init(); + reset_tod_clock(); + time_early_init(); + init_kernel_storage_key(); + lockdep_off(); + sort_amode31_extable(); + setup_lowcore_early(); + setup_facility_list(); + detect_machine_type(); + setup_arch_string(); + setup_boot_command_line(); + detect_diag9c(); + detect_machine_facilities(); + save_vector_registers(); + setup_topology(); + sclp_early_detect(); + setup_control_registers(); + setup_access_registers(); + lockdep_on(); +} diff --git a/arch/s390/kernel/early_printk.c b/arch/s390/kernel/early_printk.c new file mode 100644 index 0000000000..d9d53f4400 --- /dev/null +++ b/arch/s390/kernel/early_printk.c @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2017 + */ + +#include +#include +#include +#include + +static void sclp_early_write(struct console *con, const char *s, unsigned int len) +{ + __sclp_early_printk(s, len); +} + +static struct console sclp_early_console = { + .name = "earlysclp", + .write = sclp_early_write, + .flags = CON_PRINTBUFFER | CON_BOOT, + .index = -1, +}; + +static int __init setup_early_printk(char *buf) +{ + if (early_console) + return 0; + /* Accept only "earlyprintk" and "earlyprintk=sclp" */ + if (buf && !str_has_prefix(buf, "sclp")) + return 0; + if (!sclp.has_linemode && !sclp.has_vt220) + return 0; + early_console = &sclp_early_console; + register_console(early_console); + return 0; +} +early_param("earlyprintk", setup_early_printk); diff --git a/arch/s390/kernel/earlypgm.S b/arch/s390/kernel/earlypgm.S new file mode 100644 index 0000000000..c634871f0d --- /dev/null +++ b/arch/s390/kernel/earlypgm.S @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2006, 2007 + * Author(s): Michael Holzheu + */ + +#include +#include + +SYM_CODE_START(early_pgm_check_handler) + stmg %r8,%r15,__LC_SAVE_AREA_SYNC + aghi %r15,-(STACK_FRAME_OVERHEAD+__PT_SIZE) + la %r11,STACK_FRAME_OVERHEAD(%r15) + xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) + stmg %r0,%r7,__PT_R0(%r11) + mvc __PT_PSW(16,%r11),__LC_PGM_OLD_PSW + mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC + lgr %r2,%r11 + brasl %r14,__do_early_pgm_check + mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) + lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) + lpswe __LC_RETURN_PSW +SYM_CODE_END(early_pgm_check_handler) diff --git a/arch/s390/kernel/ebcdic.c b/arch/s390/kernel/ebcdic.c new file mode 100644 index 0000000000..0e51fa5372 --- /dev/null +++ b/arch/s390/kernel/ebcdic.c @@ -0,0 +1,401 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * EBCDIC -> ASCII, ASCII -> EBCDIC, + * upper to lower case (EBCDIC) conversion tables. + * + * S390 version + * Copyright IBM Corp. 1999 + * Author(s): Martin Schwidefsky + * Martin Peschke + */ + +#include +#include +#include + +/* + * ASCII (IBM PC 437) -> EBCDIC 037 + */ +__u8 _ascebc[256] = +{ + /*00 NUL SOH STX ETX EOT ENQ ACK BEL */ + 0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, + /*08 BS HT LF VT FF CR SO SI */ + /* ->NL */ + 0x16, 0x05, 0x15, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + /*10 DLE DC1 DC2 DC3 DC4 NAK SYN ETB */ + 0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, + /*18 CAN EM SUB ESC FS GS RS US */ + /* ->IGS ->IRS ->IUS */ + 0x18, 0x19, 0x3F, 0x27, 0x22, 0x1D, 0x1E, 0x1F, + /*20 SP ! " # $ % & ' */ + 0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, + /*28 ( ) * + , - . / */ + 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61, + /*30 0 1 2 3 4 5 6 7 */ + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, + /*38 8 9 : ; < = > ? */ + 0xF8, 0xF9, 0x7A, 0x5E, 0x4C, 0x7E, 0x6E, 0x6F, + /*40 @ A B C D E F G */ + 0x7C, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, + /*48 H I J K L M N O */ + 0xC8, 0xC9, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, + /*50 P Q R S T U V W */ + 0xD7, 0xD8, 0xD9, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, + /*58 X Y Z [ \ ] ^ _ */ + 0xE7, 0xE8, 0xE9, 0xBA, 0xE0, 0xBB, 0xB0, 0x6D, + /*60 ` a b c d e f g */ + 0x79, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + /*68 h i j k l m n o */ + 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, + /*70 p q r s t u v w */ + 0x97, 0x98, 0x99, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, + /*78 x y z { | } ~ DL */ + 0xA7, 0xA8, 0xA9, 0xC0, 0x4F, 0xD0, 0xA1, 0x07, + /*80*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*88*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*90*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*98*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*A0*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*A8*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*B0*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*B8*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*C0*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*C8*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*D0*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*D8*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*E0 sz */ + 0x3F, 0x59, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*E8*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*F0*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*F8*/ + 0x90, 0x3F, 0x3F, 0x3F, 0x3F, 0xEA, 0x3F, 0xFF +}; + +/* + * EBCDIC 037 -> ASCII (IBM PC 437) + */ +__u8 _ebcasc[256] = +{ + /* 0x00 NUL SOH STX ETX *SEL HT *RNL DEL */ + 0x00, 0x01, 0x02, 0x03, 0x07, 0x09, 0x07, 0x7F, + /* 0x08 -GE -SPS -RPT VT FF CR SO SI */ + 0x07, 0x07, 0x07, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + /* 0x10 DLE DC1 DC2 DC3 -RES -NL BS -POC + -ENP ->LF */ + 0x10, 0x11, 0x12, 0x13, 0x07, 0x0A, 0x08, 0x07, + /* 0x18 CAN EM -UBS -CU1 -IFS -IGS -IRS -ITB + -IUS */ + 0x18, 0x19, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + /* 0x20 -DS -SOS FS -WUS -BYP LF ETB ESC + -INP */ + 0x07, 0x07, 0x1C, 0x07, 0x07, 0x0A, 0x17, 0x1B, + /* 0x28 -SA -SFE -SM -CSP -MFA ENQ ACK BEL + -SW */ + 0x07, 0x07, 0x07, 0x07, 0x07, 0x05, 0x06, 0x07, + /* 0x30 ---- ---- SYN -IR -PP -TRN -NBS EOT */ + 0x07, 0x07, 0x16, 0x07, 0x07, 0x07, 0x07, 0x04, + /* 0x38 -SBS -IT -RFF -CU3 DC4 NAK ---- SUB */ + 0x07, 0x07, 0x07, 0x07, 0x14, 0x15, 0x07, 0x1A, + /* 0x40 SP RSP ä ---- */ + 0x20, 0xFF, 0x83, 0x84, 0x85, 0xA0, 0x07, 0x86, + /* 0x48 . < ( + | */ + 0x87, 0xA4, 0x9B, 0x2E, 0x3C, 0x28, 0x2B, 0x7C, + /* 0x50 & ---- */ + 0x26, 0x82, 0x88, 0x89, 0x8A, 0xA1, 0x8C, 0x07, + /* 0x58 ß ! $ * ) ; */ + 0x8D, 0xE1, 0x21, 0x24, 0x2A, 0x29, 0x3B, 0xAA, + /* 0x60 - / ---- Ä ---- ---- ---- */ + 0x2D, 0x2F, 0x07, 0x8E, 0x07, 0x07, 0x07, 0x8F, + /* 0x68 ---- , % _ > ? */ + 0x80, 0xA5, 0x07, 0x2C, 0x25, 0x5F, 0x3E, 0x3F, + /* 0x70 ---- ---- ---- ---- ---- ---- ---- */ + 0x07, 0x90, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + /* 0x78 * ` : # @ ' = " */ + 0x70, 0x60, 0x3A, 0x23, 0x40, 0x27, 0x3D, 0x22, + /* 0x80 * a b c d e f g */ + 0x07, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + /* 0x88 h i ---- ---- ---- */ + 0x68, 0x69, 0xAE, 0xAF, 0x07, 0x07, 0x07, 0xF1, + /* 0x90 ° j k l m n o p */ + 0xF8, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, + /* 0x98 q r ---- ---- */ + 0x71, 0x72, 0xA6, 0xA7, 0x91, 0x07, 0x92, 0x07, + /* 0xA0 ~ s t u v w x */ + 0xE6, 0x7E, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, + /* 0xA8 y z ---- ---- ---- ---- */ + 0x79, 0x7A, 0xAD, 0xAB, 0x07, 0x07, 0x07, 0x07, + /* 0xB0 ^ ---- § ---- */ + 0x5E, 0x9C, 0x9D, 0xFA, 0x07, 0x07, 0x07, 0xAC, + /* 0xB8 ---- [ ] ---- ---- ---- ---- */ + 0xAB, 0x07, 0x5B, 0x5D, 0x07, 0x07, 0x07, 0x07, + /* 0xC0 { A B C D E F G */ + 0x7B, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + /* 0xC8 H I ---- ö ---- */ + 0x48, 0x49, 0x07, 0x93, 0x94, 0x95, 0xA2, 0x07, + /* 0xD0 } J K L M N O P */ + 0x7D, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50, + /* 0xD8 Q R ---- ü */ + 0x51, 0x52, 0x07, 0x96, 0x81, 0x97, 0xA3, 0x98, + /* 0xE0 \ S T U V W X */ + 0x5C, 0xF6, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, + /* 0xE8 Y Z ---- Ö ---- ---- ---- */ + 0x59, 0x5A, 0xFD, 0x07, 0x99, 0x07, 0x07, 0x07, + /* 0xF0 0 1 2 3 4 5 6 7 */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + /* 0xF8 8 9 ---- ---- Ü ---- ---- ---- */ + 0x38, 0x39, 0x07, 0x07, 0x9A, 0x07, 0x07, 0x07 +}; + + +/* + * ASCII (IBM PC 437) -> EBCDIC 500 + */ +__u8 _ascebc_500[256] = +{ + /*00 NUL SOH STX ETX EOT ENQ ACK BEL */ + 0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, + /*08 BS HT LF VT FF CR SO SI */ + /* ->NL */ + 0x16, 0x05, 0x15, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + /*10 DLE DC1 DC2 DC3 DC4 NAK SYN ETB */ + 0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, + /*18 CAN EM SUB ESC FS GS RS US */ + /* ->IGS ->IRS ->IUS */ + 0x18, 0x19, 0x3F, 0x27, 0x22, 0x1D, 0x1E, 0x1F, + /*20 SP ! " # $ % & ' */ + 0x40, 0x4F, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, + /*28 ( ) * + , - . / */ + 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61, + /*30 0 1 2 3 4 5 6 7 */ + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, + /*38 8 9 : ; < = > ? */ + 0xF8, 0xF9, 0x7A, 0x5E, 0x4C, 0x7E, 0x6E, 0x6F, + /*40 @ A B C D E F G */ + 0x7C, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, + /*48 H I J K L M N O */ + 0xC8, 0xC9, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, + /*50 P Q R S T U V W */ + 0xD7, 0xD8, 0xD9, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, + /*58 X Y Z [ \ ] ^ _ */ + 0xE7, 0xE8, 0xE9, 0x4A, 0xE0, 0x5A, 0x5F, 0x6D, + /*60 ` a b c d e f g */ + 0x79, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + /*68 h i j k l m n o */ + 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, + /*70 p q r s t u v w */ + 0x97, 0x98, 0x99, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, + /*78 x y z { | } ~ DL */ + 0xA7, 0xA8, 0xA9, 0xC0, 0xBB, 0xD0, 0xA1, 0x07, + /*80*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*88*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*90*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*98*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*A0*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*A8*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*B0*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*B8*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*C0*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*C8*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*D0*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*D8*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*E0 sz */ + 0x3F, 0x59, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*E8*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*F0*/ + 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, 0x3F, + /*F8*/ + 0x90, 0x3F, 0x3F, 0x3F, 0x3F, 0xEA, 0x3F, 0xFF +}; + +/* + * EBCDIC 500 -> ASCII (IBM PC 437) + */ +__u8 _ebcasc_500[256] = +{ + /* 0x00 NUL SOH STX ETX *SEL HT *RNL DEL */ + 0x00, 0x01, 0x02, 0x03, 0x07, 0x09, 0x07, 0x7F, + /* 0x08 -GE -SPS -RPT VT FF CR SO SI */ + 0x07, 0x07, 0x07, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + /* 0x10 DLE DC1 DC2 DC3 -RES -NL BS -POC + -ENP ->LF */ + 0x10, 0x11, 0x12, 0x13, 0x07, 0x0A, 0x08, 0x07, + /* 0x18 CAN EM -UBS -CU1 -IFS -IGS -IRS -ITB + -IUS */ + 0x18, 0x19, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + /* 0x20 -DS -SOS FS -WUS -BYP LF ETB ESC + -INP */ + 0x07, 0x07, 0x1C, 0x07, 0x07, 0x0A, 0x17, 0x1B, + /* 0x28 -SA -SFE -SM -CSP -MFA ENQ ACK BEL + -SW */ + 0x07, 0x07, 0x07, 0x07, 0x07, 0x05, 0x06, 0x07, + /* 0x30 ---- ---- SYN -IR -PP -TRN -NBS EOT */ + 0x07, 0x07, 0x16, 0x07, 0x07, 0x07, 0x07, 0x04, + /* 0x38 -SBS -IT -RFF -CU3 DC4 NAK ---- SUB */ + 0x07, 0x07, 0x07, 0x07, 0x14, 0x15, 0x07, 0x1A, + /* 0x40 SP RSP ä ---- */ + 0x20, 0xFF, 0x83, 0x84, 0x85, 0xA0, 0x07, 0x86, + /* 0x48 [ . < ( + ! */ + 0x87, 0xA4, 0x5B, 0x2E, 0x3C, 0x28, 0x2B, 0x21, + /* 0x50 & ---- */ + 0x26, 0x82, 0x88, 0x89, 0x8A, 0xA1, 0x8C, 0x07, + /* 0x58 ß ] $ * ) ; ^ */ + 0x8D, 0xE1, 0x5D, 0x24, 0x2A, 0x29, 0x3B, 0x5E, + /* 0x60 - / ---- Ä ---- ---- ---- */ + 0x2D, 0x2F, 0x07, 0x8E, 0x07, 0x07, 0x07, 0x8F, + /* 0x68 ---- , % _ > ? */ + 0x80, 0xA5, 0x07, 0x2C, 0x25, 0x5F, 0x3E, 0x3F, + /* 0x70 ---- ---- ---- ---- ---- ---- ---- */ + 0x07, 0x90, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + /* 0x78 * ` : # @ ' = " */ + 0x70, 0x60, 0x3A, 0x23, 0x40, 0x27, 0x3D, 0x22, + /* 0x80 * a b c d e f g */ + 0x07, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + /* 0x88 h i ---- ---- ---- */ + 0x68, 0x69, 0xAE, 0xAF, 0x07, 0x07, 0x07, 0xF1, + /* 0x90 ° j k l m n o p */ + 0xF8, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, + /* 0x98 q r ---- ---- */ + 0x71, 0x72, 0xA6, 0xA7, 0x91, 0x07, 0x92, 0x07, + /* 0xA0 ~ s t u v w x */ + 0xE6, 0x7E, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, + /* 0xA8 y z ---- ---- ---- ---- */ + 0x79, 0x7A, 0xAD, 0xAB, 0x07, 0x07, 0x07, 0x07, + /* 0xB0 ---- § ---- */ + 0x9B, 0x9C, 0x9D, 0xFA, 0x07, 0x07, 0x07, 0xAC, + /* 0xB8 ---- | ---- ---- ---- ---- */ + 0xAB, 0x07, 0xAA, 0x7C, 0x07, 0x07, 0x07, 0x07, + /* 0xC0 { A B C D E F G */ + 0x7B, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + /* 0xC8 H I ---- ö ---- */ + 0x48, 0x49, 0x07, 0x93, 0x94, 0x95, 0xA2, 0x07, + /* 0xD0 } J K L M N O P */ + 0x7D, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50, + /* 0xD8 Q R ---- ü */ + 0x51, 0x52, 0x07, 0x96, 0x81, 0x97, 0xA3, 0x98, + /* 0xE0 \ S T U V W X */ + 0x5C, 0xF6, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, + /* 0xE8 Y Z ---- Ö ---- ---- ---- */ + 0x59, 0x5A, 0xFD, 0x07, 0x99, 0x07, 0x07, 0x07, + /* 0xF0 0 1 2 3 4 5 6 7 */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + /* 0xF8 8 9 ---- ---- Ü ---- ---- ---- */ + 0x38, 0x39, 0x07, 0x07, 0x9A, 0x07, 0x07, 0x07 +}; + + +/* + * EBCDIC 037/500 conversion table: + * from upper to lower case + */ +__u8 _ebc_tolower[256] = +{ + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F, + 0x60, 0x61, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9C, 0x9F, + 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, + 0xA8, 0xA9, 0xAA, 0xAB, 0x8C, 0x8D, 0x8E, 0xAF, + 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, + 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF, + 0xC0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, + 0xD0, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, + 0xE0, 0xE1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, + 0xA8, 0xA9, 0xEA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, + 0xF8, 0xF9, 0xFA, 0xDB, 0xDC, 0xDD, 0xDE, 0xFF +}; + + +/* + * EBCDIC 037/500 conversion table: + * from lower to upper case + */ +__u8 _ebc_toupper[256] = +{ + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F, + 0x40, 0x41, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0x50, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, + 0x80, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, + 0xC8, 0xC9, 0x8A, 0x8B, 0xAC, 0xAD, 0xAE, 0x8F, + 0x90, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, + 0xD8, 0xD9, 0x9A, 0x9B, 0x9E, 0x9D, 0x9E, 0x9F, + 0xA0, 0xA1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, + 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, + 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF, + 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, + 0xC8, 0xC9, 0xCA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, + 0xD8, 0xD9, 0xDA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, + 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF +}; + +EXPORT_SYMBOL(_ascebc_500); +EXPORT_SYMBOL(_ebcasc_500); +EXPORT_SYMBOL(_ascebc); +EXPORT_SYMBOL(_ebcasc); +EXPORT_SYMBOL(_ebc_tolower); +EXPORT_SYMBOL(_ebc_toupper); + diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S new file mode 100644 index 0000000000..49a11f6dd7 --- /dev/null +++ b/arch/s390/kernel/entry.S @@ -0,0 +1,669 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * S390 low-level entry points. + * + * Copyright IBM Corp. 1999, 2012 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), + * Hartmut Penner (hp@de.ibm.com), + * Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com), + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +_LPP_OFFSET = __LC_LPP + + .macro STBEAR address + ALTERNATIVE "nop", ".insn s,0xb2010000,\address", 193 + .endm + + .macro LBEAR address + ALTERNATIVE "nop", ".insn s,0xb2000000,\address", 193 + .endm + + .macro LPSWEY address,lpswe + ALTERNATIVE "b \lpswe; nopr", ".insn siy,0xeb0000000071,\address,0", 193 + .endm + + .macro MBEAR reg + ALTERNATIVE "brcl 0,0", __stringify(mvc __PT_LAST_BREAK(8,\reg),__LC_LAST_BREAK), 193 + .endm + + .macro CHECK_STACK savearea +#ifdef CONFIG_CHECK_STACK + tml %r15,THREAD_SIZE - CONFIG_STACK_GUARD + lghi %r14,\savearea + jz stack_overflow +#endif + .endm + + .macro CHECK_VMAP_STACK savearea,oklabel +#ifdef CONFIG_VMAP_STACK + lgr %r14,%r15 + nill %r14,0x10000 - THREAD_SIZE + oill %r14,STACK_INIT_OFFSET + clg %r14,__LC_KERNEL_STACK + je \oklabel + clg %r14,__LC_ASYNC_STACK + je \oklabel + clg %r14,__LC_MCCK_STACK + je \oklabel + clg %r14,__LC_NODAT_STACK + je \oklabel + clg %r14,__LC_RESTART_STACK + je \oklabel + lghi %r14,\savearea + j stack_overflow +#else + j \oklabel +#endif + .endm + + /* + * The TSTMSK macro generates a test-under-mask instruction by + * calculating the memory offset for the specified mask value. + * Mask value can be any constant. The macro shifts the mask + * value to calculate the memory offset for the test-under-mask + * instruction. + */ + .macro TSTMSK addr, mask, size=8, bytepos=0 + .if (\bytepos < \size) && (\mask >> 8) + .if (\mask & 0xff) + .error "Mask exceeds byte boundary" + .endif + TSTMSK \addr, "(\mask >> 8)", \size, "(\bytepos + 1)" + .exitm + .endif + .ifeq \mask + .error "Mask must not be zero" + .endif + off = \size - \bytepos - 1 + tm off+\addr, \mask + .endm + + .macro BPOFF + ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,12,0", 82 + .endm + + .macro BPON + ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,13,0", 82 + .endm + + .macro BPENTER tif_ptr,tif_mask + ALTERNATIVE "TSTMSK \tif_ptr,\tif_mask; jz .+8; .insn rrf,0xb2e80000,0,0,13,0", \ + "j .+12; nop; nop", 82 + .endm + + .macro BPEXIT tif_ptr,tif_mask + TSTMSK \tif_ptr,\tif_mask + ALTERNATIVE "jz .+8; .insn rrf,0xb2e80000,0,0,12,0", \ + "jnz .+8; .insn rrf,0xb2e80000,0,0,13,0", 82 + .endm + +#if IS_ENABLED(CONFIG_KVM) + /* + * The OUTSIDE macro jumps to the provided label in case the value + * in the provided register is outside of the provided range. The + * macro is useful for checking whether a PSW stored in a register + * pair points inside or outside of a block of instructions. + * @reg: register to check + * @start: start of the range + * @end: end of the range + * @outside_label: jump here if @reg is outside of [@start..@end) + */ + .macro OUTSIDE reg,start,end,outside_label + lgr %r14,\reg + larl %r13,\start + slgr %r14,%r13 + clgfrl %r14,.Lrange_size\@ + jhe \outside_label + .section .rodata, "a" + .balign 4 +.Lrange_size\@: + .long \end - \start + .previous + .endm + + .macro SIEEXIT + lg %r9,__SF_SIE_CONTROL(%r15) # get control block pointer + ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE + lctlg %c1,%c1,__LC_KERNEL_ASCE # load primary asce + larl %r9,sie_exit # skip forward to sie_exit + .endm +#endif + + .macro STACKLEAK_ERASE +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK + brasl %r14,stackleak_erase_on_task_stack +#endif + .endm + + GEN_BR_THUNK %r14 + + .section .kprobes.text, "ax" +.Ldummy: + /* + * The following nop exists only in order to avoid that the next + * symbol starts at the beginning of the kprobes text section. + * In that case there would be several symbols at the same address. + * E.g. objdump would take an arbitrary symbol when disassembling + * the code. + * With the added nop in between this cannot happen. + */ + nop 0 + +/* + * Scheduler resume function, called by switch_to + * gpr2 = (task_struct *) prev + * gpr3 = (task_struct *) next + * Returns: + * gpr2 = prev + */ +SYM_FUNC_START(__switch_to) + stmg %r6,%r15,__SF_GPRS(%r15) # store gprs of prev task + lghi %r4,__TASK_stack + lghi %r1,__TASK_thread + llill %r5,STACK_INIT_OFFSET + stg %r15,__THREAD_ksp(%r1,%r2) # store kernel stack of prev + lg %r15,0(%r4,%r3) # start of kernel stack of next + agr %r15,%r5 # end of kernel stack of next + stg %r3,__LC_CURRENT # store task struct of next + stg %r15,__LC_KERNEL_STACK # store end of kernel stack + lg %r15,__THREAD_ksp(%r1,%r3) # load kernel stack of next + aghi %r3,__TASK_pid + mvc __LC_CURRENT_PID(4,%r0),0(%r3) # store pid of next + lmg %r6,%r15,__SF_GPRS(%r15) # load gprs of next task + ALTERNATIVE "nop", "lpp _LPP_OFFSET", 40 + BR_EX %r14 +SYM_FUNC_END(__switch_to) + +#if IS_ENABLED(CONFIG_KVM) +/* + * __sie64a calling convention: + * %r2 pointer to sie control block phys + * %r3 pointer to sie control block virt + * %r4 guest register save area + */ +SYM_FUNC_START(__sie64a) + stmg %r6,%r14,__SF_GPRS(%r15) # save kernel registers + lg %r12,__LC_CURRENT + stg %r2,__SF_SIE_CONTROL_PHYS(%r15) # save sie block physical.. + stg %r3,__SF_SIE_CONTROL(%r15) # ...and virtual addresses + stg %r4,__SF_SIE_SAVEAREA(%r15) # save guest register save area + xc __SF_SIE_REASON(8,%r15),__SF_SIE_REASON(%r15) # reason code = 0 + mvc __SF_SIE_FLAGS(8,%r15),__TI_flags(%r12) # copy thread flags + lmg %r0,%r13,0(%r4) # load guest gprs 0-13 + lg %r14,__LC_GMAP # get gmap pointer + ltgr %r14,%r14 + jz .Lsie_gmap + lctlg %c1,%c1,__GMAP_ASCE(%r14) # load primary asce +.Lsie_gmap: + lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer + oi __SIE_PROG0C+3(%r14),1 # we are going into SIE now + tm __SIE_PROG20+3(%r14),3 # last exit... + jnz .Lsie_skip + TSTMSK __LC_CPU_FLAGS,_CIF_FPU + jo .Lsie_skip # exit if fp/vx regs changed + lg %r14,__SF_SIE_CONTROL_PHYS(%r15) # get sie block phys addr + BPEXIT __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST +.Lsie_entry: + sie 0(%r14) +# Let the next instruction be NOP to avoid triggering a machine check +# and handling it in a guest as result of the instruction execution. + nopr 7 +.Lsie_leave: + BPOFF + BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST +.Lsie_skip: + lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer + ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE + lctlg %c1,%c1,__LC_KERNEL_ASCE # load primary asce +.Lsie_done: +# some program checks are suppressing. C code (e.g. do_protection_exception) +# will rewind the PSW by the ILC, which is often 4 bytes in case of SIE. There +# are some corner cases (e.g. runtime instrumentation) where ILC is unpredictable. +# Other instructions between __sie64a and .Lsie_done should not cause program +# interrupts. So lets use 3 nops as a landing pad for all possible rewinds. +.Lrewind_pad6: + nopr 7 +.Lrewind_pad4: + nopr 7 +.Lrewind_pad2: + nopr 7 +SYM_INNER_LABEL(sie_exit, SYM_L_GLOBAL) + lg %r14,__SF_SIE_SAVEAREA(%r15) # load guest register save area + stmg %r0,%r13,0(%r14) # save guest gprs 0-13 + xgr %r0,%r0 # clear guest registers to + xgr %r1,%r1 # prevent speculative use + xgr %r3,%r3 + xgr %r4,%r4 + xgr %r5,%r5 + lmg %r6,%r14,__SF_GPRS(%r15) # restore kernel registers + lg %r2,__SF_SIE_REASON(%r15) # return exit reason code + BR_EX %r14 +.Lsie_fault: + lghi %r14,-EFAULT + stg %r14,__SF_SIE_REASON(%r15) # set exit reason code + j sie_exit + + EX_TABLE(.Lrewind_pad6,.Lsie_fault) + EX_TABLE(.Lrewind_pad4,.Lsie_fault) + EX_TABLE(.Lrewind_pad2,.Lsie_fault) + EX_TABLE(sie_exit,.Lsie_fault) +SYM_FUNC_END(__sie64a) +EXPORT_SYMBOL(__sie64a) +EXPORT_SYMBOL(sie_exit) +#endif + +/* + * SVC interrupt handler routine. System calls are synchronous events and + * are entered with interrupts disabled. + */ + +SYM_CODE_START(system_call) + stpt __LC_SYS_ENTER_TIMER + stmg %r8,%r15,__LC_SAVE_AREA_SYNC + BPOFF + lghi %r14,0 +.Lsysc_per: + STBEAR __LC_LAST_BREAK + lctlg %c1,%c1,__LC_KERNEL_ASCE + lg %r15,__LC_KERNEL_STACK + xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) + stmg %r0,%r7,STACK_FRAME_OVERHEAD+__PT_R0(%r15) + # clear user controlled register to prevent speculative use + xgr %r0,%r0 + xgr %r1,%r1 + xgr %r4,%r4 + xgr %r5,%r5 + xgr %r6,%r6 + xgr %r7,%r7 + xgr %r8,%r8 + xgr %r9,%r9 + xgr %r10,%r10 + xgr %r11,%r11 + la %r2,STACK_FRAME_OVERHEAD(%r15) # pointer to pt_regs + mvc __PT_R8(64,%r2),__LC_SAVE_AREA_SYNC + MBEAR %r2 + lgr %r3,%r14 + brasl %r14,__do_syscall + STACKLEAK_ERASE + lctlg %c1,%c1,__LC_USER_ASCE + mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) + BPON + LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15) + lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) + stpt __LC_EXIT_TIMER + LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE +SYM_CODE_END(system_call) + +# +# a new process exits the kernel with ret_from_fork +# +SYM_CODE_START(ret_from_fork) + lgr %r3,%r11 + brasl %r14,__ret_from_fork + STACKLEAK_ERASE + lctlg %c1,%c1,__LC_USER_ASCE + mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) + BPON + LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15) + lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) + stpt __LC_EXIT_TIMER + LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE +SYM_CODE_END(ret_from_fork) + +/* + * Program check handler routine + */ + +SYM_CODE_START(pgm_check_handler) + stpt __LC_SYS_ENTER_TIMER + BPOFF + stmg %r8,%r15,__LC_SAVE_AREA_SYNC + lghi %r10,0 + lmg %r8,%r9,__LC_PGM_OLD_PSW + tmhh %r8,0x0001 # coming from user space? + jno .Lpgm_skip_asce + lctlg %c1,%c1,__LC_KERNEL_ASCE + j 3f # -> fault in user space +.Lpgm_skip_asce: +#if IS_ENABLED(CONFIG_KVM) + # cleanup critical section for program checks in __sie64a + OUTSIDE %r9,.Lsie_gmap,.Lsie_done,1f + BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST + SIEEXIT + lghi %r10,_PIF_GUEST_FAULT +#endif +1: tmhh %r8,0x4000 # PER bit set in old PSW ? + jnz 2f # -> enabled, can't be a double fault + tm __LC_PGM_ILC+3,0x80 # check for per exception + jnz .Lpgm_svcper # -> single stepped svc +2: CHECK_STACK __LC_SAVE_AREA_SYNC + aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) + # CHECK_VMAP_STACK branches to stack_overflow or 4f + CHECK_VMAP_STACK __LC_SAVE_AREA_SYNC,4f +3: lg %r15,__LC_KERNEL_STACK +4: la %r11,STACK_FRAME_OVERHEAD(%r15) + stg %r10,__PT_FLAGS(%r11) + xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) + stmg %r0,%r7,__PT_R0(%r11) + mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC + mvc __PT_LAST_BREAK(8,%r11),__LC_PGM_LAST_BREAK + stmg %r8,%r9,__PT_PSW(%r11) + + # clear user controlled registers to prevent speculative use + xgr %r0,%r0 + xgr %r1,%r1 + xgr %r3,%r3 + xgr %r4,%r4 + xgr %r5,%r5 + xgr %r6,%r6 + xgr %r7,%r7 + lgr %r2,%r11 + brasl %r14,__do_pgm_check + tmhh %r8,0x0001 # returning to user space? + jno .Lpgm_exit_kernel + STACKLEAK_ERASE + lctlg %c1,%c1,__LC_USER_ASCE + BPON + stpt __LC_EXIT_TIMER +.Lpgm_exit_kernel: + mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) + LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15) + lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) + LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE + +# +# single stepped system call +# +.Lpgm_svcper: + mvc __LC_RETURN_PSW(8),__LC_SVC_NEW_PSW + larl %r14,.Lsysc_per + stg %r14,__LC_RETURN_PSW+8 + lghi %r14,1 + LBEAR __LC_PGM_LAST_BREAK + LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE # branch to .Lsysc_per +SYM_CODE_END(pgm_check_handler) + +/* + * Interrupt handler macro used for external and IO interrupts. + */ +.macro INT_HANDLER name,lc_old_psw,handler +SYM_CODE_START(\name) + stckf __LC_INT_CLOCK + stpt __LC_SYS_ENTER_TIMER + STBEAR __LC_LAST_BREAK + BPOFF + stmg %r8,%r15,__LC_SAVE_AREA_ASYNC + lmg %r8,%r9,\lc_old_psw + tmhh %r8,0x0001 # interrupting from user ? + jnz 1f +#if IS_ENABLED(CONFIG_KVM) + OUTSIDE %r9,.Lsie_gmap,.Lsie_done,0f + BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST + SIEEXIT +#endif +0: CHECK_STACK __LC_SAVE_AREA_ASYNC + aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) + j 2f +1: lctlg %c1,%c1,__LC_KERNEL_ASCE + lg %r15,__LC_KERNEL_STACK +2: xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) + la %r11,STACK_FRAME_OVERHEAD(%r15) + stmg %r0,%r7,__PT_R0(%r11) + # clear user controlled registers to prevent speculative use + xgr %r0,%r0 + xgr %r1,%r1 + xgr %r3,%r3 + xgr %r4,%r4 + xgr %r5,%r5 + xgr %r6,%r6 + xgr %r7,%r7 + xgr %r10,%r10 + xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) + mvc __PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC + MBEAR %r11 + stmg %r8,%r9,__PT_PSW(%r11) + lgr %r2,%r11 # pass pointer to pt_regs + brasl %r14,\handler + mvc __LC_RETURN_PSW(16),__PT_PSW(%r11) + tmhh %r8,0x0001 # returning to user ? + jno 2f + STACKLEAK_ERASE + lctlg %c1,%c1,__LC_USER_ASCE + BPON + stpt __LC_EXIT_TIMER +2: LBEAR __PT_LAST_BREAK(%r11) + lmg %r0,%r15,__PT_R0(%r11) + LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE +SYM_CODE_END(\name) +.endm + +INT_HANDLER ext_int_handler,__LC_EXT_OLD_PSW,do_ext_irq +INT_HANDLER io_int_handler,__LC_IO_OLD_PSW,do_io_irq + +/* + * Load idle PSW. + */ +SYM_FUNC_START(psw_idle) + stg %r14,(__SF_GPRS+8*8)(%r15) + stg %r3,__SF_EMPTY(%r15) + larl %r1,psw_idle_exit + stg %r1,__SF_EMPTY+8(%r15) + larl %r1,smp_cpu_mtid + llgf %r1,0(%r1) + ltgr %r1,%r1 + jz .Lpsw_idle_stcctm + .insn rsy,0xeb0000000017,%r1,5,__MT_CYCLES_ENTER(%r2) +.Lpsw_idle_stcctm: + oi __LC_CPU_FLAGS+7,_CIF_ENABLED_WAIT + BPON + stckf __CLOCK_IDLE_ENTER(%r2) + stpt __TIMER_IDLE_ENTER(%r2) + lpswe __SF_EMPTY(%r15) +SYM_INNER_LABEL(psw_idle_exit, SYM_L_GLOBAL) + BR_EX %r14 +SYM_FUNC_END(psw_idle) + +/* + * Machine check handler routines + */ +SYM_CODE_START(mcck_int_handler) + BPOFF + la %r1,4095 # validate r1 + spt __LC_CPU_TIMER_SAVE_AREA-4095(%r1) # validate cpu timer + LBEAR __LC_LAST_BREAK_SAVE_AREA-4095(%r1) # validate bear + lmg %r0,%r15,__LC_GPREGS_SAVE_AREA # validate gprs + lmg %r8,%r9,__LC_MCK_OLD_PSW + TSTMSK __LC_MCCK_CODE,MCCK_CODE_SYSTEM_DAMAGE + jo .Lmcck_panic # yes -> rest of mcck code invalid + TSTMSK __LC_MCCK_CODE,MCCK_CODE_CR_VALID + jno .Lmcck_panic # control registers invalid -> panic + lctlg %c0,%c15,__LC_CREGS_SAVE_AREA # validate ctl regs + ptlb + lghi %r14,__LC_CPU_TIMER_SAVE_AREA + mvc __LC_MCCK_ENTER_TIMER(8),0(%r14) + TSTMSK __LC_MCCK_CODE,MCCK_CODE_CPU_TIMER_VALID + jo 3f + la %r14,__LC_SYS_ENTER_TIMER + clc 0(8,%r14),__LC_EXIT_TIMER + jl 1f + la %r14,__LC_EXIT_TIMER +1: clc 0(8,%r14),__LC_LAST_UPDATE_TIMER + jl 2f + la %r14,__LC_LAST_UPDATE_TIMER +2: spt 0(%r14) + mvc __LC_MCCK_ENTER_TIMER(8),0(%r14) +3: TSTMSK __LC_MCCK_CODE,MCCK_CODE_PSW_MWP_VALID + jno .Lmcck_panic + tmhh %r8,0x0001 # interrupting from user ? + jnz .Lmcck_user + TSTMSK __LC_MCCK_CODE,MCCK_CODE_PSW_IA_VALID + jno .Lmcck_panic +#if IS_ENABLED(CONFIG_KVM) + OUTSIDE %r9,.Lsie_gmap,.Lsie_done,.Lmcck_user + OUTSIDE %r9,.Lsie_entry,.Lsie_leave,4f + oi __LC_CPU_FLAGS+7, _CIF_MCCK_GUEST +4: BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST + SIEEXIT +#endif +.Lmcck_user: + lg %r15,__LC_MCCK_STACK + la %r11,STACK_FRAME_OVERHEAD(%r15) + stctg %c1,%c1,__PT_CR1(%r11) + lctlg %c1,%c1,__LC_KERNEL_ASCE + xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) + lghi %r14,__LC_GPREGS_SAVE_AREA+64 + stmg %r0,%r7,__PT_R0(%r11) + # clear user controlled registers to prevent speculative use + xgr %r0,%r0 + xgr %r1,%r1 + xgr %r3,%r3 + xgr %r4,%r4 + xgr %r5,%r5 + xgr %r6,%r6 + xgr %r7,%r7 + xgr %r10,%r10 + mvc __PT_R8(64,%r11),0(%r14) + stmg %r8,%r9,__PT_PSW(%r11) + xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) + xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) + lgr %r2,%r11 # pass pointer to pt_regs + brasl %r14,s390_do_machine_check + lctlg %c1,%c1,__PT_CR1(%r11) + lmg %r0,%r10,__PT_R0(%r11) + mvc __LC_RETURN_MCCK_PSW(16),__PT_PSW(%r11) # move return PSW + tm __LC_RETURN_MCCK_PSW+1,0x01 # returning to user ? + jno 0f + BPON + stpt __LC_EXIT_TIMER +0: ALTERNATIVE "nop", __stringify(lghi %r12,__LC_LAST_BREAK_SAVE_AREA),193 + LBEAR 0(%r12) + lmg %r11,%r15,__PT_R11(%r11) + LPSWEY __LC_RETURN_MCCK_PSW,__LC_RETURN_MCCK_LPSWE + +.Lmcck_panic: + /* + * Iterate over all possible CPU addresses in the range 0..0xffff + * and stop each CPU using signal processor. Use compare and swap + * to allow just one CPU-stopper and prevent concurrent CPUs from + * stopping each other while leaving the others running. + */ + lhi %r5,0 + lhi %r6,1 + larl %r7,stop_lock + cs %r5,%r6,0(%r7) # single CPU-stopper only + jnz 4f + larl %r7,this_cpu + stap 0(%r7) # this CPU address + lh %r4,0(%r7) + nilh %r4,0 + lhi %r0,1 + sll %r0,16 # CPU counter + lhi %r3,0 # next CPU address +0: cr %r3,%r4 + je 2f +1: sigp %r1,%r3,SIGP_STOP # stop next CPU + brc SIGP_CC_BUSY,1b +2: ahi %r3,1 + brct %r0,0b +3: sigp %r1,%r4,SIGP_STOP # stop this CPU + brc SIGP_CC_BUSY,3b +4: j 4b +SYM_CODE_END(mcck_int_handler) + +SYM_CODE_START(restart_int_handler) + ALTERNATIVE "nop", "lpp _LPP_OFFSET", 40 + stg %r15,__LC_SAVE_AREA_RESTART + TSTMSK __LC_RESTART_FLAGS,RESTART_FLAG_CTLREGS,4 + jz 0f + lctlg %c0,%c15,__LC_CREGS_SAVE_AREA +0: larl %r15,daton_psw + lpswe 0(%r15) # turn dat on, keep irqs off +.Ldaton: + lg %r15,__LC_RESTART_STACK + xc STACK_FRAME_OVERHEAD(__PT_SIZE,%r15),STACK_FRAME_OVERHEAD(%r15) + stmg %r0,%r14,STACK_FRAME_OVERHEAD+__PT_R0(%r15) + mvc STACK_FRAME_OVERHEAD+__PT_R15(8,%r15),__LC_SAVE_AREA_RESTART + mvc STACK_FRAME_OVERHEAD+__PT_PSW(16,%r15),__LC_RST_OLD_PSW + xc 0(STACK_FRAME_OVERHEAD,%r15),0(%r15) + lg %r1,__LC_RESTART_FN # load fn, parm & source cpu + lg %r2,__LC_RESTART_DATA + lgf %r3,__LC_RESTART_SOURCE + ltgr %r3,%r3 # test source cpu address + jm 1f # negative -> skip source stop +0: sigp %r4,%r3,SIGP_SENSE # sigp sense to source cpu + brc 10,0b # wait for status stored +1: basr %r14,%r1 # call function + stap __SF_EMPTY(%r15) # store cpu address + llgh %r3,__SF_EMPTY(%r15) +2: sigp %r4,%r3,SIGP_STOP # sigp stop to current cpu + brc 2,2b +3: j 3b +SYM_CODE_END(restart_int_handler) + + .section .kprobes.text, "ax" + +#if defined(CONFIG_CHECK_STACK) || defined(CONFIG_VMAP_STACK) +/* + * The synchronous or the asynchronous stack overflowed. We are dead. + * No need to properly save the registers, we are going to panic anyway. + * Setup a pt_regs so that show_trace can provide a good call trace. + */ +SYM_CODE_START(stack_overflow) + lg %r15,__LC_NODAT_STACK # change to panic stack + la %r11,STACK_FRAME_OVERHEAD(%r15) + stmg %r0,%r7,__PT_R0(%r11) + stmg %r8,%r9,__PT_PSW(%r11) + mvc __PT_R8(64,%r11),0(%r14) + stg %r10,__PT_ORIG_GPR2(%r11) # store last break to orig_gpr2 + xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) + lgr %r2,%r11 # pass pointer to pt_regs + jg kernel_stack_overflow +SYM_CODE_END(stack_overflow) +#endif + + .section .data, "aw" + .balign 4 +SYM_DATA_LOCAL(stop_lock, .long 0) +SYM_DATA_LOCAL(this_cpu, .short 0) + .balign 8 +SYM_DATA_START_LOCAL(daton_psw) + .quad PSW_KERNEL_BITS + .quad .Ldaton +SYM_DATA_END(daton_psw) + + .section .rodata, "a" +#define SYSCALL(esame,emu) .quad __s390x_ ## esame +SYM_DATA_START(sys_call_table) +#include "asm/syscall_table.h" +SYM_DATA_END(sys_call_table) +#undef SYSCALL + +#ifdef CONFIG_COMPAT + +#define SYSCALL(esame,emu) .quad __s390_ ## emu +SYM_DATA_START(sys_call_table_emu) +#include "asm/syscall_table.h" +SYM_DATA_END(sys_call_table_emu) +#undef SYSCALL +#endif diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h new file mode 100644 index 0000000000..9f41853f36 --- /dev/null +++ b/arch/s390/kernel/entry.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ENTRY_H +#define _ENTRY_H + +#include +#include +#include +#include +#include +#include + +extern void *restart_stack; + +void system_call(void); +void pgm_check_handler(void); +void ext_int_handler(void); +void io_int_handler(void); +void mcck_int_handler(void); +void restart_int_handler(void); +void early_pgm_check_handler(void); + +void __ret_from_fork(struct task_struct *prev, struct pt_regs *regs); +void __do_pgm_check(struct pt_regs *regs); +void __do_syscall(struct pt_regs *regs, int per_trap); +void __do_early_pgm_check(struct pt_regs *regs); + +void do_protection_exception(struct pt_regs *regs); +void do_dat_exception(struct pt_regs *regs); +void do_secure_storage_access(struct pt_regs *regs); +void do_non_secure_storage_access(struct pt_regs *regs); +void do_secure_storage_violation(struct pt_regs *regs); +void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str); +void kernel_stack_overflow(struct pt_regs * regs); +void handle_signal32(struct ksignal *ksig, sigset_t *oldset, + struct pt_regs *regs); + +void do_io_irq(struct pt_regs *regs); +void do_ext_irq(struct pt_regs *regs); +void do_restart(void *arg); +void __init startup_init(void); +void die(struct pt_regs *regs, const char *str); +int setup_profiling_timer(unsigned int multiplier); +unsigned long prepare_ftrace_return(unsigned long parent, unsigned long sp, unsigned long ip); + +struct s390_mmap_arg_struct; +struct fadvise64_64_args; +struct old_sigaction; + +long sys_rt_sigreturn(void); +long sys_sigreturn(void); + +long sys_s390_personality(unsigned int personality); +long sys_s390_runtime_instr(int command, int signum); +long sys_s390_guarded_storage(int command, struct gs_cb __user *); +long sys_s390_pci_mmio_write(unsigned long, const void __user *, size_t); +long sys_s390_pci_mmio_read(unsigned long, void __user *, size_t); +long sys_s390_sthyi(unsigned long function_code, void __user *buffer, u64 __user *return_code, unsigned long flags); + +DECLARE_PER_CPU(u64, mt_cycles[8]); + +unsigned long stack_alloc(void); +void stack_free(unsigned long stack); + +extern char kprobes_insn_page[]; + +extern char _samode31[], _eamode31[]; +extern char _stext_amode31[], _etext_amode31[]; +extern struct exception_table_entry _start_amode31_ex_table[]; +extern struct exception_table_entry _stop_amode31_ex_table[]; + +#define __amode31_data __section(".amode31.data") +#define __amode31_ref __section(".amode31.refs") +extern long _start_amode31_refs[], _end_amode31_refs[]; + +#endif /* _ENTRY_H */ diff --git a/arch/s390/kernel/fpu.c b/arch/s390/kernel/fpu.c new file mode 100644 index 0000000000..4666b29ac8 --- /dev/null +++ b/arch/s390/kernel/fpu.c @@ -0,0 +1,264 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * In-kernel vector facility support functions + * + * Copyright IBM Corp. 2015 + * Author(s): Hendrik Brueckner + */ +#include +#include +#include +#include +#include +#include + +void __kernel_fpu_begin(struct kernel_fpu *state, u32 flags) +{ + /* + * Limit the save to the FPU/vector registers already + * in use by the previous context + */ + flags &= state->mask; + + if (flags & KERNEL_FPC) + /* Save floating point control */ + asm volatile("stfpc %0" : "=Q" (state->fpc)); + + if (!MACHINE_HAS_VX) { + if (flags & KERNEL_VXR_V0V7) { + /* Save floating-point registers */ + asm volatile("std 0,%0" : "=Q" (state->fprs[0])); + asm volatile("std 1,%0" : "=Q" (state->fprs[1])); + asm volatile("std 2,%0" : "=Q" (state->fprs[2])); + asm volatile("std 3,%0" : "=Q" (state->fprs[3])); + asm volatile("std 4,%0" : "=Q" (state->fprs[4])); + asm volatile("std 5,%0" : "=Q" (state->fprs[5])); + asm volatile("std 6,%0" : "=Q" (state->fprs[6])); + asm volatile("std 7,%0" : "=Q" (state->fprs[7])); + asm volatile("std 8,%0" : "=Q" (state->fprs[8])); + asm volatile("std 9,%0" : "=Q" (state->fprs[9])); + asm volatile("std 10,%0" : "=Q" (state->fprs[10])); + asm volatile("std 11,%0" : "=Q" (state->fprs[11])); + asm volatile("std 12,%0" : "=Q" (state->fprs[12])); + asm volatile("std 13,%0" : "=Q" (state->fprs[13])); + asm volatile("std 14,%0" : "=Q" (state->fprs[14])); + asm volatile("std 15,%0" : "=Q" (state->fprs[15])); + } + return; + } + + /* Test and save vector registers */ + asm volatile ( + /* + * Test if any vector register must be saved and, if so, + * test if all register can be saved. + */ + " la 1,%[vxrs]\n" /* load save area */ + " tmll %[m],30\n" /* KERNEL_VXR */ + " jz 7f\n" /* no work -> done */ + " jo 5f\n" /* -> save V0..V31 */ + /* + * Test for special case KERNEL_FPU_MID only. In this + * case a vstm V8..V23 is the best instruction + */ + " chi %[m],12\n" /* KERNEL_VXR_MID */ + " jne 0f\n" /* -> save V8..V23 */ + " VSTM 8,23,128,1\n" /* vstm %v8,%v23,128(%r1) */ + " j 7f\n" + /* Test and save the first half of 16 vector registers */ + "0: tmll %[m],6\n" /* KERNEL_VXR_LOW */ + " jz 3f\n" /* -> KERNEL_VXR_HIGH */ + " jo 2f\n" /* 11 -> save V0..V15 */ + " brc 2,1f\n" /* 10 -> save V8..V15 */ + " VSTM 0,7,0,1\n" /* vstm %v0,%v7,0(%r1) */ + " j 3f\n" + "1: VSTM 8,15,128,1\n" /* vstm %v8,%v15,128(%r1) */ + " j 3f\n" + "2: VSTM 0,15,0,1\n" /* vstm %v0,%v15,0(%r1) */ + /* Test and save the second half of 16 vector registers */ + "3: tmll %[m],24\n" /* KERNEL_VXR_HIGH */ + " jz 7f\n" + " jo 6f\n" /* 11 -> save V16..V31 */ + " brc 2,4f\n" /* 10 -> save V24..V31 */ + " VSTM 16,23,256,1\n" /* vstm %v16,%v23,256(%r1) */ + " j 7f\n" + "4: VSTM 24,31,384,1\n" /* vstm %v24,%v31,384(%r1) */ + " j 7f\n" + "5: VSTM 0,15,0,1\n" /* vstm %v0,%v15,0(%r1) */ + "6: VSTM 16,31,256,1\n" /* vstm %v16,%v31,256(%r1) */ + "7:" + : [vxrs] "=Q" (*(struct vx_array *) &state->vxrs) + : [m] "d" (flags) + : "1", "cc"); +} +EXPORT_SYMBOL(__kernel_fpu_begin); + +void __kernel_fpu_end(struct kernel_fpu *state, u32 flags) +{ + /* + * Limit the restore to the FPU/vector registers of the + * previous context that have been overwritte by the + * current context + */ + flags &= state->mask; + + if (flags & KERNEL_FPC) + /* Restore floating-point controls */ + asm volatile("lfpc %0" : : "Q" (state->fpc)); + + if (!MACHINE_HAS_VX) { + if (flags & KERNEL_VXR_V0V7) { + /* Restore floating-point registers */ + asm volatile("ld 0,%0" : : "Q" (state->fprs[0])); + asm volatile("ld 1,%0" : : "Q" (state->fprs[1])); + asm volatile("ld 2,%0" : : "Q" (state->fprs[2])); + asm volatile("ld 3,%0" : : "Q" (state->fprs[3])); + asm volatile("ld 4,%0" : : "Q" (state->fprs[4])); + asm volatile("ld 5,%0" : : "Q" (state->fprs[5])); + asm volatile("ld 6,%0" : : "Q" (state->fprs[6])); + asm volatile("ld 7,%0" : : "Q" (state->fprs[7])); + asm volatile("ld 8,%0" : : "Q" (state->fprs[8])); + asm volatile("ld 9,%0" : : "Q" (state->fprs[9])); + asm volatile("ld 10,%0" : : "Q" (state->fprs[10])); + asm volatile("ld 11,%0" : : "Q" (state->fprs[11])); + asm volatile("ld 12,%0" : : "Q" (state->fprs[12])); + asm volatile("ld 13,%0" : : "Q" (state->fprs[13])); + asm volatile("ld 14,%0" : : "Q" (state->fprs[14])); + asm volatile("ld 15,%0" : : "Q" (state->fprs[15])); + } + return; + } + + /* Test and restore (load) vector registers */ + asm volatile ( + /* + * Test if any vector register must be loaded and, if so, + * test if all registers can be loaded at once. + */ + " la 1,%[vxrs]\n" /* load restore area */ + " tmll %[m],30\n" /* KERNEL_VXR */ + " jz 7f\n" /* no work -> done */ + " jo 5f\n" /* -> restore V0..V31 */ + /* + * Test for special case KERNEL_FPU_MID only. In this + * case a vlm V8..V23 is the best instruction + */ + " chi %[m],12\n" /* KERNEL_VXR_MID */ + " jne 0f\n" /* -> restore V8..V23 */ + " VLM 8,23,128,1\n" /* vlm %v8,%v23,128(%r1) */ + " j 7f\n" + /* Test and restore the first half of 16 vector registers */ + "0: tmll %[m],6\n" /* KERNEL_VXR_LOW */ + " jz 3f\n" /* -> KERNEL_VXR_HIGH */ + " jo 2f\n" /* 11 -> restore V0..V15 */ + " brc 2,1f\n" /* 10 -> restore V8..V15 */ + " VLM 0,7,0,1\n" /* vlm %v0,%v7,0(%r1) */ + " j 3f\n" + "1: VLM 8,15,128,1\n" /* vlm %v8,%v15,128(%r1) */ + " j 3f\n" + "2: VLM 0,15,0,1\n" /* vlm %v0,%v15,0(%r1) */ + /* Test and restore the second half of 16 vector registers */ + "3: tmll %[m],24\n" /* KERNEL_VXR_HIGH */ + " jz 7f\n" + " jo 6f\n" /* 11 -> restore V16..V31 */ + " brc 2,4f\n" /* 10 -> restore V24..V31 */ + " VLM 16,23,256,1\n" /* vlm %v16,%v23,256(%r1) */ + " j 7f\n" + "4: VLM 24,31,384,1\n" /* vlm %v24,%v31,384(%r1) */ + " j 7f\n" + "5: VLM 0,15,0,1\n" /* vlm %v0,%v15,0(%r1) */ + "6: VLM 16,31,256,1\n" /* vlm %v16,%v31,256(%r1) */ + "7:" + : [vxrs] "=Q" (*(struct vx_array *) &state->vxrs) + : [m] "d" (flags) + : "1", "cc"); +} +EXPORT_SYMBOL(__kernel_fpu_end); + +void __load_fpu_regs(void) +{ + struct fpu *state = ¤t->thread.fpu; + unsigned long *regs = current->thread.fpu.regs; + + asm volatile("lfpc %0" : : "Q" (state->fpc)); + if (likely(MACHINE_HAS_VX)) { + asm volatile("lgr 1,%0\n" + "VLM 0,15,0,1\n" + "VLM 16,31,256,1\n" + : + : "d" (regs) + : "1", "cc", "memory"); + } else { + asm volatile("ld 0,%0" : : "Q" (regs[0])); + asm volatile("ld 1,%0" : : "Q" (regs[1])); + asm volatile("ld 2,%0" : : "Q" (regs[2])); + asm volatile("ld 3,%0" : : "Q" (regs[3])); + asm volatile("ld 4,%0" : : "Q" (regs[4])); + asm volatile("ld 5,%0" : : "Q" (regs[5])); + asm volatile("ld 6,%0" : : "Q" (regs[6])); + asm volatile("ld 7,%0" : : "Q" (regs[7])); + asm volatile("ld 8,%0" : : "Q" (regs[8])); + asm volatile("ld 9,%0" : : "Q" (regs[9])); + asm volatile("ld 10,%0" : : "Q" (regs[10])); + asm volatile("ld 11,%0" : : "Q" (regs[11])); + asm volatile("ld 12,%0" : : "Q" (regs[12])); + asm volatile("ld 13,%0" : : "Q" (regs[13])); + asm volatile("ld 14,%0" : : "Q" (regs[14])); + asm volatile("ld 15,%0" : : "Q" (regs[15])); + } + clear_cpu_flag(CIF_FPU); +} +EXPORT_SYMBOL(__load_fpu_regs); + +void load_fpu_regs(void) +{ + raw_local_irq_disable(); + __load_fpu_regs(); + raw_local_irq_enable(); +} +EXPORT_SYMBOL(load_fpu_regs); + +void save_fpu_regs(void) +{ + unsigned long flags, *regs; + struct fpu *state; + + local_irq_save(flags); + + if (test_cpu_flag(CIF_FPU)) + goto out; + + state = ¤t->thread.fpu; + regs = current->thread.fpu.regs; + + asm volatile("stfpc %0" : "=Q" (state->fpc)); + if (likely(MACHINE_HAS_VX)) { + asm volatile("lgr 1,%0\n" + "VSTM 0,15,0,1\n" + "VSTM 16,31,256,1\n" + : + : "d" (regs) + : "1", "cc", "memory"); + } else { + asm volatile("std 0,%0" : "=Q" (regs[0])); + asm volatile("std 1,%0" : "=Q" (regs[1])); + asm volatile("std 2,%0" : "=Q" (regs[2])); + asm volatile("std 3,%0" : "=Q" (regs[3])); + asm volatile("std 4,%0" : "=Q" (regs[4])); + asm volatile("std 5,%0" : "=Q" (regs[5])); + asm volatile("std 6,%0" : "=Q" (regs[6])); + asm volatile("std 7,%0" : "=Q" (regs[7])); + asm volatile("std 8,%0" : "=Q" (regs[8])); + asm volatile("std 9,%0" : "=Q" (regs[9])); + asm volatile("std 10,%0" : "=Q" (regs[10])); + asm volatile("std 11,%0" : "=Q" (regs[11])); + asm volatile("std 12,%0" : "=Q" (regs[12])); + asm volatile("std 13,%0" : "=Q" (regs[13])); + asm volatile("std 14,%0" : "=Q" (regs[14])); + asm volatile("std 15,%0" : "=Q" (regs[15])); + } + set_cpu_flag(CIF_FPU); +out: + local_irq_restore(flags); +} +EXPORT_SYMBOL(save_fpu_regs); diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c new file mode 100644 index 0000000000..c46381ea04 --- /dev/null +++ b/arch/s390/kernel/ftrace.c @@ -0,0 +1,340 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Dynamic function tracer architecture backend. + * + * Copyright IBM Corp. 2009,2014 + * + * Author(s): Martin Schwidefsky + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "entry.h" +#include "ftrace.h" + +/* + * To generate function prologue either gcc's hotpatch feature (since gcc 4.8) + * or a combination of -pg -mrecord-mcount -mnop-mcount -mfentry flags + * (since gcc 9 / clang 10) is used. + * In both cases the original and also the disabled function prologue contains + * only a single six byte instruction and looks like this: + * > brcl 0,0 # offset 0 + * To enable ftrace the code gets patched like above and afterwards looks + * like this: + * > brasl %r0,ftrace_caller # offset 0 + * + * The instruction will be patched by ftrace_make_call / ftrace_make_nop. + * The ftrace function gets called with a non-standard C function call ABI + * where r0 contains the return address. It is also expected that the called + * function only clobbers r0 and r1, but restores r2-r15. + * For module code we can't directly jump to ftrace caller, but need a + * trampoline (ftrace_plt), which clobbers also r1. + */ + +void *ftrace_func __read_mostly = ftrace_stub; +struct ftrace_insn { + u16 opc; + s32 disp; +} __packed; + +#ifdef CONFIG_MODULES +static char *ftrace_plt; +#endif /* CONFIG_MODULES */ + +static const char *ftrace_shared_hotpatch_trampoline(const char **end) +{ + const char *tstart, *tend; + + tstart = ftrace_shared_hotpatch_trampoline_br; + tend = ftrace_shared_hotpatch_trampoline_br_end; +#ifdef CONFIG_EXPOLINE + if (!nospec_disable) { + tstart = ftrace_shared_hotpatch_trampoline_exrl; + tend = ftrace_shared_hotpatch_trampoline_exrl_end; + } +#endif /* CONFIG_EXPOLINE */ + if (end) + *end = tend; + return tstart; +} + +bool ftrace_need_init_nop(void) +{ + return true; +} + +int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) +{ + static struct ftrace_hotpatch_trampoline *next_vmlinux_trampoline = + __ftrace_hotpatch_trampolines_start; + static const char orig[6] = { 0xc0, 0x04, 0x00, 0x00, 0x00, 0x00 }; + static struct ftrace_hotpatch_trampoline *trampoline; + struct ftrace_hotpatch_trampoline **next_trampoline; + struct ftrace_hotpatch_trampoline *trampolines_end; + struct ftrace_hotpatch_trampoline tmp; + struct ftrace_insn *insn; + const char *shared; + s32 disp; + + BUILD_BUG_ON(sizeof(struct ftrace_hotpatch_trampoline) != + SIZEOF_FTRACE_HOTPATCH_TRAMPOLINE); + + next_trampoline = &next_vmlinux_trampoline; + trampolines_end = __ftrace_hotpatch_trampolines_end; + shared = ftrace_shared_hotpatch_trampoline(NULL); +#ifdef CONFIG_MODULES + if (mod) { + next_trampoline = &mod->arch.next_trampoline; + trampolines_end = mod->arch.trampolines_end; + shared = ftrace_plt; + } +#endif + + if (WARN_ON_ONCE(*next_trampoline >= trampolines_end)) + return -ENOMEM; + trampoline = (*next_trampoline)++; + + /* Check for the compiler-generated fentry nop (brcl 0, .). */ + if (WARN_ON_ONCE(memcmp((const void *)rec->ip, &orig, sizeof(orig)))) + return -EINVAL; + + /* Generate the trampoline. */ + tmp.brasl_opc = 0xc015; /* brasl %r1, shared */ + tmp.brasl_disp = (shared - (const char *)&trampoline->brasl_opc) / 2; + tmp.interceptor = FTRACE_ADDR; + tmp.rest_of_intercepted_function = rec->ip + sizeof(struct ftrace_insn); + s390_kernel_write(trampoline, &tmp, sizeof(tmp)); + + /* Generate a jump to the trampoline. */ + disp = ((char *)trampoline - (char *)rec->ip) / 2; + insn = (struct ftrace_insn *)rec->ip; + s390_kernel_write(&insn->disp, &disp, sizeof(disp)); + + return 0; +} + +static struct ftrace_hotpatch_trampoline *ftrace_get_trampoline(struct dyn_ftrace *rec) +{ + struct ftrace_hotpatch_trampoline *trampoline; + struct ftrace_insn insn; + s64 disp; + u16 opc; + + if (copy_from_kernel_nofault(&insn, (void *)rec->ip, sizeof(insn))) + return ERR_PTR(-EFAULT); + disp = (s64)insn.disp * 2; + trampoline = (void *)(rec->ip + disp); + if (get_kernel_nofault(opc, &trampoline->brasl_opc)) + return ERR_PTR(-EFAULT); + if (opc != 0xc015) + return ERR_PTR(-EINVAL); + return trampoline; +} + +int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, + unsigned long addr) +{ + struct ftrace_hotpatch_trampoline *trampoline; + u64 old; + + trampoline = ftrace_get_trampoline(rec); + if (IS_ERR(trampoline)) + return PTR_ERR(trampoline); + if (get_kernel_nofault(old, &trampoline->interceptor)) + return -EFAULT; + if (old != old_addr) + return -EINVAL; + s390_kernel_write(&trampoline->interceptor, &addr, sizeof(addr)); + return 0; +} + +static int ftrace_patch_branch_mask(void *addr, u16 expected, bool enable) +{ + u16 old; + u8 op; + + if (get_kernel_nofault(old, addr)) + return -EFAULT; + if (old != expected) + return -EINVAL; + /* set mask field to all ones or zeroes */ + op = enable ? 0xf4 : 0x04; + s390_kernel_write((char *)addr + 1, &op, sizeof(op)); + return 0; +} + +int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, + unsigned long addr) +{ + /* Expect brcl 0xf,... */ + return ftrace_patch_branch_mask((void *)rec->ip, 0xc0f4, false); +} + +int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + struct ftrace_hotpatch_trampoline *trampoline; + + trampoline = ftrace_get_trampoline(rec); + if (IS_ERR(trampoline)) + return PTR_ERR(trampoline); + s390_kernel_write(&trampoline->interceptor, &addr, sizeof(addr)); + /* Expect brcl 0x0,... */ + return ftrace_patch_branch_mask((void *)rec->ip, 0xc004, true); +} + +int ftrace_update_ftrace_func(ftrace_func_t func) +{ + ftrace_func = func; + return 0; +} + +void arch_ftrace_update_code(int command) +{ + ftrace_modify_all_code(command); +} + +void ftrace_arch_code_modify_post_process(void) +{ + /* + * Flush any pre-fetched instructions on all + * CPUs to make the new code visible. + */ + text_poke_sync_lock(); +} + +#ifdef CONFIG_MODULES + +static int __init ftrace_plt_init(void) +{ + const char *start, *end; + + ftrace_plt = module_alloc(PAGE_SIZE); + if (!ftrace_plt) + panic("cannot allocate ftrace plt\n"); + + start = ftrace_shared_hotpatch_trampoline(&end); + memcpy(ftrace_plt, start, end - start); + set_memory_rox((unsigned long)ftrace_plt, 1); + return 0; +} +device_initcall(ftrace_plt_init); + +#endif /* CONFIG_MODULES */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +/* + * Hook the return address and push it in the stack of return addresses + * in current thread info. + */ +unsigned long prepare_ftrace_return(unsigned long ra, unsigned long sp, + unsigned long ip) +{ + if (unlikely(ftrace_graph_is_dead())) + goto out; + if (unlikely(atomic_read(¤t->tracing_graph_pause))) + goto out; + ip -= MCOUNT_INSN_SIZE; + if (!function_graph_enter(ra, ip, 0, (void *) sp)) + ra = (unsigned long) return_to_handler; +out: + return ra; +} +NOKPROBE_SYMBOL(prepare_ftrace_return); + +/* + * Patch the kernel code at ftrace_graph_caller location. The instruction + * there is branch relative on condition. To enable the ftrace graph code + * block, we simply patch the mask field of the instruction to zero and + * turn the instruction into a nop. + * To disable the ftrace graph code the mask field will be patched to + * all ones, which turns the instruction into an unconditional branch. + */ +int ftrace_enable_ftrace_graph_caller(void) +{ + int rc; + + /* Expect brc 0xf,... */ + rc = ftrace_patch_branch_mask(ftrace_graph_caller, 0xa7f4, false); + if (rc) + return rc; + text_poke_sync_lock(); + return 0; +} + +int ftrace_disable_ftrace_graph_caller(void) +{ + int rc; + + /* Expect brc 0x0,... */ + rc = ftrace_patch_branch_mask(ftrace_graph_caller, 0xa704, true); + if (rc) + return rc; + text_poke_sync_lock(); + return 0; +} + +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +#ifdef CONFIG_KPROBES_ON_FTRACE +void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct ftrace_regs *fregs) +{ + struct kprobe_ctlblk *kcb; + struct pt_regs *regs; + struct kprobe *p; + int bit; + + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) + return; + + regs = ftrace_get_regs(fregs); + p = get_kprobe((kprobe_opcode_t *)ip); + if (!regs || unlikely(!p) || kprobe_disabled(p)) + goto out; + + if (kprobe_running()) { + kprobes_inc_nmissed_count(p); + goto out; + } + + __this_cpu_write(current_kprobe, p); + + kcb = get_kprobe_ctlblk(); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + + instruction_pointer_set(regs, ip); + + if (!p->pre_handler || !p->pre_handler(p, regs)) { + + instruction_pointer_set(regs, ip + MCOUNT_INSN_SIZE); + + if (unlikely(p->post_handler)) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + p->post_handler(p, regs, 0); + } + } + __this_cpu_write(current_kprobe, NULL); +out: + ftrace_test_recursion_unlock(bit); +} +NOKPROBE_SYMBOL(kprobe_ftrace_handler); + +int arch_prepare_kprobe_ftrace(struct kprobe *p) +{ + p->ainsn.insn = NULL; + return 0; +} +#endif diff --git a/arch/s390/kernel/ftrace.h b/arch/s390/kernel/ftrace.h new file mode 100644 index 0000000000..7f75a96164 --- /dev/null +++ b/arch/s390/kernel/ftrace.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _FTRACE_H +#define _FTRACE_H + +#include + +struct ftrace_hotpatch_trampoline { + u16 brasl_opc; + s32 brasl_disp; + s16: 16; + u64 rest_of_intercepted_function; + u64 interceptor; +} __packed; + +extern struct ftrace_hotpatch_trampoline __ftrace_hotpatch_trampolines_start[]; +extern struct ftrace_hotpatch_trampoline __ftrace_hotpatch_trampolines_end[]; +extern const char ftrace_shared_hotpatch_trampoline_br[]; +extern const char ftrace_shared_hotpatch_trampoline_br_end[]; +extern const char ftrace_shared_hotpatch_trampoline_exrl[]; +extern const char ftrace_shared_hotpatch_trampoline_exrl_end[]; +extern const char ftrace_plt_template[]; +extern const char ftrace_plt_template_end[]; + +#endif /* _FTRACE_H */ diff --git a/arch/s390/kernel/guarded_storage.c b/arch/s390/kernel/guarded_storage.c new file mode 100644 index 0000000000..d14dd1c2e5 --- /dev/null +++ b/arch/s390/kernel/guarded_storage.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2016 + * Author(s): Martin Schwidefsky + */ + +#include +#include +#include +#include +#include +#include +#include "entry.h" + +void guarded_storage_release(struct task_struct *tsk) +{ + kfree(tsk->thread.gs_cb); + kfree(tsk->thread.gs_bc_cb); +} + +static int gs_enable(void) +{ + struct gs_cb *gs_cb; + + if (!current->thread.gs_cb) { + gs_cb = kzalloc(sizeof(*gs_cb), GFP_KERNEL); + if (!gs_cb) + return -ENOMEM; + gs_cb->gsd = 25; + preempt_disable(); + __ctl_set_bit(2, 4); + load_gs_cb(gs_cb); + current->thread.gs_cb = gs_cb; + preempt_enable(); + } + return 0; +} + +static int gs_disable(void) +{ + if (current->thread.gs_cb) { + preempt_disable(); + kfree(current->thread.gs_cb); + current->thread.gs_cb = NULL; + __ctl_clear_bit(2, 4); + preempt_enable(); + } + return 0; +} + +static int gs_set_bc_cb(struct gs_cb __user *u_gs_cb) +{ + struct gs_cb *gs_cb; + + gs_cb = current->thread.gs_bc_cb; + if (!gs_cb) { + gs_cb = kzalloc(sizeof(*gs_cb), GFP_KERNEL); + if (!gs_cb) + return -ENOMEM; + current->thread.gs_bc_cb = gs_cb; + } + if (copy_from_user(gs_cb, u_gs_cb, sizeof(*gs_cb))) + return -EFAULT; + return 0; +} + +static int gs_clear_bc_cb(void) +{ + struct gs_cb *gs_cb; + + gs_cb = current->thread.gs_bc_cb; + current->thread.gs_bc_cb = NULL; + kfree(gs_cb); + return 0; +} + +void gs_load_bc_cb(struct pt_regs *regs) +{ + struct gs_cb *gs_cb; + + preempt_disable(); + clear_thread_flag(TIF_GUARDED_STORAGE); + gs_cb = current->thread.gs_bc_cb; + if (gs_cb) { + kfree(current->thread.gs_cb); + current->thread.gs_bc_cb = NULL; + __ctl_set_bit(2, 4); + load_gs_cb(gs_cb); + current->thread.gs_cb = gs_cb; + } + preempt_enable(); +} + +static int gs_broadcast(void) +{ + struct task_struct *sibling; + + read_lock(&tasklist_lock); + for_each_thread(current, sibling) { + if (!sibling->thread.gs_bc_cb) + continue; + if (test_and_set_tsk_thread_flag(sibling, TIF_GUARDED_STORAGE)) + kick_process(sibling); + } + read_unlock(&tasklist_lock); + return 0; +} + +SYSCALL_DEFINE2(s390_guarded_storage, int, command, + struct gs_cb __user *, gs_cb) +{ + if (!MACHINE_HAS_GS) + return -EOPNOTSUPP; + switch (command) { + case GS_ENABLE: + return gs_enable(); + case GS_DISABLE: + return gs_disable(); + case GS_SET_BC_CB: + return gs_set_bc_cb(gs_cb); + case GS_CLEAR_BC_CB: + return gs_clear_bc_cb(); + case GS_BROADCAST: + return gs_broadcast(); + default: + return -EINVAL; + } +} diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S new file mode 100644 index 0000000000..45413b04ef --- /dev/null +++ b/arch/s390/kernel/head64.S @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 1999, 2010 + * + * Author(s): Hartmut Penner + * Martin Schwidefsky + * Rob van der Heij + * + */ + +#include +#include +#include +#include +#include +#include + +__HEAD +SYM_CODE_START(startup_continue) + larl %r1,tod_clock_base + mvc 0(16,%r1),__LC_BOOT_CLOCK +# +# Setup stack +# + larl %r14,init_task + stg %r14,__LC_CURRENT + larl %r15,init_thread_union+STACK_INIT_OFFSET + stg %r15,__LC_KERNEL_STACK + brasl %r14,sclp_early_adjust_va # allow sclp_early_printk + brasl %r14,startup_init # s390 specific early init + brasl %r14,start_kernel # common init code +# +# We returned from start_kernel ?!? PANIK +# + basr %r13,0 + lpswe dw_psw-.(%r13) # load disabled wait psw +SYM_CODE_END(startup_continue) + + .balign 16 +SYM_DATA_LOCAL(dw_psw, .quad 0x0002000180000000,0x0000000000000000) diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c new file mode 100644 index 0000000000..e7239aaf42 --- /dev/null +++ b/arch/s390/kernel/idle.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Idle functions for s390. + * + * Copyright IBM Corp. 2014 + * + * Author(s): Martin Schwidefsky + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "entry.h" + +static DEFINE_PER_CPU(struct s390_idle_data, s390_idle); + +void account_idle_time_irq(void) +{ + struct s390_idle_data *idle = this_cpu_ptr(&s390_idle); + unsigned long idle_time; + u64 cycles_new[8]; + int i; + + if (smp_cpu_mtid) { + stcctm(MT_DIAG, smp_cpu_mtid, cycles_new); + for (i = 0; i < smp_cpu_mtid; i++) + this_cpu_add(mt_cycles[i], cycles_new[i] - idle->mt_cycles_enter[i]); + } + + idle_time = S390_lowcore.int_clock - idle->clock_idle_enter; + + S390_lowcore.steal_timer += idle->clock_idle_enter - S390_lowcore.last_update_clock; + S390_lowcore.last_update_clock = S390_lowcore.int_clock; + + S390_lowcore.system_timer += S390_lowcore.last_update_timer - idle->timer_idle_enter; + S390_lowcore.last_update_timer = S390_lowcore.sys_enter_timer; + + /* Account time spent with enabled wait psw loaded as idle time. */ + WRITE_ONCE(idle->idle_time, READ_ONCE(idle->idle_time) + idle_time); + WRITE_ONCE(idle->idle_count, READ_ONCE(idle->idle_count) + 1); + account_idle_time(cputime_to_nsecs(idle_time)); +} + +void noinstr arch_cpu_idle(void) +{ + struct s390_idle_data *idle = this_cpu_ptr(&s390_idle); + unsigned long psw_mask; + + /* Wait for external, I/O or machine check interrupt. */ + psw_mask = PSW_KERNEL_BITS | PSW_MASK_WAIT | + PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; + clear_cpu_flag(CIF_NOHZ_DELAY); + + /* psw_idle() returns with interrupts disabled. */ + psw_idle(idle, psw_mask); +} + +static ssize_t show_idle_count(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct s390_idle_data *idle = &per_cpu(s390_idle, dev->id); + + return sysfs_emit(buf, "%lu\n", READ_ONCE(idle->idle_count)); +} +DEVICE_ATTR(idle_count, 0444, show_idle_count, NULL); + +static ssize_t show_idle_time(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct s390_idle_data *idle = &per_cpu(s390_idle, dev->id); + + return sysfs_emit(buf, "%lu\n", READ_ONCE(idle->idle_time) >> 12); +} +DEVICE_ATTR(idle_time_us, 0444, show_idle_time, NULL); + +void arch_cpu_idle_enter(void) +{ +} + +void arch_cpu_idle_exit(void) +{ +} + +void __noreturn arch_cpu_idle_dead(void) +{ + cpu_die(); +} diff --git a/arch/s390/kernel/ima_arch.c b/arch/s390/kernel/ima_arch.c new file mode 100644 index 0000000000..f3c3e6e1c5 --- /dev/null +++ b/arch/s390/kernel/ima_arch.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +bool arch_ima_get_secureboot(void) +{ + return ipl_secure_flag; +} + +const char * const *arch_get_ima_policy(void) +{ + return NULL; +} diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c new file mode 100644 index 0000000000..8d0b95c173 --- /dev/null +++ b/arch/s390/kernel/ipl.c @@ -0,0 +1,2522 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ipl/reipl/dump support for Linux on s390. + * + * Copyright IBM Corp. 2005, 2012 + * Author(s): Michael Holzheu + * Volker Sameske + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "entry.h" + +#define IPL_PARM_BLOCK_VERSION 0 + +#define IPL_UNKNOWN_STR "unknown" +#define IPL_CCW_STR "ccw" +#define IPL_ECKD_STR "eckd" +#define IPL_ECKD_DUMP_STR "eckd_dump" +#define IPL_FCP_STR "fcp" +#define IPL_FCP_DUMP_STR "fcp_dump" +#define IPL_NVME_STR "nvme" +#define IPL_NVME_DUMP_STR "nvme_dump" +#define IPL_NSS_STR "nss" + +#define DUMP_CCW_STR "ccw" +#define DUMP_ECKD_STR "eckd" +#define DUMP_FCP_STR "fcp" +#define DUMP_NVME_STR "nvme" +#define DUMP_NONE_STR "none" + +/* + * Four shutdown trigger types are supported: + * - panic + * - halt + * - power off + * - reipl + * - restart + */ +#define ON_PANIC_STR "on_panic" +#define ON_HALT_STR "on_halt" +#define ON_POFF_STR "on_poff" +#define ON_REIPL_STR "on_reboot" +#define ON_RESTART_STR "on_restart" + +struct shutdown_action; +struct shutdown_trigger { + char *name; + struct shutdown_action *action; +}; + +/* + * The following shutdown action types are supported: + */ +#define SHUTDOWN_ACTION_IPL_STR "ipl" +#define SHUTDOWN_ACTION_REIPL_STR "reipl" +#define SHUTDOWN_ACTION_DUMP_STR "dump" +#define SHUTDOWN_ACTION_VMCMD_STR "vmcmd" +#define SHUTDOWN_ACTION_STOP_STR "stop" +#define SHUTDOWN_ACTION_DUMP_REIPL_STR "dump_reipl" + +struct shutdown_action { + char *name; + void (*fn) (struct shutdown_trigger *trigger); + int (*init) (void); + int init_rc; +}; + +static char *ipl_type_str(enum ipl_type type) +{ + switch (type) { + case IPL_TYPE_CCW: + return IPL_CCW_STR; + case IPL_TYPE_ECKD: + return IPL_ECKD_STR; + case IPL_TYPE_ECKD_DUMP: + return IPL_ECKD_DUMP_STR; + case IPL_TYPE_FCP: + return IPL_FCP_STR; + case IPL_TYPE_FCP_DUMP: + return IPL_FCP_DUMP_STR; + case IPL_TYPE_NSS: + return IPL_NSS_STR; + case IPL_TYPE_NVME: + return IPL_NVME_STR; + case IPL_TYPE_NVME_DUMP: + return IPL_NVME_DUMP_STR; + case IPL_TYPE_UNKNOWN: + default: + return IPL_UNKNOWN_STR; + } +} + +enum dump_type { + DUMP_TYPE_NONE = 1, + DUMP_TYPE_CCW = 2, + DUMP_TYPE_FCP = 4, + DUMP_TYPE_NVME = 8, + DUMP_TYPE_ECKD = 16, +}; + +static char *dump_type_str(enum dump_type type) +{ + switch (type) { + case DUMP_TYPE_NONE: + return DUMP_NONE_STR; + case DUMP_TYPE_CCW: + return DUMP_CCW_STR; + case DUMP_TYPE_ECKD: + return DUMP_ECKD_STR; + case DUMP_TYPE_FCP: + return DUMP_FCP_STR; + case DUMP_TYPE_NVME: + return DUMP_NVME_STR; + default: + return NULL; + } +} + +int __bootdata_preserved(ipl_block_valid); +struct ipl_parameter_block __bootdata_preserved(ipl_block); +int __bootdata_preserved(ipl_secure_flag); + +unsigned long __bootdata_preserved(ipl_cert_list_addr); +unsigned long __bootdata_preserved(ipl_cert_list_size); + +unsigned long __bootdata(early_ipl_comp_list_addr); +unsigned long __bootdata(early_ipl_comp_list_size); + +static int reipl_capabilities = IPL_TYPE_UNKNOWN; + +static enum ipl_type reipl_type = IPL_TYPE_UNKNOWN; +static struct ipl_parameter_block *reipl_block_fcp; +static struct ipl_parameter_block *reipl_block_nvme; +static struct ipl_parameter_block *reipl_block_ccw; +static struct ipl_parameter_block *reipl_block_eckd; +static struct ipl_parameter_block *reipl_block_nss; +static struct ipl_parameter_block *reipl_block_actual; + +static int dump_capabilities = DUMP_TYPE_NONE; +static enum dump_type dump_type = DUMP_TYPE_NONE; +static struct ipl_parameter_block *dump_block_fcp; +static struct ipl_parameter_block *dump_block_nvme; +static struct ipl_parameter_block *dump_block_ccw; +static struct ipl_parameter_block *dump_block_eckd; + +static struct sclp_ipl_info sclp_ipl_info; + +static bool reipl_nvme_clear; +static bool reipl_fcp_clear; +static bool reipl_ccw_clear; +static bool reipl_eckd_clear; + +static unsigned long os_info_flags; + +static inline int __diag308(unsigned long subcode, unsigned long addr) +{ + union register_pair r1; + + r1.even = addr; + r1.odd = 0; + asm volatile( + " diag %[r1],%[subcode],0x308\n" + "0: nopr %%r7\n" + EX_TABLE(0b,0b) + : [r1] "+&d" (r1.pair) + : [subcode] "d" (subcode) + : "cc", "memory"); + return r1.odd; +} + +int diag308(unsigned long subcode, void *addr) +{ + diag_stat_inc(DIAG_STAT_X308); + return __diag308(subcode, addr ? virt_to_phys(addr) : 0); +} +EXPORT_SYMBOL_GPL(diag308); + +/* SYSFS */ + +#define IPL_ATTR_SHOW_FN(_prefix, _name, _format, args...) \ +static ssize_t sys_##_prefix##_##_name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + char *page) \ +{ \ + return scnprintf(page, PAGE_SIZE, _format, ##args); \ +} + +#define IPL_ATTR_CCW_STORE_FN(_prefix, _name, _ipl_blk) \ +static ssize_t sys_##_prefix##_##_name##_store(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + unsigned long long ssid, devno; \ + \ + if (sscanf(buf, "0.%llx.%llx\n", &ssid, &devno) != 2) \ + return -EINVAL; \ + \ + if (ssid > __MAX_SSID || devno > __MAX_SUBCHANNEL) \ + return -EINVAL; \ + \ + _ipl_blk.ssid = ssid; \ + _ipl_blk.devno = devno; \ + return len; \ +} + +#define DEFINE_IPL_CCW_ATTR_RW(_prefix, _name, _ipl_blk) \ +IPL_ATTR_SHOW_FN(_prefix, _name, "0.%x.%04x\n", \ + _ipl_blk.ssid, _ipl_blk.devno); \ +IPL_ATTR_CCW_STORE_FN(_prefix, _name, _ipl_blk); \ +static struct kobj_attribute sys_##_prefix##_##_name##_attr = \ + __ATTR(_name, 0644, \ + sys_##_prefix##_##_name##_show, \ + sys_##_prefix##_##_name##_store) \ + +#define DEFINE_IPL_ATTR_RO(_prefix, _name, _format, _value) \ +IPL_ATTR_SHOW_FN(_prefix, _name, _format, _value) \ +static struct kobj_attribute sys_##_prefix##_##_name##_attr = \ + __ATTR(_name, 0444, sys_##_prefix##_##_name##_show, NULL) + +#define DEFINE_IPL_ATTR_RW(_prefix, _name, _fmt_out, _fmt_in, _value) \ +IPL_ATTR_SHOW_FN(_prefix, _name, _fmt_out, (unsigned long long) _value) \ +static ssize_t sys_##_prefix##_##_name##_store(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + unsigned long long value; \ + if (sscanf(buf, _fmt_in, &value) != 1) \ + return -EINVAL; \ + _value = value; \ + return len; \ +} \ +static struct kobj_attribute sys_##_prefix##_##_name##_attr = \ + __ATTR(_name, 0644, \ + sys_##_prefix##_##_name##_show, \ + sys_##_prefix##_##_name##_store) + +#define DEFINE_IPL_ATTR_STR_RW(_prefix, _name, _fmt_out, _fmt_in, _value)\ +IPL_ATTR_SHOW_FN(_prefix, _name, _fmt_out, _value) \ +static ssize_t sys_##_prefix##_##_name##_store(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + strscpy(_value, buf, sizeof(_value)); \ + strim(_value); \ + return len; \ +} \ +static struct kobj_attribute sys_##_prefix##_##_name##_attr = \ + __ATTR(_name, 0644, \ + sys_##_prefix##_##_name##_show, \ + sys_##_prefix##_##_name##_store) + +/* + * ipl section + */ + +static __init enum ipl_type get_ipl_type(void) +{ + if (!ipl_block_valid) + return IPL_TYPE_UNKNOWN; + + switch (ipl_block.pb0_hdr.pbt) { + case IPL_PBT_CCW: + return IPL_TYPE_CCW; + case IPL_PBT_FCP: + if (ipl_block.fcp.opt == IPL_PB0_FCP_OPT_DUMP) + return IPL_TYPE_FCP_DUMP; + else + return IPL_TYPE_FCP; + case IPL_PBT_NVME: + if (ipl_block.nvme.opt == IPL_PB0_NVME_OPT_DUMP) + return IPL_TYPE_NVME_DUMP; + else + return IPL_TYPE_NVME; + case IPL_PBT_ECKD: + if (ipl_block.eckd.opt == IPL_PB0_ECKD_OPT_DUMP) + return IPL_TYPE_ECKD_DUMP; + else + return IPL_TYPE_ECKD; + } + return IPL_TYPE_UNKNOWN; +} + +struct ipl_info ipl_info; +EXPORT_SYMBOL_GPL(ipl_info); + +static ssize_t ipl_type_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + return sprintf(page, "%s\n", ipl_type_str(ipl_info.type)); +} + +static struct kobj_attribute sys_ipl_type_attr = __ATTR_RO(ipl_type); + +static ssize_t ipl_secure_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sprintf(page, "%i\n", !!ipl_secure_flag); +} + +static struct kobj_attribute sys_ipl_secure_attr = + __ATTR(secure, 0444, ipl_secure_show, NULL); + +static ssize_t ipl_has_secure_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sprintf(page, "%i\n", !!sclp.has_sipl); +} + +static struct kobj_attribute sys_ipl_has_secure_attr = + __ATTR(has_secure, 0444, ipl_has_secure_show, NULL); + +static ssize_t ipl_vm_parm_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + char parm[DIAG308_VMPARM_SIZE + 1] = {}; + + if (ipl_block_valid && (ipl_block.pb0_hdr.pbt == IPL_PBT_CCW)) + ipl_block_get_ascii_vmparm(parm, sizeof(parm), &ipl_block); + return sprintf(page, "%s\n", parm); +} + +static struct kobj_attribute sys_ipl_vm_parm_attr = + __ATTR(parm, 0444, ipl_vm_parm_show, NULL); + +static ssize_t sys_ipl_device_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + switch (ipl_info.type) { + case IPL_TYPE_CCW: + return sprintf(page, "0.%x.%04x\n", ipl_block.ccw.ssid, + ipl_block.ccw.devno); + case IPL_TYPE_ECKD: + case IPL_TYPE_ECKD_DUMP: + return sprintf(page, "0.%x.%04x\n", ipl_block.eckd.ssid, + ipl_block.eckd.devno); + case IPL_TYPE_FCP: + case IPL_TYPE_FCP_DUMP: + return sprintf(page, "0.0.%04x\n", ipl_block.fcp.devno); + case IPL_TYPE_NVME: + case IPL_TYPE_NVME_DUMP: + return sprintf(page, "%08ux\n", ipl_block.nvme.fid); + default: + return 0; + } +} + +static struct kobj_attribute sys_ipl_device_attr = + __ATTR(device, 0444, sys_ipl_device_show, NULL); + +static ssize_t ipl_parameter_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, char *buf, + loff_t off, size_t count) +{ + return memory_read_from_buffer(buf, count, &off, &ipl_block, + ipl_block.hdr.len); +} +static struct bin_attribute ipl_parameter_attr = + __BIN_ATTR(binary_parameter, 0444, ipl_parameter_read, NULL, + PAGE_SIZE); + +static ssize_t ipl_scp_data_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, char *buf, + loff_t off, size_t count) +{ + unsigned int size = ipl_block.fcp.scp_data_len; + void *scp_data = &ipl_block.fcp.scp_data; + + return memory_read_from_buffer(buf, count, &off, scp_data, size); +} + +static ssize_t ipl_nvme_scp_data_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, char *buf, + loff_t off, size_t count) +{ + unsigned int size = ipl_block.nvme.scp_data_len; + void *scp_data = &ipl_block.nvme.scp_data; + + return memory_read_from_buffer(buf, count, &off, scp_data, size); +} + +static ssize_t ipl_eckd_scp_data_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, char *buf, + loff_t off, size_t count) +{ + unsigned int size = ipl_block.eckd.scp_data_len; + void *scp_data = &ipl_block.eckd.scp_data; + + return memory_read_from_buffer(buf, count, &off, scp_data, size); +} + +static struct bin_attribute ipl_scp_data_attr = + __BIN_ATTR(scp_data, 0444, ipl_scp_data_read, NULL, PAGE_SIZE); + +static struct bin_attribute ipl_nvme_scp_data_attr = + __BIN_ATTR(scp_data, 0444, ipl_nvme_scp_data_read, NULL, PAGE_SIZE); + +static struct bin_attribute ipl_eckd_scp_data_attr = + __BIN_ATTR(scp_data, 0444, ipl_eckd_scp_data_read, NULL, PAGE_SIZE); + +static struct bin_attribute *ipl_fcp_bin_attrs[] = { + &ipl_parameter_attr, + &ipl_scp_data_attr, + NULL, +}; + +static struct bin_attribute *ipl_nvme_bin_attrs[] = { + &ipl_parameter_attr, + &ipl_nvme_scp_data_attr, + NULL, +}; + +static struct bin_attribute *ipl_eckd_bin_attrs[] = { + &ipl_parameter_attr, + &ipl_eckd_scp_data_attr, + NULL, +}; + +/* FCP ipl device attributes */ + +DEFINE_IPL_ATTR_RO(ipl_fcp, wwpn, "0x%016llx\n", + (unsigned long long)ipl_block.fcp.wwpn); +DEFINE_IPL_ATTR_RO(ipl_fcp, lun, "0x%016llx\n", + (unsigned long long)ipl_block.fcp.lun); +DEFINE_IPL_ATTR_RO(ipl_fcp, bootprog, "%lld\n", + (unsigned long long)ipl_block.fcp.bootprog); +DEFINE_IPL_ATTR_RO(ipl_fcp, br_lba, "%lld\n", + (unsigned long long)ipl_block.fcp.br_lba); + +/* NVMe ipl device attributes */ +DEFINE_IPL_ATTR_RO(ipl_nvme, fid, "0x%08llx\n", + (unsigned long long)ipl_block.nvme.fid); +DEFINE_IPL_ATTR_RO(ipl_nvme, nsid, "0x%08llx\n", + (unsigned long long)ipl_block.nvme.nsid); +DEFINE_IPL_ATTR_RO(ipl_nvme, bootprog, "%lld\n", + (unsigned long long)ipl_block.nvme.bootprog); +DEFINE_IPL_ATTR_RO(ipl_nvme, br_lba, "%lld\n", + (unsigned long long)ipl_block.nvme.br_lba); + +/* ECKD ipl device attributes */ +DEFINE_IPL_ATTR_RO(ipl_eckd, bootprog, "%lld\n", + (unsigned long long)ipl_block.eckd.bootprog); + +#define IPL_ATTR_BR_CHR_SHOW_FN(_name, _ipb) \ +static ssize_t eckd_##_name##_br_chr_show(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + char *buf) \ +{ \ + struct ipl_pb0_eckd *ipb = &(_ipb); \ + \ + if (!ipb->br_chr.cyl && \ + !ipb->br_chr.head && \ + !ipb->br_chr.record) \ + return sprintf(buf, "auto\n"); \ + \ + return sprintf(buf, "0x%x,0x%x,0x%x\n", \ + ipb->br_chr.cyl, \ + ipb->br_chr.head, \ + ipb->br_chr.record); \ +} + +#define IPL_ATTR_BR_CHR_STORE_FN(_name, _ipb) \ +static ssize_t eckd_##_name##_br_chr_store(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + struct ipl_pb0_eckd *ipb = &(_ipb); \ + unsigned long args[3] = { 0 }; \ + char *p, *p1, *tmp = NULL; \ + int i, rc; \ + \ + if (!strncmp(buf, "auto", 4)) \ + goto out; \ + \ + tmp = kstrdup(buf, GFP_KERNEL); \ + p = tmp; \ + for (i = 0; i < 3; i++) { \ + p1 = strsep(&p, ", "); \ + if (!p1) { \ + rc = -EINVAL; \ + goto err; \ + } \ + rc = kstrtoul(p1, 0, args + i); \ + if (rc) \ + goto err; \ + } \ + \ + rc = -EINVAL; \ + if (i != 3) \ + goto err; \ + \ + if ((args[0] || args[1]) && !args[2]) \ + goto err; \ + \ + if (args[0] > UINT_MAX || args[1] > 255 || args[2] > 255) \ + goto err; \ + \ +out: \ + ipb->br_chr.cyl = args[0]; \ + ipb->br_chr.head = args[1]; \ + ipb->br_chr.record = args[2]; \ + rc = len; \ +err: \ + kfree(tmp); \ + return rc; \ +} + +IPL_ATTR_BR_CHR_SHOW_FN(ipl, ipl_block.eckd); +static struct kobj_attribute sys_ipl_eckd_br_chr_attr = + __ATTR(br_chr, 0644, eckd_ipl_br_chr_show, NULL); + +IPL_ATTR_BR_CHR_SHOW_FN(reipl, reipl_block_eckd->eckd); +IPL_ATTR_BR_CHR_STORE_FN(reipl, reipl_block_eckd->eckd); + +static struct kobj_attribute sys_reipl_eckd_br_chr_attr = + __ATTR(br_chr, 0644, eckd_reipl_br_chr_show, eckd_reipl_br_chr_store); + +static ssize_t ipl_ccw_loadparm_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + char loadparm[LOADPARM_LEN + 1] = {}; + + if (!sclp_ipl_info.is_valid) + return sprintf(page, "#unknown#\n"); + memcpy(loadparm, &sclp_ipl_info.loadparm, LOADPARM_LEN); + EBCASC(loadparm, LOADPARM_LEN); + strim(loadparm); + return sprintf(page, "%s\n", loadparm); +} + +static struct kobj_attribute sys_ipl_ccw_loadparm_attr = + __ATTR(loadparm, 0444, ipl_ccw_loadparm_show, NULL); + +static struct attribute *ipl_fcp_attrs[] = { + &sys_ipl_device_attr.attr, + &sys_ipl_fcp_wwpn_attr.attr, + &sys_ipl_fcp_lun_attr.attr, + &sys_ipl_fcp_bootprog_attr.attr, + &sys_ipl_fcp_br_lba_attr.attr, + &sys_ipl_ccw_loadparm_attr.attr, + NULL, +}; + +static struct attribute_group ipl_fcp_attr_group = { + .attrs = ipl_fcp_attrs, + .bin_attrs = ipl_fcp_bin_attrs, +}; + +static struct attribute *ipl_nvme_attrs[] = { + &sys_ipl_nvme_fid_attr.attr, + &sys_ipl_nvme_nsid_attr.attr, + &sys_ipl_nvme_bootprog_attr.attr, + &sys_ipl_nvme_br_lba_attr.attr, + &sys_ipl_ccw_loadparm_attr.attr, + NULL, +}; + +static struct attribute_group ipl_nvme_attr_group = { + .attrs = ipl_nvme_attrs, + .bin_attrs = ipl_nvme_bin_attrs, +}; + +static struct attribute *ipl_eckd_attrs[] = { + &sys_ipl_eckd_bootprog_attr.attr, + &sys_ipl_eckd_br_chr_attr.attr, + &sys_ipl_ccw_loadparm_attr.attr, + &sys_ipl_device_attr.attr, + NULL, +}; + +static struct attribute_group ipl_eckd_attr_group = { + .attrs = ipl_eckd_attrs, + .bin_attrs = ipl_eckd_bin_attrs, +}; + +/* CCW ipl device attributes */ + +static struct attribute *ipl_ccw_attrs_vm[] = { + &sys_ipl_device_attr.attr, + &sys_ipl_ccw_loadparm_attr.attr, + &sys_ipl_vm_parm_attr.attr, + NULL, +}; + +static struct attribute *ipl_ccw_attrs_lpar[] = { + &sys_ipl_device_attr.attr, + &sys_ipl_ccw_loadparm_attr.attr, + NULL, +}; + +static struct attribute_group ipl_ccw_attr_group_vm = { + .attrs = ipl_ccw_attrs_vm, +}; + +static struct attribute_group ipl_ccw_attr_group_lpar = { + .attrs = ipl_ccw_attrs_lpar +}; + +static struct attribute *ipl_common_attrs[] = { + &sys_ipl_type_attr.attr, + &sys_ipl_secure_attr.attr, + &sys_ipl_has_secure_attr.attr, + NULL, +}; + +static struct attribute_group ipl_common_attr_group = { + .attrs = ipl_common_attrs, +}; + +static struct kset *ipl_kset; + +static void __ipl_run(void *unused) +{ + diag308(DIAG308_LOAD_CLEAR, NULL); +} + +static void ipl_run(struct shutdown_trigger *trigger) +{ + smp_call_ipl_cpu(__ipl_run, NULL); +} + +static int __init ipl_init(void) +{ + int rc; + + ipl_kset = kset_create_and_add("ipl", NULL, firmware_kobj); + if (!ipl_kset) { + rc = -ENOMEM; + goto out; + } + rc = sysfs_create_group(&ipl_kset->kobj, &ipl_common_attr_group); + if (rc) + goto out; + switch (ipl_info.type) { + case IPL_TYPE_CCW: + if (MACHINE_IS_VM) + rc = sysfs_create_group(&ipl_kset->kobj, + &ipl_ccw_attr_group_vm); + else + rc = sysfs_create_group(&ipl_kset->kobj, + &ipl_ccw_attr_group_lpar); + break; + case IPL_TYPE_ECKD: + case IPL_TYPE_ECKD_DUMP: + rc = sysfs_create_group(&ipl_kset->kobj, &ipl_eckd_attr_group); + break; + case IPL_TYPE_FCP: + case IPL_TYPE_FCP_DUMP: + rc = sysfs_create_group(&ipl_kset->kobj, &ipl_fcp_attr_group); + break; + case IPL_TYPE_NVME: + case IPL_TYPE_NVME_DUMP: + rc = sysfs_create_group(&ipl_kset->kobj, &ipl_nvme_attr_group); + break; + default: + break; + } +out: + if (rc) + panic("ipl_init failed: rc = %i\n", rc); + + return 0; +} + +static struct shutdown_action __refdata ipl_action = { + .name = SHUTDOWN_ACTION_IPL_STR, + .fn = ipl_run, + .init = ipl_init, +}; + +/* + * reipl shutdown action: Reboot Linux on shutdown. + */ + +/* VM IPL PARM attributes */ +static ssize_t reipl_generic_vmparm_show(struct ipl_parameter_block *ipb, + char *page) +{ + char vmparm[DIAG308_VMPARM_SIZE + 1] = {}; + + ipl_block_get_ascii_vmparm(vmparm, sizeof(vmparm), ipb); + return sprintf(page, "%s\n", vmparm); +} + +static ssize_t reipl_generic_vmparm_store(struct ipl_parameter_block *ipb, + size_t vmparm_max, + const char *buf, size_t len) +{ + int i, ip_len; + + /* ignore trailing newline */ + ip_len = len; + if ((len > 0) && (buf[len - 1] == '\n')) + ip_len--; + + if (ip_len > vmparm_max) + return -EINVAL; + + /* parm is used to store kernel options, check for common chars */ + for (i = 0; i < ip_len; i++) + if (!(isalnum(buf[i]) || isascii(buf[i]) || isprint(buf[i]))) + return -EINVAL; + + memset(ipb->ccw.vm_parm, 0, DIAG308_VMPARM_SIZE); + ipb->ccw.vm_parm_len = ip_len; + if (ip_len > 0) { + ipb->ccw.vm_flags |= IPL_PB0_CCW_VM_FLAG_VP; + memcpy(ipb->ccw.vm_parm, buf, ip_len); + ASCEBC(ipb->ccw.vm_parm, ip_len); + } else { + ipb->ccw.vm_flags &= ~IPL_PB0_CCW_VM_FLAG_VP; + } + + return len; +} + +/* NSS wrapper */ +static ssize_t reipl_nss_vmparm_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return reipl_generic_vmparm_show(reipl_block_nss, page); +} + +static ssize_t reipl_nss_vmparm_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + return reipl_generic_vmparm_store(reipl_block_nss, 56, buf, len); +} + +/* CCW wrapper */ +static ssize_t reipl_ccw_vmparm_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return reipl_generic_vmparm_show(reipl_block_ccw, page); +} + +static ssize_t reipl_ccw_vmparm_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + return reipl_generic_vmparm_store(reipl_block_ccw, 64, buf, len); +} + +static struct kobj_attribute sys_reipl_nss_vmparm_attr = + __ATTR(parm, 0644, reipl_nss_vmparm_show, + reipl_nss_vmparm_store); +static struct kobj_attribute sys_reipl_ccw_vmparm_attr = + __ATTR(parm, 0644, reipl_ccw_vmparm_show, + reipl_ccw_vmparm_store); + +/* FCP reipl device attributes */ + +static ssize_t reipl_fcp_scpdata_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, size_t count) +{ + size_t size = reipl_block_fcp->fcp.scp_data_len; + void *scp_data = reipl_block_fcp->fcp.scp_data; + + return memory_read_from_buffer(buf, count, &off, scp_data, size); +} + +static ssize_t reipl_fcp_scpdata_write(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, size_t count) +{ + size_t scpdata_len = count; + size_t padding; + + + if (off) + return -EINVAL; + + memcpy(reipl_block_fcp->fcp.scp_data, buf, count); + if (scpdata_len % 8) { + padding = 8 - (scpdata_len % 8); + memset(reipl_block_fcp->fcp.scp_data + scpdata_len, + 0, padding); + scpdata_len += padding; + } + + reipl_block_fcp->hdr.len = IPL_BP_FCP_LEN + scpdata_len; + reipl_block_fcp->fcp.len = IPL_BP0_FCP_LEN + scpdata_len; + reipl_block_fcp->fcp.scp_data_len = scpdata_len; + + return count; +} +static struct bin_attribute sys_reipl_fcp_scp_data_attr = + __BIN_ATTR(scp_data, 0644, reipl_fcp_scpdata_read, + reipl_fcp_scpdata_write, DIAG308_SCPDATA_SIZE); + +static struct bin_attribute *reipl_fcp_bin_attrs[] = { + &sys_reipl_fcp_scp_data_attr, + NULL, +}; + +DEFINE_IPL_ATTR_RW(reipl_fcp, wwpn, "0x%016llx\n", "%llx\n", + reipl_block_fcp->fcp.wwpn); +DEFINE_IPL_ATTR_RW(reipl_fcp, lun, "0x%016llx\n", "%llx\n", + reipl_block_fcp->fcp.lun); +DEFINE_IPL_ATTR_RW(reipl_fcp, bootprog, "%lld\n", "%lld\n", + reipl_block_fcp->fcp.bootprog); +DEFINE_IPL_ATTR_RW(reipl_fcp, br_lba, "%lld\n", "%lld\n", + reipl_block_fcp->fcp.br_lba); +DEFINE_IPL_ATTR_RW(reipl_fcp, device, "0.0.%04llx\n", "0.0.%llx\n", + reipl_block_fcp->fcp.devno); + +static void reipl_get_ascii_loadparm(char *loadparm, + struct ipl_parameter_block *ibp) +{ + memcpy(loadparm, ibp->common.loadparm, LOADPARM_LEN); + EBCASC(loadparm, LOADPARM_LEN); + loadparm[LOADPARM_LEN] = 0; + strim(loadparm); +} + +static ssize_t reipl_generic_loadparm_show(struct ipl_parameter_block *ipb, + char *page) +{ + char buf[LOADPARM_LEN + 1]; + + reipl_get_ascii_loadparm(buf, ipb); + return sprintf(page, "%s\n", buf); +} + +static ssize_t reipl_generic_loadparm_store(struct ipl_parameter_block *ipb, + const char *buf, size_t len) +{ + int i, lp_len; + + /* ignore trailing newline */ + lp_len = len; + if ((len > 0) && (buf[len - 1] == '\n')) + lp_len--; + /* loadparm can have max 8 characters and must not start with a blank */ + if ((lp_len > LOADPARM_LEN) || ((lp_len > 0) && (buf[0] == ' '))) + return -EINVAL; + /* loadparm can only contain "a-z,A-Z,0-9,SP,." */ + for (i = 0; i < lp_len; i++) { + if (isalpha(buf[i]) || isdigit(buf[i]) || (buf[i] == ' ') || + (buf[i] == '.')) + continue; + return -EINVAL; + } + /* initialize loadparm with blanks */ + memset(ipb->common.loadparm, ' ', LOADPARM_LEN); + /* copy and convert to ebcdic */ + memcpy(ipb->common.loadparm, buf, lp_len); + ASCEBC(ipb->common.loadparm, LOADPARM_LEN); + ipb->common.flags |= IPL_PB0_FLAG_LOADPARM; + return len; +} + +#define DEFINE_GENERIC_LOADPARM(name) \ +static ssize_t reipl_##name##_loadparm_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *page) \ +{ \ + return reipl_generic_loadparm_show(reipl_block_##name, page); \ +} \ +static ssize_t reipl_##name##_loadparm_store(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + return reipl_generic_loadparm_store(reipl_block_##name, buf, len); \ +} \ +static struct kobj_attribute sys_reipl_##name##_loadparm_attr = \ + __ATTR(loadparm, 0644, reipl_##name##_loadparm_show, \ + reipl_##name##_loadparm_store) + +DEFINE_GENERIC_LOADPARM(fcp); +DEFINE_GENERIC_LOADPARM(nvme); +DEFINE_GENERIC_LOADPARM(ccw); +DEFINE_GENERIC_LOADPARM(nss); +DEFINE_GENERIC_LOADPARM(eckd); + +static ssize_t reipl_fcp_clear_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sprintf(page, "%u\n", reipl_fcp_clear); +} + +static ssize_t reipl_fcp_clear_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + if (kstrtobool(buf, &reipl_fcp_clear) < 0) + return -EINVAL; + return len; +} + +static struct attribute *reipl_fcp_attrs[] = { + &sys_reipl_fcp_device_attr.attr, + &sys_reipl_fcp_wwpn_attr.attr, + &sys_reipl_fcp_lun_attr.attr, + &sys_reipl_fcp_bootprog_attr.attr, + &sys_reipl_fcp_br_lba_attr.attr, + &sys_reipl_fcp_loadparm_attr.attr, + NULL, +}; + +static struct attribute_group reipl_fcp_attr_group = { + .attrs = reipl_fcp_attrs, + .bin_attrs = reipl_fcp_bin_attrs, +}; + +static struct kobj_attribute sys_reipl_fcp_clear_attr = + __ATTR(clear, 0644, reipl_fcp_clear_show, reipl_fcp_clear_store); + +/* NVME reipl device attributes */ + +static ssize_t reipl_nvme_scpdata_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, size_t count) +{ + size_t size = reipl_block_nvme->nvme.scp_data_len; + void *scp_data = reipl_block_nvme->nvme.scp_data; + + return memory_read_from_buffer(buf, count, &off, scp_data, size); +} + +static ssize_t reipl_nvme_scpdata_write(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, size_t count) +{ + size_t scpdata_len = count; + size_t padding; + + if (off) + return -EINVAL; + + memcpy(reipl_block_nvme->nvme.scp_data, buf, count); + if (scpdata_len % 8) { + padding = 8 - (scpdata_len % 8); + memset(reipl_block_nvme->nvme.scp_data + scpdata_len, + 0, padding); + scpdata_len += padding; + } + + reipl_block_nvme->hdr.len = IPL_BP_FCP_LEN + scpdata_len; + reipl_block_nvme->nvme.len = IPL_BP0_FCP_LEN + scpdata_len; + reipl_block_nvme->nvme.scp_data_len = scpdata_len; + + return count; +} + +static struct bin_attribute sys_reipl_nvme_scp_data_attr = + __BIN_ATTR(scp_data, 0644, reipl_nvme_scpdata_read, + reipl_nvme_scpdata_write, DIAG308_SCPDATA_SIZE); + +static struct bin_attribute *reipl_nvme_bin_attrs[] = { + &sys_reipl_nvme_scp_data_attr, + NULL, +}; + +DEFINE_IPL_ATTR_RW(reipl_nvme, fid, "0x%08llx\n", "%llx\n", + reipl_block_nvme->nvme.fid); +DEFINE_IPL_ATTR_RW(reipl_nvme, nsid, "0x%08llx\n", "%llx\n", + reipl_block_nvme->nvme.nsid); +DEFINE_IPL_ATTR_RW(reipl_nvme, bootprog, "%lld\n", "%lld\n", + reipl_block_nvme->nvme.bootprog); +DEFINE_IPL_ATTR_RW(reipl_nvme, br_lba, "%lld\n", "%lld\n", + reipl_block_nvme->nvme.br_lba); + +static struct attribute *reipl_nvme_attrs[] = { + &sys_reipl_nvme_fid_attr.attr, + &sys_reipl_nvme_nsid_attr.attr, + &sys_reipl_nvme_bootprog_attr.attr, + &sys_reipl_nvme_br_lba_attr.attr, + &sys_reipl_nvme_loadparm_attr.attr, + NULL, +}; + +static struct attribute_group reipl_nvme_attr_group = { + .attrs = reipl_nvme_attrs, + .bin_attrs = reipl_nvme_bin_attrs +}; + +static ssize_t reipl_nvme_clear_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sprintf(page, "%u\n", reipl_nvme_clear); +} + +static ssize_t reipl_nvme_clear_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + if (kstrtobool(buf, &reipl_nvme_clear) < 0) + return -EINVAL; + return len; +} + +static struct kobj_attribute sys_reipl_nvme_clear_attr = + __ATTR(clear, 0644, reipl_nvme_clear_show, reipl_nvme_clear_store); + +/* CCW reipl device attributes */ +DEFINE_IPL_CCW_ATTR_RW(reipl_ccw, device, reipl_block_ccw->ccw); + +static ssize_t reipl_ccw_clear_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sprintf(page, "%u\n", reipl_ccw_clear); +} + +static ssize_t reipl_ccw_clear_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + if (kstrtobool(buf, &reipl_ccw_clear) < 0) + return -EINVAL; + return len; +} + +static struct kobj_attribute sys_reipl_ccw_clear_attr = + __ATTR(clear, 0644, reipl_ccw_clear_show, reipl_ccw_clear_store); + +static struct attribute *reipl_ccw_attrs_vm[] = { + &sys_reipl_ccw_device_attr.attr, + &sys_reipl_ccw_loadparm_attr.attr, + &sys_reipl_ccw_vmparm_attr.attr, + &sys_reipl_ccw_clear_attr.attr, + NULL, +}; + +static struct attribute *reipl_ccw_attrs_lpar[] = { + &sys_reipl_ccw_device_attr.attr, + &sys_reipl_ccw_loadparm_attr.attr, + &sys_reipl_ccw_clear_attr.attr, + NULL, +}; + +static struct attribute_group reipl_ccw_attr_group_vm = { + .name = IPL_CCW_STR, + .attrs = reipl_ccw_attrs_vm, +}; + +static struct attribute_group reipl_ccw_attr_group_lpar = { + .name = IPL_CCW_STR, + .attrs = reipl_ccw_attrs_lpar, +}; + +/* ECKD reipl device attributes */ + +static ssize_t reipl_eckd_scpdata_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, size_t count) +{ + size_t size = reipl_block_eckd->eckd.scp_data_len; + void *scp_data = reipl_block_eckd->eckd.scp_data; + + return memory_read_from_buffer(buf, count, &off, scp_data, size); +} + +static ssize_t reipl_eckd_scpdata_write(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, size_t count) +{ + size_t scpdata_len = count; + size_t padding; + + if (off) + return -EINVAL; + + memcpy(reipl_block_eckd->eckd.scp_data, buf, count); + if (scpdata_len % 8) { + padding = 8 - (scpdata_len % 8); + memset(reipl_block_eckd->eckd.scp_data + scpdata_len, + 0, padding); + scpdata_len += padding; + } + + reipl_block_eckd->hdr.len = IPL_BP_ECKD_LEN + scpdata_len; + reipl_block_eckd->eckd.len = IPL_BP0_ECKD_LEN + scpdata_len; + reipl_block_eckd->eckd.scp_data_len = scpdata_len; + + return count; +} + +static struct bin_attribute sys_reipl_eckd_scp_data_attr = + __BIN_ATTR(scp_data, 0644, reipl_eckd_scpdata_read, + reipl_eckd_scpdata_write, DIAG308_SCPDATA_SIZE); + +static struct bin_attribute *reipl_eckd_bin_attrs[] = { + &sys_reipl_eckd_scp_data_attr, + NULL, +}; + +DEFINE_IPL_CCW_ATTR_RW(reipl_eckd, device, reipl_block_eckd->eckd); +DEFINE_IPL_ATTR_RW(reipl_eckd, bootprog, "%lld\n", "%lld\n", + reipl_block_eckd->eckd.bootprog); + +static struct attribute *reipl_eckd_attrs[] = { + &sys_reipl_eckd_device_attr.attr, + &sys_reipl_eckd_bootprog_attr.attr, + &sys_reipl_eckd_br_chr_attr.attr, + &sys_reipl_eckd_loadparm_attr.attr, + NULL, +}; + +static struct attribute_group reipl_eckd_attr_group = { + .attrs = reipl_eckd_attrs, + .bin_attrs = reipl_eckd_bin_attrs +}; + +static ssize_t reipl_eckd_clear_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sprintf(page, "%u\n", reipl_eckd_clear); +} + +static ssize_t reipl_eckd_clear_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + if (kstrtobool(buf, &reipl_eckd_clear) < 0) + return -EINVAL; + return len; +} + +static struct kobj_attribute sys_reipl_eckd_clear_attr = + __ATTR(clear, 0644, reipl_eckd_clear_show, reipl_eckd_clear_store); + +/* NSS reipl device attributes */ +static void reipl_get_ascii_nss_name(char *dst, + struct ipl_parameter_block *ipb) +{ + memcpy(dst, ipb->ccw.nss_name, NSS_NAME_SIZE); + EBCASC(dst, NSS_NAME_SIZE); + dst[NSS_NAME_SIZE] = 0; +} + +static ssize_t reipl_nss_name_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + char nss_name[NSS_NAME_SIZE + 1] = {}; + + reipl_get_ascii_nss_name(nss_name, reipl_block_nss); + return sprintf(page, "%s\n", nss_name); +} + +static ssize_t reipl_nss_name_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + int nss_len; + + /* ignore trailing newline */ + nss_len = len; + if ((len > 0) && (buf[len - 1] == '\n')) + nss_len--; + + if (nss_len > NSS_NAME_SIZE) + return -EINVAL; + + memset(reipl_block_nss->ccw.nss_name, 0x40, NSS_NAME_SIZE); + if (nss_len > 0) { + reipl_block_nss->ccw.vm_flags |= IPL_PB0_CCW_VM_FLAG_NSS; + memcpy(reipl_block_nss->ccw.nss_name, buf, nss_len); + ASCEBC(reipl_block_nss->ccw.nss_name, nss_len); + EBC_TOUPPER(reipl_block_nss->ccw.nss_name, nss_len); + } else { + reipl_block_nss->ccw.vm_flags &= ~IPL_PB0_CCW_VM_FLAG_NSS; + } + + return len; +} + +static struct kobj_attribute sys_reipl_nss_name_attr = + __ATTR(name, 0644, reipl_nss_name_show, + reipl_nss_name_store); + +static struct attribute *reipl_nss_attrs[] = { + &sys_reipl_nss_name_attr.attr, + &sys_reipl_nss_loadparm_attr.attr, + &sys_reipl_nss_vmparm_attr.attr, + NULL, +}; + +static struct attribute_group reipl_nss_attr_group = { + .name = IPL_NSS_STR, + .attrs = reipl_nss_attrs, +}; + +void set_os_info_reipl_block(void) +{ + os_info_entry_add(OS_INFO_REIPL_BLOCK, reipl_block_actual, + reipl_block_actual->hdr.len); +} + +/* reipl type */ + +static int reipl_set_type(enum ipl_type type) +{ + if (!(reipl_capabilities & type)) + return -EINVAL; + + switch(type) { + case IPL_TYPE_CCW: + reipl_block_actual = reipl_block_ccw; + break; + case IPL_TYPE_ECKD: + reipl_block_actual = reipl_block_eckd; + break; + case IPL_TYPE_FCP: + reipl_block_actual = reipl_block_fcp; + break; + case IPL_TYPE_NVME: + reipl_block_actual = reipl_block_nvme; + break; + case IPL_TYPE_NSS: + reipl_block_actual = reipl_block_nss; + break; + default: + break; + } + reipl_type = type; + return 0; +} + +static ssize_t reipl_type_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sprintf(page, "%s\n", ipl_type_str(reipl_type)); +} + +static ssize_t reipl_type_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + int rc = -EINVAL; + + if (strncmp(buf, IPL_CCW_STR, strlen(IPL_CCW_STR)) == 0) + rc = reipl_set_type(IPL_TYPE_CCW); + else if (strncmp(buf, IPL_ECKD_STR, strlen(IPL_ECKD_STR)) == 0) + rc = reipl_set_type(IPL_TYPE_ECKD); + else if (strncmp(buf, IPL_FCP_STR, strlen(IPL_FCP_STR)) == 0) + rc = reipl_set_type(IPL_TYPE_FCP); + else if (strncmp(buf, IPL_NVME_STR, strlen(IPL_NVME_STR)) == 0) + rc = reipl_set_type(IPL_TYPE_NVME); + else if (strncmp(buf, IPL_NSS_STR, strlen(IPL_NSS_STR)) == 0) + rc = reipl_set_type(IPL_TYPE_NSS); + return (rc != 0) ? rc : len; +} + +static struct kobj_attribute reipl_type_attr = + __ATTR(reipl_type, 0644, reipl_type_show, reipl_type_store); + +static struct kset *reipl_kset; +static struct kset *reipl_fcp_kset; +static struct kset *reipl_nvme_kset; +static struct kset *reipl_eckd_kset; + +static void __reipl_run(void *unused) +{ + switch (reipl_type) { + case IPL_TYPE_CCW: + diag308(DIAG308_SET, reipl_block_ccw); + if (reipl_ccw_clear) + diag308(DIAG308_LOAD_CLEAR, NULL); + else + diag308(DIAG308_LOAD_NORMAL_DUMP, NULL); + break; + case IPL_TYPE_ECKD: + diag308(DIAG308_SET, reipl_block_eckd); + if (reipl_eckd_clear) + diag308(DIAG308_LOAD_CLEAR, NULL); + else + diag308(DIAG308_LOAD_NORMAL, NULL); + break; + case IPL_TYPE_FCP: + diag308(DIAG308_SET, reipl_block_fcp); + if (reipl_fcp_clear) + diag308(DIAG308_LOAD_CLEAR, NULL); + else + diag308(DIAG308_LOAD_NORMAL, NULL); + break; + case IPL_TYPE_NVME: + diag308(DIAG308_SET, reipl_block_nvme); + if (reipl_nvme_clear) + diag308(DIAG308_LOAD_CLEAR, NULL); + else + diag308(DIAG308_LOAD_NORMAL, NULL); + break; + case IPL_TYPE_NSS: + diag308(DIAG308_SET, reipl_block_nss); + diag308(DIAG308_LOAD_CLEAR, NULL); + break; + case IPL_TYPE_UNKNOWN: + diag308(DIAG308_LOAD_CLEAR, NULL); + break; + case IPL_TYPE_FCP_DUMP: + case IPL_TYPE_NVME_DUMP: + case IPL_TYPE_ECKD_DUMP: + break; + } + disabled_wait(); +} + +static void reipl_run(struct shutdown_trigger *trigger) +{ + smp_call_ipl_cpu(__reipl_run, NULL); +} + +static void reipl_block_ccw_init(struct ipl_parameter_block *ipb) +{ + ipb->hdr.len = IPL_BP_CCW_LEN; + ipb->hdr.version = IPL_PARM_BLOCK_VERSION; + ipb->pb0_hdr.len = IPL_BP0_CCW_LEN; + ipb->pb0_hdr.pbt = IPL_PBT_CCW; +} + +static void reipl_block_ccw_fill_parms(struct ipl_parameter_block *ipb) +{ + /* LOADPARM */ + /* check if read scp info worked and set loadparm */ + if (sclp_ipl_info.is_valid) + memcpy(ipb->ccw.loadparm, &sclp_ipl_info.loadparm, LOADPARM_LEN); + else + /* read scp info failed: set empty loadparm (EBCDIC blanks) */ + memset(ipb->ccw.loadparm, 0x40, LOADPARM_LEN); + ipb->ccw.flags = IPL_PB0_FLAG_LOADPARM; + + /* VM PARM */ + if (MACHINE_IS_VM && ipl_block_valid && + (ipl_block.ccw.vm_flags & IPL_PB0_CCW_VM_FLAG_VP)) { + + ipb->ccw.vm_flags |= IPL_PB0_CCW_VM_FLAG_VP; + ipb->ccw.vm_parm_len = ipl_block.ccw.vm_parm_len; + memcpy(ipb->ccw.vm_parm, + ipl_block.ccw.vm_parm, DIAG308_VMPARM_SIZE); + } +} + +static int __init reipl_nss_init(void) +{ + int rc; + + if (!MACHINE_IS_VM) + return 0; + + reipl_block_nss = (void *) get_zeroed_page(GFP_KERNEL); + if (!reipl_block_nss) + return -ENOMEM; + + rc = sysfs_create_group(&reipl_kset->kobj, &reipl_nss_attr_group); + if (rc) + return rc; + + reipl_block_ccw_init(reipl_block_nss); + reipl_capabilities |= IPL_TYPE_NSS; + return 0; +} + +static int __init reipl_ccw_init(void) +{ + int rc; + + reipl_block_ccw = (void *) get_zeroed_page(GFP_KERNEL); + if (!reipl_block_ccw) + return -ENOMEM; + + rc = sysfs_create_group(&reipl_kset->kobj, + MACHINE_IS_VM ? &reipl_ccw_attr_group_vm + : &reipl_ccw_attr_group_lpar); + if (rc) + return rc; + + reipl_block_ccw_init(reipl_block_ccw); + if (ipl_info.type == IPL_TYPE_CCW) { + reipl_block_ccw->ccw.ssid = ipl_block.ccw.ssid; + reipl_block_ccw->ccw.devno = ipl_block.ccw.devno; + reipl_block_ccw_fill_parms(reipl_block_ccw); + } + + reipl_capabilities |= IPL_TYPE_CCW; + return 0; +} + +static int __init reipl_fcp_init(void) +{ + int rc; + + reipl_block_fcp = (void *) get_zeroed_page(GFP_KERNEL); + if (!reipl_block_fcp) + return -ENOMEM; + + /* sysfs: create fcp kset for mixing attr group and bin attrs */ + reipl_fcp_kset = kset_create_and_add(IPL_FCP_STR, NULL, + &reipl_kset->kobj); + if (!reipl_fcp_kset) { + free_page((unsigned long) reipl_block_fcp); + return -ENOMEM; + } + + rc = sysfs_create_group(&reipl_fcp_kset->kobj, &reipl_fcp_attr_group); + if (rc) + goto out1; + + if (test_facility(141)) { + rc = sysfs_create_file(&reipl_fcp_kset->kobj, + &sys_reipl_fcp_clear_attr.attr); + if (rc) + goto out2; + } else { + reipl_fcp_clear = true; + } + + if (ipl_info.type == IPL_TYPE_FCP) { + memcpy(reipl_block_fcp, &ipl_block, sizeof(ipl_block)); + /* + * Fix loadparm: There are systems where the (SCSI) LOADPARM + * is invalid in the SCSI IPL parameter block, so take it + * always from sclp_ipl_info. + */ + memcpy(reipl_block_fcp->fcp.loadparm, sclp_ipl_info.loadparm, + LOADPARM_LEN); + } else { + reipl_block_fcp->hdr.len = IPL_BP_FCP_LEN; + reipl_block_fcp->hdr.version = IPL_PARM_BLOCK_VERSION; + reipl_block_fcp->fcp.len = IPL_BP0_FCP_LEN; + reipl_block_fcp->fcp.pbt = IPL_PBT_FCP; + reipl_block_fcp->fcp.opt = IPL_PB0_FCP_OPT_IPL; + } + reipl_capabilities |= IPL_TYPE_FCP; + return 0; + +out2: + sysfs_remove_group(&reipl_fcp_kset->kobj, &reipl_fcp_attr_group); +out1: + kset_unregister(reipl_fcp_kset); + free_page((unsigned long) reipl_block_fcp); + return rc; +} + +static int __init reipl_nvme_init(void) +{ + int rc; + + reipl_block_nvme = (void *) get_zeroed_page(GFP_KERNEL); + if (!reipl_block_nvme) + return -ENOMEM; + + /* sysfs: create kset for mixing attr group and bin attrs */ + reipl_nvme_kset = kset_create_and_add(IPL_NVME_STR, NULL, + &reipl_kset->kobj); + if (!reipl_nvme_kset) { + free_page((unsigned long) reipl_block_nvme); + return -ENOMEM; + } + + rc = sysfs_create_group(&reipl_nvme_kset->kobj, &reipl_nvme_attr_group); + if (rc) + goto out1; + + if (test_facility(141)) { + rc = sysfs_create_file(&reipl_nvme_kset->kobj, + &sys_reipl_nvme_clear_attr.attr); + if (rc) + goto out2; + } else { + reipl_nvme_clear = true; + } + + if (ipl_info.type == IPL_TYPE_NVME) { + memcpy(reipl_block_nvme, &ipl_block, sizeof(ipl_block)); + /* + * Fix loadparm: There are systems where the (SCSI) LOADPARM + * is invalid in the IPL parameter block, so take it + * always from sclp_ipl_info. + */ + memcpy(reipl_block_nvme->nvme.loadparm, sclp_ipl_info.loadparm, + LOADPARM_LEN); + } else { + reipl_block_nvme->hdr.len = IPL_BP_NVME_LEN; + reipl_block_nvme->hdr.version = IPL_PARM_BLOCK_VERSION; + reipl_block_nvme->nvme.len = IPL_BP0_NVME_LEN; + reipl_block_nvme->nvme.pbt = IPL_PBT_NVME; + reipl_block_nvme->nvme.opt = IPL_PB0_NVME_OPT_IPL; + } + reipl_capabilities |= IPL_TYPE_NVME; + return 0; + +out2: + sysfs_remove_group(&reipl_nvme_kset->kobj, &reipl_nvme_attr_group); +out1: + kset_unregister(reipl_nvme_kset); + free_page((unsigned long) reipl_block_nvme); + return rc; +} + +static int __init reipl_eckd_init(void) +{ + int rc; + + if (!sclp.has_sipl_eckd) + return 0; + + reipl_block_eckd = (void *)get_zeroed_page(GFP_KERNEL); + if (!reipl_block_eckd) + return -ENOMEM; + + /* sysfs: create kset for mixing attr group and bin attrs */ + reipl_eckd_kset = kset_create_and_add(IPL_ECKD_STR, NULL, + &reipl_kset->kobj); + if (!reipl_eckd_kset) { + free_page((unsigned long)reipl_block_eckd); + return -ENOMEM; + } + + rc = sysfs_create_group(&reipl_eckd_kset->kobj, &reipl_eckd_attr_group); + if (rc) + goto out1; + + if (test_facility(141)) { + rc = sysfs_create_file(&reipl_eckd_kset->kobj, + &sys_reipl_eckd_clear_attr.attr); + if (rc) + goto out2; + } else { + reipl_eckd_clear = true; + } + + if (ipl_info.type == IPL_TYPE_ECKD) { + memcpy(reipl_block_eckd, &ipl_block, sizeof(ipl_block)); + } else { + reipl_block_eckd->hdr.len = IPL_BP_ECKD_LEN; + reipl_block_eckd->hdr.version = IPL_PARM_BLOCK_VERSION; + reipl_block_eckd->eckd.len = IPL_BP0_ECKD_LEN; + reipl_block_eckd->eckd.pbt = IPL_PBT_ECKD; + reipl_block_eckd->eckd.opt = IPL_PB0_ECKD_OPT_IPL; + } + reipl_capabilities |= IPL_TYPE_ECKD; + return 0; + +out2: + sysfs_remove_group(&reipl_eckd_kset->kobj, &reipl_eckd_attr_group); +out1: + kset_unregister(reipl_eckd_kset); + free_page((unsigned long)reipl_block_eckd); + return rc; +} + +static int __init reipl_type_init(void) +{ + enum ipl_type reipl_type = ipl_info.type; + struct ipl_parameter_block *reipl_block; + unsigned long size; + + reipl_block = os_info_old_entry(OS_INFO_REIPL_BLOCK, &size); + if (!reipl_block) + goto out; + /* + * If we have an OS info reipl block, this will be used + */ + if (reipl_block->pb0_hdr.pbt == IPL_PBT_FCP) { + memcpy(reipl_block_fcp, reipl_block, size); + reipl_type = IPL_TYPE_FCP; + } else if (reipl_block->pb0_hdr.pbt == IPL_PBT_NVME) { + memcpy(reipl_block_nvme, reipl_block, size); + reipl_type = IPL_TYPE_NVME; + } else if (reipl_block->pb0_hdr.pbt == IPL_PBT_CCW) { + memcpy(reipl_block_ccw, reipl_block, size); + reipl_type = IPL_TYPE_CCW; + } else if (reipl_block->pb0_hdr.pbt == IPL_PBT_ECKD) { + memcpy(reipl_block_eckd, reipl_block, size); + reipl_type = IPL_TYPE_ECKD; + } +out: + return reipl_set_type(reipl_type); +} + +static int __init reipl_init(void) +{ + int rc; + + reipl_kset = kset_create_and_add("reipl", NULL, firmware_kobj); + if (!reipl_kset) + return -ENOMEM; + rc = sysfs_create_file(&reipl_kset->kobj, &reipl_type_attr.attr); + if (rc) { + kset_unregister(reipl_kset); + return rc; + } + rc = reipl_ccw_init(); + if (rc) + return rc; + rc = reipl_eckd_init(); + if (rc) + return rc; + rc = reipl_fcp_init(); + if (rc) + return rc; + rc = reipl_nvme_init(); + if (rc) + return rc; + rc = reipl_nss_init(); + if (rc) + return rc; + return reipl_type_init(); +} + +static struct shutdown_action __refdata reipl_action = { + .name = SHUTDOWN_ACTION_REIPL_STR, + .fn = reipl_run, + .init = reipl_init, +}; + +/* + * dump shutdown action: Dump Linux on shutdown. + */ + +/* FCP dump device attributes */ + +DEFINE_IPL_ATTR_RW(dump_fcp, wwpn, "0x%016llx\n", "%llx\n", + dump_block_fcp->fcp.wwpn); +DEFINE_IPL_ATTR_RW(dump_fcp, lun, "0x%016llx\n", "%llx\n", + dump_block_fcp->fcp.lun); +DEFINE_IPL_ATTR_RW(dump_fcp, bootprog, "%lld\n", "%lld\n", + dump_block_fcp->fcp.bootprog); +DEFINE_IPL_ATTR_RW(dump_fcp, br_lba, "%lld\n", "%lld\n", + dump_block_fcp->fcp.br_lba); +DEFINE_IPL_ATTR_RW(dump_fcp, device, "0.0.%04llx\n", "0.0.%llx\n", + dump_block_fcp->fcp.devno); + +static struct attribute *dump_fcp_attrs[] = { + &sys_dump_fcp_device_attr.attr, + &sys_dump_fcp_wwpn_attr.attr, + &sys_dump_fcp_lun_attr.attr, + &sys_dump_fcp_bootprog_attr.attr, + &sys_dump_fcp_br_lba_attr.attr, + NULL, +}; + +static struct attribute_group dump_fcp_attr_group = { + .name = IPL_FCP_STR, + .attrs = dump_fcp_attrs, +}; + +/* NVME dump device attributes */ +DEFINE_IPL_ATTR_RW(dump_nvme, fid, "0x%08llx\n", "%llx\n", + dump_block_nvme->nvme.fid); +DEFINE_IPL_ATTR_RW(dump_nvme, nsid, "0x%08llx\n", "%llx\n", + dump_block_nvme->nvme.nsid); +DEFINE_IPL_ATTR_RW(dump_nvme, bootprog, "%lld\n", "%llx\n", + dump_block_nvme->nvme.bootprog); +DEFINE_IPL_ATTR_RW(dump_nvme, br_lba, "%lld\n", "%llx\n", + dump_block_nvme->nvme.br_lba); + +static struct attribute *dump_nvme_attrs[] = { + &sys_dump_nvme_fid_attr.attr, + &sys_dump_nvme_nsid_attr.attr, + &sys_dump_nvme_bootprog_attr.attr, + &sys_dump_nvme_br_lba_attr.attr, + NULL, +}; + +static struct attribute_group dump_nvme_attr_group = { + .name = IPL_NVME_STR, + .attrs = dump_nvme_attrs, +}; + +/* ECKD dump device attributes */ +DEFINE_IPL_CCW_ATTR_RW(dump_eckd, device, dump_block_eckd->eckd); +DEFINE_IPL_ATTR_RW(dump_eckd, bootprog, "%lld\n", "%llx\n", + dump_block_eckd->eckd.bootprog); + +IPL_ATTR_BR_CHR_SHOW_FN(dump, dump_block_eckd->eckd); +IPL_ATTR_BR_CHR_STORE_FN(dump, dump_block_eckd->eckd); + +static struct kobj_attribute sys_dump_eckd_br_chr_attr = + __ATTR(br_chr, 0644, eckd_dump_br_chr_show, eckd_dump_br_chr_store); + +static struct attribute *dump_eckd_attrs[] = { + &sys_dump_eckd_device_attr.attr, + &sys_dump_eckd_bootprog_attr.attr, + &sys_dump_eckd_br_chr_attr.attr, + NULL, +}; + +static struct attribute_group dump_eckd_attr_group = { + .name = IPL_ECKD_STR, + .attrs = dump_eckd_attrs, +}; + +/* CCW dump device attributes */ +DEFINE_IPL_CCW_ATTR_RW(dump_ccw, device, dump_block_ccw->ccw); + +static struct attribute *dump_ccw_attrs[] = { + &sys_dump_ccw_device_attr.attr, + NULL, +}; + +static struct attribute_group dump_ccw_attr_group = { + .name = IPL_CCW_STR, + .attrs = dump_ccw_attrs, +}; + +/* dump type */ + +static int dump_set_type(enum dump_type type) +{ + if (!(dump_capabilities & type)) + return -EINVAL; + dump_type = type; + return 0; +} + +static ssize_t dump_type_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sprintf(page, "%s\n", dump_type_str(dump_type)); +} + +static ssize_t dump_type_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + int rc = -EINVAL; + + if (strncmp(buf, DUMP_NONE_STR, strlen(DUMP_NONE_STR)) == 0) + rc = dump_set_type(DUMP_TYPE_NONE); + else if (strncmp(buf, DUMP_CCW_STR, strlen(DUMP_CCW_STR)) == 0) + rc = dump_set_type(DUMP_TYPE_CCW); + else if (strncmp(buf, DUMP_ECKD_STR, strlen(DUMP_ECKD_STR)) == 0) + rc = dump_set_type(DUMP_TYPE_ECKD); + else if (strncmp(buf, DUMP_FCP_STR, strlen(DUMP_FCP_STR)) == 0) + rc = dump_set_type(DUMP_TYPE_FCP); + else if (strncmp(buf, DUMP_NVME_STR, strlen(DUMP_NVME_STR)) == 0) + rc = dump_set_type(DUMP_TYPE_NVME); + return (rc != 0) ? rc : len; +} + +static struct kobj_attribute dump_type_attr = + __ATTR(dump_type, 0644, dump_type_show, dump_type_store); + +static struct kset *dump_kset; + +static void diag308_dump(void *dump_block) +{ + diag308(DIAG308_SET, dump_block); + while (1) { + if (diag308(DIAG308_LOAD_NORMAL_DUMP, NULL) != 0x302) + break; + udelay(USEC_PER_SEC); + } +} + +static void __dump_run(void *unused) +{ + switch (dump_type) { + case DUMP_TYPE_CCW: + diag308_dump(dump_block_ccw); + break; + case DUMP_TYPE_ECKD: + diag308_dump(dump_block_eckd); + break; + case DUMP_TYPE_FCP: + diag308_dump(dump_block_fcp); + break; + case DUMP_TYPE_NVME: + diag308_dump(dump_block_nvme); + break; + default: + break; + } +} + +static void dump_run(struct shutdown_trigger *trigger) +{ + if (dump_type == DUMP_TYPE_NONE) + return; + smp_send_stop(); + smp_call_ipl_cpu(__dump_run, NULL); +} + +static int __init dump_ccw_init(void) +{ + int rc; + + dump_block_ccw = (void *) get_zeroed_page(GFP_KERNEL); + if (!dump_block_ccw) + return -ENOMEM; + rc = sysfs_create_group(&dump_kset->kobj, &dump_ccw_attr_group); + if (rc) { + free_page((unsigned long)dump_block_ccw); + return rc; + } + dump_block_ccw->hdr.len = IPL_BP_CCW_LEN; + dump_block_ccw->hdr.version = IPL_PARM_BLOCK_VERSION; + dump_block_ccw->ccw.len = IPL_BP0_CCW_LEN; + dump_block_ccw->ccw.pbt = IPL_PBT_CCW; + dump_capabilities |= DUMP_TYPE_CCW; + return 0; +} + +static int __init dump_fcp_init(void) +{ + int rc; + + if (!sclp_ipl_info.has_dump) + return 0; /* LDIPL DUMP is not installed */ + dump_block_fcp = (void *) get_zeroed_page(GFP_KERNEL); + if (!dump_block_fcp) + return -ENOMEM; + rc = sysfs_create_group(&dump_kset->kobj, &dump_fcp_attr_group); + if (rc) { + free_page((unsigned long)dump_block_fcp); + return rc; + } + dump_block_fcp->hdr.len = IPL_BP_FCP_LEN; + dump_block_fcp->hdr.version = IPL_PARM_BLOCK_VERSION; + dump_block_fcp->fcp.len = IPL_BP0_FCP_LEN; + dump_block_fcp->fcp.pbt = IPL_PBT_FCP; + dump_block_fcp->fcp.opt = IPL_PB0_FCP_OPT_DUMP; + dump_capabilities |= DUMP_TYPE_FCP; + return 0; +} + +static int __init dump_nvme_init(void) +{ + int rc; + + if (!sclp_ipl_info.has_dump) + return 0; /* LDIPL DUMP is not installed */ + dump_block_nvme = (void *) get_zeroed_page(GFP_KERNEL); + if (!dump_block_nvme) + return -ENOMEM; + rc = sysfs_create_group(&dump_kset->kobj, &dump_nvme_attr_group); + if (rc) { + free_page((unsigned long)dump_block_nvme); + return rc; + } + dump_block_nvme->hdr.len = IPL_BP_NVME_LEN; + dump_block_nvme->hdr.version = IPL_PARM_BLOCK_VERSION; + dump_block_nvme->fcp.len = IPL_BP0_NVME_LEN; + dump_block_nvme->fcp.pbt = IPL_PBT_NVME; + dump_block_nvme->fcp.opt = IPL_PB0_NVME_OPT_DUMP; + dump_capabilities |= DUMP_TYPE_NVME; + return 0; +} + +static int __init dump_eckd_init(void) +{ + int rc; + + if (!sclp_ipl_info.has_dump || !sclp.has_sipl_eckd) + return 0; /* LDIPL DUMP is not installed */ + dump_block_eckd = (void *)get_zeroed_page(GFP_KERNEL); + if (!dump_block_eckd) + return -ENOMEM; + rc = sysfs_create_group(&dump_kset->kobj, &dump_eckd_attr_group); + if (rc) { + free_page((unsigned long)dump_block_eckd); + return rc; + } + dump_block_eckd->hdr.len = IPL_BP_ECKD_LEN; + dump_block_eckd->hdr.version = IPL_PARM_BLOCK_VERSION; + dump_block_eckd->eckd.len = IPL_BP0_ECKD_LEN; + dump_block_eckd->eckd.pbt = IPL_PBT_ECKD; + dump_block_eckd->eckd.opt = IPL_PB0_ECKD_OPT_DUMP; + dump_capabilities |= DUMP_TYPE_ECKD; + return 0; +} + +static int __init dump_init(void) +{ + int rc; + + dump_kset = kset_create_and_add("dump", NULL, firmware_kobj); + if (!dump_kset) + return -ENOMEM; + rc = sysfs_create_file(&dump_kset->kobj, &dump_type_attr.attr); + if (rc) { + kset_unregister(dump_kset); + return rc; + } + rc = dump_ccw_init(); + if (rc) + return rc; + rc = dump_eckd_init(); + if (rc) + return rc; + rc = dump_fcp_init(); + if (rc) + return rc; + rc = dump_nvme_init(); + if (rc) + return rc; + dump_set_type(DUMP_TYPE_NONE); + return 0; +} + +static struct shutdown_action __refdata dump_action = { + .name = SHUTDOWN_ACTION_DUMP_STR, + .fn = dump_run, + .init = dump_init, +}; + +static void dump_reipl_run(struct shutdown_trigger *trigger) +{ + struct lowcore *abs_lc; + unsigned int csum; + + /* + * Set REIPL_CLEAR flag in os_info flags entry indicating + * 'clear' sysfs attribute has been set on the panicked system + * for specified reipl type. + * Always set for IPL_TYPE_NSS and IPL_TYPE_UNKNOWN. + */ + if ((reipl_type == IPL_TYPE_CCW && reipl_ccw_clear) || + (reipl_type == IPL_TYPE_ECKD && reipl_eckd_clear) || + (reipl_type == IPL_TYPE_FCP && reipl_fcp_clear) || + (reipl_type == IPL_TYPE_NVME && reipl_nvme_clear) || + reipl_type == IPL_TYPE_NSS || + reipl_type == IPL_TYPE_UNKNOWN) + os_info_flags |= OS_INFO_FLAG_REIPL_CLEAR; + os_info_entry_add(OS_INFO_FLAGS_ENTRY, &os_info_flags, sizeof(os_info_flags)); + csum = (__force unsigned int) + csum_partial(reipl_block_actual, reipl_block_actual->hdr.len, 0); + abs_lc = get_abs_lowcore(); + abs_lc->ipib = __pa(reipl_block_actual); + abs_lc->ipib_checksum = csum; + put_abs_lowcore(abs_lc); + dump_run(trigger); +} + +static struct shutdown_action __refdata dump_reipl_action = { + .name = SHUTDOWN_ACTION_DUMP_REIPL_STR, + .fn = dump_reipl_run, +}; + +/* + * vmcmd shutdown action: Trigger vm command on shutdown. + */ + +static char vmcmd_on_reboot[128]; +static char vmcmd_on_panic[128]; +static char vmcmd_on_halt[128]; +static char vmcmd_on_poff[128]; +static char vmcmd_on_restart[128]; + +DEFINE_IPL_ATTR_STR_RW(vmcmd, on_reboot, "%s\n", "%s\n", vmcmd_on_reboot); +DEFINE_IPL_ATTR_STR_RW(vmcmd, on_panic, "%s\n", "%s\n", vmcmd_on_panic); +DEFINE_IPL_ATTR_STR_RW(vmcmd, on_halt, "%s\n", "%s\n", vmcmd_on_halt); +DEFINE_IPL_ATTR_STR_RW(vmcmd, on_poff, "%s\n", "%s\n", vmcmd_on_poff); +DEFINE_IPL_ATTR_STR_RW(vmcmd, on_restart, "%s\n", "%s\n", vmcmd_on_restart); + +static struct attribute *vmcmd_attrs[] = { + &sys_vmcmd_on_reboot_attr.attr, + &sys_vmcmd_on_panic_attr.attr, + &sys_vmcmd_on_halt_attr.attr, + &sys_vmcmd_on_poff_attr.attr, + &sys_vmcmd_on_restart_attr.attr, + NULL, +}; + +static struct attribute_group vmcmd_attr_group = { + .attrs = vmcmd_attrs, +}; + +static struct kset *vmcmd_kset; + +static void vmcmd_run(struct shutdown_trigger *trigger) +{ + char *cmd; + + if (strcmp(trigger->name, ON_REIPL_STR) == 0) + cmd = vmcmd_on_reboot; + else if (strcmp(trigger->name, ON_PANIC_STR) == 0) + cmd = vmcmd_on_panic; + else if (strcmp(trigger->name, ON_HALT_STR) == 0) + cmd = vmcmd_on_halt; + else if (strcmp(trigger->name, ON_POFF_STR) == 0) + cmd = vmcmd_on_poff; + else if (strcmp(trigger->name, ON_RESTART_STR) == 0) + cmd = vmcmd_on_restart; + else + return; + + if (strlen(cmd) == 0) + return; + __cpcmd(cmd, NULL, 0, NULL); +} + +static int vmcmd_init(void) +{ + if (!MACHINE_IS_VM) + return -EOPNOTSUPP; + vmcmd_kset = kset_create_and_add("vmcmd", NULL, firmware_kobj); + if (!vmcmd_kset) + return -ENOMEM; + return sysfs_create_group(&vmcmd_kset->kobj, &vmcmd_attr_group); +} + +static struct shutdown_action vmcmd_action = {SHUTDOWN_ACTION_VMCMD_STR, + vmcmd_run, vmcmd_init}; + +/* + * stop shutdown action: Stop Linux on shutdown. + */ + +static void stop_run(struct shutdown_trigger *trigger) +{ + if (strcmp(trigger->name, ON_PANIC_STR) == 0 || + strcmp(trigger->name, ON_RESTART_STR) == 0) + disabled_wait(); + smp_stop_cpu(); +} + +static struct shutdown_action stop_action = {SHUTDOWN_ACTION_STOP_STR, + stop_run, NULL}; + +/* action list */ + +static struct shutdown_action *shutdown_actions_list[] = { + &ipl_action, &reipl_action, &dump_reipl_action, &dump_action, + &vmcmd_action, &stop_action}; +#define SHUTDOWN_ACTIONS_COUNT (sizeof(shutdown_actions_list) / sizeof(void *)) + +/* + * Trigger section + */ + +static struct kset *shutdown_actions_kset; + +static int set_trigger(const char *buf, struct shutdown_trigger *trigger, + size_t len) +{ + int i; + + for (i = 0; i < SHUTDOWN_ACTIONS_COUNT; i++) { + if (sysfs_streq(buf, shutdown_actions_list[i]->name)) { + if (shutdown_actions_list[i]->init_rc) { + return shutdown_actions_list[i]->init_rc; + } else { + trigger->action = shutdown_actions_list[i]; + return len; + } + } + } + return -EINVAL; +} + +/* on reipl */ + +static struct shutdown_trigger on_reboot_trigger = {ON_REIPL_STR, + &reipl_action}; + +static ssize_t on_reboot_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sprintf(page, "%s\n", on_reboot_trigger.action->name); +} + +static ssize_t on_reboot_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + return set_trigger(buf, &on_reboot_trigger, len); +} +static struct kobj_attribute on_reboot_attr = __ATTR_RW(on_reboot); + +static void do_machine_restart(char *__unused) +{ + smp_send_stop(); + on_reboot_trigger.action->fn(&on_reboot_trigger); + reipl_run(NULL); +} +void (*_machine_restart)(char *command) = do_machine_restart; + +/* on panic */ + +static struct shutdown_trigger on_panic_trigger = {ON_PANIC_STR, &stop_action}; + +static ssize_t on_panic_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sprintf(page, "%s\n", on_panic_trigger.action->name); +} + +static ssize_t on_panic_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + return set_trigger(buf, &on_panic_trigger, len); +} +static struct kobj_attribute on_panic_attr = __ATTR_RW(on_panic); + +static void do_panic(void) +{ + lgr_info_log(); + on_panic_trigger.action->fn(&on_panic_trigger); + stop_run(&on_panic_trigger); +} + +/* on restart */ + +static struct shutdown_trigger on_restart_trigger = {ON_RESTART_STR, + &stop_action}; + +static ssize_t on_restart_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sprintf(page, "%s\n", on_restart_trigger.action->name); +} + +static ssize_t on_restart_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + return set_trigger(buf, &on_restart_trigger, len); +} +static struct kobj_attribute on_restart_attr = __ATTR_RW(on_restart); + +static void __do_restart(void *ignore) +{ + smp_send_stop(); +#ifdef CONFIG_CRASH_DUMP + crash_kexec(NULL); +#endif + on_restart_trigger.action->fn(&on_restart_trigger); + stop_run(&on_restart_trigger); +} + +void do_restart(void *arg) +{ + tracing_off(); + debug_locks_off(); + lgr_info_log(); + smp_call_online_cpu(__do_restart, arg); +} + +/* on halt */ + +static struct shutdown_trigger on_halt_trigger = {ON_HALT_STR, &stop_action}; + +static ssize_t on_halt_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sprintf(page, "%s\n", on_halt_trigger.action->name); +} + +static ssize_t on_halt_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + return set_trigger(buf, &on_halt_trigger, len); +} +static struct kobj_attribute on_halt_attr = __ATTR_RW(on_halt); + +static void do_machine_halt(void) +{ + smp_send_stop(); + on_halt_trigger.action->fn(&on_halt_trigger); + stop_run(&on_halt_trigger); +} +void (*_machine_halt)(void) = do_machine_halt; + +/* on power off */ + +static struct shutdown_trigger on_poff_trigger = {ON_POFF_STR, &stop_action}; + +static ssize_t on_poff_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sprintf(page, "%s\n", on_poff_trigger.action->name); +} + +static ssize_t on_poff_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + return set_trigger(buf, &on_poff_trigger, len); +} +static struct kobj_attribute on_poff_attr = __ATTR_RW(on_poff); + +static void do_machine_power_off(void) +{ + smp_send_stop(); + on_poff_trigger.action->fn(&on_poff_trigger); + stop_run(&on_poff_trigger); +} +void (*_machine_power_off)(void) = do_machine_power_off; + +static struct attribute *shutdown_action_attrs[] = { + &on_restart_attr.attr, + &on_reboot_attr.attr, + &on_panic_attr.attr, + &on_halt_attr.attr, + &on_poff_attr.attr, + NULL, +}; + +static struct attribute_group shutdown_action_attr_group = { + .attrs = shutdown_action_attrs, +}; + +static void __init shutdown_triggers_init(void) +{ + shutdown_actions_kset = kset_create_and_add("shutdown_actions", NULL, + firmware_kobj); + if (!shutdown_actions_kset) + goto fail; + if (sysfs_create_group(&shutdown_actions_kset->kobj, + &shutdown_action_attr_group)) + goto fail; + return; +fail: + panic("shutdown_triggers_init failed\n"); +} + +static void __init shutdown_actions_init(void) +{ + int i; + + for (i = 0; i < SHUTDOWN_ACTIONS_COUNT; i++) { + if (!shutdown_actions_list[i]->init) + continue; + shutdown_actions_list[i]->init_rc = + shutdown_actions_list[i]->init(); + } +} + +static int __init s390_ipl_init(void) +{ + char str[8] = {0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40}; + + sclp_early_get_ipl_info(&sclp_ipl_info); + /* + * Fix loadparm: There are systems where the (SCSI) LOADPARM + * returned by read SCP info is invalid (contains EBCDIC blanks) + * when the system has been booted via diag308. In that case we use + * the value from diag308, if available. + * + * There are also systems where diag308 store does not work in + * case the system is booted from HMC. Fortunately in this case + * READ SCP info provides the correct value. + */ + if (memcmp(sclp_ipl_info.loadparm, str, sizeof(str)) == 0 && ipl_block_valid) + memcpy(sclp_ipl_info.loadparm, ipl_block.ccw.loadparm, LOADPARM_LEN); + shutdown_actions_init(); + shutdown_triggers_init(); + return 0; +} + +__initcall(s390_ipl_init); + +static void __init strncpy_skip_quote(char *dst, char *src, int n) +{ + int sx, dx; + + dx = 0; + for (sx = 0; src[sx] != 0; sx++) { + if (src[sx] == '"') + continue; + dst[dx++] = src[sx]; + if (dx >= n) + break; + } +} + +static int __init vmcmd_on_reboot_setup(char *str) +{ + if (!MACHINE_IS_VM) + return 1; + strncpy_skip_quote(vmcmd_on_reboot, str, 127); + vmcmd_on_reboot[127] = 0; + on_reboot_trigger.action = &vmcmd_action; + return 1; +} +__setup("vmreboot=", vmcmd_on_reboot_setup); + +static int __init vmcmd_on_panic_setup(char *str) +{ + if (!MACHINE_IS_VM) + return 1; + strncpy_skip_quote(vmcmd_on_panic, str, 127); + vmcmd_on_panic[127] = 0; + on_panic_trigger.action = &vmcmd_action; + return 1; +} +__setup("vmpanic=", vmcmd_on_panic_setup); + +static int __init vmcmd_on_halt_setup(char *str) +{ + if (!MACHINE_IS_VM) + return 1; + strncpy_skip_quote(vmcmd_on_halt, str, 127); + vmcmd_on_halt[127] = 0; + on_halt_trigger.action = &vmcmd_action; + return 1; +} +__setup("vmhalt=", vmcmd_on_halt_setup); + +static int __init vmcmd_on_poff_setup(char *str) +{ + if (!MACHINE_IS_VM) + return 1; + strncpy_skip_quote(vmcmd_on_poff, str, 127); + vmcmd_on_poff[127] = 0; + on_poff_trigger.action = &vmcmd_action; + return 1; +} +__setup("vmpoff=", vmcmd_on_poff_setup); + +static int on_panic_notify(struct notifier_block *self, + unsigned long event, void *data) +{ + do_panic(); + return NOTIFY_OK; +} + +static struct notifier_block on_panic_nb = { + .notifier_call = on_panic_notify, + .priority = INT_MIN, +}; + +void __init setup_ipl(void) +{ + BUILD_BUG_ON(sizeof(struct ipl_parameter_block) != PAGE_SIZE); + + ipl_info.type = get_ipl_type(); + switch (ipl_info.type) { + case IPL_TYPE_CCW: + ipl_info.data.ccw.dev_id.ssid = ipl_block.ccw.ssid; + ipl_info.data.ccw.dev_id.devno = ipl_block.ccw.devno; + break; + case IPL_TYPE_ECKD: + case IPL_TYPE_ECKD_DUMP: + ipl_info.data.eckd.dev_id.ssid = ipl_block.eckd.ssid; + ipl_info.data.eckd.dev_id.devno = ipl_block.eckd.devno; + break; + case IPL_TYPE_FCP: + case IPL_TYPE_FCP_DUMP: + ipl_info.data.fcp.dev_id.ssid = 0; + ipl_info.data.fcp.dev_id.devno = ipl_block.fcp.devno; + ipl_info.data.fcp.wwpn = ipl_block.fcp.wwpn; + ipl_info.data.fcp.lun = ipl_block.fcp.lun; + break; + case IPL_TYPE_NVME: + case IPL_TYPE_NVME_DUMP: + ipl_info.data.nvme.fid = ipl_block.nvme.fid; + ipl_info.data.nvme.nsid = ipl_block.nvme.nsid; + break; + case IPL_TYPE_NSS: + case IPL_TYPE_UNKNOWN: + /* We have no info to copy */ + break; + } + atomic_notifier_chain_register(&panic_notifier_list, &on_panic_nb); +} + +void s390_reset_system(void) +{ + /* Disable prefixing */ + set_prefix(0); + + /* Disable lowcore protection */ + __ctl_clear_bit(0, 28); + diag_amode31_ops.diag308_reset(); +} + +#ifdef CONFIG_KEXEC_FILE + +int ipl_report_add_component(struct ipl_report *report, struct kexec_buf *kbuf, + unsigned char flags, unsigned short cert) +{ + struct ipl_report_component *comp; + + comp = vzalloc(sizeof(*comp)); + if (!comp) + return -ENOMEM; + list_add_tail(&comp->list, &report->components); + + comp->entry.addr = kbuf->mem; + comp->entry.len = kbuf->memsz; + comp->entry.flags = flags; + comp->entry.certificate_index = cert; + + report->size += sizeof(comp->entry); + + return 0; +} + +int ipl_report_add_certificate(struct ipl_report *report, void *key, + unsigned long addr, unsigned long len) +{ + struct ipl_report_certificate *cert; + + cert = vzalloc(sizeof(*cert)); + if (!cert) + return -ENOMEM; + list_add_tail(&cert->list, &report->certificates); + + cert->entry.addr = addr; + cert->entry.len = len; + cert->key = key; + + report->size += sizeof(cert->entry); + report->size += cert->entry.len; + + return 0; +} + +struct ipl_report *ipl_report_init(struct ipl_parameter_block *ipib) +{ + struct ipl_report *report; + + report = vzalloc(sizeof(*report)); + if (!report) + return ERR_PTR(-ENOMEM); + + report->ipib = ipib; + INIT_LIST_HEAD(&report->components); + INIT_LIST_HEAD(&report->certificates); + + report->size = ALIGN(ipib->hdr.len, 8); + report->size += sizeof(struct ipl_rl_hdr); + report->size += sizeof(struct ipl_rb_components); + report->size += sizeof(struct ipl_rb_certificates); + + return report; +} + +void *ipl_report_finish(struct ipl_report *report) +{ + struct ipl_report_certificate *cert; + struct ipl_report_component *comp; + struct ipl_rb_certificates *certs; + struct ipl_parameter_block *ipib; + struct ipl_rb_components *comps; + struct ipl_rl_hdr *rl_hdr; + void *buf, *ptr; + + buf = vzalloc(report->size); + if (!buf) + goto out; + ptr = buf; + + memcpy(ptr, report->ipib, report->ipib->hdr.len); + ipib = ptr; + if (ipl_secure_flag) + ipib->hdr.flags |= IPL_PL_FLAG_SIPL; + ipib->hdr.flags |= IPL_PL_FLAG_IPLSR; + ptr += report->ipib->hdr.len; + ptr = PTR_ALIGN(ptr, 8); + + rl_hdr = ptr; + ptr += sizeof(*rl_hdr); + + comps = ptr; + comps->rbt = IPL_RBT_COMPONENTS; + ptr += sizeof(*comps); + list_for_each_entry(comp, &report->components, list) { + memcpy(ptr, &comp->entry, sizeof(comp->entry)); + ptr += sizeof(comp->entry); + } + comps->len = ptr - (void *)comps; + + certs = ptr; + certs->rbt = IPL_RBT_CERTIFICATES; + ptr += sizeof(*certs); + list_for_each_entry(cert, &report->certificates, list) { + memcpy(ptr, &cert->entry, sizeof(cert->entry)); + ptr += sizeof(cert->entry); + } + certs->len = ptr - (void *)certs; + rl_hdr->len = ptr - (void *)rl_hdr; + + list_for_each_entry(cert, &report->certificates, list) { + memcpy(ptr, cert->key, cert->entry.len); + ptr += cert->entry.len; + } + + BUG_ON(ptr > buf + report->size); +out: + return buf; +} + +int ipl_report_free(struct ipl_report *report) +{ + struct ipl_report_component *comp, *ncomp; + struct ipl_report_certificate *cert, *ncert; + + list_for_each_entry_safe(comp, ncomp, &report->components, list) + vfree(comp); + + list_for_each_entry_safe(cert, ncert, &report->certificates, list) + vfree(cert); + + vfree(report); + + return 0; +} + +#endif diff --git a/arch/s390/kernel/ipl_vmparm.c b/arch/s390/kernel/ipl_vmparm.c new file mode 100644 index 0000000000..b5245fadcf --- /dev/null +++ b/arch/s390/kernel/ipl_vmparm.c @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +/* VM IPL PARM routines */ +size_t ipl_block_get_ascii_vmparm(char *dest, size_t size, + const struct ipl_parameter_block *ipb) +{ + int i; + size_t len; + char has_lowercase = 0; + + len = 0; + if ((ipb->ccw.vm_flags & IPL_PB0_CCW_VM_FLAG_VP) && + (ipb->ccw.vm_parm_len > 0)) { + + len = min_t(size_t, size - 1, ipb->ccw.vm_parm_len); + memcpy(dest, ipb->ccw.vm_parm, len); + /* If at least one character is lowercase, we assume mixed + * case; otherwise we convert everything to lowercase. + */ + for (i = 0; i < len; i++) + if ((dest[i] > 0x80 && dest[i] < 0x8a) || /* a-i */ + (dest[i] > 0x90 && dest[i] < 0x9a) || /* j-r */ + (dest[i] > 0xa1 && dest[i] < 0xaa)) { /* s-z */ + has_lowercase = 1; + break; + } + if (!has_lowercase) + EBC_TOLOWER(dest, len); + EBCASC(dest, len); + } + dest[len] = 0; + + return len; +} diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c new file mode 100644 index 0000000000..b020ff17d2 --- /dev/null +++ b/arch/s390/kernel/irq.c @@ -0,0 +1,402 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2004, 2011 + * Author(s): Martin Schwidefsky , + * Holger Smolinski , + * Thomas Spatzier , + * + * This file contains interrupt related functions. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "entry.h" + +DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_stat, irq_stat); +EXPORT_PER_CPU_SYMBOL_GPL(irq_stat); + +struct irq_class { + int irq; + char *name; + char *desc; +}; + +/* + * The list of "main" irq classes on s390. This is the list of interrupts + * that appear both in /proc/stat ("intr" line) and /proc/interrupts. + * Historically only external and I/O interrupts have been part of /proc/stat. + * We can't add the split external and I/O sub classes since the first field + * in the "intr" line in /proc/stat is supposed to be the sum of all other + * fields. + * Since the external and I/O interrupt fields are already sums we would end + * up with having a sum which accounts each interrupt twice. + */ +static const struct irq_class irqclass_main_desc[NR_IRQS_BASE] = { + {.irq = EXT_INTERRUPT, .name = "EXT"}, + {.irq = IO_INTERRUPT, .name = "I/O"}, + {.irq = THIN_INTERRUPT, .name = "AIO"}, +}; + +/* + * The list of split external and I/O interrupts that appear only in + * /proc/interrupts. + * In addition this list contains non external / I/O events like NMIs. + */ +static const struct irq_class irqclass_sub_desc[] = { + {.irq = IRQEXT_CLK, .name = "CLK", .desc = "[EXT] Clock Comparator"}, + {.irq = IRQEXT_EXC, .name = "EXC", .desc = "[EXT] External Call"}, + {.irq = IRQEXT_EMS, .name = "EMS", .desc = "[EXT] Emergency Signal"}, + {.irq = IRQEXT_TMR, .name = "TMR", .desc = "[EXT] CPU Timer"}, + {.irq = IRQEXT_TLA, .name = "TAL", .desc = "[EXT] Timing Alert"}, + {.irq = IRQEXT_PFL, .name = "PFL", .desc = "[EXT] Pseudo Page Fault"}, + {.irq = IRQEXT_DSD, .name = "DSD", .desc = "[EXT] DASD Diag"}, + {.irq = IRQEXT_VRT, .name = "VRT", .desc = "[EXT] Virtio"}, + {.irq = IRQEXT_SCP, .name = "SCP", .desc = "[EXT] Service Call"}, + {.irq = IRQEXT_IUC, .name = "IUC", .desc = "[EXT] IUCV"}, + {.irq = IRQEXT_CMS, .name = "CMS", .desc = "[EXT] CPU-Measurement: Sampling"}, + {.irq = IRQEXT_CMC, .name = "CMC", .desc = "[EXT] CPU-Measurement: Counter"}, + {.irq = IRQEXT_FTP, .name = "FTP", .desc = "[EXT] HMC FTP Service"}, + {.irq = IRQIO_CIO, .name = "CIO", .desc = "[I/O] Common I/O Layer Interrupt"}, + {.irq = IRQIO_DAS, .name = "DAS", .desc = "[I/O] DASD"}, + {.irq = IRQIO_C15, .name = "C15", .desc = "[I/O] 3215"}, + {.irq = IRQIO_C70, .name = "C70", .desc = "[I/O] 3270"}, + {.irq = IRQIO_TAP, .name = "TAP", .desc = "[I/O] Tape"}, + {.irq = IRQIO_VMR, .name = "VMR", .desc = "[I/O] Unit Record Devices"}, + {.irq = IRQIO_LCS, .name = "LCS", .desc = "[I/O] LCS"}, + {.irq = IRQIO_CTC, .name = "CTC", .desc = "[I/O] CTC"}, + {.irq = IRQIO_ADM, .name = "ADM", .desc = "[I/O] EADM Subchannel"}, + {.irq = IRQIO_CSC, .name = "CSC", .desc = "[I/O] CHSC Subchannel"}, + {.irq = IRQIO_VIR, .name = "VIR", .desc = "[I/O] Virtual I/O Devices"}, + {.irq = IRQIO_QAI, .name = "QAI", .desc = "[AIO] QDIO Adapter Interrupt"}, + {.irq = IRQIO_APB, .name = "APB", .desc = "[AIO] AP Bus"}, + {.irq = IRQIO_PCF, .name = "PCF", .desc = "[AIO] PCI Floating Interrupt"}, + {.irq = IRQIO_PCD, .name = "PCD", .desc = "[AIO] PCI Directed Interrupt"}, + {.irq = IRQIO_MSI, .name = "MSI", .desc = "[AIO] MSI Interrupt"}, + {.irq = IRQIO_VAI, .name = "VAI", .desc = "[AIO] Virtual I/O Devices AI"}, + {.irq = IRQIO_GAL, .name = "GAL", .desc = "[AIO] GIB Alert"}, + {.irq = NMI_NMI, .name = "NMI", .desc = "[NMI] Machine Check"}, + {.irq = CPU_RST, .name = "RST", .desc = "[CPU] CPU Restart"}, +}; + +static void do_IRQ(struct pt_regs *regs, int irq) +{ + if (tod_after_eq(S390_lowcore.int_clock, + S390_lowcore.clock_comparator)) + /* Serve timer interrupts first. */ + clock_comparator_work(); + generic_handle_irq(irq); +} + +static int on_async_stack(void) +{ + unsigned long frame = current_frame_address(); + + return ((S390_lowcore.async_stack ^ frame) & ~(THREAD_SIZE - 1)) == 0; +} + +static void do_irq_async(struct pt_regs *regs, int irq) +{ + if (on_async_stack()) { + do_IRQ(regs, irq); + } else { + call_on_stack(2, S390_lowcore.async_stack, void, do_IRQ, + struct pt_regs *, regs, int, irq); + } +} + +static int irq_pending(struct pt_regs *regs) +{ + int cc; + + asm volatile("tpi 0\n" + "ipm %0" : "=d" (cc) : : "cc"); + return cc >> 28; +} + +void noinstr do_io_irq(struct pt_regs *regs) +{ + irqentry_state_t state = irqentry_enter(regs); + struct pt_regs *old_regs = set_irq_regs(regs); + bool from_idle; + + irq_enter_rcu(); + + if (user_mode(regs)) { + update_timer_sys(); + if (static_branch_likely(&cpu_has_bear)) + current->thread.last_break = regs->last_break; + } + + from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT); + if (from_idle) + account_idle_time_irq(); + + do { + regs->tpi_info = S390_lowcore.tpi_info; + if (S390_lowcore.tpi_info.adapter_IO) + do_irq_async(regs, THIN_INTERRUPT); + else + do_irq_async(regs, IO_INTERRUPT); + } while (MACHINE_IS_LPAR && irq_pending(regs)); + + irq_exit_rcu(); + + set_irq_regs(old_regs); + irqentry_exit(regs, state); + + if (from_idle) + regs->psw.mask &= ~(PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_WAIT); +} + +void noinstr do_ext_irq(struct pt_regs *regs) +{ + irqentry_state_t state = irqentry_enter(regs); + struct pt_regs *old_regs = set_irq_regs(regs); + bool from_idle; + + irq_enter_rcu(); + + if (user_mode(regs)) { + update_timer_sys(); + if (static_branch_likely(&cpu_has_bear)) + current->thread.last_break = regs->last_break; + } + + regs->int_code = S390_lowcore.ext_int_code_addr; + regs->int_parm = S390_lowcore.ext_params; + regs->int_parm_long = S390_lowcore.ext_params2; + + from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT); + if (from_idle) + account_idle_time_irq(); + + do_irq_async(regs, EXT_INTERRUPT); + + irq_exit_rcu(); + set_irq_regs(old_regs); + irqentry_exit(regs, state); + + if (from_idle) + regs->psw.mask &= ~(PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_WAIT); +} + +static void show_msi_interrupt(struct seq_file *p, int irq) +{ + struct irq_desc *desc; + unsigned long flags; + int cpu; + + rcu_read_lock(); + desc = irq_to_desc(irq); + if (!desc) + goto out; + + raw_spin_lock_irqsave(&desc->lock, flags); + seq_printf(p, "%3d: ", irq); + for_each_online_cpu(cpu) + seq_printf(p, "%10u ", irq_desc_kstat_cpu(desc, cpu)); + + if (desc->irq_data.chip) + seq_printf(p, " %8s", desc->irq_data.chip->name); + + if (desc->action) + seq_printf(p, " %s", desc->action->name); + + seq_putc(p, '\n'); + raw_spin_unlock_irqrestore(&desc->lock, flags); +out: + rcu_read_unlock(); +} + +/* + * show_interrupts is needed by /proc/interrupts. + */ +int show_interrupts(struct seq_file *p, void *v) +{ + int index = *(loff_t *) v; + int cpu, irq; + + cpus_read_lock(); + if (index == 0) { + seq_puts(p, " "); + for_each_online_cpu(cpu) + seq_printf(p, "CPU%-8d", cpu); + seq_putc(p, '\n'); + } + if (index < NR_IRQS_BASE) { + seq_printf(p, "%s: ", irqclass_main_desc[index].name); + irq = irqclass_main_desc[index].irq; + for_each_online_cpu(cpu) + seq_printf(p, "%10u ", kstat_irqs_cpu(irq, cpu)); + seq_putc(p, '\n'); + goto out; + } + if (index < nr_irqs) { + show_msi_interrupt(p, index); + goto out; + } + for (index = 0; index < NR_ARCH_IRQS; index++) { + seq_printf(p, "%s: ", irqclass_sub_desc[index].name); + irq = irqclass_sub_desc[index].irq; + for_each_online_cpu(cpu) + seq_printf(p, "%10u ", + per_cpu(irq_stat, cpu).irqs[irq]); + if (irqclass_sub_desc[index].desc) + seq_printf(p, " %s", irqclass_sub_desc[index].desc); + seq_putc(p, '\n'); + } +out: + cpus_read_unlock(); + return 0; +} + +unsigned int arch_dynirq_lower_bound(unsigned int from) +{ + return from < NR_IRQS_BASE ? NR_IRQS_BASE : from; +} + +/* + * ext_int_hash[index] is the list head for all external interrupts that hash + * to this index. + */ +static struct hlist_head ext_int_hash[32] ____cacheline_aligned; + +struct ext_int_info { + ext_int_handler_t handler; + struct hlist_node entry; + struct rcu_head rcu; + u16 code; +}; + +/* ext_int_hash_lock protects the handler lists for external interrupts */ +static DEFINE_SPINLOCK(ext_int_hash_lock); + +static inline int ext_hash(u16 code) +{ + BUILD_BUG_ON(!is_power_of_2(ARRAY_SIZE(ext_int_hash))); + + return (code + (code >> 9)) & (ARRAY_SIZE(ext_int_hash) - 1); +} + +int register_external_irq(u16 code, ext_int_handler_t handler) +{ + struct ext_int_info *p; + unsigned long flags; + int index; + + p = kmalloc(sizeof(*p), GFP_ATOMIC); + if (!p) + return -ENOMEM; + p->code = code; + p->handler = handler; + index = ext_hash(code); + + spin_lock_irqsave(&ext_int_hash_lock, flags); + hlist_add_head_rcu(&p->entry, &ext_int_hash[index]); + spin_unlock_irqrestore(&ext_int_hash_lock, flags); + return 0; +} +EXPORT_SYMBOL(register_external_irq); + +int unregister_external_irq(u16 code, ext_int_handler_t handler) +{ + struct ext_int_info *p; + unsigned long flags; + int index = ext_hash(code); + + spin_lock_irqsave(&ext_int_hash_lock, flags); + hlist_for_each_entry_rcu(p, &ext_int_hash[index], entry) { + if (p->code == code && p->handler == handler) { + hlist_del_rcu(&p->entry); + kfree_rcu(p, rcu); + } + } + spin_unlock_irqrestore(&ext_int_hash_lock, flags); + return 0; +} +EXPORT_SYMBOL(unregister_external_irq); + +static irqreturn_t do_ext_interrupt(int irq, void *dummy) +{ + struct pt_regs *regs = get_irq_regs(); + struct ext_code ext_code; + struct ext_int_info *p; + int index; + + ext_code.int_code = regs->int_code; + if (ext_code.code != EXT_IRQ_CLK_COMP) + set_cpu_flag(CIF_NOHZ_DELAY); + + index = ext_hash(ext_code.code); + rcu_read_lock(); + hlist_for_each_entry_rcu(p, &ext_int_hash[index], entry) { + if (unlikely(p->code != ext_code.code)) + continue; + p->handler(ext_code, regs->int_parm, regs->int_parm_long); + } + rcu_read_unlock(); + return IRQ_HANDLED; +} + +static void __init init_ext_interrupts(void) +{ + int idx; + + for (idx = 0; idx < ARRAY_SIZE(ext_int_hash); idx++) + INIT_HLIST_HEAD(&ext_int_hash[idx]); + + irq_set_chip_and_handler(EXT_INTERRUPT, + &dummy_irq_chip, handle_percpu_irq); + if (request_irq(EXT_INTERRUPT, do_ext_interrupt, 0, "EXT", NULL)) + panic("Failed to register EXT interrupt\n"); +} + +void __init init_IRQ(void) +{ + BUILD_BUG_ON(ARRAY_SIZE(irqclass_sub_desc) != NR_ARCH_IRQS); + init_cio_interrupts(); + init_airq_interrupts(); + init_ext_interrupts(); +} + +static DEFINE_SPINLOCK(irq_subclass_lock); +static unsigned char irq_subclass_refcount[64]; + +void irq_subclass_register(enum irq_subclass subclass) +{ + spin_lock(&irq_subclass_lock); + if (!irq_subclass_refcount[subclass]) + ctl_set_bit(0, subclass); + irq_subclass_refcount[subclass]++; + spin_unlock(&irq_subclass_lock); +} +EXPORT_SYMBOL(irq_subclass_register); + +void irq_subclass_unregister(enum irq_subclass subclass) +{ + spin_lock(&irq_subclass_lock); + irq_subclass_refcount[subclass]--; + if (!irq_subclass_refcount[subclass]) + ctl_clear_bit(0, subclass); + spin_unlock(&irq_subclass_lock); +} +EXPORT_SYMBOL(irq_subclass_unregister); diff --git a/arch/s390/kernel/jump_label.c b/arch/s390/kernel/jump_label.c new file mode 100644 index 0000000000..e808bb8bc0 --- /dev/null +++ b/arch/s390/kernel/jump_label.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Jump label s390 support + * + * Copyright IBM Corp. 2011 + * Author(s): Jan Glauber + */ +#include +#include +#include +#include +#include + +struct insn { + u16 opcode; + s32 offset; +} __packed; + +static void jump_label_make_nop(struct jump_entry *entry, struct insn *insn) +{ + /* brcl 0,offset */ + insn->opcode = 0xc004; + insn->offset = (jump_entry_target(entry) - jump_entry_code(entry)) >> 1; +} + +static void jump_label_make_branch(struct jump_entry *entry, struct insn *insn) +{ + /* brcl 15,offset */ + insn->opcode = 0xc0f4; + insn->offset = (jump_entry_target(entry) - jump_entry_code(entry)) >> 1; +} + +static void jump_label_bug(struct jump_entry *entry, struct insn *expected, + struct insn *new) +{ + unsigned char *ipc = (unsigned char *)jump_entry_code(entry); + unsigned char *ipe = (unsigned char *)expected; + unsigned char *ipn = (unsigned char *)new; + + pr_emerg("Jump label code mismatch at %pS [%px]\n", ipc, ipc); + pr_emerg("Found: %6ph\n", ipc); + pr_emerg("Expected: %6ph\n", ipe); + pr_emerg("New: %6ph\n", ipn); + panic("Corrupted kernel text"); +} + +static void jump_label_transform(struct jump_entry *entry, + enum jump_label_type type) +{ + void *code = (void *)jump_entry_code(entry); + struct insn old, new; + + if (type == JUMP_LABEL_JMP) { + jump_label_make_nop(entry, &old); + jump_label_make_branch(entry, &new); + } else { + jump_label_make_branch(entry, &old); + jump_label_make_nop(entry, &new); + } + if (memcmp(code, &old, sizeof(old))) + jump_label_bug(entry, &old, &new); + s390_kernel_write(code, &new, sizeof(new)); +} + +void arch_jump_label_transform(struct jump_entry *entry, + enum jump_label_type type) +{ + jump_label_transform(entry, type); + text_poke_sync(); +} + +bool arch_jump_label_transform_queue(struct jump_entry *entry, + enum jump_label_type type) +{ + jump_label_transform(entry, type); + return true; +} + +void arch_jump_label_transform_apply(void) +{ + text_poke_sync(); +} diff --git a/arch/s390/kernel/kdebugfs.c b/arch/s390/kernel/kdebugfs.c new file mode 100644 index 0000000000..33130c7daf --- /dev/null +++ b/arch/s390/kernel/kdebugfs.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +struct dentry *arch_debugfs_dir; +EXPORT_SYMBOL(arch_debugfs_dir); + +static int __init arch_kdebugfs_init(void) +{ + arch_debugfs_dir = debugfs_create_dir("s390", NULL); + return 0; +} +postcore_initcall(arch_kdebugfs_init); diff --git a/arch/s390/kernel/kexec_elf.c b/arch/s390/kernel/kexec_elf.c new file mode 100644 index 0000000000..9da6fa30c4 --- /dev/null +++ b/arch/s390/kernel/kexec_elf.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ELF loader for kexec_file_load system call. + * + * Copyright IBM Corp. 2018 + * + * Author(s): Philipp Rudo + */ + +#include +#include +#include +#include +#include + +static int kexec_file_add_kernel_elf(struct kimage *image, + struct s390_load_data *data) +{ + struct kexec_buf buf; + const Elf_Ehdr *ehdr; + const Elf_Phdr *phdr; + Elf_Addr entry; + void *kernel; + int i, ret; + + kernel = image->kernel_buf; + ehdr = (Elf_Ehdr *)kernel; + buf.image = image; + if (image->type == KEXEC_TYPE_CRASH) + entry = STARTUP_KDUMP_OFFSET; + else + entry = ehdr->e_entry; + + phdr = (void *)ehdr + ehdr->e_phoff; + for (i = 0; i < ehdr->e_phnum; i++, phdr++) { + if (phdr->p_type != PT_LOAD) + continue; + + buf.buffer = kernel + phdr->p_offset; + buf.bufsz = phdr->p_filesz; + + buf.mem = ALIGN(phdr->p_paddr, phdr->p_align); + if (image->type == KEXEC_TYPE_CRASH) + buf.mem += crashk_res.start; + buf.memsz = phdr->p_memsz; + data->memsz = ALIGN(data->memsz, phdr->p_align) + buf.memsz; + + if (entry - phdr->p_paddr < phdr->p_memsz) { + data->kernel_buf = buf.buffer; + data->kernel_mem = buf.mem; + data->parm = buf.buffer + PARMAREA; + } + + ipl_report_add_component(data->report, &buf, + IPL_RB_COMPONENT_FLAG_SIGNED | + IPL_RB_COMPONENT_FLAG_VERIFIED, + IPL_RB_CERT_UNKNOWN); + ret = kexec_add_buffer(&buf); + if (ret) + return ret; + } + + return data->memsz ? 0 : -EINVAL; +} + +static void *s390_elf_load(struct kimage *image, + char *kernel, unsigned long kernel_len, + char *initrd, unsigned long initrd_len, + char *cmdline, unsigned long cmdline_len) +{ + const Elf_Ehdr *ehdr; + const Elf_Phdr *phdr; + size_t size; + int i; + + /* image->fobs->probe already checked for valid ELF magic number. */ + ehdr = (Elf_Ehdr *)kernel; + + if (ehdr->e_type != ET_EXEC || + ehdr->e_ident[EI_CLASS] != ELFCLASS64 || + !elf_check_arch(ehdr)) + return ERR_PTR(-EINVAL); + + if (!ehdr->e_phnum || ehdr->e_phentsize != sizeof(Elf_Phdr)) + return ERR_PTR(-EINVAL); + + size = ehdr->e_ehsize + ehdr->e_phoff; + size += ehdr->e_phentsize * ehdr->e_phnum; + if (size > kernel_len) + return ERR_PTR(-EINVAL); + + phdr = (void *)ehdr + ehdr->e_phoff; + size = ALIGN(size, phdr->p_align); + for (i = 0; i < ehdr->e_phnum; i++, phdr++) { + if (phdr->p_type == PT_INTERP) + return ERR_PTR(-EINVAL); + + if (phdr->p_offset > kernel_len) + return ERR_PTR(-EINVAL); + + size += ALIGN(phdr->p_filesz, phdr->p_align); + } + + if (size > kernel_len) + return ERR_PTR(-EINVAL); + + return kexec_file_add_components(image, kexec_file_add_kernel_elf); +} + +static int s390_elf_probe(const char *buf, unsigned long len) +{ + const Elf_Ehdr *ehdr; + + if (len < sizeof(Elf_Ehdr)) + return -ENOEXEC; + + ehdr = (Elf_Ehdr *)buf; + + /* Only check the ELF magic number here and do proper validity check + * in the loader. Any check here that fails would send the erroneous + * ELF file to the image loader that does not care what it gets. + * (Most likely) causing behavior not intended by the user. + */ + if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0) + return -ENOEXEC; + + return 0; +} + +const struct kexec_file_ops s390_kexec_elf_ops = { + .probe = s390_elf_probe, + .load = s390_elf_load, +#ifdef CONFIG_KEXEC_SIG + .verify_sig = s390_verify_sig, +#endif /* CONFIG_KEXEC_SIG */ +}; diff --git a/arch/s390/kernel/kexec_image.c b/arch/s390/kernel/kexec_image.c new file mode 100644 index 0000000000..af23eff577 --- /dev/null +++ b/arch/s390/kernel/kexec_image.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Image loader for kexec_file_load system call. + * + * Copyright IBM Corp. 2018 + * + * Author(s): Philipp Rudo + */ + +#include +#include +#include +#include +#include + +static int kexec_file_add_kernel_image(struct kimage *image, + struct s390_load_data *data) +{ + struct kexec_buf buf; + + buf.image = image; + + buf.buffer = image->kernel_buf; + buf.bufsz = image->kernel_buf_len; + + buf.mem = 0; + if (image->type == KEXEC_TYPE_CRASH) + buf.mem += crashk_res.start; + buf.memsz = buf.bufsz; + + data->kernel_buf = image->kernel_buf; + data->kernel_mem = buf.mem; + data->parm = image->kernel_buf + PARMAREA; + data->memsz += buf.memsz; + + ipl_report_add_component(data->report, &buf, + IPL_RB_COMPONENT_FLAG_SIGNED | + IPL_RB_COMPONENT_FLAG_VERIFIED, + IPL_RB_CERT_UNKNOWN); + return kexec_add_buffer(&buf); +} + +static void *s390_image_load(struct kimage *image, + char *kernel, unsigned long kernel_len, + char *initrd, unsigned long initrd_len, + char *cmdline, unsigned long cmdline_len) +{ + return kexec_file_add_components(image, kexec_file_add_kernel_image); +} + +static int s390_image_probe(const char *buf, unsigned long len) +{ + /* Can't reliably tell if an image is valid. Therefore give the + * user whatever he wants. + */ + return 0; +} + +const struct kexec_file_ops s390_kexec_image_ops = { + .probe = s390_image_probe, + .load = s390_image_load, +#ifdef CONFIG_KEXEC_SIG + .verify_sig = s390_verify_sig, +#endif /* CONFIG_KEXEC_SIG */ +}; diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c new file mode 100644 index 0000000000..d4b863ed0a --- /dev/null +++ b/arch/s390/kernel/kprobes.c @@ -0,0 +1,525 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Kernel Probes (KProbes) + * + * Copyright IBM Corp. 2002, 2006 + * + * s390 port, used ppc64 as template. Mike Grundy + */ + +#define pr_fmt(fmt) "kprobes: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "kprobes.h" +#include "entry.h" + +DEFINE_PER_CPU(struct kprobe *, current_kprobe); +DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); + +struct kretprobe_blackpoint kretprobe_blacklist[] = { }; + +static int insn_page_in_use; + +void *alloc_insn_page(void) +{ + void *page; + + page = module_alloc(PAGE_SIZE); + if (!page) + return NULL; + set_memory_rox((unsigned long)page, 1); + return page; +} + +static void *alloc_s390_insn_page(void) +{ + if (xchg(&insn_page_in_use, 1) == 1) + return NULL; + return &kprobes_insn_page; +} + +static void free_s390_insn_page(void *page) +{ + xchg(&insn_page_in_use, 0); +} + +struct kprobe_insn_cache kprobe_s390_insn_slots = { + .mutex = __MUTEX_INITIALIZER(kprobe_s390_insn_slots.mutex), + .alloc = alloc_s390_insn_page, + .free = free_s390_insn_page, + .pages = LIST_HEAD_INIT(kprobe_s390_insn_slots.pages), + .insn_size = MAX_INSN_SIZE, +}; + +static void copy_instruction(struct kprobe *p) +{ + kprobe_opcode_t insn[MAX_INSN_SIZE]; + s64 disp, new_disp; + u64 addr, new_addr; + unsigned int len; + + len = insn_length(*p->addr >> 8); + memcpy(&insn, p->addr, len); + p->opcode = insn[0]; + if (probe_is_insn_relative_long(&insn[0])) { + /* + * For pc-relative instructions in RIL-b or RIL-c format patch + * the RI2 displacement field. We have already made sure that + * the insn slot for the patched instruction is within the same + * 2GB area as the original instruction (either kernel image or + * module area). Therefore the new displacement will always fit. + */ + disp = *(s32 *)&insn[1]; + addr = (u64)(unsigned long)p->addr; + new_addr = (u64)(unsigned long)p->ainsn.insn; + new_disp = ((addr + (disp * 2)) - new_addr) / 2; + *(s32 *)&insn[1] = new_disp; + } + s390_kernel_write(p->ainsn.insn, &insn, len); +} +NOKPROBE_SYMBOL(copy_instruction); + +static int s390_get_insn_slot(struct kprobe *p) +{ + /* + * Get an insn slot that is within the same 2GB area like the original + * instruction. That way instructions with a 32bit signed displacement + * field can be patched and executed within the insn slot. + */ + p->ainsn.insn = NULL; + if (is_kernel((unsigned long)p->addr)) + p->ainsn.insn = get_s390_insn_slot(); + else if (is_module_addr(p->addr)) + p->ainsn.insn = get_insn_slot(); + return p->ainsn.insn ? 0 : -ENOMEM; +} +NOKPROBE_SYMBOL(s390_get_insn_slot); + +static void s390_free_insn_slot(struct kprobe *p) +{ + if (!p->ainsn.insn) + return; + if (is_kernel((unsigned long)p->addr)) + free_s390_insn_slot(p->ainsn.insn, 0); + else + free_insn_slot(p->ainsn.insn, 0); + p->ainsn.insn = NULL; +} +NOKPROBE_SYMBOL(s390_free_insn_slot); + +/* Check if paddr is at an instruction boundary */ +static bool can_probe(unsigned long paddr) +{ + unsigned long addr, offset = 0; + kprobe_opcode_t insn; + struct kprobe *kp; + + if (paddr & 0x01) + return false; + + if (!kallsyms_lookup_size_offset(paddr, NULL, &offset)) + return false; + + /* Decode instructions */ + addr = paddr - offset; + while (addr < paddr) { + if (copy_from_kernel_nofault(&insn, (void *)addr, sizeof(insn))) + return false; + + if (insn >> 8 == 0) { + if (insn != BREAKPOINT_INSTRUCTION) { + /* + * Note that QEMU inserts opcode 0x0000 to implement + * software breakpoints for guests. Since the size of + * the original instruction is unknown, stop following + * instructions and prevent setting a kprobe. + */ + return false; + } + /* + * Check if the instruction has been modified by another + * kprobe, in which case the original instruction is + * decoded. + */ + kp = get_kprobe((void *)addr); + if (!kp) { + /* not a kprobe */ + return false; + } + insn = kp->opcode; + } + addr += insn_length(insn >> 8); + } + return addr == paddr; +} + +int arch_prepare_kprobe(struct kprobe *p) +{ + if (!can_probe((unsigned long)p->addr)) + return -EINVAL; + /* Make sure the probe isn't going on a difficult instruction */ + if (probe_is_prohibited_opcode(p->addr)) + return -EINVAL; + if (s390_get_insn_slot(p)) + return -ENOMEM; + copy_instruction(p); + return 0; +} +NOKPROBE_SYMBOL(arch_prepare_kprobe); + +struct swap_insn_args { + struct kprobe *p; + unsigned int arm_kprobe : 1; +}; + +static int swap_instruction(void *data) +{ + struct swap_insn_args *args = data; + struct kprobe *p = args->p; + u16 opc; + + opc = args->arm_kprobe ? BREAKPOINT_INSTRUCTION : p->opcode; + s390_kernel_write(p->addr, &opc, sizeof(opc)); + return 0; +} +NOKPROBE_SYMBOL(swap_instruction); + +void arch_arm_kprobe(struct kprobe *p) +{ + struct swap_insn_args args = {.p = p, .arm_kprobe = 1}; + + stop_machine_cpuslocked(swap_instruction, &args, NULL); +} +NOKPROBE_SYMBOL(arch_arm_kprobe); + +void arch_disarm_kprobe(struct kprobe *p) +{ + struct swap_insn_args args = {.p = p, .arm_kprobe = 0}; + + stop_machine_cpuslocked(swap_instruction, &args, NULL); +} +NOKPROBE_SYMBOL(arch_disarm_kprobe); + +void arch_remove_kprobe(struct kprobe *p) +{ + s390_free_insn_slot(p); +} +NOKPROBE_SYMBOL(arch_remove_kprobe); + +static void enable_singlestep(struct kprobe_ctlblk *kcb, + struct pt_regs *regs, + unsigned long ip) +{ + struct per_regs per_kprobe; + + /* Set up the PER control registers %cr9-%cr11 */ + per_kprobe.control = PER_EVENT_IFETCH; + per_kprobe.start = ip; + per_kprobe.end = ip; + + /* Save control regs and psw mask */ + __ctl_store(kcb->kprobe_saved_ctl, 9, 11); + kcb->kprobe_saved_imask = regs->psw.mask & + (PSW_MASK_PER | PSW_MASK_IO | PSW_MASK_EXT); + + /* Set PER control regs, turns on single step for the given address */ + __ctl_load(per_kprobe, 9, 11); + regs->psw.mask |= PSW_MASK_PER; + regs->psw.mask &= ~(PSW_MASK_IO | PSW_MASK_EXT); + regs->psw.addr = ip; +} +NOKPROBE_SYMBOL(enable_singlestep); + +static void disable_singlestep(struct kprobe_ctlblk *kcb, + struct pt_regs *regs, + unsigned long ip) +{ + /* Restore control regs and psw mask, set new psw address */ + __ctl_load(kcb->kprobe_saved_ctl, 9, 11); + regs->psw.mask &= ~PSW_MASK_PER; + regs->psw.mask |= kcb->kprobe_saved_imask; + regs->psw.addr = ip; +} +NOKPROBE_SYMBOL(disable_singlestep); + +/* + * Activate a kprobe by storing its pointer to current_kprobe. The + * previous kprobe is stored in kcb->prev_kprobe. A stack of up to + * two kprobes can be active, see KPROBE_REENTER. + */ +static void push_kprobe(struct kprobe_ctlblk *kcb, struct kprobe *p) +{ + kcb->prev_kprobe.kp = __this_cpu_read(current_kprobe); + kcb->prev_kprobe.status = kcb->kprobe_status; + __this_cpu_write(current_kprobe, p); +} +NOKPROBE_SYMBOL(push_kprobe); + +/* + * Deactivate a kprobe by backing up to the previous state. If the + * current state is KPROBE_REENTER prev_kprobe.kp will be non-NULL, + * for any other state prev_kprobe.kp will be NULL. + */ +static void pop_kprobe(struct kprobe_ctlblk *kcb) +{ + __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp); + kcb->kprobe_status = kcb->prev_kprobe.status; + kcb->prev_kprobe.kp = NULL; +} +NOKPROBE_SYMBOL(pop_kprobe); + +static void kprobe_reenter_check(struct kprobe_ctlblk *kcb, struct kprobe *p) +{ + switch (kcb->kprobe_status) { + case KPROBE_HIT_SSDONE: + case KPROBE_HIT_ACTIVE: + kprobes_inc_nmissed_count(p); + break; + case KPROBE_HIT_SS: + case KPROBE_REENTER: + default: + /* + * A kprobe on the code path to single step an instruction + * is a BUG. The code path resides in the .kprobes.text + * section and is executed with interrupts disabled. + */ + pr_err("Failed to recover from reentered kprobes.\n"); + dump_kprobe(p); + BUG(); + } +} +NOKPROBE_SYMBOL(kprobe_reenter_check); + +static int kprobe_handler(struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb; + struct kprobe *p; + + /* + * We want to disable preemption for the entire duration of kprobe + * processing. That includes the calls to the pre/post handlers + * and single stepping the kprobe instruction. + */ + preempt_disable(); + kcb = get_kprobe_ctlblk(); + p = get_kprobe((void *)(regs->psw.addr - 2)); + + if (p) { + if (kprobe_running()) { + /* + * We have hit a kprobe while another is still + * active. This can happen in the pre and post + * handler. Single step the instruction of the + * new probe but do not call any handler function + * of this secondary kprobe. + * push_kprobe and pop_kprobe saves and restores + * the currently active kprobe. + */ + kprobe_reenter_check(kcb, p); + push_kprobe(kcb, p); + kcb->kprobe_status = KPROBE_REENTER; + } else { + /* + * If we have no pre-handler or it returned 0, we + * continue with single stepping. If we have a + * pre-handler and it returned non-zero, it prepped + * for changing execution path, so get out doing + * nothing more here. + */ + push_kprobe(kcb, p); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + if (p->pre_handler && p->pre_handler(p, regs)) { + pop_kprobe(kcb); + preempt_enable_no_resched(); + return 1; + } + kcb->kprobe_status = KPROBE_HIT_SS; + } + enable_singlestep(kcb, regs, (unsigned long) p->ainsn.insn); + return 1; + } /* else: + * No kprobe at this address and no active kprobe. The trap has + * not been caused by a kprobe breakpoint. The race of breakpoint + * vs. kprobe remove does not exist because on s390 as we use + * stop_machine to arm/disarm the breakpoints. + */ + preempt_enable_no_resched(); + return 0; +} +NOKPROBE_SYMBOL(kprobe_handler); + +/* + * Called after single-stepping. p->addr is the address of the + * instruction whose first byte has been replaced by the "breakpoint" + * instruction. To avoid the SMP problems that can occur when we + * temporarily put back the original opcode to single-step, we + * single-stepped a copy of the instruction. The address of this + * copy is p->ainsn.insn. + */ +static void resume_execution(struct kprobe *p, struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + unsigned long ip = regs->psw.addr; + int fixup = probe_get_fixup_type(p->ainsn.insn); + + if (fixup & FIXUP_PSW_NORMAL) + ip += (unsigned long) p->addr - (unsigned long) p->ainsn.insn; + + if (fixup & FIXUP_BRANCH_NOT_TAKEN) { + int ilen = insn_length(p->ainsn.insn[0] >> 8); + if (ip - (unsigned long) p->ainsn.insn == ilen) + ip = (unsigned long) p->addr + ilen; + } + + if (fixup & FIXUP_RETURN_REGISTER) { + int reg = (p->ainsn.insn[0] & 0xf0) >> 4; + regs->gprs[reg] += (unsigned long) p->addr - + (unsigned long) p->ainsn.insn; + } + + disable_singlestep(kcb, regs, ip); +} +NOKPROBE_SYMBOL(resume_execution); + +static int post_kprobe_handler(struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + struct kprobe *p = kprobe_running(); + + if (!p) + return 0; + + resume_execution(p, regs); + if (kcb->kprobe_status != KPROBE_REENTER && p->post_handler) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + p->post_handler(p, regs, 0); + } + pop_kprobe(kcb); + preempt_enable_no_resched(); + + /* + * if somebody else is singlestepping across a probe point, psw mask + * will have PER set, in which case, continue the remaining processing + * of do_single_step, as if this is not a probe hit. + */ + if (regs->psw.mask & PSW_MASK_PER) + return 0; + + return 1; +} +NOKPROBE_SYMBOL(post_kprobe_handler); + +static int kprobe_trap_handler(struct pt_regs *regs, int trapnr) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + struct kprobe *p = kprobe_running(); + + switch(kcb->kprobe_status) { + case KPROBE_HIT_SS: + case KPROBE_REENTER: + /* + * We are here because the instruction being single + * stepped caused a page fault. We reset the current + * kprobe and the nip points back to the probe address + * and allow the page fault handler to continue as a + * normal page fault. + */ + disable_singlestep(kcb, regs, (unsigned long) p->addr); + pop_kprobe(kcb); + preempt_enable_no_resched(); + break; + case KPROBE_HIT_ACTIVE: + case KPROBE_HIT_SSDONE: + /* + * In case the user-specified fault handler returned + * zero, try to fix up. + */ + if (fixup_exception(regs)) + return 1; + /* + * fixup_exception() could not handle it, + * Let do_page_fault() fix it. + */ + break; + default: + break; + } + return 0; +} +NOKPROBE_SYMBOL(kprobe_trap_handler); + +int kprobe_fault_handler(struct pt_regs *regs, int trapnr) +{ + int ret; + + if (regs->psw.mask & (PSW_MASK_IO | PSW_MASK_EXT)) + local_irq_disable(); + ret = kprobe_trap_handler(regs, trapnr); + if (regs->psw.mask & (PSW_MASK_IO | PSW_MASK_EXT)) + local_irq_restore(regs->psw.mask & ~PSW_MASK_PER); + return ret; +} +NOKPROBE_SYMBOL(kprobe_fault_handler); + +/* + * Wrapper routine to for handling exceptions. + */ +int kprobe_exceptions_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct die_args *args = (struct die_args *) data; + struct pt_regs *regs = args->regs; + int ret = NOTIFY_DONE; + + if (regs->psw.mask & (PSW_MASK_IO | PSW_MASK_EXT)) + local_irq_disable(); + + switch (val) { + case DIE_BPT: + if (kprobe_handler(regs)) + ret = NOTIFY_STOP; + break; + case DIE_SSTEP: + if (post_kprobe_handler(regs)) + ret = NOTIFY_STOP; + break; + case DIE_TRAP: + if (!preemptible() && kprobe_running() && + kprobe_trap_handler(regs, args->trapnr)) + ret = NOTIFY_STOP; + break; + default: + break; + } + + if (regs->psw.mask & (PSW_MASK_IO | PSW_MASK_EXT)) + local_irq_restore(regs->psw.mask & ~PSW_MASK_PER); + + return ret; +} +NOKPROBE_SYMBOL(kprobe_exceptions_notify); + +int __init arch_init_kprobes(void) +{ + return 0; +} + +int arch_trampoline_kprobe(struct kprobe *p) +{ + return 0; +} +NOKPROBE_SYMBOL(arch_trampoline_kprobe); diff --git a/arch/s390/kernel/kprobes.h b/arch/s390/kernel/kprobes.h new file mode 100644 index 0000000000..dc3ed5098e --- /dev/null +++ b/arch/s390/kernel/kprobes.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +#ifndef _ARCH_S390_KPROBES_H +#define _ARCH_S390_KPROBES_H + +#include + +DEFINE_INSN_CACHE_OPS(s390_insn); + +#endif diff --git a/arch/s390/kernel/kprobes_insn_page.S b/arch/s390/kernel/kprobes_insn_page.S new file mode 100644 index 0000000000..0fe4d725e9 --- /dev/null +++ b/arch/s390/kernel/kprobes_insn_page.S @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include + +/* + * insn_page is a special 4k aligned dummy function for kprobes. + * It will contain all kprobed instructions that are out-of-line executed. + * The page must be within the kernel image to guarantee that the + * out-of-line instructions are within 2GB distance of their original + * location. Using a dummy function ensures that the insn_page is within + * the text section of the kernel and mapped read-only/executable from + * the beginning on, thus avoiding to split large mappings if the page + * would be in the data section instead. + */ + .section .kprobes.text, "ax" + .balign 4096 +SYM_CODE_START(kprobes_insn_page) + .rept 2048 + .word 0x07fe + .endr +SYM_CODE_END(kprobes_insn_page) + .previous diff --git a/arch/s390/kernel/lgr.c b/arch/s390/kernel/lgr.c new file mode 100644 index 0000000000..6652e54cf3 --- /dev/null +++ b/arch/s390/kernel/lgr.c @@ -0,0 +1,187 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Linux Guest Relocation (LGR) detection + * + * Copyright IBM Corp. 2012 + * Author(s): Michael Holzheu + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define LGR_TIMER_INTERVAL_SECS (30 * 60) +#define VM_LEVEL_MAX 2 /* Maximum is 8, but we only record two levels */ + +/* + * LGR info: Contains stfle and stsi data + */ +struct lgr_info { + /* Bit field with facility information: 4 DWORDs are stored */ + u64 stfle_fac_list[4]; + /* Level of system (1 = CEC, 2 = LPAR, 3 = z/VM */ + u32 level; + /* Level 1: CEC info (stsi 1.1.1) */ + char manufacturer[16]; + char type[4]; + char sequence[16]; + char plant[4]; + char model[16]; + /* Level 2: LPAR info (stsi 2.2.2) */ + u16 lpar_number; + char name[8]; + /* Level 3: VM info (stsi 3.2.2) */ + u8 vm_count; + struct { + char name[8]; + char cpi[16]; + } vm[VM_LEVEL_MAX]; +} __packed __aligned(8); + +/* + * LGR globals + */ +static char lgr_page[PAGE_SIZE] __aligned(PAGE_SIZE); +static struct lgr_info lgr_info_last; +static struct lgr_info lgr_info_cur; +static struct debug_info *lgr_dbf; + +/* + * Copy buffer and then convert it to ASCII + */ +static void cpascii(char *dst, char *src, int size) +{ + memcpy(dst, src, size); + EBCASC(dst, size); +} + +/* + * Fill LGR info with 1.1.1 stsi data + */ +static void lgr_stsi_1_1_1(struct lgr_info *lgr_info) +{ + struct sysinfo_1_1_1 *si = (void *) lgr_page; + + if (stsi(si, 1, 1, 1)) + return; + cpascii(lgr_info->manufacturer, si->manufacturer, + sizeof(si->manufacturer)); + cpascii(lgr_info->type, si->type, sizeof(si->type)); + cpascii(lgr_info->model, si->model, sizeof(si->model)); + cpascii(lgr_info->sequence, si->sequence, sizeof(si->sequence)); + cpascii(lgr_info->plant, si->plant, sizeof(si->plant)); +} + +/* + * Fill LGR info with 2.2.2 stsi data + */ +static void lgr_stsi_2_2_2(struct lgr_info *lgr_info) +{ + struct sysinfo_2_2_2 *si = (void *) lgr_page; + + if (stsi(si, 2, 2, 2)) + return; + cpascii(lgr_info->name, si->name, sizeof(si->name)); + lgr_info->lpar_number = si->lpar_number; +} + +/* + * Fill LGR info with 3.2.2 stsi data + */ +static void lgr_stsi_3_2_2(struct lgr_info *lgr_info) +{ + struct sysinfo_3_2_2 *si = (void *) lgr_page; + int i; + + if (stsi(si, 3, 2, 2)) + return; + for (i = 0; i < min_t(u8, si->count, VM_LEVEL_MAX); i++) { + cpascii(lgr_info->vm[i].name, si->vm[i].name, + sizeof(si->vm[i].name)); + cpascii(lgr_info->vm[i].cpi, si->vm[i].cpi, + sizeof(si->vm[i].cpi)); + } + lgr_info->vm_count = si->count; +} + +/* + * Fill LGR info with current data + */ +static void lgr_info_get(struct lgr_info *lgr_info) +{ + int level; + + memset(lgr_info, 0, sizeof(*lgr_info)); + stfle(lgr_info->stfle_fac_list, ARRAY_SIZE(lgr_info->stfle_fac_list)); + level = stsi(NULL, 0, 0, 0); + lgr_info->level = level; + if (level >= 1) + lgr_stsi_1_1_1(lgr_info); + if (level >= 2) + lgr_stsi_2_2_2(lgr_info); + if (level >= 3) + lgr_stsi_3_2_2(lgr_info); +} + +/* + * Check if LGR info has changed and if yes log new LGR info to s390dbf + */ +void lgr_info_log(void) +{ + static DEFINE_SPINLOCK(lgr_info_lock); + unsigned long flags; + + if (!spin_trylock_irqsave(&lgr_info_lock, flags)) + return; + lgr_info_get(&lgr_info_cur); + if (memcmp(&lgr_info_last, &lgr_info_cur, sizeof(lgr_info_cur)) != 0) { + debug_event(lgr_dbf, 1, &lgr_info_cur, sizeof(lgr_info_cur)); + lgr_info_last = lgr_info_cur; + } + spin_unlock_irqrestore(&lgr_info_lock, flags); +} +EXPORT_SYMBOL_GPL(lgr_info_log); + +static void lgr_timer_set(void); + +/* + * LGR timer callback + */ +static void lgr_timer_fn(struct timer_list *unused) +{ + lgr_info_log(); + lgr_timer_set(); +} + +static struct timer_list lgr_timer; + +/* + * Setup next LGR timer + */ +static void lgr_timer_set(void) +{ + mod_timer(&lgr_timer, jiffies + msecs_to_jiffies(LGR_TIMER_INTERVAL_SECS * MSEC_PER_SEC)); +} + +/* + * Initialize LGR: Add s390dbf, write initial lgr_info and setup timer + */ +static int __init lgr_init(void) +{ + lgr_dbf = debug_register("lgr", 1, 1, sizeof(struct lgr_info)); + if (!lgr_dbf) + return -ENOMEM; + debug_register_view(lgr_dbf, &debug_hex_ascii_view); + lgr_info_get(&lgr_info_last); + debug_event(lgr_dbf, 1, &lgr_info_last, sizeof(lgr_info_last)); + timer_setup(&lgr_timer, lgr_timer_fn, TIMER_DEFERRABLE); + lgr_timer_set(); + return 0; +} +device_initcall(lgr_init); diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c new file mode 100644 index 0000000000..ce65fc0167 --- /dev/null +++ b/arch/s390/kernel/machine_kexec.c @@ -0,0 +1,286 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2005, 2011 + * + * Author(s): Rolf Adelsberger, + * Michael Holzheu + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef void (*relocate_kernel_t)(unsigned long, unsigned long, unsigned long); +typedef int (*purgatory_t)(int); + +extern const unsigned char relocate_kernel[]; +extern const unsigned long long relocate_kernel_len; + +#ifdef CONFIG_CRASH_DUMP + +/* + * Reset the system, copy boot CPU registers to absolute zero, + * and jump to the kdump image + */ +static void __do_machine_kdump(void *data) +{ + struct kimage *image = data; + purgatory_t purgatory; + unsigned long prefix; + + purgatory = (purgatory_t)image->start; + + /* store_status() saved the prefix register to lowcore */ + prefix = (unsigned long) S390_lowcore.prefixreg_save_area; + + /* Now do the reset */ + s390_reset_system(); + + /* + * Copy dump CPU store status info to absolute zero. + * This need to be done *after* s390_reset_system set the + * prefix register of this CPU to zero + */ + memcpy(absolute_pointer(__LC_FPREGS_SAVE_AREA), + phys_to_virt(prefix + __LC_FPREGS_SAVE_AREA), 512); + + call_nodat(1, int, purgatory, int, 1); + + /* Die if kdump returns */ + disabled_wait(); +} + +/* + * Start kdump: create a LGR log entry, store status of all CPUs and + * branch to __do_machine_kdump. + */ +static noinline void __machine_kdump(void *image) +{ + struct mcesa *mcesa; + union ctlreg2 cr2_old, cr2_new; + int this_cpu, cpu; + + lgr_info_log(); + /* Get status of the other CPUs */ + this_cpu = smp_find_processor_id(stap()); + for_each_online_cpu(cpu) { + if (cpu == this_cpu) + continue; + if (smp_store_status(cpu)) + continue; + } + /* Store status of the boot CPU */ + mcesa = __va(S390_lowcore.mcesad & MCESA_ORIGIN_MASK); + if (MACHINE_HAS_VX) + save_vx_regs((__vector128 *) mcesa->vector_save_area); + if (MACHINE_HAS_GS) { + __ctl_store(cr2_old.val, 2, 2); + cr2_new = cr2_old; + cr2_new.gse = 1; + __ctl_load(cr2_new.val, 2, 2); + save_gs_cb((struct gs_cb *) mcesa->guarded_storage_save_area); + __ctl_load(cr2_old.val, 2, 2); + } + /* + * To create a good backchain for this CPU in the dump store_status + * is passed the address of a function. The address is saved into + * the PSW save area of the boot CPU and the function is invoked as + * a tail call of store_status. The backchain in the dump will look + * like this: + * restart_int_handler -> __machine_kexec -> __do_machine_kdump + * The call to store_status() will not return. + */ + store_status(__do_machine_kdump, image); +} + +#endif /* CONFIG_CRASH_DUMP */ + +/* + * Check if kdump checksums are valid: We call purgatory with parameter "0" + */ +static bool kdump_csum_valid(struct kimage *image) +{ +#ifdef CONFIG_CRASH_DUMP + purgatory_t purgatory = (purgatory_t)image->start; + int rc; + + rc = call_nodat(1, int, purgatory, int, 0); + return rc == 0; +#else + return false; +#endif +} + +#ifdef CONFIG_CRASH_DUMP + +void crash_free_reserved_phys_range(unsigned long begin, unsigned long end) +{ + unsigned long addr, size; + + for (addr = begin; addr < end; addr += PAGE_SIZE) + free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT)); + size = begin - crashk_res.start; + if (size) + os_info_crashkernel_add(crashk_res.start, size); + else + os_info_crashkernel_add(0, 0); +} + +static void crash_protect_pages(int protect) +{ + unsigned long size; + + if (!crashk_res.end) + return; + size = resource_size(&crashk_res); + if (protect) + set_memory_ro(crashk_res.start, size >> PAGE_SHIFT); + else + set_memory_rw(crashk_res.start, size >> PAGE_SHIFT); +} + +void arch_kexec_protect_crashkres(void) +{ + crash_protect_pages(1); +} + +void arch_kexec_unprotect_crashkres(void) +{ + crash_protect_pages(0); +} + +#endif + +/* + * Give back memory to hypervisor before new kdump is loaded + */ +static int machine_kexec_prepare_kdump(void) +{ +#ifdef CONFIG_CRASH_DUMP + if (MACHINE_IS_VM) + diag10_range(PFN_DOWN(crashk_res.start), + PFN_DOWN(crashk_res.end - crashk_res.start + 1)); + return 0; +#else + return -EINVAL; +#endif +} + +int machine_kexec_prepare(struct kimage *image) +{ + void *reboot_code_buffer; + + if (image->type == KEXEC_TYPE_CRASH) + return machine_kexec_prepare_kdump(); + + /* We don't support anything but the default image type for now. */ + if (image->type != KEXEC_TYPE_DEFAULT) + return -EINVAL; + + /* Get the destination where the assembler code should be copied to.*/ + reboot_code_buffer = page_to_virt(image->control_code_page); + + /* Then copy it */ + memcpy(reboot_code_buffer, relocate_kernel, relocate_kernel_len); + return 0; +} + +void machine_kexec_cleanup(struct kimage *image) +{ +} + +void arch_crash_save_vmcoreinfo(void) +{ + struct lowcore *abs_lc; + + VMCOREINFO_SYMBOL(lowcore_ptr); + VMCOREINFO_SYMBOL(high_memory); + VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS); + vmcoreinfo_append_str("SAMODE31=%lx\n", (unsigned long)__samode31); + vmcoreinfo_append_str("EAMODE31=%lx\n", (unsigned long)__eamode31); + vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); + abs_lc = get_abs_lowcore(); + abs_lc->vmcore_info = paddr_vmcoreinfo_note(); + put_abs_lowcore(abs_lc); +} + +void machine_shutdown(void) +{ +} + +void machine_crash_shutdown(struct pt_regs *regs) +{ + set_os_info_reipl_block(); +} + +/* + * Do normal kexec + */ +static void __do_machine_kexec(void *data) +{ + unsigned long data_mover, entry, diag308_subcode; + struct kimage *image = data; + + data_mover = page_to_phys(image->control_code_page); + entry = virt_to_phys(&image->head); + diag308_subcode = DIAG308_CLEAR_RESET; + if (sclp.has_iplcc) + diag308_subcode |= DIAG308_FLAG_EI; + s390_reset_system(); + + call_nodat(3, void, (relocate_kernel_t)data_mover, + unsigned long, entry, + unsigned long, image->start, + unsigned long, diag308_subcode); + + /* Die if kexec returns */ + disabled_wait(); +} + +/* + * Reset system and call either kdump or normal kexec + */ +static void __machine_kexec(void *data) +{ + pfault_fini(); + tracing_off(); + debug_locks_off(); +#ifdef CONFIG_CRASH_DUMP + if (((struct kimage *) data)->type == KEXEC_TYPE_CRASH) + __machine_kdump(data); +#endif + __do_machine_kexec(data); +} + +/* + * Do either kdump or normal kexec. In case of kdump we first ask + * purgatory, if kdump checksums are valid. + */ +void machine_kexec(struct kimage *image) +{ + if (image->type == KEXEC_TYPE_CRASH && !kdump_csum_valid(image)) + return; + tracer_disable(); + smp_send_stop(); + smp_call_ipl_cpu(__machine_kexec, image); +} diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c new file mode 100644 index 0000000000..8d207b82d9 --- /dev/null +++ b/arch/s390/kernel/machine_kexec_file.c @@ -0,0 +1,379 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * s390 code for kexec_file_load system call + * + * Copyright IBM Corp. 2018 + * + * Author(s): Philipp Rudo + */ + +#define pr_fmt(fmt) "kexec: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +const struct kexec_file_ops * const kexec_file_loaders[] = { + &s390_kexec_elf_ops, + &s390_kexec_image_ops, + NULL, +}; + +#ifdef CONFIG_KEXEC_SIG +int s390_verify_sig(const char *kernel, unsigned long kernel_len) +{ + const unsigned long marker_len = sizeof(MODULE_SIG_STRING) - 1; + struct module_signature *ms; + unsigned long sig_len; + int ret; + + /* Skip signature verification when not secure IPLed. */ + if (!ipl_secure_flag) + return 0; + + if (marker_len > kernel_len) + return -EKEYREJECTED; + + if (memcmp(kernel + kernel_len - marker_len, MODULE_SIG_STRING, + marker_len)) + return -EKEYREJECTED; + kernel_len -= marker_len; + + ms = (void *)kernel + kernel_len - sizeof(*ms); + kernel_len -= sizeof(*ms); + + sig_len = be32_to_cpu(ms->sig_len); + if (sig_len >= kernel_len) + return -EKEYREJECTED; + kernel_len -= sig_len; + + if (ms->id_type != PKEY_ID_PKCS7) + return -EKEYREJECTED; + + if (ms->algo != 0 || + ms->hash != 0 || + ms->signer_len != 0 || + ms->key_id_len != 0 || + ms->__pad[0] != 0 || + ms->__pad[1] != 0 || + ms->__pad[2] != 0) { + return -EBADMSG; + } + + ret = verify_pkcs7_signature(kernel, kernel_len, + kernel + kernel_len, sig_len, + VERIFY_USE_SECONDARY_KEYRING, + VERIFYING_MODULE_SIGNATURE, + NULL, NULL); + if (ret == -ENOKEY && IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING)) + ret = verify_pkcs7_signature(kernel, kernel_len, + kernel + kernel_len, sig_len, + VERIFY_USE_PLATFORM_KEYRING, + VERIFYING_MODULE_SIGNATURE, + NULL, NULL); + return ret; +} +#endif /* CONFIG_KEXEC_SIG */ + +static int kexec_file_update_purgatory(struct kimage *image, + struct s390_load_data *data) +{ + u64 entry, type; + int ret; + + if (image->type == KEXEC_TYPE_CRASH) { + entry = STARTUP_KDUMP_OFFSET; + type = KEXEC_TYPE_CRASH; + } else { + entry = STARTUP_NORMAL_OFFSET; + type = KEXEC_TYPE_DEFAULT; + } + + ret = kexec_purgatory_get_set_symbol(image, "kernel_entry", &entry, + sizeof(entry), false); + if (ret) + return ret; + + ret = kexec_purgatory_get_set_symbol(image, "kernel_type", &type, + sizeof(type), false); + if (ret) + return ret; + + if (image->type == KEXEC_TYPE_CRASH) { + u64 crash_size; + + ret = kexec_purgatory_get_set_symbol(image, "crash_start", + &crashk_res.start, + sizeof(crashk_res.start), + false); + if (ret) + return ret; + + crash_size = crashk_res.end - crashk_res.start + 1; + ret = kexec_purgatory_get_set_symbol(image, "crash_size", + &crash_size, + sizeof(crash_size), + false); + } + return ret; +} + +static int kexec_file_add_purgatory(struct kimage *image, + struct s390_load_data *data) +{ + struct kexec_buf buf; + int ret; + + buf.image = image; + + data->memsz = ALIGN(data->memsz, PAGE_SIZE); + buf.mem = data->memsz; + if (image->type == KEXEC_TYPE_CRASH) + buf.mem += crashk_res.start; + + ret = kexec_load_purgatory(image, &buf); + if (ret) + return ret; + data->memsz += buf.memsz; + + return kexec_file_update_purgatory(image, data); +} + +static int kexec_file_add_initrd(struct kimage *image, + struct s390_load_data *data) +{ + struct kexec_buf buf; + int ret; + + buf.image = image; + + buf.buffer = image->initrd_buf; + buf.bufsz = image->initrd_buf_len; + + data->memsz = ALIGN(data->memsz, PAGE_SIZE); + buf.mem = data->memsz; + if (image->type == KEXEC_TYPE_CRASH) + buf.mem += crashk_res.start; + buf.memsz = buf.bufsz; + + data->parm->initrd_start = data->memsz; + data->parm->initrd_size = buf.memsz; + data->memsz += buf.memsz; + + ret = kexec_add_buffer(&buf); + if (ret) + return ret; + + return ipl_report_add_component(data->report, &buf, 0, 0); +} + +static int kexec_file_add_ipl_report(struct kimage *image, + struct s390_load_data *data) +{ + __u32 *lc_ipl_parmblock_ptr; + unsigned int len, ncerts; + struct kexec_buf buf; + unsigned long addr; + void *ptr, *end; + int ret; + + buf.image = image; + + data->memsz = ALIGN(data->memsz, PAGE_SIZE); + buf.mem = data->memsz; + + ptr = __va(ipl_cert_list_addr); + end = ptr + ipl_cert_list_size; + ncerts = 0; + while (ptr < end) { + ncerts++; + len = *(unsigned int *)ptr; + ptr += sizeof(len); + ptr += len; + } + + addr = data->memsz + data->report->size; + addr += ncerts * sizeof(struct ipl_rb_certificate_entry); + ptr = __va(ipl_cert_list_addr); + while (ptr < end) { + len = *(unsigned int *)ptr; + ptr += sizeof(len); + ipl_report_add_certificate(data->report, ptr, addr, len); + addr += len; + ptr += len; + } + + ret = -ENOMEM; + buf.buffer = ipl_report_finish(data->report); + if (!buf.buffer) + goto out; + buf.bufsz = data->report->size; + buf.memsz = buf.bufsz; + image->arch.ipl_buf = buf.buffer; + + data->memsz += buf.memsz; + + lc_ipl_parmblock_ptr = + data->kernel_buf + offsetof(struct lowcore, ipl_parmblock_ptr); + *lc_ipl_parmblock_ptr = (__u32)buf.mem; + + if (image->type == KEXEC_TYPE_CRASH) + buf.mem += crashk_res.start; + + ret = kexec_add_buffer(&buf); +out: + return ret; +} + +void *kexec_file_add_components(struct kimage *image, + int (*add_kernel)(struct kimage *image, + struct s390_load_data *data)) +{ + unsigned long max_command_line_size = LEGACY_COMMAND_LINE_SIZE; + struct s390_load_data data = {0}; + unsigned long minsize; + int ret; + + data.report = ipl_report_init(&ipl_block); + if (IS_ERR(data.report)) + return data.report; + + ret = add_kernel(image, &data); + if (ret) + goto out; + + ret = -EINVAL; + minsize = PARMAREA + offsetof(struct parmarea, command_line); + if (image->kernel_buf_len < minsize) + goto out; + + if (data.parm->max_command_line_size) + max_command_line_size = data.parm->max_command_line_size; + + if (minsize + max_command_line_size < minsize) + goto out; + + if (image->kernel_buf_len < minsize + max_command_line_size) + goto out; + + if (image->cmdline_buf_len >= max_command_line_size) + goto out; + + memcpy(data.parm->command_line, image->cmdline_buf, + image->cmdline_buf_len); + + if (image->type == KEXEC_TYPE_CRASH) { + data.parm->oldmem_base = crashk_res.start; + data.parm->oldmem_size = crashk_res.end - crashk_res.start + 1; + } + + if (image->initrd_buf) { + ret = kexec_file_add_initrd(image, &data); + if (ret) + goto out; + } + + ret = kexec_file_add_purgatory(image, &data); + if (ret) + goto out; + + if (data.kernel_mem == 0) { + unsigned long restart_psw = 0x0008000080000000UL; + restart_psw += image->start; + memcpy(data.kernel_buf, &restart_psw, sizeof(restart_psw)); + image->start = 0; + } + + ret = kexec_file_add_ipl_report(image, &data); +out: + ipl_report_free(data.report); + return ERR_PTR(ret); +} + +int arch_kexec_apply_relocations_add(struct purgatory_info *pi, + Elf_Shdr *section, + const Elf_Shdr *relsec, + const Elf_Shdr *symtab) +{ + const char *strtab, *name, *shstrtab; + const Elf_Shdr *sechdrs; + Elf_Rela *relas; + int i, r_type; + int ret; + + /* String & section header string table */ + sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff; + strtab = (char *)pi->ehdr + sechdrs[symtab->sh_link].sh_offset; + shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset; + + relas = (void *)pi->ehdr + relsec->sh_offset; + + for (i = 0; i < relsec->sh_size / sizeof(*relas); i++) { + const Elf_Sym *sym; /* symbol to relocate */ + unsigned long addr; /* final location after relocation */ + unsigned long val; /* relocated symbol value */ + void *loc; /* tmp location to modify */ + + sym = (void *)pi->ehdr + symtab->sh_offset; + sym += ELF64_R_SYM(relas[i].r_info); + + if (sym->st_name) + name = strtab + sym->st_name; + else + name = shstrtab + sechdrs[sym->st_shndx].sh_name; + + if (sym->st_shndx == SHN_UNDEF) { + pr_err("Undefined symbol: %s\n", name); + return -ENOEXEC; + } + + if (sym->st_shndx == SHN_COMMON) { + pr_err("symbol '%s' in common section\n", name); + return -ENOEXEC; + } + + if (sym->st_shndx >= pi->ehdr->e_shnum && + sym->st_shndx != SHN_ABS) { + pr_err("Invalid section %d for symbol %s\n", + sym->st_shndx, name); + return -ENOEXEC; + } + + loc = pi->purgatory_buf; + loc += section->sh_offset; + loc += relas[i].r_offset; + + val = sym->st_value; + if (sym->st_shndx != SHN_ABS) + val += pi->sechdrs[sym->st_shndx].sh_addr; + val += relas[i].r_addend; + + addr = section->sh_addr + relas[i].r_offset; + + r_type = ELF64_R_TYPE(relas[i].r_info); + + if (r_type == R_390_PLT32DBL) + r_type = R_390_PC32DBL; + + ret = arch_kexec_do_relocs(r_type, loc, val, addr); + if (ret) { + pr_err("Unknown rela relocation: %d\n", r_type); + return -ENOEXEC; + } + } + return 0; +} + +int arch_kimage_file_post_load_cleanup(struct kimage *image) +{ + vfree(image->arch.ipl_buf); + image->arch.ipl_buf = NULL; + + return kexec_image_post_load_cleanup_default(image); +} diff --git a/arch/s390/kernel/machine_kexec_reloc.c b/arch/s390/kernel/machine_kexec_reloc.c new file mode 100644 index 0000000000..b7182cec48 --- /dev/null +++ b/arch/s390/kernel/machine_kexec_reloc.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +int arch_kexec_do_relocs(int r_type, void *loc, unsigned long val, + unsigned long addr) +{ + switch (r_type) { + case R_390_NONE: + break; + case R_390_8: /* Direct 8 bit. */ + *(u8 *)loc = val; + break; + case R_390_12: /* Direct 12 bit. */ + *(u16 *)loc &= 0xf000; + *(u16 *)loc |= val & 0xfff; + break; + case R_390_16: /* Direct 16 bit. */ + *(u16 *)loc = val; + break; + case R_390_20: /* Direct 20 bit. */ + *(u32 *)loc &= 0xf00000ff; + *(u32 *)loc |= (val & 0xfff) << 16; /* DL */ + *(u32 *)loc |= (val & 0xff000) >> 4; /* DH */ + break; + case R_390_32: /* Direct 32 bit. */ + *(u32 *)loc = val; + break; + case R_390_64: /* Direct 64 bit. */ + case R_390_GLOB_DAT: + case R_390_JMP_SLOT: + *(u64 *)loc = val; + break; + case R_390_PC16: /* PC relative 16 bit. */ + *(u16 *)loc = (val - addr); + break; + case R_390_PC16DBL: /* PC relative 16 bit shifted by 1. */ + *(u16 *)loc = (val - addr) >> 1; + break; + case R_390_PC32DBL: /* PC relative 32 bit shifted by 1. */ + *(u32 *)loc = (val - addr) >> 1; + break; + case R_390_PC32: /* PC relative 32 bit. */ + *(u32 *)loc = (val - addr); + break; + case R_390_PC64: /* PC relative 64 bit. */ + *(u64 *)loc = (val - addr); + break; + case R_390_RELATIVE: + *(unsigned long *) loc = val; + break; + default: + return 1; + } + return 0; +} diff --git a/arch/s390/kernel/mcount.S b/arch/s390/kernel/mcount.S new file mode 100644 index 0000000000..ae4d4fd9af --- /dev/null +++ b/arch/s390/kernel/mcount.S @@ -0,0 +1,195 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2008, 2009 + * + */ + +#include +#include +#include +#include +#include + +#define STACK_FRAME_SIZE_PTREGS (STACK_FRAME_OVERHEAD + __PT_SIZE) +#define STACK_PTREGS (STACK_FRAME_OVERHEAD) +#define STACK_PTREGS_GPRS (STACK_PTREGS + __PT_GPRS) +#define STACK_PTREGS_PSW (STACK_PTREGS + __PT_PSW) + +#define STACK_FRAME_SIZE_FREGS (STACK_FRAME_OVERHEAD + __FTRACE_REGS_SIZE) +#define STACK_FREGS (STACK_FRAME_OVERHEAD) +#define STACK_FREGS_PTREGS (STACK_FRAME_OVERHEAD + __FTRACE_REGS_PT_REGS) +#define STACK_FREGS_PTREGS_GPRS (STACK_FREGS_PTREGS + __PT_GPRS) +#define STACK_FREGS_PTREGS_PSW (STACK_FREGS_PTREGS + __PT_PSW) +#define STACK_FREGS_PTREGS_ORIG_GPR2 (STACK_FREGS_PTREGS + __PT_ORIG_GPR2) +#define STACK_FREGS_PTREGS_FLAGS (STACK_FREGS_PTREGS + __PT_FLAGS) + +/* packed stack: allocate just enough for r14, r15 and backchain */ +#define TRACED_FUNC_FRAME_SIZE 24 + +#ifdef CONFIG_FUNCTION_TRACER + + GEN_BR_THUNK %r1 + GEN_BR_THUNK %r14 + + .section .kprobes.text, "ax" + +SYM_FUNC_START(ftrace_stub) + BR_EX %r14 +SYM_FUNC_END(ftrace_stub) + +SYM_CODE_START(ftrace_stub_direct_tramp) + lgr %r1, %r0 + BR_EX %r1 +SYM_CODE_END(ftrace_stub_direct_tramp) + + .macro ftrace_regs_entry, allregs=0 + stg %r14,(__SF_GPRS+8*8)(%r15) # save traced function caller + + .if \allregs == 1 + # save psw mask + # don't put any instructions clobbering CC before this point + epsw %r1,%r14 + risbg %r14,%r1,0,31,32 + .endif + + lgr %r1,%r15 + # allocate stack frame for ftrace_caller to contain traced function + aghi %r15,-TRACED_FUNC_FRAME_SIZE + stg %r1,__SF_BACKCHAIN(%r15) + stg %r0,(__SF_GPRS+8*8)(%r15) + stg %r15,(__SF_GPRS+9*8)(%r15) + # allocate ftrace_regs and stack frame for ftrace_trace_function + aghi %r15,-STACK_FRAME_SIZE_FREGS + stg %r1,(STACK_FREGS_PTREGS_GPRS+15*8)(%r15) + xc STACK_FREGS_PTREGS_ORIG_GPR2(8,%r15),STACK_FREGS_PTREGS_ORIG_GPR2(%r15) + + .if \allregs == 1 + stg %r14,(STACK_FREGS_PTREGS_PSW)(%r15) + mvghi STACK_FREGS_PTREGS_FLAGS(%r15),_PIF_FTRACE_FULL_REGS + .else + xc STACK_FREGS_PTREGS_FLAGS(8,%r15),STACK_FREGS_PTREGS_FLAGS(%r15) + .endif + + lg %r14,(__SF_GPRS+8*8)(%r1) # restore original return address + aghi %r1,-TRACED_FUNC_FRAME_SIZE + stg %r1,__SF_BACKCHAIN(%r15) + stg %r0,(STACK_FREGS_PTREGS_PSW+8)(%r15) + stmg %r2,%r14,(STACK_FREGS_PTREGS_GPRS+2*8)(%r15) + .endm + +SYM_CODE_START(ftrace_regs_caller) + ftrace_regs_entry 1 + j ftrace_common +SYM_CODE_END(ftrace_regs_caller) + +SYM_CODE_START(ftrace_caller) + ftrace_regs_entry 0 + j ftrace_common +SYM_CODE_END(ftrace_caller) + +SYM_CODE_START(ftrace_common) +#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES + aghik %r2,%r0,-MCOUNT_INSN_SIZE + lgrl %r4,function_trace_op + lgrl %r1,ftrace_func +#else + lgr %r2,%r0 + aghi %r2,-MCOUNT_INSN_SIZE + larl %r4,function_trace_op + lg %r4,0(%r4) + larl %r1,ftrace_func + lg %r1,0(%r1) +#endif + lgr %r3,%r14 + la %r5,STACK_FREGS(%r15) + BASR_EX %r14,%r1 +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +# The j instruction gets runtime patched to a nop instruction. +# See ftrace_enable_ftrace_graph_caller. +SYM_INNER_LABEL(ftrace_graph_caller, SYM_L_GLOBAL) + j .Lftrace_graph_caller_end + lmg %r2,%r3,(STACK_FREGS_PTREGS_GPRS+14*8)(%r15) + lg %r4,(STACK_FREGS_PTREGS_PSW+8)(%r15) + brasl %r14,prepare_ftrace_return + stg %r2,(STACK_FREGS_PTREGS_GPRS+14*8)(%r15) +.Lftrace_graph_caller_end: +#endif + lg %r0,(STACK_FREGS_PTREGS_PSW+8)(%r15) +#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES + ltg %r1,STACK_FREGS_PTREGS_ORIG_GPR2(%r15) + locgrz %r1,%r0 +#else + lg %r1,STACK_FREGS_PTREGS_ORIG_GPR2(%r15) + ltgr %r1,%r1 + jnz 0f + lgr %r1,%r0 +#endif +0: lmg %r2,%r15,(STACK_FREGS_PTREGS_GPRS+2*8)(%r15) + BR_EX %r1 +SYM_CODE_END(ftrace_common) + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +SYM_FUNC_START(return_to_handler) + stmg %r2,%r5,32(%r15) + lgr %r1,%r15 + aghi %r15,-(STACK_FRAME_OVERHEAD+__FGRAPH_RET_SIZE) + stg %r1,__SF_BACKCHAIN(%r15) + la %r3,STACK_FRAME_OVERHEAD(%r15) + stg %r1,__FGRAPH_RET_FP(%r3) + stg %r2,__FGRAPH_RET_GPR2(%r3) + lgr %r2,%r3 + brasl %r14,ftrace_return_to_handler + aghi %r15,STACK_FRAME_OVERHEAD+__FGRAPH_RET_SIZE + lgr %r14,%r2 + lmg %r2,%r5,32(%r15) + BR_EX %r14 +SYM_FUNC_END(return_to_handler) + +#endif +#endif /* CONFIG_FUNCTION_TRACER */ + +SYM_CODE_START(ftrace_shared_hotpatch_trampoline_br) + lmg %r0,%r1,2(%r1) + br %r1 +SYM_INNER_LABEL(ftrace_shared_hotpatch_trampoline_br_end, SYM_L_GLOBAL) +SYM_CODE_END(ftrace_shared_hotpatch_trampoline_br) + +#ifdef CONFIG_EXPOLINE +SYM_CODE_START(ftrace_shared_hotpatch_trampoline_exrl) + lmg %r0,%r1,2(%r1) + exrl %r0,0f + j . +0: br %r1 +SYM_INNER_LABEL(ftrace_shared_hotpatch_trampoline_exrl_end, SYM_L_GLOBAL) +SYM_CODE_END(ftrace_shared_hotpatch_trampoline_exrl) +#endif /* CONFIG_EXPOLINE */ + +#ifdef CONFIG_RETHOOK + +SYM_CODE_START(arch_rethook_trampoline) + stg %r14,(__SF_GPRS+8*8)(%r15) + lay %r15,-STACK_FRAME_SIZE_PTREGS(%r15) + stmg %r0,%r14,STACK_PTREGS_GPRS(%r15) + + # store original stack pointer in backchain and pt_regs + lay %r7,STACK_FRAME_SIZE_PTREGS(%r15) + stg %r7,__SF_BACKCHAIN(%r15) + stg %r7,STACK_PTREGS_GPRS+(15*8)(%r15) + + # store full psw + epsw %r2,%r3 + risbg %r3,%r2,0,31,32 + stg %r3,STACK_PTREGS_PSW(%r15) + larl %r1,arch_rethook_trampoline + stg %r1,STACK_PTREGS_PSW+8(%r15) + + lay %r2,STACK_PTREGS(%r15) + brasl %r14,arch_rethook_trampoline_callback + + mvc __SF_EMPTY(16,%r7),STACK_PTREGS_PSW(%r15) + lmg %r0,%r15,STACK_PTREGS_GPRS(%r15) + lpswe __SF_EMPTY(%r15) +SYM_CODE_END(arch_rethook_trampoline) + +#endif /* CONFIG_RETHOOK */ diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c new file mode 100644 index 0000000000..42215f9404 --- /dev/null +++ b/arch/s390/kernel/module.c @@ -0,0 +1,576 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Kernel module help for s390. + * + * S390 version + * Copyright IBM Corp. 2002, 2003 + * Author(s): Arnd Bergmann (arndb@de.ibm.com) + * Martin Schwidefsky (schwidefsky@de.ibm.com) + * + * based on i386 version + * Copyright (C) 2001 Rusty Russell. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(fmt , ...) +#endif + +#define PLT_ENTRY_SIZE 22 + +static unsigned long get_module_load_offset(void) +{ + static DEFINE_MUTEX(module_kaslr_mutex); + static unsigned long module_load_offset; + + if (!kaslr_enabled()) + return 0; + /* + * Calculate the module_load_offset the first time this code + * is called. Once calculated it stays the same until reboot. + */ + mutex_lock(&module_kaslr_mutex); + if (!module_load_offset) + module_load_offset = get_random_u32_inclusive(1, 1024) * PAGE_SIZE; + mutex_unlock(&module_kaslr_mutex); + return module_load_offset; +} + +void *module_alloc(unsigned long size) +{ + gfp_t gfp_mask = GFP_KERNEL; + void *p; + + if (PAGE_ALIGN(size) > MODULES_LEN) + return NULL; + p = __vmalloc_node_range(size, MODULE_ALIGN, + MODULES_VADDR + get_module_load_offset(), + MODULES_END, gfp_mask, PAGE_KERNEL, + VM_FLUSH_RESET_PERMS | VM_DEFER_KMEMLEAK, + NUMA_NO_NODE, __builtin_return_address(0)); + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { + vfree(p); + return NULL; + } + return p; +} + +#ifdef CONFIG_FUNCTION_TRACER +void module_arch_cleanup(struct module *mod) +{ + module_memfree(mod->arch.trampolines_start); +} +#endif + +void module_arch_freeing_init(struct module *mod) +{ + if (is_livepatch_module(mod) && + mod->state == MODULE_STATE_LIVE) + return; + + vfree(mod->arch.syminfo); + mod->arch.syminfo = NULL; +} + +static void check_rela(Elf_Rela *rela, struct module *me) +{ + struct mod_arch_syminfo *info; + + info = me->arch.syminfo + ELF_R_SYM (rela->r_info); + switch (ELF_R_TYPE (rela->r_info)) { + case R_390_GOT12: /* 12 bit GOT offset. */ + case R_390_GOT16: /* 16 bit GOT offset. */ + case R_390_GOT20: /* 20 bit GOT offset. */ + case R_390_GOT32: /* 32 bit GOT offset. */ + case R_390_GOT64: /* 64 bit GOT offset. */ + case R_390_GOTENT: /* 32 bit PC rel. to GOT entry shifted by 1. */ + case R_390_GOTPLT12: /* 12 bit offset to jump slot. */ + case R_390_GOTPLT16: /* 16 bit offset to jump slot. */ + case R_390_GOTPLT20: /* 20 bit offset to jump slot. */ + case R_390_GOTPLT32: /* 32 bit offset to jump slot. */ + case R_390_GOTPLT64: /* 64 bit offset to jump slot. */ + case R_390_GOTPLTENT: /* 32 bit rel. offset to jump slot >> 1. */ + if (info->got_offset == -1UL) { + info->got_offset = me->arch.got_size; + me->arch.got_size += sizeof(void*); + } + break; + case R_390_PLT16DBL: /* 16 bit PC rel. PLT shifted by 1. */ + case R_390_PLT32DBL: /* 32 bit PC rel. PLT shifted by 1. */ + case R_390_PLT32: /* 32 bit PC relative PLT address. */ + case R_390_PLT64: /* 64 bit PC relative PLT address. */ + case R_390_PLTOFF16: /* 16 bit offset from GOT to PLT. */ + case R_390_PLTOFF32: /* 32 bit offset from GOT to PLT. */ + case R_390_PLTOFF64: /* 16 bit offset from GOT to PLT. */ + if (info->plt_offset == -1UL) { + info->plt_offset = me->arch.plt_size; + me->arch.plt_size += PLT_ENTRY_SIZE; + } + break; + case R_390_COPY: + case R_390_GLOB_DAT: + case R_390_JMP_SLOT: + case R_390_RELATIVE: + /* Only needed if we want to support loading of + modules linked with -shared. */ + break; + } +} + +/* + * Account for GOT and PLT relocations. We can't add sections for + * got and plt but we can increase the core module size. + */ +int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, + char *secstrings, struct module *me) +{ + Elf_Shdr *symtab; + Elf_Sym *symbols; + Elf_Rela *rela; + char *strings; + int nrela, i, j; + struct module_memory *mod_mem; + + /* Find symbol table and string table. */ + symtab = NULL; + for (i = 0; i < hdr->e_shnum; i++) + switch (sechdrs[i].sh_type) { + case SHT_SYMTAB: + symtab = sechdrs + i; + break; + } + if (!symtab) { + printk(KERN_ERR "module %s: no symbol table\n", me->name); + return -ENOEXEC; + } + + /* Allocate one syminfo structure per symbol. */ + me->arch.nsyms = symtab->sh_size / sizeof(Elf_Sym); + me->arch.syminfo = vmalloc(array_size(sizeof(struct mod_arch_syminfo), + me->arch.nsyms)); + if (!me->arch.syminfo) + return -ENOMEM; + symbols = (void *) hdr + symtab->sh_offset; + strings = (void *) hdr + sechdrs[symtab->sh_link].sh_offset; + for (i = 0; i < me->arch.nsyms; i++) { + if (symbols[i].st_shndx == SHN_UNDEF && + strcmp(strings + symbols[i].st_name, + "_GLOBAL_OFFSET_TABLE_") == 0) + /* "Define" it as absolute. */ + symbols[i].st_shndx = SHN_ABS; + me->arch.syminfo[i].got_offset = -1UL; + me->arch.syminfo[i].plt_offset = -1UL; + me->arch.syminfo[i].got_initialized = 0; + me->arch.syminfo[i].plt_initialized = 0; + } + + /* Search for got/plt relocations. */ + me->arch.got_size = me->arch.plt_size = 0; + for (i = 0; i < hdr->e_shnum; i++) { + if (sechdrs[i].sh_type != SHT_RELA) + continue; + nrela = sechdrs[i].sh_size / sizeof(Elf_Rela); + rela = (void *) hdr + sechdrs[i].sh_offset; + for (j = 0; j < nrela; j++) + check_rela(rela + j, me); + } + + /* Increase core size by size of got & plt and set start + offsets for got and plt. */ + mod_mem = &me->mem[MOD_TEXT]; + mod_mem->size = ALIGN(mod_mem->size, 4); + me->arch.got_offset = mod_mem->size; + mod_mem->size += me->arch.got_size; + me->arch.plt_offset = mod_mem->size; + if (me->arch.plt_size) { + if (IS_ENABLED(CONFIG_EXPOLINE) && !nospec_disable) + me->arch.plt_size += PLT_ENTRY_SIZE; + mod_mem->size += me->arch.plt_size; + } + return 0; +} + +static int apply_rela_bits(Elf_Addr loc, Elf_Addr val, + int sign, int bits, int shift, + void *(*write)(void *dest, const void *src, size_t len)) +{ + unsigned long umax; + long min, max; + void *dest = (void *)loc; + + if (val & ((1UL << shift) - 1)) + return -ENOEXEC; + if (sign) { + val = (Elf_Addr)(((long) val) >> shift); + min = -(1L << (bits - 1)); + max = (1L << (bits - 1)) - 1; + if ((long) val < min || (long) val > max) + return -ENOEXEC; + } else { + val >>= shift; + umax = ((1UL << (bits - 1)) << 1) - 1; + if ((unsigned long) val > umax) + return -ENOEXEC; + } + + if (bits == 8) { + unsigned char tmp = val; + write(dest, &tmp, 1); + } else if (bits == 12) { + unsigned short tmp = (val & 0xfff) | + (*(unsigned short *) loc & 0xf000); + write(dest, &tmp, 2); + } else if (bits == 16) { + unsigned short tmp = val; + write(dest, &tmp, 2); + } else if (bits == 20) { + unsigned int tmp = (val & 0xfff) << 16 | + (val & 0xff000) >> 4 | (*(unsigned int *) loc & 0xf00000ff); + write(dest, &tmp, 4); + } else if (bits == 32) { + unsigned int tmp = val; + write(dest, &tmp, 4); + } else if (bits == 64) { + unsigned long tmp = val; + write(dest, &tmp, 8); + } + return 0; +} + +static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab, + const char *strtab, struct module *me, + void *(*write)(void *dest, const void *src, size_t len)) +{ + struct mod_arch_syminfo *info; + Elf_Addr loc, val; + int r_type, r_sym; + int rc = -ENOEXEC; + + /* This is where to make the change */ + loc = base + rela->r_offset; + /* This is the symbol it is referring to. Note that all + undefined symbols have been resolved. */ + r_sym = ELF_R_SYM(rela->r_info); + r_type = ELF_R_TYPE(rela->r_info); + info = me->arch.syminfo + r_sym; + val = symtab[r_sym].st_value; + + switch (r_type) { + case R_390_NONE: /* No relocation. */ + rc = 0; + break; + case R_390_8: /* Direct 8 bit. */ + case R_390_12: /* Direct 12 bit. */ + case R_390_16: /* Direct 16 bit. */ + case R_390_20: /* Direct 20 bit. */ + case R_390_32: /* Direct 32 bit. */ + case R_390_64: /* Direct 64 bit. */ + val += rela->r_addend; + if (r_type == R_390_8) + rc = apply_rela_bits(loc, val, 0, 8, 0, write); + else if (r_type == R_390_12) + rc = apply_rela_bits(loc, val, 0, 12, 0, write); + else if (r_type == R_390_16) + rc = apply_rela_bits(loc, val, 0, 16, 0, write); + else if (r_type == R_390_20) + rc = apply_rela_bits(loc, val, 1, 20, 0, write); + else if (r_type == R_390_32) + rc = apply_rela_bits(loc, val, 0, 32, 0, write); + else if (r_type == R_390_64) + rc = apply_rela_bits(loc, val, 0, 64, 0, write); + break; + case R_390_PC16: /* PC relative 16 bit. */ + case R_390_PC16DBL: /* PC relative 16 bit shifted by 1. */ + case R_390_PC32DBL: /* PC relative 32 bit shifted by 1. */ + case R_390_PC32: /* PC relative 32 bit. */ + case R_390_PC64: /* PC relative 64 bit. */ + val += rela->r_addend - loc; + if (r_type == R_390_PC16) + rc = apply_rela_bits(loc, val, 1, 16, 0, write); + else if (r_type == R_390_PC16DBL) + rc = apply_rela_bits(loc, val, 1, 16, 1, write); + else if (r_type == R_390_PC32DBL) + rc = apply_rela_bits(loc, val, 1, 32, 1, write); + else if (r_type == R_390_PC32) + rc = apply_rela_bits(loc, val, 1, 32, 0, write); + else if (r_type == R_390_PC64) + rc = apply_rela_bits(loc, val, 1, 64, 0, write); + break; + case R_390_GOT12: /* 12 bit GOT offset. */ + case R_390_GOT16: /* 16 bit GOT offset. */ + case R_390_GOT20: /* 20 bit GOT offset. */ + case R_390_GOT32: /* 32 bit GOT offset. */ + case R_390_GOT64: /* 64 bit GOT offset. */ + case R_390_GOTENT: /* 32 bit PC rel. to GOT entry shifted by 1. */ + case R_390_GOTPLT12: /* 12 bit offset to jump slot. */ + case R_390_GOTPLT20: /* 20 bit offset to jump slot. */ + case R_390_GOTPLT16: /* 16 bit offset to jump slot. */ + case R_390_GOTPLT32: /* 32 bit offset to jump slot. */ + case R_390_GOTPLT64: /* 64 bit offset to jump slot. */ + case R_390_GOTPLTENT: /* 32 bit rel. offset to jump slot >> 1. */ + if (info->got_initialized == 0) { + Elf_Addr *gotent = me->mem[MOD_TEXT].base + + me->arch.got_offset + + info->got_offset; + + write(gotent, &val, sizeof(*gotent)); + info->got_initialized = 1; + } + val = info->got_offset + rela->r_addend; + if (r_type == R_390_GOT12 || + r_type == R_390_GOTPLT12) + rc = apply_rela_bits(loc, val, 0, 12, 0, write); + else if (r_type == R_390_GOT16 || + r_type == R_390_GOTPLT16) + rc = apply_rela_bits(loc, val, 0, 16, 0, write); + else if (r_type == R_390_GOT20 || + r_type == R_390_GOTPLT20) + rc = apply_rela_bits(loc, val, 1, 20, 0, write); + else if (r_type == R_390_GOT32 || + r_type == R_390_GOTPLT32) + rc = apply_rela_bits(loc, val, 0, 32, 0, write); + else if (r_type == R_390_GOT64 || + r_type == R_390_GOTPLT64) + rc = apply_rela_bits(loc, val, 0, 64, 0, write); + else if (r_type == R_390_GOTENT || + r_type == R_390_GOTPLTENT) { + val += (Elf_Addr)me->mem[MOD_TEXT].base + + me->arch.got_offset - loc; + rc = apply_rela_bits(loc, val, 1, 32, 1, write); + } + break; + case R_390_PLT16DBL: /* 16 bit PC rel. PLT shifted by 1. */ + case R_390_PLT32DBL: /* 32 bit PC rel. PLT shifted by 1. */ + case R_390_PLT32: /* 32 bit PC relative PLT address. */ + case R_390_PLT64: /* 64 bit PC relative PLT address. */ + case R_390_PLTOFF16: /* 16 bit offset from GOT to PLT. */ + case R_390_PLTOFF32: /* 32 bit offset from GOT to PLT. */ + case R_390_PLTOFF64: /* 16 bit offset from GOT to PLT. */ + if (info->plt_initialized == 0) { + unsigned char insn[PLT_ENTRY_SIZE]; + char *plt_base; + char *ip; + + plt_base = me->mem[MOD_TEXT].base + me->arch.plt_offset; + ip = plt_base + info->plt_offset; + *(int *)insn = 0x0d10e310; /* basr 1,0 */ + *(int *)&insn[4] = 0x100c0004; /* lg 1,12(1) */ + if (IS_ENABLED(CONFIG_EXPOLINE) && !nospec_disable) { + char *jump_r1; + + jump_r1 = plt_base + me->arch.plt_size - + PLT_ENTRY_SIZE; + /* brcl 0xf,__jump_r1 */ + *(short *)&insn[8] = 0xc0f4; + *(int *)&insn[10] = (jump_r1 - (ip + 8)) / 2; + } else { + *(int *)&insn[8] = 0x07f10000; /* br %r1 */ + } + *(long *)&insn[14] = val; + + write(ip, insn, sizeof(insn)); + info->plt_initialized = 1; + } + if (r_type == R_390_PLTOFF16 || + r_type == R_390_PLTOFF32 || + r_type == R_390_PLTOFF64) + val = me->arch.plt_offset - me->arch.got_offset + + info->plt_offset + rela->r_addend; + else { + if (!((r_type == R_390_PLT16DBL && + val - loc + 0xffffUL < 0x1ffffeUL) || + (r_type == R_390_PLT32DBL && + val - loc + 0xffffffffULL < 0x1fffffffeULL))) + val = (Elf_Addr) me->mem[MOD_TEXT].base + + me->arch.plt_offset + + info->plt_offset; + val += rela->r_addend - loc; + } + if (r_type == R_390_PLT16DBL) + rc = apply_rela_bits(loc, val, 1, 16, 1, write); + else if (r_type == R_390_PLTOFF16) + rc = apply_rela_bits(loc, val, 0, 16, 0, write); + else if (r_type == R_390_PLT32DBL) + rc = apply_rela_bits(loc, val, 1, 32, 1, write); + else if (r_type == R_390_PLT32 || + r_type == R_390_PLTOFF32) + rc = apply_rela_bits(loc, val, 0, 32, 0, write); + else if (r_type == R_390_PLT64 || + r_type == R_390_PLTOFF64) + rc = apply_rela_bits(loc, val, 0, 64, 0, write); + break; + case R_390_GOTOFF16: /* 16 bit offset to GOT. */ + case R_390_GOTOFF32: /* 32 bit offset to GOT. */ + case R_390_GOTOFF64: /* 64 bit offset to GOT. */ + val = val + rela->r_addend - + ((Elf_Addr) me->mem[MOD_TEXT].base + me->arch.got_offset); + if (r_type == R_390_GOTOFF16) + rc = apply_rela_bits(loc, val, 0, 16, 0, write); + else if (r_type == R_390_GOTOFF32) + rc = apply_rela_bits(loc, val, 0, 32, 0, write); + else if (r_type == R_390_GOTOFF64) + rc = apply_rela_bits(loc, val, 0, 64, 0, write); + break; + case R_390_GOTPC: /* 32 bit PC relative offset to GOT. */ + case R_390_GOTPCDBL: /* 32 bit PC rel. off. to GOT shifted by 1. */ + val = (Elf_Addr) me->mem[MOD_TEXT].base + me->arch.got_offset + + rela->r_addend - loc; + if (r_type == R_390_GOTPC) + rc = apply_rela_bits(loc, val, 1, 32, 0, write); + else if (r_type == R_390_GOTPCDBL) + rc = apply_rela_bits(loc, val, 1, 32, 1, write); + break; + case R_390_COPY: + case R_390_GLOB_DAT: /* Create GOT entry. */ + case R_390_JMP_SLOT: /* Create PLT entry. */ + case R_390_RELATIVE: /* Adjust by program base. */ + /* Only needed if we want to support loading of + modules linked with -shared. */ + return -ENOEXEC; + default: + printk(KERN_ERR "module %s: unknown relocation: %u\n", + me->name, r_type); + return -ENOEXEC; + } + if (rc) { + printk(KERN_ERR "module %s: relocation error for symbol %s " + "(r_type %i, value 0x%lx)\n", + me->name, strtab + symtab[r_sym].st_name, + r_type, (unsigned long) val); + return rc; + } + return 0; +} + +static int __apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, + unsigned int symindex, unsigned int relsec, + struct module *me, + void *(*write)(void *dest, const void *src, size_t len)) +{ + Elf_Addr base; + Elf_Sym *symtab; + Elf_Rela *rela; + unsigned long i, n; + int rc; + + DEBUGP("Applying relocate section %u to %u\n", + relsec, sechdrs[relsec].sh_info); + base = sechdrs[sechdrs[relsec].sh_info].sh_addr; + symtab = (Elf_Sym *) sechdrs[symindex].sh_addr; + rela = (Elf_Rela *) sechdrs[relsec].sh_addr; + n = sechdrs[relsec].sh_size / sizeof(Elf_Rela); + + for (i = 0; i < n; i++, rela++) { + rc = apply_rela(rela, base, symtab, strtab, me, write); + if (rc) + return rc; + } + return 0; +} + +int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, + unsigned int symindex, unsigned int relsec, + struct module *me) +{ + bool early = me->state == MODULE_STATE_UNFORMED; + void *(*write)(void *, const void *, size_t) = memcpy; + + if (!early) + write = s390_kernel_write; + + return __apply_relocate_add(sechdrs, strtab, symindex, relsec, me, + write); +} + +#ifdef CONFIG_FUNCTION_TRACER +static int module_alloc_ftrace_hotpatch_trampolines(struct module *me, + const Elf_Shdr *s) +{ + char *start, *end; + int numpages; + size_t size; + + size = FTRACE_HOTPATCH_TRAMPOLINES_SIZE(s->sh_size); + numpages = DIV_ROUND_UP(size, PAGE_SIZE); + start = module_alloc(numpages * PAGE_SIZE); + if (!start) + return -ENOMEM; + set_memory_rox((unsigned long)start, numpages); + end = start + size; + + me->arch.trampolines_start = (struct ftrace_hotpatch_trampoline *)start; + me->arch.trampolines_end = (struct ftrace_hotpatch_trampoline *)end; + me->arch.next_trampoline = me->arch.trampolines_start; + + return 0; +} +#endif /* CONFIG_FUNCTION_TRACER */ + +int module_finalize(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, + struct module *me) +{ + const Elf_Shdr *s; + char *secstrings, *secname; + void *aseg; +#ifdef CONFIG_FUNCTION_TRACER + int ret; +#endif + + if (IS_ENABLED(CONFIG_EXPOLINE) && + !nospec_disable && me->arch.plt_size) { + unsigned int *ij; + + ij = me->mem[MOD_TEXT].base + me->arch.plt_offset + + me->arch.plt_size - PLT_ENTRY_SIZE; + ij[0] = 0xc6000000; /* exrl %r0,.+10 */ + ij[1] = 0x0005a7f4; /* j . */ + ij[2] = 0x000007f1; /* br %r1 */ + } + + secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { + aseg = (void *) s->sh_addr; + secname = secstrings + s->sh_name; + + if (!strcmp(".altinstructions", secname)) + /* patch .altinstructions */ + apply_alternatives(aseg, aseg + s->sh_size); + + if (IS_ENABLED(CONFIG_EXPOLINE) && + (str_has_prefix(secname, ".s390_indirect"))) + nospec_revert(aseg, aseg + s->sh_size); + + if (IS_ENABLED(CONFIG_EXPOLINE) && + (str_has_prefix(secname, ".s390_return"))) + nospec_revert(aseg, aseg + s->sh_size); + +#ifdef CONFIG_FUNCTION_TRACER + if (!strcmp(FTRACE_CALLSITE_SECTION, secname)) { + ret = module_alloc_ftrace_hotpatch_trampolines(me, s); + if (ret < 0) + return ret; + } +#endif /* CONFIG_FUNCTION_TRACER */ + } + + return 0; +} diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c new file mode 100644 index 0000000000..38ec048752 --- /dev/null +++ b/arch/s390/kernel/nmi.c @@ -0,0 +1,513 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Machine check handler + * + * Copyright IBM Corp. 2000, 2009 + * Author(s): Ingo Adlung , + * Martin Schwidefsky , + * Cornelia Huck , + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct mcck_struct { + unsigned int kill_task : 1; + unsigned int channel_report : 1; + unsigned int warning : 1; + unsigned int stp_queue : 1; + unsigned long mcck_code; +}; + +static DEFINE_PER_CPU(struct mcck_struct, cpu_mcck); + +static inline int nmi_needs_mcesa(void) +{ + return MACHINE_HAS_VX || MACHINE_HAS_GS; +} + +/* + * The initial machine check extended save area for the boot CPU. + * It will be replaced on the boot CPU reinit with an allocated + * structure. The structure is required for machine check happening + * early in the boot process. + */ +static struct mcesa boot_mcesa __aligned(MCESA_MAX_SIZE); + +void __init nmi_alloc_mcesa_early(u64 *mcesad) +{ + if (!nmi_needs_mcesa()) + return; + *mcesad = __pa(&boot_mcesa); + if (MACHINE_HAS_GS) + *mcesad |= ilog2(MCESA_MAX_SIZE); +} + +int nmi_alloc_mcesa(u64 *mcesad) +{ + unsigned long size; + void *origin; + + *mcesad = 0; + if (!nmi_needs_mcesa()) + return 0; + size = MACHINE_HAS_GS ? MCESA_MAX_SIZE : MCESA_MIN_SIZE; + origin = kmalloc(size, GFP_KERNEL); + if (!origin) + return -ENOMEM; + /* The pointer is stored with mcesa_bits ORed in */ + kmemleak_not_leak(origin); + *mcesad = __pa(origin); + if (MACHINE_HAS_GS) + *mcesad |= ilog2(MCESA_MAX_SIZE); + return 0; +} + +void nmi_free_mcesa(u64 *mcesad) +{ + if (!nmi_needs_mcesa()) + return; + kfree(__va(*mcesad & MCESA_ORIGIN_MASK)); +} + +static __always_inline char *nmi_puts(char *dest, const char *src) +{ + while (*src) + *dest++ = *src++; + *dest = 0; + return dest; +} + +static __always_inline char *u64_to_hex(char *dest, u64 val) +{ + int i, num; + + for (i = 1; i <= 16; i++) { + num = (val >> (64 - 4 * i)) & 0xf; + if (num >= 10) + *dest++ = 'A' + num - 10; + else + *dest++ = '0' + num; + } + *dest = 0; + return dest; +} + +static notrace void s390_handle_damage(void) +{ + union ctlreg0 cr0, cr0_new; + char message[100]; + psw_t psw_save; + char *ptr; + + smp_emergency_stop(); + diag_amode31_ops.diag308_reset(); + ptr = nmi_puts(message, "System stopped due to unrecoverable machine check, code: 0x"); + u64_to_hex(ptr, S390_lowcore.mcck_interruption_code); + + /* + * Disable low address protection and make machine check new PSW a + * disabled wait PSW. Any additional machine check cannot be handled. + */ + __ctl_store(cr0.val, 0, 0); + cr0_new = cr0; + cr0_new.lap = 0; + __ctl_load(cr0_new.val, 0, 0); + psw_save = S390_lowcore.mcck_new_psw; + psw_bits(S390_lowcore.mcck_new_psw).io = 0; + psw_bits(S390_lowcore.mcck_new_psw).ext = 0; + psw_bits(S390_lowcore.mcck_new_psw).wait = 1; + sclp_emergency_printk(message); + + /* + * Restore machine check new PSW and control register 0 to original + * values. This makes possible system dump analysis easier. + */ + S390_lowcore.mcck_new_psw = psw_save; + __ctl_load(cr0.val, 0, 0); + disabled_wait(); + while (1); +} +NOKPROBE_SYMBOL(s390_handle_damage); + +/* + * Main machine check handler function. Will be called with interrupts disabled + * and machine checks enabled. + */ +void s390_handle_mcck(void) +{ + struct mcck_struct mcck; + + /* + * Disable machine checks and get the current state of accumulated + * machine checks. Afterwards delete the old state and enable machine + * checks again. + */ + local_mcck_disable(); + mcck = *this_cpu_ptr(&cpu_mcck); + memset(this_cpu_ptr(&cpu_mcck), 0, sizeof(mcck)); + local_mcck_enable(); + + if (mcck.channel_report) + crw_handle_channel_report(); + /* + * A warning may remain for a prolonged period on the bare iron. + * (actually until the machine is powered off, or the problem is gone) + * So we just stop listening for the WARNING MCH and avoid continuously + * being interrupted. One caveat is however, that we must do this per + * processor and cannot use the smp version of ctl_clear_bit(). + * On VM we only get one interrupt per virtally presented machinecheck. + * Though one suffices, we may get one interrupt per (virtual) cpu. + */ + if (mcck.warning) { /* WARNING pending ? */ + static int mchchk_wng_posted = 0; + + /* Use single cpu clear, as we cannot handle smp here. */ + __ctl_clear_bit(14, 24); /* Disable WARNING MCH */ + if (xchg(&mchchk_wng_posted, 1) == 0) + kill_cad_pid(SIGPWR, 1); + } + if (mcck.stp_queue) + stp_queue_work(); + if (mcck.kill_task) { + printk(KERN_EMERG "mcck: Terminating task because of machine " + "malfunction (code 0x%016lx).\n", mcck.mcck_code); + printk(KERN_EMERG "mcck: task: %s, pid: %d.\n", + current->comm, current->pid); + if (is_global_init(current)) + panic("mcck: Attempting to kill init!\n"); + do_send_sig_info(SIGKILL, SEND_SIG_PRIV, current, PIDTYPE_PID); + } +} + +/* + * returns 0 if register contents could be validated + * returns 1 otherwise + */ +static int notrace s390_validate_registers(union mci mci) +{ + struct mcesa *mcesa; + void *fpt_save_area; + union ctlreg2 cr2; + int kill_task; + u64 zero; + + kill_task = 0; + zero = 0; + + if (!mci.gr || !mci.fp) + kill_task = 1; + fpt_save_area = &S390_lowcore.floating_pt_save_area; + if (!mci.fc) { + kill_task = 1; + asm volatile( + " lfpc %0\n" + : + : "Q" (zero)); + } else { + asm volatile( + " lfpc %0\n" + : + : "Q" (S390_lowcore.fpt_creg_save_area)); + } + + mcesa = __va(S390_lowcore.mcesad & MCESA_ORIGIN_MASK); + if (!MACHINE_HAS_VX) { + /* Validate floating point registers */ + asm volatile( + " ld 0,0(%0)\n" + " ld 1,8(%0)\n" + " ld 2,16(%0)\n" + " ld 3,24(%0)\n" + " ld 4,32(%0)\n" + " ld 5,40(%0)\n" + " ld 6,48(%0)\n" + " ld 7,56(%0)\n" + " ld 8,64(%0)\n" + " ld 9,72(%0)\n" + " ld 10,80(%0)\n" + " ld 11,88(%0)\n" + " ld 12,96(%0)\n" + " ld 13,104(%0)\n" + " ld 14,112(%0)\n" + " ld 15,120(%0)\n" + : + : "a" (fpt_save_area) + : "memory"); + } else { + /* Validate vector registers */ + union ctlreg0 cr0; + + /* + * The vector validity must only be checked if not running a + * KVM guest. For KVM guests the machine check is forwarded by + * KVM and it is the responsibility of the guest to take + * appropriate actions. The host vector or FPU values have been + * saved by KVM and will be restored by KVM. + */ + if (!mci.vr && !test_cpu_flag(CIF_MCCK_GUEST)) + kill_task = 1; + cr0.val = S390_lowcore.cregs_save_area[0]; + cr0.afp = cr0.vx = 1; + __ctl_load(cr0.val, 0, 0); + asm volatile( + " la 1,%0\n" + " VLM 0,15,0,1\n" + " VLM 16,31,256,1\n" + : + : "Q" (*(struct vx_array *)mcesa->vector_save_area) + : "1"); + __ctl_load(S390_lowcore.cregs_save_area[0], 0, 0); + } + /* Validate access registers */ + asm volatile( + " lam 0,15,0(%0)\n" + : + : "a" (&S390_lowcore.access_regs_save_area) + : "memory"); + if (!mci.ar) + kill_task = 1; + /* Validate guarded storage registers */ + cr2.val = S390_lowcore.cregs_save_area[2]; + if (cr2.gse) { + if (!mci.gs) { + /* + * 2 cases: + * - machine check in kernel or userspace + * - machine check while running SIE (KVM guest) + * For kernel or userspace the userspace values of + * guarded storage control can not be recreated, the + * process must be terminated. + * For SIE the guest values of guarded storage can not + * be recreated. This is either due to a bug or due to + * GS being disabled in the guest. The guest will be + * notified by KVM code and the guests machine check + * handling must take care of this. The host values + * are saved by KVM and are not affected. + */ + if (!test_cpu_flag(CIF_MCCK_GUEST)) + kill_task = 1; + } else { + load_gs_cb((struct gs_cb *)mcesa->guarded_storage_save_area); + } + } + /* + * The getcpu vdso syscall reads CPU number from the programmable + * field of the TOD clock. Disregard the TOD programmable register + * validity bit and load the CPU number into the TOD programmable + * field unconditionally. + */ + set_tod_programmable_field(raw_smp_processor_id()); + /* Validate clock comparator register */ + set_clock_comparator(S390_lowcore.clock_comparator); + + if (!mci.ms || !mci.pm || !mci.ia) + kill_task = 1; + + return kill_task; +} +NOKPROBE_SYMBOL(s390_validate_registers); + +/* + * Backup the guest's machine check info to its description block + */ +static void notrace s390_backup_mcck_info(struct pt_regs *regs) +{ + struct mcck_volatile_info *mcck_backup; + struct sie_page *sie_page; + + /* r14 contains the sie block, which was set in sie64a */ + struct kvm_s390_sie_block *sie_block = phys_to_virt(regs->gprs[14]); + + if (sie_block == NULL) + /* Something's seriously wrong, stop system. */ + s390_handle_damage(); + + sie_page = container_of(sie_block, struct sie_page, sie_block); + mcck_backup = &sie_page->mcck_info; + mcck_backup->mcic = S390_lowcore.mcck_interruption_code & + ~(MCCK_CODE_CP | MCCK_CODE_EXT_DAMAGE); + mcck_backup->ext_damage_code = S390_lowcore.external_damage_code; + mcck_backup->failing_storage_address + = S390_lowcore.failing_storage_address; +} +NOKPROBE_SYMBOL(s390_backup_mcck_info); + +#define MAX_IPD_COUNT 29 +#define MAX_IPD_TIME (5 * 60 * USEC_PER_SEC) /* 5 minutes */ + +#define ED_STP_ISLAND 6 /* External damage STP island check */ +#define ED_STP_SYNC 7 /* External damage STP sync check */ + +#define MCCK_CODE_NO_GUEST (MCCK_CODE_CP | MCCK_CODE_EXT_DAMAGE) + +/* + * machine check handler. + */ +void notrace s390_do_machine_check(struct pt_regs *regs) +{ + static int ipd_count; + static DEFINE_SPINLOCK(ipd_lock); + static unsigned long long last_ipd; + struct mcck_struct *mcck; + unsigned long long tmp; + irqentry_state_t irq_state; + union mci mci; + unsigned long mcck_dam_code; + int mcck_pending = 0; + + irq_state = irqentry_nmi_enter(regs); + + if (user_mode(regs)) + update_timer_mcck(); + inc_irq_stat(NMI_NMI); + mci.val = S390_lowcore.mcck_interruption_code; + mcck = this_cpu_ptr(&cpu_mcck); + + /* + * Reinject the instruction processing damages' machine checks + * including Delayed Access Exception into the guest + * instead of damaging the host if they happen in the guest. + */ + if (mci.pd && !test_cpu_flag(CIF_MCCK_GUEST)) { + if (mci.b) { + /* Processing backup -> verify if we can survive this */ + u64 z_mcic, o_mcic, t_mcic; + z_mcic = (1ULL<<63 | 1ULL<<59 | 1ULL<<29); + o_mcic = (1ULL<<43 | 1ULL<<42 | 1ULL<<41 | 1ULL<<40 | + 1ULL<<36 | 1ULL<<35 | 1ULL<<34 | 1ULL<<32 | + 1ULL<<30 | 1ULL<<21 | 1ULL<<20 | 1ULL<<17 | + 1ULL<<16); + t_mcic = mci.val; + + if (((t_mcic & z_mcic) != 0) || + ((t_mcic & o_mcic) != o_mcic)) { + s390_handle_damage(); + } + + /* + * Nullifying exigent condition, therefore we might + * retry this instruction. + */ + spin_lock(&ipd_lock); + tmp = get_tod_clock(); + if (((tmp - last_ipd) >> 12) < MAX_IPD_TIME) + ipd_count++; + else + ipd_count = 1; + last_ipd = tmp; + if (ipd_count == MAX_IPD_COUNT) + s390_handle_damage(); + spin_unlock(&ipd_lock); + } else { + /* Processing damage -> stopping machine */ + s390_handle_damage(); + } + } + if (s390_validate_registers(mci)) { + if (!user_mode(regs)) + s390_handle_damage(); + /* + * Couldn't restore all register contents for the + * user space process -> mark task for termination. + */ + mcck->kill_task = 1; + mcck->mcck_code = mci.val; + mcck_pending = 1; + } + + /* + * Backup the machine check's info if it happens when the guest + * is running. + */ + if (test_cpu_flag(CIF_MCCK_GUEST)) + s390_backup_mcck_info(regs); + + if (mci.cd) { + /* Timing facility damage */ + s390_handle_damage(); + } + if (mci.ed && mci.ec) { + /* External damage */ + if (S390_lowcore.external_damage_code & (1U << ED_STP_SYNC)) + mcck->stp_queue |= stp_sync_check(); + if (S390_lowcore.external_damage_code & (1U << ED_STP_ISLAND)) + mcck->stp_queue |= stp_island_check(); + mcck_pending = 1; + } + /* + * Reinject storage related machine checks into the guest if they + * happen when the guest is running. + */ + if (!test_cpu_flag(CIF_MCCK_GUEST)) { + /* Storage error uncorrected */ + if (mci.se) + s390_handle_damage(); + /* Storage key-error uncorrected */ + if (mci.ke) + s390_handle_damage(); + /* Storage degradation */ + if (mci.ds && mci.fa) + s390_handle_damage(); + } + if (mci.cp) { + /* Channel report word pending */ + mcck->channel_report = 1; + mcck_pending = 1; + } + if (mci.w) { + /* Warning pending */ + mcck->warning = 1; + mcck_pending = 1; + } + + /* + * If there are only Channel Report Pending and External Damage + * machine checks, they will not be reinjected into the guest + * because they refer to host conditions only. + */ + mcck_dam_code = (mci.val & MCIC_SUBCLASS_MASK); + if (test_cpu_flag(CIF_MCCK_GUEST) && + (mcck_dam_code & MCCK_CODE_NO_GUEST) != mcck_dam_code) { + /* Set exit reason code for host's later handling */ + *((long *)(regs->gprs[15] + __SF_SIE_REASON)) = -EINTR; + } + clear_cpu_flag(CIF_MCCK_GUEST); + + if (mcck_pending) + schedule_mcck_handler(); + + irqentry_nmi_exit(regs, irq_state); +} +NOKPROBE_SYMBOL(s390_do_machine_check); + +static int __init machine_check_init(void) +{ + ctl_set_bit(14, 25); /* enable external damage MCH */ + ctl_set_bit(14, 27); /* enable system recovery MCH */ + ctl_set_bit(14, 24); /* enable warning MCH */ + return 0; +} +early_initcall(machine_check_init); diff --git a/arch/s390/kernel/nospec-branch.c b/arch/s390/kernel/nospec-branch.c new file mode 100644 index 0000000000..d1b16d83e4 --- /dev/null +++ b/arch/s390/kernel/nospec-branch.c @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +static int __init nobp_setup_early(char *str) +{ + bool enabled; + int rc; + + rc = kstrtobool(str, &enabled); + if (rc) + return rc; + if (enabled && test_facility(82)) { + /* + * The user explicitly requested nobp=1, enable it and + * disable the expoline support. + */ + __set_facility(82, alt_stfle_fac_list); + if (IS_ENABLED(CONFIG_EXPOLINE)) + nospec_disable = 1; + } else { + __clear_facility(82, alt_stfle_fac_list); + } + return 0; +} +early_param("nobp", nobp_setup_early); + +static int __init nospec_setup_early(char *str) +{ + __clear_facility(82, alt_stfle_fac_list); + return 0; +} +early_param("nospec", nospec_setup_early); + +static int __init nospec_report(void) +{ + if (test_facility(156)) + pr_info("Spectre V2 mitigation: etokens\n"); + if (nospec_uses_trampoline()) + pr_info("Spectre V2 mitigation: execute trampolines\n"); + if (__test_facility(82, alt_stfle_fac_list)) + pr_info("Spectre V2 mitigation: limited branch prediction\n"); + return 0; +} +arch_initcall(nospec_report); + +#ifdef CONFIG_EXPOLINE + +int nospec_disable = IS_ENABLED(CONFIG_EXPOLINE_OFF); + +static int __init nospectre_v2_setup_early(char *str) +{ + nospec_disable = 1; + return 0; +} +early_param("nospectre_v2", nospectre_v2_setup_early); + +void __init nospec_auto_detect(void) +{ + if (test_facility(156) || cpu_mitigations_off()) { + /* + * The machine supports etokens. + * Disable expolines and disable nobp. + */ + if (__is_defined(CC_USING_EXPOLINE)) + nospec_disable = 1; + __clear_facility(82, alt_stfle_fac_list); + } else if (__is_defined(CC_USING_EXPOLINE)) { + /* + * The kernel has been compiled with expolines. + * Keep expolines enabled and disable nobp. + */ + nospec_disable = 0; + __clear_facility(82, alt_stfle_fac_list); + } + /* + * If the kernel has not been compiled with expolines the + * nobp setting decides what is done, this depends on the + * CONFIG_KERNEL_NP option and the nobp/nospec parameters. + */ +} + +static int __init spectre_v2_setup_early(char *str) +{ + if (str && !strncmp(str, "on", 2)) { + nospec_disable = 0; + __clear_facility(82, alt_stfle_fac_list); + } + if (str && !strncmp(str, "off", 3)) + nospec_disable = 1; + if (str && !strncmp(str, "auto", 4)) + nospec_auto_detect(); + return 0; +} +early_param("spectre_v2", spectre_v2_setup_early); + +static void __init_or_module __nospec_revert(s32 *start, s32 *end) +{ + enum { BRCL_EXPOLINE, BRASL_EXPOLINE } type; + static const u8 branch[] = { 0x47, 0x00, 0x07, 0x00 }; + u8 *instr, *thunk, *br; + u8 insnbuf[6]; + s32 *epo; + + /* Second part of the instruction replace is always a nop */ + memcpy(insnbuf + 2, branch, sizeof(branch)); + for (epo = start; epo < end; epo++) { + instr = (u8 *) epo + *epo; + if (instr[0] == 0xc0 && (instr[1] & 0x0f) == 0x04) + type = BRCL_EXPOLINE; /* brcl instruction */ + else if (instr[0] == 0xc0 && (instr[1] & 0x0f) == 0x05) + type = BRASL_EXPOLINE; /* brasl instruction */ + else + continue; + thunk = instr + (*(int *)(instr + 2)) * 2; + if (thunk[0] == 0xc6 && thunk[1] == 0x00) + /* exrl %r0, */ + br = thunk + (*(int *)(thunk + 2)) * 2; + else + continue; + if (br[0] != 0x07 || (br[1] & 0xf0) != 0xf0) + continue; + switch (type) { + case BRCL_EXPOLINE: + /* brcl to thunk, replace with br + nop */ + insnbuf[0] = br[0]; + insnbuf[1] = (instr[1] & 0xf0) | (br[1] & 0x0f); + break; + case BRASL_EXPOLINE: + /* brasl to thunk, replace with basr + nop */ + insnbuf[0] = 0x0d; + insnbuf[1] = (instr[1] & 0xf0) | (br[1] & 0x0f); + break; + } + + s390_kernel_write(instr, insnbuf, 6); + } +} + +void __init_or_module nospec_revert(s32 *start, s32 *end) +{ + if (nospec_disable) + __nospec_revert(start, end); +} + +extern s32 __nospec_call_start[], __nospec_call_end[]; +extern s32 __nospec_return_start[], __nospec_return_end[]; +void __init nospec_init_branches(void) +{ + nospec_revert(__nospec_call_start, __nospec_call_end); + nospec_revert(__nospec_return_start, __nospec_return_end); +} + +#endif /* CONFIG_EXPOLINE */ diff --git a/arch/s390/kernel/nospec-sysfs.c b/arch/s390/kernel/nospec-sysfs.c new file mode 100644 index 0000000000..52d4353188 --- /dev/null +++ b/arch/s390/kernel/nospec-sysfs.c @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +ssize_t cpu_show_spectre_v1(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "Mitigation: __user pointer sanitization\n"); +} + +ssize_t cpu_show_spectre_v2(struct device *dev, + struct device_attribute *attr, char *buf) +{ + if (test_facility(156)) + return sprintf(buf, "Mitigation: etokens\n"); + if (nospec_uses_trampoline()) + return sprintf(buf, "Mitigation: execute trampolines\n"); + if (__test_facility(82, alt_stfle_fac_list)) + return sprintf(buf, "Mitigation: limited branch prediction\n"); + return sprintf(buf, "Vulnerable\n"); +} diff --git a/arch/s390/kernel/numa.c b/arch/s390/kernel/numa.c new file mode 100644 index 0000000000..23ab9f02f2 --- /dev/null +++ b/arch/s390/kernel/numa.c @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NUMA support for s390 + * + * Implement NUMA core code. + * + * Copyright IBM Corp. 2015 + */ + +#include +#include +#include +#include +#include +#include + +struct pglist_data *node_data[MAX_NUMNODES]; +EXPORT_SYMBOL(node_data); + +void __init numa_setup(void) +{ + int nid; + + nodes_clear(node_possible_map); + node_set(0, node_possible_map); + node_set_online(0); + for (nid = 0; nid < MAX_NUMNODES; nid++) { + NODE_DATA(nid) = memblock_alloc(sizeof(pg_data_t), 8); + if (!NODE_DATA(nid)) + panic("%s: Failed to allocate %zu bytes align=0x%x\n", + __func__, sizeof(pg_data_t), 8); + } + NODE_DATA(0)->node_spanned_pages = memblock_end_of_DRAM() >> PAGE_SHIFT; + NODE_DATA(0)->node_id = 0; +} diff --git a/arch/s390/kernel/os_info.c b/arch/s390/kernel/os_info.c new file mode 100644 index 0000000000..6e1824141b --- /dev/null +++ b/arch/s390/kernel/os_info.c @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * OS info memory interface + * + * Copyright IBM Corp. 2012 + * Author(s): Michael Holzheu + */ + +#define KMSG_COMPONENT "os_info" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * OS info structure has to be page aligned + */ +static struct os_info os_info __page_aligned_data; + +/* + * Compute checksum over OS info structure + */ +u32 os_info_csum(struct os_info *os_info) +{ + int size = sizeof(*os_info) - offsetof(struct os_info, version_major); + return (__force u32)csum_partial(&os_info->version_major, size, 0); +} + +/* + * Add crashkernel info to OS info and update checksum + */ +void os_info_crashkernel_add(unsigned long base, unsigned long size) +{ + os_info.crashkernel_addr = (u64)(unsigned long)base; + os_info.crashkernel_size = (u64)(unsigned long)size; + os_info.csum = os_info_csum(&os_info); +} + +/* + * Add OS info entry and update checksum + */ +void os_info_entry_add(int nr, void *ptr, u64 size) +{ + os_info.entry[nr].addr = __pa(ptr); + os_info.entry[nr].size = size; + os_info.entry[nr].csum = (__force u32)csum_partial(ptr, size, 0); + os_info.csum = os_info_csum(&os_info); +} + +/* + * Initialize OS info structure and set lowcore pointer + */ +void __init os_info_init(void) +{ + struct lowcore *abs_lc; + + os_info.version_major = OS_INFO_VERSION_MAJOR; + os_info.version_minor = OS_INFO_VERSION_MINOR; + os_info.magic = OS_INFO_MAGIC; + os_info.csum = os_info_csum(&os_info); + abs_lc = get_abs_lowcore(); + abs_lc->os_info = __pa(&os_info); + put_abs_lowcore(abs_lc); +} + +#ifdef CONFIG_CRASH_DUMP + +static struct os_info *os_info_old; + +/* + * Allocate and copy OS info entry from oldmem + */ +static void os_info_old_alloc(int nr, int align) +{ + unsigned long addr, size = 0; + char *buf, *buf_align, *msg; + u32 csum; + + addr = os_info_old->entry[nr].addr; + if (!addr) { + msg = "not available"; + goto fail; + } + size = os_info_old->entry[nr].size; + buf = kmalloc(size + align - 1, GFP_KERNEL); + if (!buf) { + msg = "alloc failed"; + goto fail; + } + buf_align = PTR_ALIGN(buf, align); + if (copy_oldmem_kernel(buf_align, addr, size)) { + msg = "copy failed"; + goto fail_free; + } + csum = (__force u32)csum_partial(buf_align, size, 0); + if (csum != os_info_old->entry[nr].csum) { + msg = "checksum failed"; + goto fail_free; + } + os_info_old->entry[nr].addr = (u64)(unsigned long)buf_align; + msg = "copied"; + goto out; +fail_free: + kfree(buf); +fail: + os_info_old->entry[nr].addr = 0; +out: + pr_info("entry %i: %s (addr=0x%lx size=%lu)\n", + nr, msg, addr, size); +} + +/* + * Initialize os info and os info entries from oldmem + */ +static void os_info_old_init(void) +{ + static int os_info_init; + unsigned long addr; + + if (os_info_init) + return; + if (!oldmem_data.start) + goto fail; + if (copy_oldmem_kernel(&addr, __LC_OS_INFO, sizeof(addr))) + goto fail; + if (addr == 0 || addr % PAGE_SIZE) + goto fail; + os_info_old = kzalloc(sizeof(*os_info_old), GFP_KERNEL); + if (!os_info_old) + goto fail; + if (copy_oldmem_kernel(os_info_old, addr, sizeof(*os_info_old))) + goto fail_free; + if (os_info_old->magic != OS_INFO_MAGIC) + goto fail_free; + if (os_info_old->csum != os_info_csum(os_info_old)) + goto fail_free; + if (os_info_old->version_major > OS_INFO_VERSION_MAJOR) + goto fail_free; + os_info_old_alloc(OS_INFO_VMCOREINFO, 1); + os_info_old_alloc(OS_INFO_REIPL_BLOCK, 1); + pr_info("crashkernel: addr=0x%lx size=%lu\n", + (unsigned long) os_info_old->crashkernel_addr, + (unsigned long) os_info_old->crashkernel_size); + os_info_init = 1; + return; +fail_free: + kfree(os_info_old); +fail: + os_info_init = 1; + os_info_old = NULL; +} + +/* + * Return pointer to os infor entry and its size + */ +void *os_info_old_entry(int nr, unsigned long *size) +{ + os_info_old_init(); + + if (!os_info_old) + return NULL; + if (!os_info_old->entry[nr].addr) + return NULL; + *size = (unsigned long) os_info_old->entry[nr].size; + return (void *)(unsigned long)os_info_old->entry[nr].addr; +} +#endif diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c new file mode 100644 index 0000000000..850c11ea63 --- /dev/null +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -0,0 +1,1950 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Performance event support for s390x - CPU-measurement Counter Facility + * + * Copyright IBM Corp. 2012, 2023 + * Author(s): Hendrik Brueckner + * Thomas Richter + */ +#define KMSG_COMPONENT "cpum_cf" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +enum cpumf_ctr_set { + CPUMF_CTR_SET_BASIC = 0, /* Basic Counter Set */ + CPUMF_CTR_SET_USER = 1, /* Problem-State Counter Set */ + CPUMF_CTR_SET_CRYPTO = 2, /* Crypto-Activity Counter Set */ + CPUMF_CTR_SET_EXT = 3, /* Extended Counter Set */ + CPUMF_CTR_SET_MT_DIAG = 4, /* MT-diagnostic Counter Set */ + + /* Maximum number of counter sets */ + CPUMF_CTR_SET_MAX, +}; + +#define CPUMF_LCCTL_ENABLE_SHIFT 16 +#define CPUMF_LCCTL_ACTCTL_SHIFT 0 + +static inline void ctr_set_enable(u64 *state, u64 ctrsets) +{ + *state |= ctrsets << CPUMF_LCCTL_ENABLE_SHIFT; +} + +static inline void ctr_set_disable(u64 *state, u64 ctrsets) +{ + *state &= ~(ctrsets << CPUMF_LCCTL_ENABLE_SHIFT); +} + +static inline void ctr_set_start(u64 *state, u64 ctrsets) +{ + *state |= ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT; +} + +static inline void ctr_set_stop(u64 *state, u64 ctrsets) +{ + *state &= ~(ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT); +} + +static inline int ctr_stcctm(enum cpumf_ctr_set set, u64 range, u64 *dest) +{ + switch (set) { + case CPUMF_CTR_SET_BASIC: + return stcctm(BASIC, range, dest); + case CPUMF_CTR_SET_USER: + return stcctm(PROBLEM_STATE, range, dest); + case CPUMF_CTR_SET_CRYPTO: + return stcctm(CRYPTO_ACTIVITY, range, dest); + case CPUMF_CTR_SET_EXT: + return stcctm(EXTENDED, range, dest); + case CPUMF_CTR_SET_MT_DIAG: + return stcctm(MT_DIAG_CLEARING, range, dest); + case CPUMF_CTR_SET_MAX: + return 3; + } + return 3; +} + +struct cpu_cf_events { + refcount_t refcnt; /* Reference count */ + atomic_t ctr_set[CPUMF_CTR_SET_MAX]; + u64 state; /* For perf_event_open SVC */ + u64 dev_state; /* For /dev/hwctr */ + unsigned int flags; + size_t used; /* Bytes used in data */ + size_t usedss; /* Bytes used in start/stop */ + unsigned char start[PAGE_SIZE]; /* Counter set at event add */ + unsigned char stop[PAGE_SIZE]; /* Counter set at event delete */ + unsigned char data[PAGE_SIZE]; /* Counter set at /dev/hwctr */ + unsigned int sets; /* # Counter set saved in memory */ +}; + +static unsigned int cfdiag_cpu_speed; /* CPU speed for CF_DIAG trailer */ +static debug_info_t *cf_dbg; + +/* + * The CPU Measurement query counter information instruction contains + * information which varies per machine generation, but is constant and + * does not change when running on a particular machine, such as counter + * first and second version number. This is needed to determine the size + * of counter sets. Extract this information at device driver initialization. + */ +static struct cpumf_ctr_info cpumf_ctr_info; + +struct cpu_cf_ptr { + struct cpu_cf_events *cpucf; +}; + +static struct cpu_cf_root { /* Anchor to per CPU data */ + refcount_t refcnt; /* Overall active events */ + struct cpu_cf_ptr __percpu *cfptr; +} cpu_cf_root; + +/* + * Serialize event initialization and event removal. Both are called from + * user space in task context with perf_event_open() and close() + * system calls. + * + * This mutex serializes functions cpum_cf_alloc_cpu() called at event + * initialization via cpumf_pmu_event_init() and function cpum_cf_free_cpu() + * called at event removal via call back function hw_perf_event_destroy() + * when the event is deleted. They are serialized to enforce correct + * bookkeeping of pointer and reference counts anchored by + * struct cpu_cf_root and the access to cpu_cf_root::refcnt and the + * per CPU pointers stored in cpu_cf_root::cfptr. + */ +static DEFINE_MUTEX(pmc_reserve_mutex); + +/* + * Get pointer to per-cpu structure. + * + * Function get_cpu_cfhw() is called from + * - cfset_copy_all(): This function is protected by cpus_read_lock(), so + * CPU hot plug remove can not happen. Event removal requires a close() + * first. + * + * Function this_cpu_cfhw() is called from perf common code functions: + * - pmu_{en|dis}able(), pmu_{add|del}()and pmu_{start|stop}(): + * All functions execute with interrupts disabled on that particular CPU. + * - cfset_ioctl_{on|off}, cfset_cpu_read(): see comment cfset_copy_all(). + * + * Therefore it is safe to access the CPU specific pointer to the event. + */ +static struct cpu_cf_events *get_cpu_cfhw(int cpu) +{ + struct cpu_cf_ptr __percpu *p = cpu_cf_root.cfptr; + + if (p) { + struct cpu_cf_ptr *q = per_cpu_ptr(p, cpu); + + return q->cpucf; + } + return NULL; +} + +static struct cpu_cf_events *this_cpu_cfhw(void) +{ + return get_cpu_cfhw(smp_processor_id()); +} + +/* Disable counter sets on dedicated CPU */ +static void cpum_cf_reset_cpu(void *flags) +{ + lcctl(0); +} + +/* Free per CPU data when the last event is removed. */ +static void cpum_cf_free_root(void) +{ + if (!refcount_dec_and_test(&cpu_cf_root.refcnt)) + return; + free_percpu(cpu_cf_root.cfptr); + cpu_cf_root.cfptr = NULL; + irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT); + on_each_cpu(cpum_cf_reset_cpu, NULL, 1); + debug_sprintf_event(cf_dbg, 4, "%s root.refcnt %u cfptr %d\n", + __func__, refcount_read(&cpu_cf_root.refcnt), + !cpu_cf_root.cfptr); +} + +/* + * On initialization of first event also allocate per CPU data dynamically. + * Start with an array of pointers, the array size is the maximum number of + * CPUs possible, which might be larger than the number of CPUs currently + * online. + */ +static int cpum_cf_alloc_root(void) +{ + int rc = 0; + + if (refcount_inc_not_zero(&cpu_cf_root.refcnt)) + return rc; + + /* The memory is already zeroed. */ + cpu_cf_root.cfptr = alloc_percpu(struct cpu_cf_ptr); + if (cpu_cf_root.cfptr) { + refcount_set(&cpu_cf_root.refcnt, 1); + on_each_cpu(cpum_cf_reset_cpu, NULL, 1); + irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT); + } else { + rc = -ENOMEM; + } + + return rc; +} + +/* Free CPU counter data structure for a PMU */ +static void cpum_cf_free_cpu(int cpu) +{ + struct cpu_cf_events *cpuhw; + struct cpu_cf_ptr *p; + + mutex_lock(&pmc_reserve_mutex); + /* + * When invoked via CPU hotplug handler, there might be no events + * installed or that particular CPU might not have an + * event installed. This anchor pointer can be NULL! + */ + if (!cpu_cf_root.cfptr) + goto out; + p = per_cpu_ptr(cpu_cf_root.cfptr, cpu); + cpuhw = p->cpucf; + /* + * Might be zero when called from CPU hotplug handler and no event + * installed on that CPU, but on different CPUs. + */ + if (!cpuhw) + goto out; + + if (refcount_dec_and_test(&cpuhw->refcnt)) { + kfree(cpuhw); + p->cpucf = NULL; + } + cpum_cf_free_root(); +out: + mutex_unlock(&pmc_reserve_mutex); +} + +/* Allocate CPU counter data structure for a PMU. Called under mutex lock. */ +static int cpum_cf_alloc_cpu(int cpu) +{ + struct cpu_cf_events *cpuhw; + struct cpu_cf_ptr *p; + int rc; + + mutex_lock(&pmc_reserve_mutex); + rc = cpum_cf_alloc_root(); + if (rc) + goto unlock; + p = per_cpu_ptr(cpu_cf_root.cfptr, cpu); + cpuhw = p->cpucf; + + if (!cpuhw) { + cpuhw = kzalloc(sizeof(*cpuhw), GFP_KERNEL); + if (cpuhw) { + p->cpucf = cpuhw; + refcount_set(&cpuhw->refcnt, 1); + } else { + rc = -ENOMEM; + } + } else { + refcount_inc(&cpuhw->refcnt); + } + if (rc) { + /* + * Error in allocation of event, decrement anchor. Since + * cpu_cf_event in not created, its destroy() function is not + * invoked. Adjust the reference counter for the anchor. + */ + cpum_cf_free_root(); + } +unlock: + mutex_unlock(&pmc_reserve_mutex); + return rc; +} + +/* + * Create/delete per CPU data structures for /dev/hwctr interface and events + * created by perf_event_open(). + * If cpu is -1, track task on all available CPUs. This requires + * allocation of hardware data structures for all CPUs. This setup handles + * perf_event_open() with task context and /dev/hwctr interface. + * If cpu is non-zero install event on this CPU only. This setup handles + * perf_event_open() with CPU context. + */ +static int cpum_cf_alloc(int cpu) +{ + cpumask_var_t mask; + int rc; + + if (cpu == -1) { + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + for_each_online_cpu(cpu) { + rc = cpum_cf_alloc_cpu(cpu); + if (rc) { + for_each_cpu(cpu, mask) + cpum_cf_free_cpu(cpu); + break; + } + cpumask_set_cpu(cpu, mask); + } + free_cpumask_var(mask); + } else { + rc = cpum_cf_alloc_cpu(cpu); + } + return rc; +} + +static void cpum_cf_free(int cpu) +{ + if (cpu == -1) { + for_each_online_cpu(cpu) + cpum_cf_free_cpu(cpu); + } else { + cpum_cf_free_cpu(cpu); + } +} + +#define CF_DIAG_CTRSET_DEF 0xfeef /* Counter set header mark */ + /* interval in seconds */ + +/* Counter sets are stored as data stream in a page sized memory buffer and + * exported to user space via raw data attached to the event sample data. + * Each counter set starts with an eight byte header consisting of: + * - a two byte eye catcher (0xfeef) + * - a one byte counter set number + * - a two byte counter set size (indicates the number of counters in this set) + * - a three byte reserved value (must be zero) to make the header the same + * size as a counter value. + * All counter values are eight byte in size. + * + * All counter sets are followed by a 64 byte trailer. + * The trailer consists of a: + * - flag field indicating valid fields when corresponding bit set + * - the counter facility first and second version number + * - the CPU speed if nonzero + * - the time stamp the counter sets have been collected + * - the time of day (TOD) base value + * - the machine type. + * + * The counter sets are saved when the process is prepared to be executed on a + * CPU and saved again when the process is going to be removed from a CPU. + * The difference of both counter sets are calculated and stored in the event + * sample data area. + */ +struct cf_ctrset_entry { /* CPU-M CF counter set entry (8 byte) */ + unsigned int def:16; /* 0-15 Data Entry Format */ + unsigned int set:16; /* 16-31 Counter set identifier */ + unsigned int ctr:16; /* 32-47 Number of stored counters */ + unsigned int res1:16; /* 48-63 Reserved */ +}; + +struct cf_trailer_entry { /* CPU-M CF_DIAG trailer (64 byte) */ + /* 0 - 7 */ + union { + struct { + unsigned int clock_base:1; /* TOD clock base set */ + unsigned int speed:1; /* CPU speed set */ + /* Measurement alerts */ + unsigned int mtda:1; /* Loss of MT ctr. data alert */ + unsigned int caca:1; /* Counter auth. change alert */ + unsigned int lcda:1; /* Loss of counter data alert */ + }; + unsigned long flags; /* 0-63 All indicators */ + }; + /* 8 - 15 */ + unsigned int cfvn:16; /* 64-79 Ctr First Version */ + unsigned int csvn:16; /* 80-95 Ctr Second Version */ + unsigned int cpu_speed:32; /* 96-127 CPU speed */ + /* 16 - 23 */ + unsigned long timestamp; /* 128-191 Timestamp (TOD) */ + /* 24 - 55 */ + union { + struct { + unsigned long progusage1; + unsigned long progusage2; + unsigned long progusage3; + unsigned long tod_base; + }; + unsigned long progusage[4]; + }; + /* 56 - 63 */ + unsigned int mach_type:16; /* Machine type */ + unsigned int res1:16; /* Reserved */ + unsigned int res2:32; /* Reserved */ +}; + +/* Create the trailer data at the end of a page. */ +static void cfdiag_trailer(struct cf_trailer_entry *te) +{ + struct cpuid cpuid; + + te->cfvn = cpumf_ctr_info.cfvn; /* Counter version numbers */ + te->csvn = cpumf_ctr_info.csvn; + + get_cpu_id(&cpuid); /* Machine type */ + te->mach_type = cpuid.machine; + te->cpu_speed = cfdiag_cpu_speed; + if (te->cpu_speed) + te->speed = 1; + te->clock_base = 1; /* Save clock base */ + te->tod_base = tod_clock_base.tod; + te->timestamp = get_tod_clock_fast(); +} + +/* + * The number of counters per counter set varies between machine generations, + * but is constant when running on a particular machine generation. + * Determine each counter set size at device driver initialization and + * retrieve it later. + */ +static size_t cpumf_ctr_setsizes[CPUMF_CTR_SET_MAX]; +static void cpum_cf_make_setsize(enum cpumf_ctr_set ctrset) +{ + size_t ctrset_size = 0; + + switch (ctrset) { + case CPUMF_CTR_SET_BASIC: + if (cpumf_ctr_info.cfvn >= 1) + ctrset_size = 6; + break; + case CPUMF_CTR_SET_USER: + if (cpumf_ctr_info.cfvn == 1) + ctrset_size = 6; + else if (cpumf_ctr_info.cfvn >= 3) + ctrset_size = 2; + break; + case CPUMF_CTR_SET_CRYPTO: + if (cpumf_ctr_info.csvn >= 1 && cpumf_ctr_info.csvn <= 5) + ctrset_size = 16; + else if (cpumf_ctr_info.csvn == 6 || cpumf_ctr_info.csvn == 7) + ctrset_size = 20; + break; + case CPUMF_CTR_SET_EXT: + if (cpumf_ctr_info.csvn == 1) + ctrset_size = 32; + else if (cpumf_ctr_info.csvn == 2) + ctrset_size = 48; + else if (cpumf_ctr_info.csvn >= 3 && cpumf_ctr_info.csvn <= 5) + ctrset_size = 128; + else if (cpumf_ctr_info.csvn == 6 || cpumf_ctr_info.csvn == 7) + ctrset_size = 160; + break; + case CPUMF_CTR_SET_MT_DIAG: + if (cpumf_ctr_info.csvn > 3) + ctrset_size = 48; + break; + case CPUMF_CTR_SET_MAX: + break; + } + cpumf_ctr_setsizes[ctrset] = ctrset_size; +} + +/* + * Return the maximum possible counter set size (in number of 8 byte counters) + * depending on type and model number. + */ +static size_t cpum_cf_read_setsize(enum cpumf_ctr_set ctrset) +{ + return cpumf_ctr_setsizes[ctrset]; +} + +/* Read a counter set. The counter set number determines the counter set and + * the CPUM-CF first and second version number determine the number of + * available counters in each counter set. + * Each counter set starts with header containing the counter set number and + * the number of eight byte counters. + * + * The functions returns the number of bytes occupied by this counter set + * including the header. + * If there is no counter in the counter set, this counter set is useless and + * zero is returned on this case. + * + * Note that the counter sets may not be enabled or active and the stcctm + * instruction might return error 3. Depending on error_ok value this is ok, + * for example when called from cpumf_pmu_start() call back function. + */ +static size_t cfdiag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset, + size_t room, bool error_ok) +{ + size_t ctrset_size, need = 0; + int rc = 3; /* Assume write failure */ + + ctrdata->def = CF_DIAG_CTRSET_DEF; + ctrdata->set = ctrset; + ctrdata->res1 = 0; + ctrset_size = cpum_cf_read_setsize(ctrset); + + if (ctrset_size) { /* Save data */ + need = ctrset_size * sizeof(u64) + sizeof(*ctrdata); + if (need <= room) { + rc = ctr_stcctm(ctrset, ctrset_size, + (u64 *)(ctrdata + 1)); + } + if (rc != 3 || error_ok) + ctrdata->ctr = ctrset_size; + else + need = 0; + } + + return need; +} + +static const u64 cpumf_ctr_ctl[CPUMF_CTR_SET_MAX] = { + [CPUMF_CTR_SET_BASIC] = 0x02, + [CPUMF_CTR_SET_USER] = 0x04, + [CPUMF_CTR_SET_CRYPTO] = 0x08, + [CPUMF_CTR_SET_EXT] = 0x01, + [CPUMF_CTR_SET_MT_DIAG] = 0x20, +}; + +/* Read out all counter sets and save them in the provided data buffer. + * The last 64 byte host an artificial trailer entry. + */ +static size_t cfdiag_getctr(void *data, size_t sz, unsigned long auth, + bool error_ok) +{ + struct cf_trailer_entry *trailer; + size_t offset = 0, done; + int i; + + memset(data, 0, sz); + sz -= sizeof(*trailer); /* Always room for trailer */ + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { + struct cf_ctrset_entry *ctrdata = data + offset; + + if (!(auth & cpumf_ctr_ctl[i])) + continue; /* Counter set not authorized */ + + done = cfdiag_getctrset(ctrdata, i, sz - offset, error_ok); + offset += done; + } + trailer = data + offset; + cfdiag_trailer(trailer); + return offset + sizeof(*trailer); +} + +/* Calculate the difference for each counter in a counter set. */ +static void cfdiag_diffctrset(u64 *pstart, u64 *pstop, int counters) +{ + for (; --counters >= 0; ++pstart, ++pstop) + if (*pstop >= *pstart) + *pstop -= *pstart; + else + *pstop = *pstart - *pstop + 1; +} + +/* Scan the counter sets and calculate the difference of each counter + * in each set. The result is the increment of each counter during the + * period the counter set has been activated. + * + * Return true on success. + */ +static int cfdiag_diffctr(struct cpu_cf_events *cpuhw, unsigned long auth) +{ + struct cf_trailer_entry *trailer_start, *trailer_stop; + struct cf_ctrset_entry *ctrstart, *ctrstop; + size_t offset = 0; + + auth &= (1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1; + do { + ctrstart = (struct cf_ctrset_entry *)(cpuhw->start + offset); + ctrstop = (struct cf_ctrset_entry *)(cpuhw->stop + offset); + + if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) { + pr_err_once("cpum_cf_diag counter set compare error " + "in set %i\n", ctrstart->set); + return 0; + } + auth &= ~cpumf_ctr_ctl[ctrstart->set]; + if (ctrstart->def == CF_DIAG_CTRSET_DEF) { + cfdiag_diffctrset((u64 *)(ctrstart + 1), + (u64 *)(ctrstop + 1), ctrstart->ctr); + offset += ctrstart->ctr * sizeof(u64) + + sizeof(*ctrstart); + } + } while (ctrstart->def && auth); + + /* Save time_stamp from start of event in stop's trailer */ + trailer_start = (struct cf_trailer_entry *)(cpuhw->start + offset); + trailer_stop = (struct cf_trailer_entry *)(cpuhw->stop + offset); + trailer_stop->progusage[0] = trailer_start->timestamp; + + return 1; +} + +static enum cpumf_ctr_set get_counter_set(u64 event) +{ + int set = CPUMF_CTR_SET_MAX; + + if (event < 32) + set = CPUMF_CTR_SET_BASIC; + else if (event < 64) + set = CPUMF_CTR_SET_USER; + else if (event < 128) + set = CPUMF_CTR_SET_CRYPTO; + else if (event < 288) + set = CPUMF_CTR_SET_EXT; + else if (event >= 448 && event < 496) + set = CPUMF_CTR_SET_MT_DIAG; + + return set; +} + +static int validate_ctr_version(const u64 config, enum cpumf_ctr_set set) +{ + u16 mtdiag_ctl; + int err = 0; + + /* check required version for counter sets */ + switch (set) { + case CPUMF_CTR_SET_BASIC: + case CPUMF_CTR_SET_USER: + if (cpumf_ctr_info.cfvn < 1) + err = -EOPNOTSUPP; + break; + case CPUMF_CTR_SET_CRYPTO: + if ((cpumf_ctr_info.csvn >= 1 && cpumf_ctr_info.csvn <= 5 && + config > 79) || (cpumf_ctr_info.csvn >= 6 && config > 83)) + err = -EOPNOTSUPP; + break; + case CPUMF_CTR_SET_EXT: + if (cpumf_ctr_info.csvn < 1) + err = -EOPNOTSUPP; + if ((cpumf_ctr_info.csvn == 1 && config > 159) || + (cpumf_ctr_info.csvn == 2 && config > 175) || + (cpumf_ctr_info.csvn >= 3 && cpumf_ctr_info.csvn <= 5 && + config > 255) || + (cpumf_ctr_info.csvn >= 6 && config > 287)) + err = -EOPNOTSUPP; + break; + case CPUMF_CTR_SET_MT_DIAG: + if (cpumf_ctr_info.csvn <= 3) + err = -EOPNOTSUPP; + /* + * MT-diagnostic counters are read-only. The counter set + * is automatically enabled and activated on all CPUs with + * multithreading (SMT). Deactivation of multithreading + * also disables the counter set. State changes are ignored + * by lcctl(). Because Linux controls SMT enablement through + * a kernel parameter only, the counter set is either disabled + * or enabled and active. + * + * Thus, the counters can only be used if SMT is on and the + * counter set is enabled and active. + */ + mtdiag_ctl = cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG]; + if (!((cpumf_ctr_info.auth_ctl & mtdiag_ctl) && + (cpumf_ctr_info.enable_ctl & mtdiag_ctl) && + (cpumf_ctr_info.act_ctl & mtdiag_ctl))) + err = -EOPNOTSUPP; + break; + case CPUMF_CTR_SET_MAX: + err = -EOPNOTSUPP; + } + + return err; +} + +/* + * Change the CPUMF state to active. + * Enable and activate the CPU-counter sets according + * to the per-cpu control state. + */ +static void cpumf_pmu_enable(struct pmu *pmu) +{ + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); + int err; + + if (!cpuhw || (cpuhw->flags & PMU_F_ENABLED)) + return; + + err = lcctl(cpuhw->state | cpuhw->dev_state); + if (err) + pr_err("Enabling the performance measuring unit failed with rc=%x\n", err); + else + cpuhw->flags |= PMU_F_ENABLED; +} + +/* + * Change the CPUMF state to inactive. + * Disable and enable (inactive) the CPU-counter sets according + * to the per-cpu control state. + */ +static void cpumf_pmu_disable(struct pmu *pmu) +{ + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); + u64 inactive; + int err; + + if (!cpuhw || !(cpuhw->flags & PMU_F_ENABLED)) + return; + + inactive = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1); + inactive |= cpuhw->dev_state; + err = lcctl(inactive); + if (err) + pr_err("Disabling the performance measuring unit failed with rc=%x\n", err); + else + cpuhw->flags &= ~PMU_F_ENABLED; +} + +/* Release the PMU if event is the last perf event */ +static void hw_perf_event_destroy(struct perf_event *event) +{ + cpum_cf_free(event->cpu); +} + +/* CPUMF <-> perf event mappings for kernel+userspace (basic set) */ +static const int cpumf_generic_events_basic[] = { + [PERF_COUNT_HW_CPU_CYCLES] = 0, + [PERF_COUNT_HW_INSTRUCTIONS] = 1, + [PERF_COUNT_HW_CACHE_REFERENCES] = -1, + [PERF_COUNT_HW_CACHE_MISSES] = -1, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = -1, + [PERF_COUNT_HW_BRANCH_MISSES] = -1, + [PERF_COUNT_HW_BUS_CYCLES] = -1, +}; +/* CPUMF <-> perf event mappings for userspace (problem-state set) */ +static const int cpumf_generic_events_user[] = { + [PERF_COUNT_HW_CPU_CYCLES] = 32, + [PERF_COUNT_HW_INSTRUCTIONS] = 33, + [PERF_COUNT_HW_CACHE_REFERENCES] = -1, + [PERF_COUNT_HW_CACHE_MISSES] = -1, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = -1, + [PERF_COUNT_HW_BRANCH_MISSES] = -1, + [PERF_COUNT_HW_BUS_CYCLES] = -1, +}; + +static int is_userspace_event(u64 ev) +{ + return cpumf_generic_events_user[PERF_COUNT_HW_CPU_CYCLES] == ev || + cpumf_generic_events_user[PERF_COUNT_HW_INSTRUCTIONS] == ev; +} + +static int __hw_perf_event_init(struct perf_event *event, unsigned int type) +{ + struct perf_event_attr *attr = &event->attr; + struct hw_perf_event *hwc = &event->hw; + enum cpumf_ctr_set set; + u64 ev; + + switch (type) { + case PERF_TYPE_RAW: + /* Raw events are used to access counters directly, + * hence do not permit excludes */ + if (attr->exclude_kernel || attr->exclude_user || + attr->exclude_hv) + return -EOPNOTSUPP; + ev = attr->config; + break; + + case PERF_TYPE_HARDWARE: + if (is_sampling_event(event)) /* No sampling support */ + return -ENOENT; + ev = attr->config; + if (!attr->exclude_user && attr->exclude_kernel) { + /* + * Count user space (problem-state) only + * Handle events 32 and 33 as 0:u and 1:u + */ + if (!is_userspace_event(ev)) { + if (ev >= ARRAY_SIZE(cpumf_generic_events_user)) + return -EOPNOTSUPP; + ev = cpumf_generic_events_user[ev]; + } + } else if (!attr->exclude_kernel && attr->exclude_user) { + /* No support for kernel space counters only */ + return -EOPNOTSUPP; + } else { + /* Count user and kernel space, incl. events 32 + 33 */ + if (!is_userspace_event(ev)) { + if (ev >= ARRAY_SIZE(cpumf_generic_events_basic)) + return -EOPNOTSUPP; + ev = cpumf_generic_events_basic[ev]; + } + } + break; + + default: + return -ENOENT; + } + + if (ev == -1) + return -ENOENT; + + if (ev > PERF_CPUM_CF_MAX_CTR) + return -ENOENT; + + /* Obtain the counter set to which the specified counter belongs */ + set = get_counter_set(ev); + switch (set) { + case CPUMF_CTR_SET_BASIC: + case CPUMF_CTR_SET_USER: + case CPUMF_CTR_SET_CRYPTO: + case CPUMF_CTR_SET_EXT: + case CPUMF_CTR_SET_MT_DIAG: + /* + * Use the hardware perf event structure to store the + * counter number in the 'config' member and the counter + * set number in the 'config_base' as bit mask. + * It is later used to enable/disable the counter(s). + */ + hwc->config = ev; + hwc->config_base = cpumf_ctr_ctl[set]; + break; + case CPUMF_CTR_SET_MAX: + /* The counter could not be associated to a counter set */ + return -EINVAL; + } + + /* Initialize for using the CPU-measurement counter facility */ + if (cpum_cf_alloc(event->cpu)) + return -ENOMEM; + event->destroy = hw_perf_event_destroy; + + /* + * Finally, validate version and authorization of the counter set. + * If the particular CPU counter set is not authorized, + * return with -ENOENT in order to fall back to other + * PMUs that might suffice the event request. + */ + if (!(hwc->config_base & cpumf_ctr_info.auth_ctl)) + return -ENOENT; + return validate_ctr_version(hwc->config, set); +} + +/* Events CPU_CYLCES and INSTRUCTIONS can be submitted with two different + * attribute::type values: + * - PERF_TYPE_HARDWARE: + * - pmu->type: + * Handle both type of invocations identical. They address the same hardware. + * The result is different when event modifiers exclude_kernel and/or + * exclude_user are also set. + */ +static int cpumf_pmu_event_type(struct perf_event *event) +{ + u64 ev = event->attr.config; + + if (cpumf_generic_events_basic[PERF_COUNT_HW_CPU_CYCLES] == ev || + cpumf_generic_events_basic[PERF_COUNT_HW_INSTRUCTIONS] == ev || + cpumf_generic_events_user[PERF_COUNT_HW_CPU_CYCLES] == ev || + cpumf_generic_events_user[PERF_COUNT_HW_INSTRUCTIONS] == ev) + return PERF_TYPE_HARDWARE; + return PERF_TYPE_RAW; +} + +static int cpumf_pmu_event_init(struct perf_event *event) +{ + unsigned int type = event->attr.type; + int err; + + if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_RAW) + err = __hw_perf_event_init(event, type); + else if (event->pmu->type == type) + /* Registered as unknown PMU */ + err = __hw_perf_event_init(event, cpumf_pmu_event_type(event)); + else + return -ENOENT; + + if (unlikely(err) && event->destroy) + event->destroy(event); + + return err; +} + +static int hw_perf_event_reset(struct perf_event *event) +{ + u64 prev, new; + int err; + + do { + prev = local64_read(&event->hw.prev_count); + err = ecctr(event->hw.config, &new); + if (err) { + if (err != 3) + break; + /* The counter is not (yet) available. This + * might happen if the counter set to which + * this counter belongs is in the disabled + * state. + */ + new = 0; + } + } while (local64_cmpxchg(&event->hw.prev_count, prev, new) != prev); + + return err; +} + +static void hw_perf_event_update(struct perf_event *event) +{ + u64 prev, new, delta; + int err; + + do { + prev = local64_read(&event->hw.prev_count); + err = ecctr(event->hw.config, &new); + if (err) + return; + } while (local64_cmpxchg(&event->hw.prev_count, prev, new) != prev); + + delta = (prev <= new) ? new - prev + : (-1ULL - prev) + new + 1; /* overflow */ + local64_add(delta, &event->count); +} + +static void cpumf_pmu_read(struct perf_event *event) +{ + if (event->hw.state & PERF_HES_STOPPED) + return; + + hw_perf_event_update(event); +} + +static void cpumf_pmu_start(struct perf_event *event, int flags) +{ + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); + struct hw_perf_event *hwc = &event->hw; + int i; + + if (!(hwc->state & PERF_HES_STOPPED)) + return; + + hwc->state = 0; + + /* (Re-)enable and activate the counter set */ + ctr_set_enable(&cpuhw->state, hwc->config_base); + ctr_set_start(&cpuhw->state, hwc->config_base); + + /* The counter set to which this counter belongs can be already active. + * Because all counters in a set are active, the event->hw.prev_count + * needs to be synchronized. At this point, the counter set can be in + * the inactive or disabled state. + */ + if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) { + cpuhw->usedss = cfdiag_getctr(cpuhw->start, + sizeof(cpuhw->start), + hwc->config_base, true); + } else { + hw_perf_event_reset(event); + } + + /* Increment refcount for counter sets */ + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) + if ((hwc->config_base & cpumf_ctr_ctl[i])) + atomic_inc(&cpuhw->ctr_set[i]); +} + +/* Create perf event sample with the counter sets as raw data. The sample + * is then pushed to the event subsystem and the function checks for + * possible event overflows. If an event overflow occurs, the PMU is + * stopped. + * + * Return non-zero if an event overflow occurred. + */ +static int cfdiag_push_sample(struct perf_event *event, + struct cpu_cf_events *cpuhw) +{ + struct perf_sample_data data; + struct perf_raw_record raw; + struct pt_regs regs; + int overflow; + + /* Setup perf sample */ + perf_sample_data_init(&data, 0, event->hw.last_period); + memset(®s, 0, sizeof(regs)); + memset(&raw, 0, sizeof(raw)); + + if (event->attr.sample_type & PERF_SAMPLE_CPU) + data.cpu_entry.cpu = event->cpu; + if (event->attr.sample_type & PERF_SAMPLE_RAW) { + raw.frag.size = cpuhw->usedss; + raw.frag.data = cpuhw->stop; + perf_sample_save_raw_data(&data, &raw); + } + + overflow = perf_event_overflow(event, &data, ®s); + if (overflow) + event->pmu->stop(event, 0); + + perf_event_update_userpage(event); + return overflow; +} + +static void cpumf_pmu_stop(struct perf_event *event, int flags) +{ + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); + struct hw_perf_event *hwc = &event->hw; + int i; + + if (!(hwc->state & PERF_HES_STOPPED)) { + /* Decrement reference count for this counter set and if this + * is the last used counter in the set, clear activation + * control and set the counter set state to inactive. + */ + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { + if (!(hwc->config_base & cpumf_ctr_ctl[i])) + continue; + if (!atomic_dec_return(&cpuhw->ctr_set[i])) + ctr_set_stop(&cpuhw->state, cpumf_ctr_ctl[i]); + } + hwc->state |= PERF_HES_STOPPED; + } + + if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) { + local64_inc(&event->count); + cpuhw->usedss = cfdiag_getctr(cpuhw->stop, + sizeof(cpuhw->stop), + event->hw.config_base, + false); + if (cfdiag_diffctr(cpuhw, event->hw.config_base)) + cfdiag_push_sample(event, cpuhw); + } else { + hw_perf_event_update(event); + } + hwc->state |= PERF_HES_UPTODATE; + } +} + +static int cpumf_pmu_add(struct perf_event *event, int flags) +{ + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); + + ctr_set_enable(&cpuhw->state, event->hw.config_base); + event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + if (flags & PERF_EF_START) + cpumf_pmu_start(event, PERF_EF_RELOAD); + + return 0; +} + +static void cpumf_pmu_del(struct perf_event *event, int flags) +{ + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); + int i; + + cpumf_pmu_stop(event, PERF_EF_UPDATE); + + /* Check if any counter in the counter set is still used. If not used, + * change the counter set to the disabled state. This also clears the + * content of all counters in the set. + * + * When a new perf event has been added but not yet started, this can + * clear enable control and resets all counters in a set. Therefore, + * cpumf_pmu_start() always has to reenable a counter set. + */ + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) + if (!atomic_read(&cpuhw->ctr_set[i])) + ctr_set_disable(&cpuhw->state, cpumf_ctr_ctl[i]); +} + +/* Performance monitoring unit for s390x */ +static struct pmu cpumf_pmu = { + .task_ctx_nr = perf_sw_context, + .capabilities = PERF_PMU_CAP_NO_INTERRUPT, + .pmu_enable = cpumf_pmu_enable, + .pmu_disable = cpumf_pmu_disable, + .event_init = cpumf_pmu_event_init, + .add = cpumf_pmu_add, + .del = cpumf_pmu_del, + .start = cpumf_pmu_start, + .stop = cpumf_pmu_stop, + .read = cpumf_pmu_read, +}; + +static struct cfset_session { /* CPUs and counter set bit mask */ + struct list_head head; /* Head of list of active processes */ +} cfset_session = { + .head = LIST_HEAD_INIT(cfset_session.head) +}; + +static refcount_t cfset_opencnt = REFCOUNT_INIT(0); /* Access count */ +/* + * Synchronize access to device /dev/hwc. This mutex protects against + * concurrent access to functions cfset_open() and cfset_release(). + * Same for CPU hotplug add and remove events triggering + * cpum_cf_online_cpu() and cpum_cf_offline_cpu(). + * It also serializes concurrent device ioctl access from multiple + * processes accessing /dev/hwc. + * + * The mutex protects concurrent access to the /dev/hwctr session management + * struct cfset_session and reference counting variable cfset_opencnt. + */ +static DEFINE_MUTEX(cfset_ctrset_mutex); + +/* + * CPU hotplug handles only /dev/hwctr device. + * For perf_event_open() the CPU hotplug handling is done on kernel common + * code: + * - CPU add: Nothing is done since a file descriptor can not be created + * and returned to the user. + * - CPU delete: Handled by common code via pmu_disable(), pmu_stop() and + * pmu_delete(). The event itself is removed when the file descriptor is + * closed. + */ +static int cfset_online_cpu(unsigned int cpu); + +static int cpum_cf_online_cpu(unsigned int cpu) +{ + int rc = 0; + + /* + * Ignore notification for perf_event_open(). + * Handle only /dev/hwctr device sessions. + */ + mutex_lock(&cfset_ctrset_mutex); + if (refcount_read(&cfset_opencnt)) { + rc = cpum_cf_alloc_cpu(cpu); + if (!rc) + cfset_online_cpu(cpu); + } + mutex_unlock(&cfset_ctrset_mutex); + return rc; +} + +static int cfset_offline_cpu(unsigned int cpu); + +static int cpum_cf_offline_cpu(unsigned int cpu) +{ + /* + * During task exit processing of grouped perf events triggered by CPU + * hotplug processing, pmu_disable() is called as part of perf context + * removal process. Therefore do not trigger event removal now for + * perf_event_open() created events. Perf common code triggers event + * destruction when the event file descriptor is closed. + * + * Handle only /dev/hwctr device sessions. + */ + mutex_lock(&cfset_ctrset_mutex); + if (refcount_read(&cfset_opencnt)) { + cfset_offline_cpu(cpu); + cpum_cf_free_cpu(cpu); + } + mutex_unlock(&cfset_ctrset_mutex); + return 0; +} + +/* Return true if store counter set multiple instruction is available */ +static inline int stccm_avail(void) +{ + return test_facility(142); +} + +/* CPU-measurement alerts for the counter facility */ +static void cpumf_measurement_alert(struct ext_code ext_code, + unsigned int alert, unsigned long unused) +{ + struct cpu_cf_events *cpuhw; + + if (!(alert & CPU_MF_INT_CF_MASK)) + return; + + inc_irq_stat(IRQEXT_CMC); + + /* + * Measurement alerts are shared and might happen when the PMU + * is not reserved. Ignore these alerts in this case. + */ + cpuhw = this_cpu_cfhw(); + if (!cpuhw) + return; + + /* counter authorization change alert */ + if (alert & CPU_MF_INT_CF_CACA) + qctri(&cpumf_ctr_info); + + /* loss of counter data alert */ + if (alert & CPU_MF_INT_CF_LCDA) + pr_err("CPU[%i] Counter data was lost\n", smp_processor_id()); + + /* loss of MT counter data alert */ + if (alert & CPU_MF_INT_CF_MTDA) + pr_warn("CPU[%i] MT counter data was lost\n", + smp_processor_id()); +} + +static int cfset_init(void); +static int __init cpumf_pmu_init(void) +{ + int rc; + + /* Extract counter measurement facility information */ + if (!cpum_cf_avail() || qctri(&cpumf_ctr_info)) + return -ENODEV; + + /* Determine and store counter set sizes for later reference */ + for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc) + cpum_cf_make_setsize(rc); + + /* + * Clear bit 15 of cr0 to unauthorize problem-state to + * extract measurement counters + */ + ctl_clear_bit(0, 48); + + /* register handler for measurement-alert interruptions */ + rc = register_external_irq(EXT_IRQ_MEASURE_ALERT, + cpumf_measurement_alert); + if (rc) { + pr_err("Registering for CPU-measurement alerts failed with rc=%i\n", rc); + return rc; + } + + /* Setup s390dbf facility */ + cf_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128); + if (!cf_dbg) { + pr_err("Registration of s390dbf(cpum_cf) failed\n"); + rc = -ENOMEM; + goto out1; + } + debug_register_view(cf_dbg, &debug_sprintf_view); + + cpumf_pmu.attr_groups = cpumf_cf_event_group(); + rc = perf_pmu_register(&cpumf_pmu, "cpum_cf", -1); + if (rc) { + pr_err("Registering the cpum_cf PMU failed with rc=%i\n", rc); + goto out2; + } else if (stccm_avail()) { /* Setup counter set device */ + cfset_init(); + } + + rc = cpuhp_setup_state(CPUHP_AP_PERF_S390_CF_ONLINE, + "perf/s390/cf:online", + cpum_cf_online_cpu, cpum_cf_offline_cpu); + return rc; + +out2: + debug_unregister_view(cf_dbg, &debug_sprintf_view); + debug_unregister(cf_dbg); +out1: + unregister_external_irq(EXT_IRQ_MEASURE_ALERT, cpumf_measurement_alert); + return rc; +} + +/* Support for the CPU Measurement Facility counter set extraction using + * device /dev/hwctr. This allows user space programs to extract complete + * counter set via normal file operations. + */ + +struct cfset_call_on_cpu_parm { /* Parm struct for smp_call_on_cpu */ + unsigned int sets; /* Counter set bit mask */ + atomic_t cpus_ack; /* # CPUs successfully executed func */ +}; + +struct cfset_request { /* CPUs and counter set bit mask */ + unsigned long ctrset; /* Bit mask of counter set to read */ + cpumask_t mask; /* CPU mask to read from */ + struct list_head node; /* Chain to cfset_session.head */ +}; + +static void cfset_session_init(void) +{ + INIT_LIST_HEAD(&cfset_session.head); +} + +/* Remove current request from global bookkeeping. Maintain a counter set bit + * mask on a per CPU basis. + * Done in process context under mutex protection. + */ +static void cfset_session_del(struct cfset_request *p) +{ + list_del(&p->node); +} + +/* Add current request to global bookkeeping. Maintain a counter set bit mask + * on a per CPU basis. + * Done in process context under mutex protection. + */ +static void cfset_session_add(struct cfset_request *p) +{ + list_add(&p->node, &cfset_session.head); +} + +/* The /dev/hwctr device access uses PMU_F_IN_USE to mark the device access + * path is currently used. + * The cpu_cf_events::dev_state is used to denote counter sets in use by this + * interface. It is always or'ed in. If this interface is not active, its + * value is zero and no additional counter sets will be included. + * + * The cpu_cf_events::state is used by the perf_event_open SVC and remains + * unchanged. + * + * perf_pmu_enable() and perf_pmu_enable() and its call backs + * cpumf_pmu_enable() and cpumf_pmu_disable() are called by the + * performance measurement subsystem to enable per process + * CPU Measurement counter facility. + * The XXX_enable() and XXX_disable functions are used to turn off + * x86 performance monitoring interrupt (PMI) during scheduling. + * s390 uses these calls to temporarily stop and resume the active CPU + * counters sets during scheduling. + * + * We do allow concurrent access of perf_event_open() SVC and /dev/hwctr + * device access. The perf_event_open() SVC interface makes a lot of effort + * to only run the counters while the calling process is actively scheduled + * to run. + * When /dev/hwctr interface is also used at the same time, the counter sets + * will keep running, even when the process is scheduled off a CPU. + * However this is not a problem and does not lead to wrong counter values + * for the perf_event_open() SVC. The current counter value will be recorded + * during schedule-in. At schedule-out time the current counter value is + * extracted again and the delta is calculated and added to the event. + */ +/* Stop all counter sets via ioctl interface */ +static void cfset_ioctl_off(void *parm) +{ + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); + struct cfset_call_on_cpu_parm *p = parm; + int rc; + + /* Check if any counter set used by /dev/hwctr */ + for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc) + if ((p->sets & cpumf_ctr_ctl[rc])) { + if (!atomic_dec_return(&cpuhw->ctr_set[rc])) { + ctr_set_disable(&cpuhw->dev_state, + cpumf_ctr_ctl[rc]); + ctr_set_stop(&cpuhw->dev_state, + cpumf_ctr_ctl[rc]); + } + } + /* Keep perf_event_open counter sets */ + rc = lcctl(cpuhw->dev_state | cpuhw->state); + if (rc) + pr_err("Counter set stop %#llx of /dev/%s failed rc=%i\n", + cpuhw->state, S390_HWCTR_DEVICE, rc); + if (!cpuhw->dev_state) + cpuhw->flags &= ~PMU_F_IN_USE; +} + +/* Start counter sets on particular CPU */ +static void cfset_ioctl_on(void *parm) +{ + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); + struct cfset_call_on_cpu_parm *p = parm; + int rc; + + cpuhw->flags |= PMU_F_IN_USE; + ctr_set_enable(&cpuhw->dev_state, p->sets); + ctr_set_start(&cpuhw->dev_state, p->sets); + for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc) + if ((p->sets & cpumf_ctr_ctl[rc])) + atomic_inc(&cpuhw->ctr_set[rc]); + rc = lcctl(cpuhw->dev_state | cpuhw->state); /* Start counter sets */ + if (!rc) + atomic_inc(&p->cpus_ack); + else + pr_err("Counter set start %#llx of /dev/%s failed rc=%i\n", + cpuhw->dev_state | cpuhw->state, S390_HWCTR_DEVICE, rc); +} + +static void cfset_release_cpu(void *p) +{ + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); + int rc; + + cpuhw->dev_state = 0; + rc = lcctl(cpuhw->state); /* Keep perf_event_open counter sets */ + if (rc) + pr_err("Counter set release %#llx of /dev/%s failed rc=%i\n", + cpuhw->state, S390_HWCTR_DEVICE, rc); +} + +/* This modifies the process CPU mask to adopt it to the currently online + * CPUs. Offline CPUs can not be addresses. This call terminates the access + * and is usually followed by close() or a new iotcl(..., START, ...) which + * creates a new request structure. + */ +static void cfset_all_stop(struct cfset_request *req) +{ + struct cfset_call_on_cpu_parm p = { + .sets = req->ctrset, + }; + + cpumask_and(&req->mask, &req->mask, cpu_online_mask); + on_each_cpu_mask(&req->mask, cfset_ioctl_off, &p, 1); +} + +/* Release function is also called when application gets terminated without + * doing a proper ioctl(..., S390_HWCTR_STOP, ...) command. + */ +static int cfset_release(struct inode *inode, struct file *file) +{ + mutex_lock(&cfset_ctrset_mutex); + /* Open followed by close/exit has no private_data */ + if (file->private_data) { + cfset_all_stop(file->private_data); + cfset_session_del(file->private_data); + kfree(file->private_data); + file->private_data = NULL; + } + if (refcount_dec_and_test(&cfset_opencnt)) { /* Last close */ + on_each_cpu(cfset_release_cpu, NULL, 1); + cpum_cf_free(-1); + } + mutex_unlock(&cfset_ctrset_mutex); + return 0; +} + +/* + * Open via /dev/hwctr device. Allocate all per CPU resources on the first + * open of the device. The last close releases all per CPU resources. + * Parallel perf_event_open system calls also use per CPU resources. + * These invocations are handled via reference counting on the per CPU data + * structures. + */ +static int cfset_open(struct inode *inode, struct file *file) +{ + int rc = 0; + + if (!perfmon_capable()) + return -EPERM; + file->private_data = NULL; + + mutex_lock(&cfset_ctrset_mutex); + if (!refcount_inc_not_zero(&cfset_opencnt)) { /* First open */ + rc = cpum_cf_alloc(-1); + if (!rc) { + cfset_session_init(); + refcount_set(&cfset_opencnt, 1); + } + } + mutex_unlock(&cfset_ctrset_mutex); + + /* nonseekable_open() never fails */ + return rc ?: nonseekable_open(inode, file); +} + +static int cfset_all_start(struct cfset_request *req) +{ + struct cfset_call_on_cpu_parm p = { + .sets = req->ctrset, + .cpus_ack = ATOMIC_INIT(0), + }; + cpumask_var_t mask; + int rc = 0; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + cpumask_and(mask, &req->mask, cpu_online_mask); + on_each_cpu_mask(mask, cfset_ioctl_on, &p, 1); + if (atomic_read(&p.cpus_ack) != cpumask_weight(mask)) { + on_each_cpu_mask(mask, cfset_ioctl_off, &p, 1); + rc = -EIO; + } + free_cpumask_var(mask); + return rc; +} + +/* Return the maximum required space for all possible CPUs in case one + * CPU will be onlined during the START, READ, STOP cycles. + * To find out the size of the counter sets, any one CPU will do. They + * all have the same counter sets. + */ +static size_t cfset_needspace(unsigned int sets) +{ + size_t bytes = 0; + int i; + + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { + if (!(sets & cpumf_ctr_ctl[i])) + continue; + bytes += cpum_cf_read_setsize(i) * sizeof(u64) + + sizeof(((struct s390_ctrset_setdata *)0)->set) + + sizeof(((struct s390_ctrset_setdata *)0)->no_cnts); + } + bytes = sizeof(((struct s390_ctrset_read *)0)->no_cpus) + nr_cpu_ids * + (bytes + sizeof(((struct s390_ctrset_cpudata *)0)->cpu_nr) + + sizeof(((struct s390_ctrset_cpudata *)0)->no_sets)); + return bytes; +} + +static int cfset_all_copy(unsigned long arg, cpumask_t *mask) +{ + struct s390_ctrset_read __user *ctrset_read; + unsigned int cpu, cpus, rc = 0; + void __user *uptr; + + ctrset_read = (struct s390_ctrset_read __user *)arg; + uptr = ctrset_read->data; + for_each_cpu(cpu, mask) { + struct cpu_cf_events *cpuhw = get_cpu_cfhw(cpu); + struct s390_ctrset_cpudata __user *ctrset_cpudata; + + ctrset_cpudata = uptr; + rc = put_user(cpu, &ctrset_cpudata->cpu_nr); + rc |= put_user(cpuhw->sets, &ctrset_cpudata->no_sets); + rc |= copy_to_user(ctrset_cpudata->data, cpuhw->data, + cpuhw->used); + if (rc) { + rc = -EFAULT; + goto out; + } + uptr += sizeof(struct s390_ctrset_cpudata) + cpuhw->used; + cond_resched(); + } + cpus = cpumask_weight(mask); + if (put_user(cpus, &ctrset_read->no_cpus)) + rc = -EFAULT; +out: + return rc; +} + +static size_t cfset_cpuset_read(struct s390_ctrset_setdata *p, int ctrset, + int ctrset_size, size_t room) +{ + size_t need = 0; + int rc = -1; + + need = sizeof(*p) + sizeof(u64) * ctrset_size; + if (need <= room) { + p->set = cpumf_ctr_ctl[ctrset]; + p->no_cnts = ctrset_size; + rc = ctr_stcctm(ctrset, ctrset_size, (u64 *)p->cv); + if (rc == 3) /* Nothing stored */ + need = 0; + } + return need; +} + +/* Read all counter sets. */ +static void cfset_cpu_read(void *parm) +{ + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); + struct cfset_call_on_cpu_parm *p = parm; + int set, set_size; + size_t space; + + /* No data saved yet */ + cpuhw->used = 0; + cpuhw->sets = 0; + memset(cpuhw->data, 0, sizeof(cpuhw->data)); + + /* Scan the counter sets */ + for (set = CPUMF_CTR_SET_BASIC; set < CPUMF_CTR_SET_MAX; ++set) { + struct s390_ctrset_setdata *sp = (void *)cpuhw->data + + cpuhw->used; + + if (!(p->sets & cpumf_ctr_ctl[set])) + continue; /* Counter set not in list */ + set_size = cpum_cf_read_setsize(set); + space = sizeof(cpuhw->data) - cpuhw->used; + space = cfset_cpuset_read(sp, set, set_size, space); + if (space) { + cpuhw->used += space; + cpuhw->sets += 1; + } + } +} + +static int cfset_all_read(unsigned long arg, struct cfset_request *req) +{ + struct cfset_call_on_cpu_parm p; + cpumask_var_t mask; + int rc; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + p.sets = req->ctrset; + cpumask_and(mask, &req->mask, cpu_online_mask); + on_each_cpu_mask(mask, cfset_cpu_read, &p, 1); + rc = cfset_all_copy(arg, mask); + free_cpumask_var(mask); + return rc; +} + +static long cfset_ioctl_read(unsigned long arg, struct cfset_request *req) +{ + int ret = -ENODATA; + + if (req && req->ctrset) + ret = cfset_all_read(arg, req); + return ret; +} + +static long cfset_ioctl_stop(struct file *file) +{ + struct cfset_request *req = file->private_data; + int ret = -ENXIO; + + if (req) { + cfset_all_stop(req); + cfset_session_del(req); + kfree(req); + file->private_data = NULL; + ret = 0; + } + return ret; +} + +static long cfset_ioctl_start(unsigned long arg, struct file *file) +{ + struct s390_ctrset_start __user *ustart; + struct s390_ctrset_start start; + struct cfset_request *preq; + void __user *umask; + unsigned int len; + int ret = 0; + size_t need; + + if (file->private_data) + return -EBUSY; + ustart = (struct s390_ctrset_start __user *)arg; + if (copy_from_user(&start, ustart, sizeof(start))) + return -EFAULT; + if (start.version != S390_HWCTR_START_VERSION) + return -EINVAL; + if (start.counter_sets & ~(cpumf_ctr_ctl[CPUMF_CTR_SET_BASIC] | + cpumf_ctr_ctl[CPUMF_CTR_SET_USER] | + cpumf_ctr_ctl[CPUMF_CTR_SET_CRYPTO] | + cpumf_ctr_ctl[CPUMF_CTR_SET_EXT] | + cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG])) + return -EINVAL; /* Invalid counter set */ + if (!start.counter_sets) + return -EINVAL; /* No counter set at all? */ + + preq = kzalloc(sizeof(*preq), GFP_KERNEL); + if (!preq) + return -ENOMEM; + cpumask_clear(&preq->mask); + len = min_t(u64, start.cpumask_len, cpumask_size()); + umask = (void __user *)start.cpumask; + if (copy_from_user(&preq->mask, umask, len)) { + kfree(preq); + return -EFAULT; + } + if (cpumask_empty(&preq->mask)) { + kfree(preq); + return -EINVAL; + } + need = cfset_needspace(start.counter_sets); + if (put_user(need, &ustart->data_bytes)) { + kfree(preq); + return -EFAULT; + } + preq->ctrset = start.counter_sets; + ret = cfset_all_start(preq); + if (!ret) { + cfset_session_add(preq); + file->private_data = preq; + } else { + kfree(preq); + } + return ret; +} + +/* Entry point to the /dev/hwctr device interface. + * The ioctl system call supports three subcommands: + * S390_HWCTR_START: Start the specified counter sets on a CPU list. The + * counter set keeps running until explicitly stopped. Returns the number + * of bytes needed to store the counter values. If another S390_HWCTR_START + * ioctl subcommand is called without a previous S390_HWCTR_STOP stop + * command on the same file descriptor, -EBUSY is returned. + * S390_HWCTR_READ: Read the counter set values from specified CPU list given + * with the S390_HWCTR_START command. + * S390_HWCTR_STOP: Stops the counter sets on the CPU list given with the + * previous S390_HWCTR_START subcommand. + */ +static long cfset_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int ret; + + cpus_read_lock(); + mutex_lock(&cfset_ctrset_mutex); + switch (cmd) { + case S390_HWCTR_START: + ret = cfset_ioctl_start(arg, file); + break; + case S390_HWCTR_STOP: + ret = cfset_ioctl_stop(file); + break; + case S390_HWCTR_READ: + ret = cfset_ioctl_read(arg, file->private_data); + break; + default: + ret = -ENOTTY; + break; + } + mutex_unlock(&cfset_ctrset_mutex); + cpus_read_unlock(); + return ret; +} + +static const struct file_operations cfset_fops = { + .owner = THIS_MODULE, + .open = cfset_open, + .release = cfset_release, + .unlocked_ioctl = cfset_ioctl, + .compat_ioctl = cfset_ioctl, + .llseek = no_llseek +}; + +static struct miscdevice cfset_dev = { + .name = S390_HWCTR_DEVICE, + .minor = MISC_DYNAMIC_MINOR, + .fops = &cfset_fops, + .mode = 0666, +}; + +/* Hotplug add of a CPU. Scan through all active processes and add + * that CPU to the list of CPUs supplied with ioctl(..., START, ...). + */ +static int cfset_online_cpu(unsigned int cpu) +{ + struct cfset_call_on_cpu_parm p; + struct cfset_request *rp; + + if (!list_empty(&cfset_session.head)) { + list_for_each_entry(rp, &cfset_session.head, node) { + p.sets = rp->ctrset; + cfset_ioctl_on(&p); + cpumask_set_cpu(cpu, &rp->mask); + } + } + return 0; +} + +/* Hotplug remove of a CPU. Scan through all active processes and clear + * that CPU from the list of CPUs supplied with ioctl(..., START, ...). + * Adjust reference counts. + */ +static int cfset_offline_cpu(unsigned int cpu) +{ + struct cfset_call_on_cpu_parm p; + struct cfset_request *rp; + + if (!list_empty(&cfset_session.head)) { + list_for_each_entry(rp, &cfset_session.head, node) { + p.sets = rp->ctrset; + cfset_ioctl_off(&p); + cpumask_clear_cpu(cpu, &rp->mask); + } + } + return 0; +} + +static void cfdiag_read(struct perf_event *event) +{ +} + +static int get_authctrsets(void) +{ + unsigned long auth = 0; + enum cpumf_ctr_set i; + + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { + if (cpumf_ctr_info.auth_ctl & cpumf_ctr_ctl[i]) + auth |= cpumf_ctr_ctl[i]; + } + return auth; +} + +/* Setup the event. Test for authorized counter sets and only include counter + * sets which are authorized at the time of the setup. Including unauthorized + * counter sets result in specification exception (and panic). + */ +static int cfdiag_event_init2(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + int err = 0; + + /* Set sample_period to indicate sampling */ + event->hw.config = attr->config; + event->hw.sample_period = attr->sample_period; + local64_set(&event->hw.period_left, event->hw.sample_period); + local64_set(&event->count, 0); + event->hw.last_period = event->hw.sample_period; + + /* Add all authorized counter sets to config_base. The + * the hardware init function is either called per-cpu or just once + * for all CPUS (event->cpu == -1). This depends on the whether + * counting is started for all CPUs or on a per workload base where + * the perf event moves from one CPU to another CPU. + * Checking the authorization on any CPU is fine as the hardware + * applies the same authorization settings to all CPUs. + */ + event->hw.config_base = get_authctrsets(); + + /* No authorized counter sets, nothing to count/sample */ + if (!event->hw.config_base) + err = -EINVAL; + + return err; +} + +static int cfdiag_event_init(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + int err = -ENOENT; + + if (event->attr.config != PERF_EVENT_CPUM_CF_DIAG || + event->attr.type != event->pmu->type) + goto out; + + /* Raw events are used to access counters directly, + * hence do not permit excludes. + * This event is useless without PERF_SAMPLE_RAW to return counter set + * values as raw data. + */ + if (attr->exclude_kernel || attr->exclude_user || attr->exclude_hv || + !(attr->sample_type & (PERF_SAMPLE_CPU | PERF_SAMPLE_RAW))) { + err = -EOPNOTSUPP; + goto out; + } + + /* Initialize for using the CPU-measurement counter facility */ + if (cpum_cf_alloc(event->cpu)) + return -ENOMEM; + event->destroy = hw_perf_event_destroy; + + err = cfdiag_event_init2(event); + if (unlikely(err)) + event->destroy(event); +out: + return err; +} + +/* Create cf_diag/events/CF_DIAG event sysfs file. This counter is used + * to collect the complete counter sets for a scheduled process. Target + * are complete counter sets attached as raw data to the artificial event. + * This results in complete counter sets available when a process is + * scheduled. Contains the delta of every counter while the process was + * running. + */ +CPUMF_EVENT_ATTR(CF_DIAG, CF_DIAG, PERF_EVENT_CPUM_CF_DIAG); + +static struct attribute *cfdiag_events_attr[] = { + CPUMF_EVENT_PTR(CF_DIAG, CF_DIAG), + NULL, +}; + +PMU_FORMAT_ATTR(event, "config:0-63"); + +static struct attribute *cfdiag_format_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group cfdiag_events_group = { + .name = "events", + .attrs = cfdiag_events_attr, +}; +static struct attribute_group cfdiag_format_group = { + .name = "format", + .attrs = cfdiag_format_attr, +}; +static const struct attribute_group *cfdiag_attr_groups[] = { + &cfdiag_events_group, + &cfdiag_format_group, + NULL, +}; + +/* Performance monitoring unit for event CF_DIAG. Since this event + * is also started and stopped via the perf_event_open() system call, use + * the same event enable/disable call back functions. They do not + * have a pointer to the perf_event strcture as first parameter. + * + * The functions XXX_add, XXX_del, XXX_start and XXX_stop are also common. + * Reuse them and distinguish the event (always first parameter) via + * 'config' member. + */ +static struct pmu cf_diag = { + .task_ctx_nr = perf_sw_context, + .event_init = cfdiag_event_init, + .pmu_enable = cpumf_pmu_enable, + .pmu_disable = cpumf_pmu_disable, + .add = cpumf_pmu_add, + .del = cpumf_pmu_del, + .start = cpumf_pmu_start, + .stop = cpumf_pmu_stop, + .read = cfdiag_read, + + .attr_groups = cfdiag_attr_groups +}; + +/* Calculate memory needed to store all counter sets together with header and + * trailer data. This is independent of the counter set authorization which + * can vary depending on the configuration. + */ +static size_t cfdiag_maxsize(struct cpumf_ctr_info *info) +{ + size_t max_size = sizeof(struct cf_trailer_entry); + enum cpumf_ctr_set i; + + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { + size_t size = cpum_cf_read_setsize(i); + + if (size) + max_size += size * sizeof(u64) + + sizeof(struct cf_ctrset_entry); + } + return max_size; +} + +/* Get the CPU speed, try sampling facility first and CPU attributes second. */ +static void cfdiag_get_cpu_speed(void) +{ + unsigned long mhz; + + if (cpum_sf_avail()) { /* Sampling facility first */ + struct hws_qsi_info_block si; + + memset(&si, 0, sizeof(si)); + if (!qsi(&si)) { + cfdiag_cpu_speed = si.cpu_speed; + return; + } + } + + /* Fallback: CPU speed extract static part. Used in case + * CPU Measurement Sampling Facility is turned off. + */ + mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0); + if (mhz != -1UL) + cfdiag_cpu_speed = mhz & 0xffffffff; +} + +static int cfset_init(void) +{ + size_t need; + int rc; + + cfdiag_get_cpu_speed(); + /* Make sure the counter set data fits into predefined buffer. */ + need = cfdiag_maxsize(&cpumf_ctr_info); + if (need > sizeof(((struct cpu_cf_events *)0)->start)) { + pr_err("Insufficient memory for PMU(cpum_cf_diag) need=%zu\n", + need); + return -ENOMEM; + } + + rc = misc_register(&cfset_dev); + if (rc) { + pr_err("Registration of /dev/%s failed rc=%i\n", + cfset_dev.name, rc); + goto out; + } + + rc = perf_pmu_register(&cf_diag, "cpum_cf_diag", -1); + if (rc) { + misc_deregister(&cfset_dev); + pr_err("Registration of PMU(cpum_cf_diag) failed with rc=%i\n", + rc); + } +out: + return rc; +} + +device_initcall(cpumf_pmu_init); diff --git a/arch/s390/kernel/perf_cpum_cf_events.c b/arch/s390/kernel/perf_cpum_cf_events.c new file mode 100644 index 0000000000..0d64aafd15 --- /dev/null +++ b/arch/s390/kernel/perf_cpum_cf_events.c @@ -0,0 +1,909 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Perf PMU sysfs events attributes for available CPU-measurement counters + * + */ + +#include +#include +#include + + +/* BEGIN: CPUM_CF COUNTER DEFINITIONS =================================== */ + +CPUMF_EVENT_ATTR(cf_fvn1, CPU_CYCLES, 0x0000); +CPUMF_EVENT_ATTR(cf_fvn1, INSTRUCTIONS, 0x0001); +CPUMF_EVENT_ATTR(cf_fvn1, L1I_DIR_WRITES, 0x0002); +CPUMF_EVENT_ATTR(cf_fvn1, L1I_PENALTY_CYCLES, 0x0003); +CPUMF_EVENT_ATTR(cf_fvn1, PROBLEM_STATE_CPU_CYCLES, 0x0020); +CPUMF_EVENT_ATTR(cf_fvn1, PROBLEM_STATE_INSTRUCTIONS, 0x0021); +CPUMF_EVENT_ATTR(cf_fvn1, PROBLEM_STATE_L1I_DIR_WRITES, 0x0022); +CPUMF_EVENT_ATTR(cf_fvn1, PROBLEM_STATE_L1I_PENALTY_CYCLES, 0x0023); +CPUMF_EVENT_ATTR(cf_fvn1, PROBLEM_STATE_L1D_DIR_WRITES, 0x0024); +CPUMF_EVENT_ATTR(cf_fvn1, PROBLEM_STATE_L1D_PENALTY_CYCLES, 0x0025); +CPUMF_EVENT_ATTR(cf_fvn1, L1D_DIR_WRITES, 0x0004); +CPUMF_EVENT_ATTR(cf_fvn1, L1D_PENALTY_CYCLES, 0x0005); +CPUMF_EVENT_ATTR(cf_fvn3, CPU_CYCLES, 0x0000); +CPUMF_EVENT_ATTR(cf_fvn3, INSTRUCTIONS, 0x0001); +CPUMF_EVENT_ATTR(cf_fvn3, L1I_DIR_WRITES, 0x0002); +CPUMF_EVENT_ATTR(cf_fvn3, L1I_PENALTY_CYCLES, 0x0003); +CPUMF_EVENT_ATTR(cf_fvn3, PROBLEM_STATE_CPU_CYCLES, 0x0020); +CPUMF_EVENT_ATTR(cf_fvn3, PROBLEM_STATE_INSTRUCTIONS, 0x0021); +CPUMF_EVENT_ATTR(cf_fvn3, L1D_DIR_WRITES, 0x0004); +CPUMF_EVENT_ATTR(cf_fvn3, L1D_PENALTY_CYCLES, 0x0005); +CPUMF_EVENT_ATTR(cf_svn_12345, PRNG_FUNCTIONS, 0x0040); +CPUMF_EVENT_ATTR(cf_svn_12345, PRNG_CYCLES, 0x0041); +CPUMF_EVENT_ATTR(cf_svn_12345, PRNG_BLOCKED_FUNCTIONS, 0x0042); +CPUMF_EVENT_ATTR(cf_svn_12345, PRNG_BLOCKED_CYCLES, 0x0043); +CPUMF_EVENT_ATTR(cf_svn_12345, SHA_FUNCTIONS, 0x0044); +CPUMF_EVENT_ATTR(cf_svn_12345, SHA_CYCLES, 0x0045); +CPUMF_EVENT_ATTR(cf_svn_12345, SHA_BLOCKED_FUNCTIONS, 0x0046); +CPUMF_EVENT_ATTR(cf_svn_12345, SHA_BLOCKED_CYCLES, 0x0047); +CPUMF_EVENT_ATTR(cf_svn_12345, DEA_FUNCTIONS, 0x0048); +CPUMF_EVENT_ATTR(cf_svn_12345, DEA_CYCLES, 0x0049); +CPUMF_EVENT_ATTR(cf_svn_12345, DEA_BLOCKED_FUNCTIONS, 0x004a); +CPUMF_EVENT_ATTR(cf_svn_12345, DEA_BLOCKED_CYCLES, 0x004b); +CPUMF_EVENT_ATTR(cf_svn_12345, AES_FUNCTIONS, 0x004c); +CPUMF_EVENT_ATTR(cf_svn_12345, AES_CYCLES, 0x004d); +CPUMF_EVENT_ATTR(cf_svn_12345, AES_BLOCKED_FUNCTIONS, 0x004e); +CPUMF_EVENT_ATTR(cf_svn_12345, AES_BLOCKED_CYCLES, 0x004f); +CPUMF_EVENT_ATTR(cf_svn_6, ECC_FUNCTION_COUNT, 0x0050); +CPUMF_EVENT_ATTR(cf_svn_6, ECC_CYCLES_COUNT, 0x0051); +CPUMF_EVENT_ATTR(cf_svn_6, ECC_BLOCKED_FUNCTION_COUNT, 0x0052); +CPUMF_EVENT_ATTR(cf_svn_6, ECC_BLOCKED_CYCLES_COUNT, 0x0053); +CPUMF_EVENT_ATTR(cf_z10, L1I_L2_SOURCED_WRITES, 0x0080); +CPUMF_EVENT_ATTR(cf_z10, L1D_L2_SOURCED_WRITES, 0x0081); +CPUMF_EVENT_ATTR(cf_z10, L1I_L3_LOCAL_WRITES, 0x0082); +CPUMF_EVENT_ATTR(cf_z10, L1D_L3_LOCAL_WRITES, 0x0083); +CPUMF_EVENT_ATTR(cf_z10, L1I_L3_REMOTE_WRITES, 0x0084); +CPUMF_EVENT_ATTR(cf_z10, L1D_L3_REMOTE_WRITES, 0x0085); +CPUMF_EVENT_ATTR(cf_z10, L1D_LMEM_SOURCED_WRITES, 0x0086); +CPUMF_EVENT_ATTR(cf_z10, L1I_LMEM_SOURCED_WRITES, 0x0087); +CPUMF_EVENT_ATTR(cf_z10, L1D_RO_EXCL_WRITES, 0x0088); +CPUMF_EVENT_ATTR(cf_z10, L1I_CACHELINE_INVALIDATES, 0x0089); +CPUMF_EVENT_ATTR(cf_z10, ITLB1_WRITES, 0x008a); +CPUMF_EVENT_ATTR(cf_z10, DTLB1_WRITES, 0x008b); +CPUMF_EVENT_ATTR(cf_z10, TLB2_PTE_WRITES, 0x008c); +CPUMF_EVENT_ATTR(cf_z10, TLB2_CRSTE_WRITES, 0x008d); +CPUMF_EVENT_ATTR(cf_z10, TLB2_CRSTE_HPAGE_WRITES, 0x008e); +CPUMF_EVENT_ATTR(cf_z10, ITLB1_MISSES, 0x0091); +CPUMF_EVENT_ATTR(cf_z10, DTLB1_MISSES, 0x0092); +CPUMF_EVENT_ATTR(cf_z10, L2C_STORES_SENT, 0x0093); +CPUMF_EVENT_ATTR(cf_z196, L1D_L2_SOURCED_WRITES, 0x0080); +CPUMF_EVENT_ATTR(cf_z196, L1I_L2_SOURCED_WRITES, 0x0081); +CPUMF_EVENT_ATTR(cf_z196, DTLB1_MISSES, 0x0082); +CPUMF_EVENT_ATTR(cf_z196, ITLB1_MISSES, 0x0083); +CPUMF_EVENT_ATTR(cf_z196, L2C_STORES_SENT, 0x0085); +CPUMF_EVENT_ATTR(cf_z196, L1D_OFFBOOK_L3_SOURCED_WRITES, 0x0086); +CPUMF_EVENT_ATTR(cf_z196, L1D_ONBOOK_L4_SOURCED_WRITES, 0x0087); +CPUMF_EVENT_ATTR(cf_z196, L1I_ONBOOK_L4_SOURCED_WRITES, 0x0088); +CPUMF_EVENT_ATTR(cf_z196, L1D_RO_EXCL_WRITES, 0x0089); +CPUMF_EVENT_ATTR(cf_z196, L1D_OFFBOOK_L4_SOURCED_WRITES, 0x008a); +CPUMF_EVENT_ATTR(cf_z196, L1I_OFFBOOK_L4_SOURCED_WRITES, 0x008b); +CPUMF_EVENT_ATTR(cf_z196, DTLB1_HPAGE_WRITES, 0x008c); +CPUMF_EVENT_ATTR(cf_z196, L1D_LMEM_SOURCED_WRITES, 0x008d); +CPUMF_EVENT_ATTR(cf_z196, L1I_LMEM_SOURCED_WRITES, 0x008e); +CPUMF_EVENT_ATTR(cf_z196, L1I_OFFBOOK_L3_SOURCED_WRITES, 0x008f); +CPUMF_EVENT_ATTR(cf_z196, DTLB1_WRITES, 0x0090); +CPUMF_EVENT_ATTR(cf_z196, ITLB1_WRITES, 0x0091); +CPUMF_EVENT_ATTR(cf_z196, TLB2_PTE_WRITES, 0x0092); +CPUMF_EVENT_ATTR(cf_z196, TLB2_CRSTE_HPAGE_WRITES, 0x0093); +CPUMF_EVENT_ATTR(cf_z196, TLB2_CRSTE_WRITES, 0x0094); +CPUMF_EVENT_ATTR(cf_z196, L1D_ONCHIP_L3_SOURCED_WRITES, 0x0096); +CPUMF_EVENT_ATTR(cf_z196, L1D_OFFCHIP_L3_SOURCED_WRITES, 0x0098); +CPUMF_EVENT_ATTR(cf_z196, L1I_ONCHIP_L3_SOURCED_WRITES, 0x0099); +CPUMF_EVENT_ATTR(cf_z196, L1I_OFFCHIP_L3_SOURCED_WRITES, 0x009b); +CPUMF_EVENT_ATTR(cf_zec12, DTLB1_MISSES, 0x0080); +CPUMF_EVENT_ATTR(cf_zec12, ITLB1_MISSES, 0x0081); +CPUMF_EVENT_ATTR(cf_zec12, L1D_L2I_SOURCED_WRITES, 0x0082); +CPUMF_EVENT_ATTR(cf_zec12, L1I_L2I_SOURCED_WRITES, 0x0083); +CPUMF_EVENT_ATTR(cf_zec12, L1D_L2D_SOURCED_WRITES, 0x0084); +CPUMF_EVENT_ATTR(cf_zec12, DTLB1_WRITES, 0x0085); +CPUMF_EVENT_ATTR(cf_zec12, L1D_LMEM_SOURCED_WRITES, 0x0087); +CPUMF_EVENT_ATTR(cf_zec12, L1I_LMEM_SOURCED_WRITES, 0x0089); +CPUMF_EVENT_ATTR(cf_zec12, L1D_RO_EXCL_WRITES, 0x008a); +CPUMF_EVENT_ATTR(cf_zec12, DTLB1_HPAGE_WRITES, 0x008b); +CPUMF_EVENT_ATTR(cf_zec12, ITLB1_WRITES, 0x008c); +CPUMF_EVENT_ATTR(cf_zec12, TLB2_PTE_WRITES, 0x008d); +CPUMF_EVENT_ATTR(cf_zec12, TLB2_CRSTE_HPAGE_WRITES, 0x008e); +CPUMF_EVENT_ATTR(cf_zec12, TLB2_CRSTE_WRITES, 0x008f); +CPUMF_EVENT_ATTR(cf_zec12, L1D_ONCHIP_L3_SOURCED_WRITES, 0x0090); +CPUMF_EVENT_ATTR(cf_zec12, L1D_OFFCHIP_L3_SOURCED_WRITES, 0x0091); +CPUMF_EVENT_ATTR(cf_zec12, L1D_OFFBOOK_L3_SOURCED_WRITES, 0x0092); +CPUMF_EVENT_ATTR(cf_zec12, L1D_ONBOOK_L4_SOURCED_WRITES, 0x0093); +CPUMF_EVENT_ATTR(cf_zec12, L1D_OFFBOOK_L4_SOURCED_WRITES, 0x0094); +CPUMF_EVENT_ATTR(cf_zec12, TX_NC_TEND, 0x0095); +CPUMF_EVENT_ATTR(cf_zec12, L1D_ONCHIP_L3_SOURCED_WRITES_IV, 0x0096); +CPUMF_EVENT_ATTR(cf_zec12, L1D_OFFCHIP_L3_SOURCED_WRITES_IV, 0x0097); +CPUMF_EVENT_ATTR(cf_zec12, L1D_OFFBOOK_L3_SOURCED_WRITES_IV, 0x0098); +CPUMF_EVENT_ATTR(cf_zec12, L1I_ONCHIP_L3_SOURCED_WRITES, 0x0099); +CPUMF_EVENT_ATTR(cf_zec12, L1I_OFFCHIP_L3_SOURCED_WRITES, 0x009a); +CPUMF_EVENT_ATTR(cf_zec12, L1I_OFFBOOK_L3_SOURCED_WRITES, 0x009b); +CPUMF_EVENT_ATTR(cf_zec12, L1I_ONBOOK_L4_SOURCED_WRITES, 0x009c); +CPUMF_EVENT_ATTR(cf_zec12, L1I_OFFBOOK_L4_SOURCED_WRITES, 0x009d); +CPUMF_EVENT_ATTR(cf_zec12, TX_C_TEND, 0x009e); +CPUMF_EVENT_ATTR(cf_zec12, L1I_ONCHIP_L3_SOURCED_WRITES_IV, 0x009f); +CPUMF_EVENT_ATTR(cf_zec12, L1I_OFFCHIP_L3_SOURCED_WRITES_IV, 0x00a0); +CPUMF_EVENT_ATTR(cf_zec12, L1I_OFFBOOK_L3_SOURCED_WRITES_IV, 0x00a1); +CPUMF_EVENT_ATTR(cf_zec12, TX_NC_TABORT, 0x00b1); +CPUMF_EVENT_ATTR(cf_zec12, TX_C_TABORT_NO_SPECIAL, 0x00b2); +CPUMF_EVENT_ATTR(cf_zec12, TX_C_TABORT_SPECIAL, 0x00b3); +CPUMF_EVENT_ATTR(cf_z13, L1D_RO_EXCL_WRITES, 0x0080); +CPUMF_EVENT_ATTR(cf_z13, DTLB1_WRITES, 0x0081); +CPUMF_EVENT_ATTR(cf_z13, DTLB1_MISSES, 0x0082); +CPUMF_EVENT_ATTR(cf_z13, DTLB1_HPAGE_WRITES, 0x0083); +CPUMF_EVENT_ATTR(cf_z13, DTLB1_GPAGE_WRITES, 0x0084); +CPUMF_EVENT_ATTR(cf_z13, L1D_L2D_SOURCED_WRITES, 0x0085); +CPUMF_EVENT_ATTR(cf_z13, ITLB1_WRITES, 0x0086); +CPUMF_EVENT_ATTR(cf_z13, ITLB1_MISSES, 0x0087); +CPUMF_EVENT_ATTR(cf_z13, L1I_L2I_SOURCED_WRITES, 0x0088); +CPUMF_EVENT_ATTR(cf_z13, TLB2_PTE_WRITES, 0x0089); +CPUMF_EVENT_ATTR(cf_z13, TLB2_CRSTE_HPAGE_WRITES, 0x008a); +CPUMF_EVENT_ATTR(cf_z13, TLB2_CRSTE_WRITES, 0x008b); +CPUMF_EVENT_ATTR(cf_z13, TX_C_TEND, 0x008c); +CPUMF_EVENT_ATTR(cf_z13, TX_NC_TEND, 0x008d); +CPUMF_EVENT_ATTR(cf_z13, L1C_TLB1_MISSES, 0x008f); +CPUMF_EVENT_ATTR(cf_z13, L1D_ONCHIP_L3_SOURCED_WRITES, 0x0090); +CPUMF_EVENT_ATTR(cf_z13, L1D_ONCHIP_L3_SOURCED_WRITES_IV, 0x0091); +CPUMF_EVENT_ATTR(cf_z13, L1D_ONNODE_L4_SOURCED_WRITES, 0x0092); +CPUMF_EVENT_ATTR(cf_z13, L1D_ONNODE_L3_SOURCED_WRITES_IV, 0x0093); +CPUMF_EVENT_ATTR(cf_z13, L1D_ONNODE_L3_SOURCED_WRITES, 0x0094); +CPUMF_EVENT_ATTR(cf_z13, L1D_ONDRAWER_L4_SOURCED_WRITES, 0x0095); +CPUMF_EVENT_ATTR(cf_z13, L1D_ONDRAWER_L3_SOURCED_WRITES_IV, 0x0096); +CPUMF_EVENT_ATTR(cf_z13, L1D_ONDRAWER_L3_SOURCED_WRITES, 0x0097); +CPUMF_EVENT_ATTR(cf_z13, L1D_OFFDRAWER_SCOL_L4_SOURCED_WRITES, 0x0098); +CPUMF_EVENT_ATTR(cf_z13, L1D_OFFDRAWER_SCOL_L3_SOURCED_WRITES_IV, 0x0099); +CPUMF_EVENT_ATTR(cf_z13, L1D_OFFDRAWER_SCOL_L3_SOURCED_WRITES, 0x009a); +CPUMF_EVENT_ATTR(cf_z13, L1D_OFFDRAWER_FCOL_L4_SOURCED_WRITES, 0x009b); +CPUMF_EVENT_ATTR(cf_z13, L1D_OFFDRAWER_FCOL_L3_SOURCED_WRITES_IV, 0x009c); +CPUMF_EVENT_ATTR(cf_z13, L1D_OFFDRAWER_FCOL_L3_SOURCED_WRITES, 0x009d); +CPUMF_EVENT_ATTR(cf_z13, L1D_ONNODE_MEM_SOURCED_WRITES, 0x009e); +CPUMF_EVENT_ATTR(cf_z13, L1D_ONDRAWER_MEM_SOURCED_WRITES, 0x009f); +CPUMF_EVENT_ATTR(cf_z13, L1D_OFFDRAWER_MEM_SOURCED_WRITES, 0x00a0); +CPUMF_EVENT_ATTR(cf_z13, L1D_ONCHIP_MEM_SOURCED_WRITES, 0x00a1); +CPUMF_EVENT_ATTR(cf_z13, L1I_ONCHIP_L3_SOURCED_WRITES, 0x00a2); +CPUMF_EVENT_ATTR(cf_z13, L1I_ONCHIP_L3_SOURCED_WRITES_IV, 0x00a3); +CPUMF_EVENT_ATTR(cf_z13, L1I_ONNODE_L4_SOURCED_WRITES, 0x00a4); +CPUMF_EVENT_ATTR(cf_z13, L1I_ONNODE_L3_SOURCED_WRITES_IV, 0x00a5); +CPUMF_EVENT_ATTR(cf_z13, L1I_ONNODE_L3_SOURCED_WRITES, 0x00a6); +CPUMF_EVENT_ATTR(cf_z13, L1I_ONDRAWER_L4_SOURCED_WRITES, 0x00a7); +CPUMF_EVENT_ATTR(cf_z13, L1I_ONDRAWER_L3_SOURCED_WRITES_IV, 0x00a8); +CPUMF_EVENT_ATTR(cf_z13, L1I_ONDRAWER_L3_SOURCED_WRITES, 0x00a9); +CPUMF_EVENT_ATTR(cf_z13, L1I_OFFDRAWER_SCOL_L4_SOURCED_WRITES, 0x00aa); +CPUMF_EVENT_ATTR(cf_z13, L1I_OFFDRAWER_SCOL_L3_SOURCED_WRITES_IV, 0x00ab); +CPUMF_EVENT_ATTR(cf_z13, L1I_OFFDRAWER_SCOL_L3_SOURCED_WRITES, 0x00ac); +CPUMF_EVENT_ATTR(cf_z13, L1I_OFFDRAWER_FCOL_L4_SOURCED_WRITES, 0x00ad); +CPUMF_EVENT_ATTR(cf_z13, L1I_OFFDRAWER_FCOL_L3_SOURCED_WRITES_IV, 0x00ae); +CPUMF_EVENT_ATTR(cf_z13, L1I_OFFDRAWER_FCOL_L3_SOURCED_WRITES, 0x00af); +CPUMF_EVENT_ATTR(cf_z13, L1I_ONNODE_MEM_SOURCED_WRITES, 0x00b0); +CPUMF_EVENT_ATTR(cf_z13, L1I_ONDRAWER_MEM_SOURCED_WRITES, 0x00b1); +CPUMF_EVENT_ATTR(cf_z13, L1I_OFFDRAWER_MEM_SOURCED_WRITES, 0x00b2); +CPUMF_EVENT_ATTR(cf_z13, L1I_ONCHIP_MEM_SOURCED_WRITES, 0x00b3); +CPUMF_EVENT_ATTR(cf_z13, TX_NC_TABORT, 0x00da); +CPUMF_EVENT_ATTR(cf_z13, TX_C_TABORT_NO_SPECIAL, 0x00db); +CPUMF_EVENT_ATTR(cf_z13, TX_C_TABORT_SPECIAL, 0x00dc); +CPUMF_EVENT_ATTR(cf_z13, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0); +CPUMF_EVENT_ATTR(cf_z13, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1); +CPUMF_EVENT_ATTR(cf_z14, L1D_RO_EXCL_WRITES, 0x0080); +CPUMF_EVENT_ATTR(cf_z14, DTLB2_WRITES, 0x0081); +CPUMF_EVENT_ATTR(cf_z14, DTLB2_MISSES, 0x0082); +CPUMF_EVENT_ATTR(cf_z14, DTLB2_HPAGE_WRITES, 0x0083); +CPUMF_EVENT_ATTR(cf_z14, DTLB2_GPAGE_WRITES, 0x0084); +CPUMF_EVENT_ATTR(cf_z14, L1D_L2D_SOURCED_WRITES, 0x0085); +CPUMF_EVENT_ATTR(cf_z14, ITLB2_WRITES, 0x0086); +CPUMF_EVENT_ATTR(cf_z14, ITLB2_MISSES, 0x0087); +CPUMF_EVENT_ATTR(cf_z14, L1I_L2I_SOURCED_WRITES, 0x0088); +CPUMF_EVENT_ATTR(cf_z14, TLB2_PTE_WRITES, 0x0089); +CPUMF_EVENT_ATTR(cf_z14, TLB2_CRSTE_WRITES, 0x008a); +CPUMF_EVENT_ATTR(cf_z14, TLB2_ENGINES_BUSY, 0x008b); +CPUMF_EVENT_ATTR(cf_z14, TX_C_TEND, 0x008c); +CPUMF_EVENT_ATTR(cf_z14, TX_NC_TEND, 0x008d); +CPUMF_EVENT_ATTR(cf_z14, L1C_TLB2_MISSES, 0x008f); +CPUMF_EVENT_ATTR(cf_z14, L1D_ONCHIP_L3_SOURCED_WRITES, 0x0090); +CPUMF_EVENT_ATTR(cf_z14, L1D_ONCHIP_MEMORY_SOURCED_WRITES, 0x0091); +CPUMF_EVENT_ATTR(cf_z14, L1D_ONCHIP_L3_SOURCED_WRITES_IV, 0x0092); +CPUMF_EVENT_ATTR(cf_z14, L1D_ONCLUSTER_L3_SOURCED_WRITES, 0x0093); +CPUMF_EVENT_ATTR(cf_z14, L1D_ONCLUSTER_MEMORY_SOURCED_WRITES, 0x0094); +CPUMF_EVENT_ATTR(cf_z14, L1D_ONCLUSTER_L3_SOURCED_WRITES_IV, 0x0095); +CPUMF_EVENT_ATTR(cf_z14, L1D_OFFCLUSTER_L3_SOURCED_WRITES, 0x0096); +CPUMF_EVENT_ATTR(cf_z14, L1D_OFFCLUSTER_MEMORY_SOURCED_WRITES, 0x0097); +CPUMF_EVENT_ATTR(cf_z14, L1D_OFFCLUSTER_L3_SOURCED_WRITES_IV, 0x0098); +CPUMF_EVENT_ATTR(cf_z14, L1D_OFFDRAWER_L3_SOURCED_WRITES, 0x0099); +CPUMF_EVENT_ATTR(cf_z14, L1D_OFFDRAWER_MEMORY_SOURCED_WRITES, 0x009a); +CPUMF_EVENT_ATTR(cf_z14, L1D_OFFDRAWER_L3_SOURCED_WRITES_IV, 0x009b); +CPUMF_EVENT_ATTR(cf_z14, L1D_ONDRAWER_L4_SOURCED_WRITES, 0x009c); +CPUMF_EVENT_ATTR(cf_z14, L1D_OFFDRAWER_L4_SOURCED_WRITES, 0x009d); +CPUMF_EVENT_ATTR(cf_z14, L1D_ONCHIP_L3_SOURCED_WRITES_RO, 0x009e); +CPUMF_EVENT_ATTR(cf_z14, L1I_ONCHIP_L3_SOURCED_WRITES, 0x00a2); +CPUMF_EVENT_ATTR(cf_z14, L1I_ONCHIP_MEMORY_SOURCED_WRITES, 0x00a3); +CPUMF_EVENT_ATTR(cf_z14, L1I_ONCHIP_L3_SOURCED_WRITES_IV, 0x00a4); +CPUMF_EVENT_ATTR(cf_z14, L1I_ONCLUSTER_L3_SOURCED_WRITES, 0x00a5); +CPUMF_EVENT_ATTR(cf_z14, L1I_ONCLUSTER_MEMORY_SOURCED_WRITES, 0x00a6); +CPUMF_EVENT_ATTR(cf_z14, L1I_ONCLUSTER_L3_SOURCED_WRITES_IV, 0x00a7); +CPUMF_EVENT_ATTR(cf_z14, L1I_OFFCLUSTER_L3_SOURCED_WRITES, 0x00a8); +CPUMF_EVENT_ATTR(cf_z14, L1I_OFFCLUSTER_MEMORY_SOURCED_WRITES, 0x00a9); +CPUMF_EVENT_ATTR(cf_z14, L1I_OFFCLUSTER_L3_SOURCED_WRITES_IV, 0x00aa); +CPUMF_EVENT_ATTR(cf_z14, L1I_OFFDRAWER_L3_SOURCED_WRITES, 0x00ab); +CPUMF_EVENT_ATTR(cf_z14, L1I_OFFDRAWER_MEMORY_SOURCED_WRITES, 0x00ac); +CPUMF_EVENT_ATTR(cf_z14, L1I_OFFDRAWER_L3_SOURCED_WRITES_IV, 0x00ad); +CPUMF_EVENT_ATTR(cf_z14, L1I_ONDRAWER_L4_SOURCED_WRITES, 0x00ae); +CPUMF_EVENT_ATTR(cf_z14, L1I_OFFDRAWER_L4_SOURCED_WRITES, 0x00af); +CPUMF_EVENT_ATTR(cf_z14, BCD_DFP_EXECUTION_SLOTS, 0x00e0); +CPUMF_EVENT_ATTR(cf_z14, VX_BCD_EXECUTION_SLOTS, 0x00e1); +CPUMF_EVENT_ATTR(cf_z14, DECIMAL_INSTRUCTIONS, 0x00e2); +CPUMF_EVENT_ATTR(cf_z14, LAST_HOST_TRANSLATIONS, 0x00e8); +CPUMF_EVENT_ATTR(cf_z14, TX_NC_TABORT, 0x00f3); +CPUMF_EVENT_ATTR(cf_z14, TX_C_TABORT_NO_SPECIAL, 0x00f4); +CPUMF_EVENT_ATTR(cf_z14, TX_C_TABORT_SPECIAL, 0x00f5); +CPUMF_EVENT_ATTR(cf_z14, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0); +CPUMF_EVENT_ATTR(cf_z14, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1); + +CPUMF_EVENT_ATTR(cf_z15, L1D_RO_EXCL_WRITES, 0x0080); +CPUMF_EVENT_ATTR(cf_z15, DTLB2_WRITES, 0x0081); +CPUMF_EVENT_ATTR(cf_z15, DTLB2_MISSES, 0x0082); +CPUMF_EVENT_ATTR(cf_z15, DTLB2_HPAGE_WRITES, 0x0083); +CPUMF_EVENT_ATTR(cf_z15, DTLB2_GPAGE_WRITES, 0x0084); +CPUMF_EVENT_ATTR(cf_z15, L1D_L2D_SOURCED_WRITES, 0x0085); +CPUMF_EVENT_ATTR(cf_z15, ITLB2_WRITES, 0x0086); +CPUMF_EVENT_ATTR(cf_z15, ITLB2_MISSES, 0x0087); +CPUMF_EVENT_ATTR(cf_z15, L1I_L2I_SOURCED_WRITES, 0x0088); +CPUMF_EVENT_ATTR(cf_z15, TLB2_PTE_WRITES, 0x0089); +CPUMF_EVENT_ATTR(cf_z15, TLB2_CRSTE_WRITES, 0x008a); +CPUMF_EVENT_ATTR(cf_z15, TLB2_ENGINES_BUSY, 0x008b); +CPUMF_EVENT_ATTR(cf_z15, TX_C_TEND, 0x008c); +CPUMF_EVENT_ATTR(cf_z15, TX_NC_TEND, 0x008d); +CPUMF_EVENT_ATTR(cf_z15, L1C_TLB2_MISSES, 0x008f); +CPUMF_EVENT_ATTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES, 0x0090); +CPUMF_EVENT_ATTR(cf_z15, L1D_ONCHIP_MEMORY_SOURCED_WRITES, 0x0091); +CPUMF_EVENT_ATTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES_IV, 0x0092); +CPUMF_EVENT_ATTR(cf_z15, L1D_ONCLUSTER_L3_SOURCED_WRITES, 0x0093); +CPUMF_EVENT_ATTR(cf_z15, L1D_ONCLUSTER_MEMORY_SOURCED_WRITES, 0x0094); +CPUMF_EVENT_ATTR(cf_z15, L1D_ONCLUSTER_L3_SOURCED_WRITES_IV, 0x0095); +CPUMF_EVENT_ATTR(cf_z15, L1D_OFFCLUSTER_L3_SOURCED_WRITES, 0x0096); +CPUMF_EVENT_ATTR(cf_z15, L1D_OFFCLUSTER_MEMORY_SOURCED_WRITES, 0x0097); +CPUMF_EVENT_ATTR(cf_z15, L1D_OFFCLUSTER_L3_SOURCED_WRITES_IV, 0x0098); +CPUMF_EVENT_ATTR(cf_z15, L1D_OFFDRAWER_L3_SOURCED_WRITES, 0x0099); +CPUMF_EVENT_ATTR(cf_z15, L1D_OFFDRAWER_MEMORY_SOURCED_WRITES, 0x009a); +CPUMF_EVENT_ATTR(cf_z15, L1D_OFFDRAWER_L3_SOURCED_WRITES_IV, 0x009b); +CPUMF_EVENT_ATTR(cf_z15, L1D_ONDRAWER_L4_SOURCED_WRITES, 0x009c); +CPUMF_EVENT_ATTR(cf_z15, L1D_OFFDRAWER_L4_SOURCED_WRITES, 0x009d); +CPUMF_EVENT_ATTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES_RO, 0x009e); +CPUMF_EVENT_ATTR(cf_z15, L1I_ONCHIP_L3_SOURCED_WRITES, 0x00a2); +CPUMF_EVENT_ATTR(cf_z15, L1I_ONCHIP_MEMORY_SOURCED_WRITES, 0x00a3); +CPUMF_EVENT_ATTR(cf_z15, L1I_ONCHIP_L3_SOURCED_WRITES_IV, 0x00a4); +CPUMF_EVENT_ATTR(cf_z15, L1I_ONCLUSTER_L3_SOURCED_WRITES, 0x00a5); +CPUMF_EVENT_ATTR(cf_z15, L1I_ONCLUSTER_MEMORY_SOURCED_WRITES, 0x00a6); +CPUMF_EVENT_ATTR(cf_z15, L1I_ONCLUSTER_L3_SOURCED_WRITES_IV, 0x00a7); +CPUMF_EVENT_ATTR(cf_z15, L1I_OFFCLUSTER_L3_SOURCED_WRITES, 0x00a8); +CPUMF_EVENT_ATTR(cf_z15, L1I_OFFCLUSTER_MEMORY_SOURCED_WRITES, 0x00a9); +CPUMF_EVENT_ATTR(cf_z15, L1I_OFFCLUSTER_L3_SOURCED_WRITES_IV, 0x00aa); +CPUMF_EVENT_ATTR(cf_z15, L1I_OFFDRAWER_L3_SOURCED_WRITES, 0x00ab); +CPUMF_EVENT_ATTR(cf_z15, L1I_OFFDRAWER_MEMORY_SOURCED_WRITES, 0x00ac); +CPUMF_EVENT_ATTR(cf_z15, L1I_OFFDRAWER_L3_SOURCED_WRITES_IV, 0x00ad); +CPUMF_EVENT_ATTR(cf_z15, L1I_ONDRAWER_L4_SOURCED_WRITES, 0x00ae); +CPUMF_EVENT_ATTR(cf_z15, L1I_OFFDRAWER_L4_SOURCED_WRITES, 0x00af); +CPUMF_EVENT_ATTR(cf_z15, BCD_DFP_EXECUTION_SLOTS, 0x00e0); +CPUMF_EVENT_ATTR(cf_z15, VX_BCD_EXECUTION_SLOTS, 0x00e1); +CPUMF_EVENT_ATTR(cf_z15, DECIMAL_INSTRUCTIONS, 0x00e2); +CPUMF_EVENT_ATTR(cf_z15, LAST_HOST_TRANSLATIONS, 0x00e8); +CPUMF_EVENT_ATTR(cf_z15, TX_NC_TABORT, 0x00f3); +CPUMF_EVENT_ATTR(cf_z15, TX_C_TABORT_NO_SPECIAL, 0x00f4); +CPUMF_EVENT_ATTR(cf_z15, TX_C_TABORT_SPECIAL, 0x00f5); +CPUMF_EVENT_ATTR(cf_z15, DFLT_ACCESS, 0x00f7); +CPUMF_EVENT_ATTR(cf_z15, DFLT_CYCLES, 0x00fc); +CPUMF_EVENT_ATTR(cf_z15, DFLT_CC, 0x00108); +CPUMF_EVENT_ATTR(cf_z15, DFLT_CCFINISH, 0x00109); +CPUMF_EVENT_ATTR(cf_z15, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0); +CPUMF_EVENT_ATTR(cf_z15, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1); +CPUMF_EVENT_ATTR(cf_z16, L1D_RO_EXCL_WRITES, 0x0080); +CPUMF_EVENT_ATTR(cf_z16, DTLB2_WRITES, 0x0081); +CPUMF_EVENT_ATTR(cf_z16, DTLB2_MISSES, 0x0082); +CPUMF_EVENT_ATTR(cf_z16, CRSTE_1MB_WRITES, 0x0083); +CPUMF_EVENT_ATTR(cf_z16, DTLB2_GPAGE_WRITES, 0x0084); +CPUMF_EVENT_ATTR(cf_z16, ITLB2_WRITES, 0x0086); +CPUMF_EVENT_ATTR(cf_z16, ITLB2_MISSES, 0x0087); +CPUMF_EVENT_ATTR(cf_z16, TLB2_PTE_WRITES, 0x0089); +CPUMF_EVENT_ATTR(cf_z16, TLB2_CRSTE_WRITES, 0x008a); +CPUMF_EVENT_ATTR(cf_z16, TLB2_ENGINES_BUSY, 0x008b); +CPUMF_EVENT_ATTR(cf_z16, TX_C_TEND, 0x008c); +CPUMF_EVENT_ATTR(cf_z16, TX_NC_TEND, 0x008d); +CPUMF_EVENT_ATTR(cf_z16, L1C_TLB2_MISSES, 0x008f); +CPUMF_EVENT_ATTR(cf_z16, DCW_REQ, 0x0091); +CPUMF_EVENT_ATTR(cf_z16, DCW_REQ_IV, 0x0092); +CPUMF_EVENT_ATTR(cf_z16, DCW_REQ_CHIP_HIT, 0x0093); +CPUMF_EVENT_ATTR(cf_z16, DCW_REQ_DRAWER_HIT, 0x0094); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP, 0x0095); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_IV, 0x0096); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_CHIP_HIT, 0x0097); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_DRAWER_HIT, 0x0098); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_MODULE, 0x0099); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_DRAWER, 0x009a); +CPUMF_EVENT_ATTR(cf_z16, DCW_OFF_DRAWER, 0x009b); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_MEMORY, 0x009c); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_MODULE_MEMORY, 0x009d); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_DRAWER_MEMORY, 0x009e); +CPUMF_EVENT_ATTR(cf_z16, DCW_OFF_DRAWER_MEMORY, 0x009f); +CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_MODULE_IV, 0x00a0); +CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_MODULE_CHIP_HIT, 0x00a1); +CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_MODULE_DRAWER_HIT, 0x00a2); +CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_DRAWER_IV, 0x00a3); +CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_DRAWER_CHIP_HIT, 0x00a4); +CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_DRAWER_DRAWER_HIT, 0x00a5); +CPUMF_EVENT_ATTR(cf_z16, IDCW_OFF_DRAWER_IV, 0x00a6); +CPUMF_EVENT_ATTR(cf_z16, IDCW_OFF_DRAWER_CHIP_HIT, 0x00a7); +CPUMF_EVENT_ATTR(cf_z16, IDCW_OFF_DRAWER_DRAWER_HIT, 0x00a8); +CPUMF_EVENT_ATTR(cf_z16, ICW_REQ, 0x00a9); +CPUMF_EVENT_ATTR(cf_z16, ICW_REQ_IV, 0x00aa); +CPUMF_EVENT_ATTR(cf_z16, ICW_REQ_CHIP_HIT, 0x00ab); +CPUMF_EVENT_ATTR(cf_z16, ICW_REQ_DRAWER_HIT, 0x00ac); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP, 0x00ad); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_IV, 0x00ae); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_CHIP_HIT, 0x00af); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_DRAWER_HIT, 0x00b0); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_MODULE, 0x00b1); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_DRAWER, 0x00b2); +CPUMF_EVENT_ATTR(cf_z16, ICW_OFF_DRAWER, 0x00b3); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_MEMORY, 0x00b4); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_MODULE_MEMORY, 0x00b5); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_DRAWER_MEMORY, 0x00b6); +CPUMF_EVENT_ATTR(cf_z16, ICW_OFF_DRAWER_MEMORY, 0x00b7); +CPUMF_EVENT_ATTR(cf_z16, BCD_DFP_EXECUTION_SLOTS, 0x00e0); +CPUMF_EVENT_ATTR(cf_z16, VX_BCD_EXECUTION_SLOTS, 0x00e1); +CPUMF_EVENT_ATTR(cf_z16, DECIMAL_INSTRUCTIONS, 0x00e2); +CPUMF_EVENT_ATTR(cf_z16, LAST_HOST_TRANSLATIONS, 0x00e8); +CPUMF_EVENT_ATTR(cf_z16, TX_NC_TABORT, 0x00f4); +CPUMF_EVENT_ATTR(cf_z16, TX_C_TABORT_NO_SPECIAL, 0x00f5); +CPUMF_EVENT_ATTR(cf_z16, TX_C_TABORT_SPECIAL, 0x00f6); +CPUMF_EVENT_ATTR(cf_z16, DFLT_ACCESS, 0x00f8); +CPUMF_EVENT_ATTR(cf_z16, DFLT_CYCLES, 0x00fd); +CPUMF_EVENT_ATTR(cf_z16, SORTL, 0x0100); +CPUMF_EVENT_ATTR(cf_z16, DFLT_CC, 0x0109); +CPUMF_EVENT_ATTR(cf_z16, DFLT_CCFINISH, 0x010a); +CPUMF_EVENT_ATTR(cf_z16, NNPA_INVOCATIONS, 0x010b); +CPUMF_EVENT_ATTR(cf_z16, NNPA_COMPLETIONS, 0x010c); +CPUMF_EVENT_ATTR(cf_z16, NNPA_WAIT_LOCK, 0x010d); +CPUMF_EVENT_ATTR(cf_z16, NNPA_HOLD_LOCK, 0x010e); +CPUMF_EVENT_ATTR(cf_z16, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0); +CPUMF_EVENT_ATTR(cf_z16, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1); + +static struct attribute *cpumcf_fvn1_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_fvn1, CPU_CYCLES), + CPUMF_EVENT_PTR(cf_fvn1, INSTRUCTIONS), + CPUMF_EVENT_PTR(cf_fvn1, L1I_DIR_WRITES), + CPUMF_EVENT_PTR(cf_fvn1, L1I_PENALTY_CYCLES), + CPUMF_EVENT_PTR(cf_fvn1, PROBLEM_STATE_CPU_CYCLES), + CPUMF_EVENT_PTR(cf_fvn1, PROBLEM_STATE_INSTRUCTIONS), + CPUMF_EVENT_PTR(cf_fvn1, PROBLEM_STATE_L1I_DIR_WRITES), + CPUMF_EVENT_PTR(cf_fvn1, PROBLEM_STATE_L1I_PENALTY_CYCLES), + CPUMF_EVENT_PTR(cf_fvn1, PROBLEM_STATE_L1D_DIR_WRITES), + CPUMF_EVENT_PTR(cf_fvn1, PROBLEM_STATE_L1D_PENALTY_CYCLES), + CPUMF_EVENT_PTR(cf_fvn1, L1D_DIR_WRITES), + CPUMF_EVENT_PTR(cf_fvn1, L1D_PENALTY_CYCLES), + NULL, +}; + +static struct attribute *cpumcf_fvn3_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_fvn3, CPU_CYCLES), + CPUMF_EVENT_PTR(cf_fvn3, INSTRUCTIONS), + CPUMF_EVENT_PTR(cf_fvn3, L1I_DIR_WRITES), + CPUMF_EVENT_PTR(cf_fvn3, L1I_PENALTY_CYCLES), + CPUMF_EVENT_PTR(cf_fvn3, PROBLEM_STATE_CPU_CYCLES), + CPUMF_EVENT_PTR(cf_fvn3, PROBLEM_STATE_INSTRUCTIONS), + CPUMF_EVENT_PTR(cf_fvn3, L1D_DIR_WRITES), + CPUMF_EVENT_PTR(cf_fvn3, L1D_PENALTY_CYCLES), + NULL, +}; + +static struct attribute *cpumcf_svn_12345_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_svn_12345, PRNG_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, PRNG_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, PRNG_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, PRNG_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, SHA_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, SHA_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, SHA_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, SHA_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, DEA_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, DEA_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, DEA_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, DEA_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, AES_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, AES_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, AES_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, AES_BLOCKED_CYCLES), + NULL, +}; + +static struct attribute *cpumcf_svn_67_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_svn_12345, PRNG_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, PRNG_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, PRNG_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, PRNG_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, SHA_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, SHA_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, SHA_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, SHA_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, DEA_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, DEA_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, DEA_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, DEA_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, AES_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, AES_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, AES_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, AES_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_6, ECC_FUNCTION_COUNT), + CPUMF_EVENT_PTR(cf_svn_6, ECC_CYCLES_COUNT), + CPUMF_EVENT_PTR(cf_svn_6, ECC_BLOCKED_FUNCTION_COUNT), + CPUMF_EVENT_PTR(cf_svn_6, ECC_BLOCKED_CYCLES_COUNT), + NULL, +}; + +static struct attribute *cpumcf_z10_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_z10, L1I_L2_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z10, L1D_L2_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z10, L1I_L3_LOCAL_WRITES), + CPUMF_EVENT_PTR(cf_z10, L1D_L3_LOCAL_WRITES), + CPUMF_EVENT_PTR(cf_z10, L1I_L3_REMOTE_WRITES), + CPUMF_EVENT_PTR(cf_z10, L1D_L3_REMOTE_WRITES), + CPUMF_EVENT_PTR(cf_z10, L1D_LMEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z10, L1I_LMEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z10, L1D_RO_EXCL_WRITES), + CPUMF_EVENT_PTR(cf_z10, L1I_CACHELINE_INVALIDATES), + CPUMF_EVENT_PTR(cf_z10, ITLB1_WRITES), + CPUMF_EVENT_PTR(cf_z10, DTLB1_WRITES), + CPUMF_EVENT_PTR(cf_z10, TLB2_PTE_WRITES), + CPUMF_EVENT_PTR(cf_z10, TLB2_CRSTE_WRITES), + CPUMF_EVENT_PTR(cf_z10, TLB2_CRSTE_HPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z10, ITLB1_MISSES), + CPUMF_EVENT_PTR(cf_z10, DTLB1_MISSES), + CPUMF_EVENT_PTR(cf_z10, L2C_STORES_SENT), + NULL, +}; + +static struct attribute *cpumcf_z196_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_z196, L1D_L2_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z196, L1I_L2_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z196, DTLB1_MISSES), + CPUMF_EVENT_PTR(cf_z196, ITLB1_MISSES), + CPUMF_EVENT_PTR(cf_z196, L2C_STORES_SENT), + CPUMF_EVENT_PTR(cf_z196, L1D_OFFBOOK_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z196, L1D_ONBOOK_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z196, L1I_ONBOOK_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z196, L1D_RO_EXCL_WRITES), + CPUMF_EVENT_PTR(cf_z196, L1D_OFFBOOK_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z196, L1I_OFFBOOK_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z196, DTLB1_HPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z196, L1D_LMEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z196, L1I_LMEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z196, L1I_OFFBOOK_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z196, DTLB1_WRITES), + CPUMF_EVENT_PTR(cf_z196, ITLB1_WRITES), + CPUMF_EVENT_PTR(cf_z196, TLB2_PTE_WRITES), + CPUMF_EVENT_PTR(cf_z196, TLB2_CRSTE_HPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z196, TLB2_CRSTE_WRITES), + CPUMF_EVENT_PTR(cf_z196, L1D_ONCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z196, L1D_OFFCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z196, L1I_ONCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z196, L1I_OFFCHIP_L3_SOURCED_WRITES), + NULL, +}; + +static struct attribute *cpumcf_zec12_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_zec12, DTLB1_MISSES), + CPUMF_EVENT_PTR(cf_zec12, ITLB1_MISSES), + CPUMF_EVENT_PTR(cf_zec12, L1D_L2I_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_zec12, L1I_L2I_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_zec12, L1D_L2D_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_zec12, DTLB1_WRITES), + CPUMF_EVENT_PTR(cf_zec12, L1D_LMEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_zec12, L1I_LMEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_zec12, L1D_RO_EXCL_WRITES), + CPUMF_EVENT_PTR(cf_zec12, DTLB1_HPAGE_WRITES), + CPUMF_EVENT_PTR(cf_zec12, ITLB1_WRITES), + CPUMF_EVENT_PTR(cf_zec12, TLB2_PTE_WRITES), + CPUMF_EVENT_PTR(cf_zec12, TLB2_CRSTE_HPAGE_WRITES), + CPUMF_EVENT_PTR(cf_zec12, TLB2_CRSTE_WRITES), + CPUMF_EVENT_PTR(cf_zec12, L1D_ONCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_zec12, L1D_OFFCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_zec12, L1D_OFFBOOK_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_zec12, L1D_ONBOOK_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_zec12, L1D_OFFBOOK_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_zec12, TX_NC_TEND), + CPUMF_EVENT_PTR(cf_zec12, L1D_ONCHIP_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_zec12, L1D_OFFCHIP_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_zec12, L1D_OFFBOOK_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_zec12, L1I_ONCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_zec12, L1I_OFFCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_zec12, L1I_OFFBOOK_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_zec12, L1I_ONBOOK_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_zec12, L1I_OFFBOOK_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_zec12, TX_C_TEND), + CPUMF_EVENT_PTR(cf_zec12, L1I_ONCHIP_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_zec12, L1I_OFFCHIP_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_zec12, L1I_OFFBOOK_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_zec12, TX_NC_TABORT), + CPUMF_EVENT_PTR(cf_zec12, TX_C_TABORT_NO_SPECIAL), + CPUMF_EVENT_PTR(cf_zec12, TX_C_TABORT_SPECIAL), + NULL, +}; + +static struct attribute *cpumcf_z13_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_z13, L1D_RO_EXCL_WRITES), + CPUMF_EVENT_PTR(cf_z13, DTLB1_WRITES), + CPUMF_EVENT_PTR(cf_z13, DTLB1_MISSES), + CPUMF_EVENT_PTR(cf_z13, DTLB1_HPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z13, DTLB1_GPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_L2D_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, ITLB1_WRITES), + CPUMF_EVENT_PTR(cf_z13, ITLB1_MISSES), + CPUMF_EVENT_PTR(cf_z13, L1I_L2I_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, TLB2_PTE_WRITES), + CPUMF_EVENT_PTR(cf_z13, TLB2_CRSTE_HPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z13, TLB2_CRSTE_WRITES), + CPUMF_EVENT_PTR(cf_z13, TX_C_TEND), + CPUMF_EVENT_PTR(cf_z13, TX_NC_TEND), + CPUMF_EVENT_PTR(cf_z13, L1C_TLB1_MISSES), + CPUMF_EVENT_PTR(cf_z13, L1D_ONCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_ONCHIP_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z13, L1D_ONNODE_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_ONNODE_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z13, L1D_ONNODE_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_ONDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_ONDRAWER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z13, L1D_ONDRAWER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_OFFDRAWER_SCOL_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_OFFDRAWER_SCOL_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z13, L1D_OFFDRAWER_SCOL_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_OFFDRAWER_FCOL_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_OFFDRAWER_FCOL_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z13, L1D_OFFDRAWER_FCOL_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_ONNODE_MEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_ONDRAWER_MEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_OFFDRAWER_MEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_ONCHIP_MEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_ONCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_ONCHIP_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z13, L1I_ONNODE_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_ONNODE_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z13, L1I_ONNODE_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_ONDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_ONDRAWER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z13, L1I_ONDRAWER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_OFFDRAWER_SCOL_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_OFFDRAWER_SCOL_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z13, L1I_OFFDRAWER_SCOL_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_OFFDRAWER_FCOL_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_OFFDRAWER_FCOL_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z13, L1I_OFFDRAWER_FCOL_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_ONNODE_MEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_ONDRAWER_MEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_OFFDRAWER_MEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_ONCHIP_MEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, TX_NC_TABORT), + CPUMF_EVENT_PTR(cf_z13, TX_C_TABORT_NO_SPECIAL), + CPUMF_EVENT_PTR(cf_z13, TX_C_TABORT_SPECIAL), + CPUMF_EVENT_PTR(cf_z13, MT_DIAG_CYCLES_ONE_THR_ACTIVE), + CPUMF_EVENT_PTR(cf_z13, MT_DIAG_CYCLES_TWO_THR_ACTIVE), + NULL, +}; + +static struct attribute *cpumcf_z14_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_z14, L1D_RO_EXCL_WRITES), + CPUMF_EVENT_PTR(cf_z14, DTLB2_WRITES), + CPUMF_EVENT_PTR(cf_z14, DTLB2_MISSES), + CPUMF_EVENT_PTR(cf_z14, DTLB2_HPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z14, DTLB2_GPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1D_L2D_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, ITLB2_WRITES), + CPUMF_EVENT_PTR(cf_z14, ITLB2_MISSES), + CPUMF_EVENT_PTR(cf_z14, L1I_L2I_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, TLB2_PTE_WRITES), + CPUMF_EVENT_PTR(cf_z14, TLB2_CRSTE_WRITES), + CPUMF_EVENT_PTR(cf_z14, TLB2_ENGINES_BUSY), + CPUMF_EVENT_PTR(cf_z14, TX_C_TEND), + CPUMF_EVENT_PTR(cf_z14, TX_NC_TEND), + CPUMF_EVENT_PTR(cf_z14, L1C_TLB2_MISSES), + CPUMF_EVENT_PTR(cf_z14, L1D_ONCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1D_ONCHIP_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1D_ONCHIP_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z14, L1D_ONCLUSTER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1D_ONCLUSTER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1D_ONCLUSTER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z14, L1D_OFFCLUSTER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1D_OFFCLUSTER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1D_OFFCLUSTER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z14, L1D_OFFDRAWER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1D_OFFDRAWER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1D_OFFDRAWER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z14, L1D_ONDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1D_OFFDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1D_ONCHIP_L3_SOURCED_WRITES_RO), + CPUMF_EVENT_PTR(cf_z14, L1I_ONCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1I_ONCHIP_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1I_ONCHIP_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z14, L1I_ONCLUSTER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1I_ONCLUSTER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1I_ONCLUSTER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z14, L1I_OFFCLUSTER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1I_OFFCLUSTER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1I_OFFCLUSTER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z14, L1I_OFFDRAWER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1I_OFFDRAWER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1I_OFFDRAWER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z14, L1I_ONDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, L1I_OFFDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z14, BCD_DFP_EXECUTION_SLOTS), + CPUMF_EVENT_PTR(cf_z14, VX_BCD_EXECUTION_SLOTS), + CPUMF_EVENT_PTR(cf_z14, DECIMAL_INSTRUCTIONS), + CPUMF_EVENT_PTR(cf_z14, LAST_HOST_TRANSLATIONS), + CPUMF_EVENT_PTR(cf_z14, TX_NC_TABORT), + CPUMF_EVENT_PTR(cf_z14, TX_C_TABORT_NO_SPECIAL), + CPUMF_EVENT_PTR(cf_z14, TX_C_TABORT_SPECIAL), + CPUMF_EVENT_PTR(cf_z14, MT_DIAG_CYCLES_ONE_THR_ACTIVE), + CPUMF_EVENT_PTR(cf_z14, MT_DIAG_CYCLES_TWO_THR_ACTIVE), + NULL, +}; + +static struct attribute *cpumcf_z15_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_z15, L1D_RO_EXCL_WRITES), + CPUMF_EVENT_PTR(cf_z15, DTLB2_WRITES), + CPUMF_EVENT_PTR(cf_z15, DTLB2_MISSES), + CPUMF_EVENT_PTR(cf_z15, DTLB2_HPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z15, DTLB2_GPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_L2D_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, ITLB2_WRITES), + CPUMF_EVENT_PTR(cf_z15, ITLB2_MISSES), + CPUMF_EVENT_PTR(cf_z15, L1I_L2I_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, TLB2_PTE_WRITES), + CPUMF_EVENT_PTR(cf_z15, TLB2_CRSTE_WRITES), + CPUMF_EVENT_PTR(cf_z15, TLB2_ENGINES_BUSY), + CPUMF_EVENT_PTR(cf_z15, TX_C_TEND), + CPUMF_EVENT_PTR(cf_z15, TX_NC_TEND), + CPUMF_EVENT_PTR(cf_z15, L1C_TLB2_MISSES), + CPUMF_EVENT_PTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_ONCHIP_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z15, L1D_ONCLUSTER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_ONCLUSTER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_ONCLUSTER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z15, L1D_OFFCLUSTER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_OFFCLUSTER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_OFFCLUSTER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z15, L1D_OFFDRAWER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_OFFDRAWER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_OFFDRAWER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z15, L1D_ONDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_OFFDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES_RO), + CPUMF_EVENT_PTR(cf_z15, L1I_ONCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_ONCHIP_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_ONCHIP_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z15, L1I_ONCLUSTER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_ONCLUSTER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_ONCLUSTER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z15, L1I_OFFCLUSTER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_OFFCLUSTER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_OFFCLUSTER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z15, L1I_OFFDRAWER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_OFFDRAWER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_OFFDRAWER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z15, L1I_ONDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_OFFDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, BCD_DFP_EXECUTION_SLOTS), + CPUMF_EVENT_PTR(cf_z15, VX_BCD_EXECUTION_SLOTS), + CPUMF_EVENT_PTR(cf_z15, DECIMAL_INSTRUCTIONS), + CPUMF_EVENT_PTR(cf_z15, LAST_HOST_TRANSLATIONS), + CPUMF_EVENT_PTR(cf_z15, TX_NC_TABORT), + CPUMF_EVENT_PTR(cf_z15, TX_C_TABORT_NO_SPECIAL), + CPUMF_EVENT_PTR(cf_z15, TX_C_TABORT_SPECIAL), + CPUMF_EVENT_PTR(cf_z15, DFLT_ACCESS), + CPUMF_EVENT_PTR(cf_z15, DFLT_CYCLES), + CPUMF_EVENT_PTR(cf_z15, DFLT_CC), + CPUMF_EVENT_PTR(cf_z15, DFLT_CCFINISH), + CPUMF_EVENT_PTR(cf_z15, MT_DIAG_CYCLES_ONE_THR_ACTIVE), + CPUMF_EVENT_PTR(cf_z15, MT_DIAG_CYCLES_TWO_THR_ACTIVE), + NULL, +}; + +static struct attribute *cpumcf_z16_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_z16, L1D_RO_EXCL_WRITES), + CPUMF_EVENT_PTR(cf_z16, DTLB2_WRITES), + CPUMF_EVENT_PTR(cf_z16, DTLB2_MISSES), + CPUMF_EVENT_PTR(cf_z16, CRSTE_1MB_WRITES), + CPUMF_EVENT_PTR(cf_z16, DTLB2_GPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z16, ITLB2_WRITES), + CPUMF_EVENT_PTR(cf_z16, ITLB2_MISSES), + CPUMF_EVENT_PTR(cf_z16, TLB2_PTE_WRITES), + CPUMF_EVENT_PTR(cf_z16, TLB2_CRSTE_WRITES), + CPUMF_EVENT_PTR(cf_z16, TLB2_ENGINES_BUSY), + CPUMF_EVENT_PTR(cf_z16, TX_C_TEND), + CPUMF_EVENT_PTR(cf_z16, TX_NC_TEND), + CPUMF_EVENT_PTR(cf_z16, L1C_TLB2_MISSES), + CPUMF_EVENT_PTR(cf_z16, DCW_REQ), + CPUMF_EVENT_PTR(cf_z16, DCW_REQ_IV), + CPUMF_EVENT_PTR(cf_z16, DCW_REQ_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, DCW_REQ_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_IV), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_MODULE), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_DRAWER), + CPUMF_EVENT_PTR(cf_z16, DCW_OFF_DRAWER), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_MEMORY), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_MODULE_MEMORY), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_DRAWER_MEMORY), + CPUMF_EVENT_PTR(cf_z16, DCW_OFF_DRAWER_MEMORY), + CPUMF_EVENT_PTR(cf_z16, IDCW_ON_MODULE_IV), + CPUMF_EVENT_PTR(cf_z16, IDCW_ON_MODULE_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, IDCW_ON_MODULE_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, IDCW_ON_DRAWER_IV), + CPUMF_EVENT_PTR(cf_z16, IDCW_ON_DRAWER_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, IDCW_ON_DRAWER_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, IDCW_OFF_DRAWER_IV), + CPUMF_EVENT_PTR(cf_z16, IDCW_OFF_DRAWER_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, IDCW_OFF_DRAWER_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, ICW_REQ), + CPUMF_EVENT_PTR(cf_z16, ICW_REQ_IV), + CPUMF_EVENT_PTR(cf_z16, ICW_REQ_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, ICW_REQ_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_IV), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_MODULE), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_DRAWER), + CPUMF_EVENT_PTR(cf_z16, ICW_OFF_DRAWER), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_MEMORY), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_MODULE_MEMORY), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_DRAWER_MEMORY), + CPUMF_EVENT_PTR(cf_z16, ICW_OFF_DRAWER_MEMORY), + CPUMF_EVENT_PTR(cf_z16, BCD_DFP_EXECUTION_SLOTS), + CPUMF_EVENT_PTR(cf_z16, VX_BCD_EXECUTION_SLOTS), + CPUMF_EVENT_PTR(cf_z16, DECIMAL_INSTRUCTIONS), + CPUMF_EVENT_PTR(cf_z16, LAST_HOST_TRANSLATIONS), + CPUMF_EVENT_PTR(cf_z16, TX_NC_TABORT), + CPUMF_EVENT_PTR(cf_z16, TX_C_TABORT_NO_SPECIAL), + CPUMF_EVENT_PTR(cf_z16, TX_C_TABORT_SPECIAL), + CPUMF_EVENT_PTR(cf_z16, DFLT_ACCESS), + CPUMF_EVENT_PTR(cf_z16, DFLT_CYCLES), + CPUMF_EVENT_PTR(cf_z16, SORTL), + CPUMF_EVENT_PTR(cf_z16, DFLT_CC), + CPUMF_EVENT_PTR(cf_z16, DFLT_CCFINISH), + CPUMF_EVENT_PTR(cf_z16, NNPA_INVOCATIONS), + CPUMF_EVENT_PTR(cf_z16, NNPA_COMPLETIONS), + CPUMF_EVENT_PTR(cf_z16, NNPA_WAIT_LOCK), + CPUMF_EVENT_PTR(cf_z16, NNPA_HOLD_LOCK), + CPUMF_EVENT_PTR(cf_z16, MT_DIAG_CYCLES_ONE_THR_ACTIVE), + CPUMF_EVENT_PTR(cf_z16, MT_DIAG_CYCLES_TWO_THR_ACTIVE), + NULL, +}; + +/* END: CPUM_CF COUNTER DEFINITIONS ===================================== */ + +static struct attribute_group cpumcf_pmu_events_group = { + .name = "events", +}; + +PMU_FORMAT_ATTR(event, "config:0-63"); + +static struct attribute *cpumcf_pmu_format_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group cpumcf_pmu_format_group = { + .name = "format", + .attrs = cpumcf_pmu_format_attr, +}; + +static const struct attribute_group *cpumcf_pmu_attr_groups[] = { + &cpumcf_pmu_events_group, + &cpumcf_pmu_format_group, + NULL, +}; + + +static __init struct attribute **merge_attr(struct attribute **a, + struct attribute **b, + struct attribute **c) +{ + struct attribute **new; + int j, i; + + for (j = 0; a[j]; j++) + ; + for (i = 0; b[i]; i++) + j++; + for (i = 0; c[i]; i++) + j++; + j++; + + new = kmalloc_array(j, sizeof(struct attribute *), GFP_KERNEL); + if (!new) + return NULL; + j = 0; + for (i = 0; a[i]; i++) + new[j++] = a[i]; + for (i = 0; b[i]; i++) + new[j++] = b[i]; + for (i = 0; c[i]; i++) + new[j++] = c[i]; + new[j] = NULL; + + return new; +} + +__init const struct attribute_group **cpumf_cf_event_group(void) +{ + struct attribute **combined, **model, **cfvn, **csvn; + struct attribute *none[] = { NULL }; + struct cpumf_ctr_info ci; + struct cpuid cpu_id; + + /* Determine generic counters set(s) */ + qctri(&ci); + switch (ci.cfvn) { + case 1: + cfvn = cpumcf_fvn1_pmu_event_attr; + break; + case 3: + cfvn = cpumcf_fvn3_pmu_event_attr; + break; + default: + cfvn = none; + } + + /* Determine version specific crypto set */ + switch (ci.csvn) { + case 1 ... 5: + csvn = cpumcf_svn_12345_pmu_event_attr; + break; + case 6 ... 7: + csvn = cpumcf_svn_67_pmu_event_attr; + break; + default: + csvn = none; + } + + /* Determine model-specific counter set(s) */ + get_cpu_id(&cpu_id); + switch (cpu_id.machine) { + case 0x2097: + case 0x2098: + model = cpumcf_z10_pmu_event_attr; + break; + case 0x2817: + case 0x2818: + model = cpumcf_z196_pmu_event_attr; + break; + case 0x2827: + case 0x2828: + model = cpumcf_zec12_pmu_event_attr; + break; + case 0x2964: + case 0x2965: + model = cpumcf_z13_pmu_event_attr; + break; + case 0x3906: + case 0x3907: + model = cpumcf_z14_pmu_event_attr; + break; + case 0x8561: + case 0x8562: + model = cpumcf_z15_pmu_event_attr; + break; + case 0x3931: + case 0x3932: + model = cpumcf_z16_pmu_event_attr; + break; + default: + model = none; + break; + } + + combined = merge_attr(cfvn, csvn, model); + if (combined) + cpumcf_pmu_events_group.attrs = combined; + return cpumcf_pmu_attr_groups; +} diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c new file mode 100644 index 0000000000..06efad5b4f --- /dev/null +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -0,0 +1,2280 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Performance event support for the System z CPU-measurement Sampling Facility + * + * Copyright IBM Corp. 2013, 2018 + * Author(s): Hendrik Brueckner + */ +#define KMSG_COMPONENT "cpum_sf" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Minimum number of sample-data-block-tables: + * At least one table is required for the sampling buffer structure. + * A single table contains up to 511 pointers to sample-data-blocks. + */ +#define CPUM_SF_MIN_SDBT 1 + +/* Number of sample-data-blocks per sample-data-block-table (SDBT): + * A table contains SDB pointers (8 bytes) and one table-link entry + * that points to the origin of the next SDBT. + */ +#define CPUM_SF_SDB_PER_TABLE ((PAGE_SIZE - 8) / 8) + +/* Maximum page offset for an SDBT table-link entry: + * If this page offset is reached, a table-link entry to the next SDBT + * must be added. + */ +#define CPUM_SF_SDBT_TL_OFFSET (CPUM_SF_SDB_PER_TABLE * 8) +static inline int require_table_link(const void *sdbt) +{ + return ((unsigned long)sdbt & ~PAGE_MASK) == CPUM_SF_SDBT_TL_OFFSET; +} + +/* Minimum and maximum sampling buffer sizes: + * + * This number represents the maximum size of the sampling buffer taking + * the number of sample-data-block-tables into account. Note that these + * numbers apply to the basic-sampling function only. + * The maximum number of SDBs is increased by CPUM_SF_SDB_DIAG_FACTOR if + * the diagnostic-sampling function is active. + * + * Sampling buffer size Buffer characteristics + * --------------------------------------------------- + * 64KB == 16 pages (4KB per page) + * 1 page for SDB-tables + * 15 pages for SDBs + * + * 32MB == 8192 pages (4KB per page) + * 16 pages for SDB-tables + * 8176 pages for SDBs + */ +static unsigned long __read_mostly CPUM_SF_MIN_SDB = 15; +static unsigned long __read_mostly CPUM_SF_MAX_SDB = 8176; +static unsigned long __read_mostly CPUM_SF_SDB_DIAG_FACTOR = 1; + +struct sf_buffer { + unsigned long *sdbt; /* Sample-data-block-table origin */ + /* buffer characteristics (required for buffer increments) */ + unsigned long num_sdb; /* Number of sample-data-blocks */ + unsigned long num_sdbt; /* Number of sample-data-block-tables */ + unsigned long *tail; /* last sample-data-block-table */ +}; + +struct aux_buffer { + struct sf_buffer sfb; + unsigned long head; /* index of SDB of buffer head */ + unsigned long alert_mark; /* index of SDB of alert request position */ + unsigned long empty_mark; /* mark of SDB not marked full */ + unsigned long *sdb_index; /* SDB address for fast lookup */ + unsigned long *sdbt_index; /* SDBT address for fast lookup */ +}; + +struct cpu_hw_sf { + /* CPU-measurement sampling information block */ + struct hws_qsi_info_block qsi; + /* CPU-measurement sampling control block */ + struct hws_lsctl_request_block lsctl; + struct sf_buffer sfb; /* Sampling buffer */ + unsigned int flags; /* Status flags */ + struct perf_event *event; /* Scheduled perf event */ + struct perf_output_handle handle; /* AUX buffer output handle */ +}; +static DEFINE_PER_CPU(struct cpu_hw_sf, cpu_hw_sf); + +/* Debug feature */ +static debug_info_t *sfdbg; + +/* Sampling control helper functions */ +static inline unsigned long freq_to_sample_rate(struct hws_qsi_info_block *qsi, + unsigned long freq) +{ + return (USEC_PER_SEC / freq) * qsi->cpu_speed; +} + +static inline unsigned long sample_rate_to_freq(struct hws_qsi_info_block *qsi, + unsigned long rate) +{ + return USEC_PER_SEC * qsi->cpu_speed / rate; +} + +/* Return TOD timestamp contained in an trailer entry */ +static inline unsigned long long trailer_timestamp(struct hws_trailer_entry *te) +{ + /* TOD in STCKE format */ + if (te->header.t) + return *((unsigned long long *)&te->timestamp[1]); + + /* TOD in STCK format */ + return *((unsigned long long *)&te->timestamp[0]); +} + +/* Return pointer to trailer entry of an sample data block */ +static inline struct hws_trailer_entry *trailer_entry_ptr(unsigned long v) +{ + void *ret; + + ret = (void *)v; + ret += PAGE_SIZE; + ret -= sizeof(struct hws_trailer_entry); + + return ret; +} + +/* + * Return true if the entry in the sample data block table (sdbt) + * is a link to the next sdbt + */ +static inline int is_link_entry(unsigned long *s) +{ + return *s & 0x1UL ? 1 : 0; +} + +/* Return pointer to the linked sdbt */ +static inline unsigned long *get_next_sdbt(unsigned long *s) +{ + return phys_to_virt(*s & ~0x1UL); +} + +/* + * sf_disable() - Switch off sampling facility + */ +static int sf_disable(void) +{ + struct hws_lsctl_request_block sreq; + + memset(&sreq, 0, sizeof(sreq)); + return lsctl(&sreq); +} + +/* + * sf_buffer_available() - Check for an allocated sampling buffer + */ +static int sf_buffer_available(struct cpu_hw_sf *cpuhw) +{ + return !!cpuhw->sfb.sdbt; +} + +/* + * deallocate sampling facility buffer + */ +static void free_sampling_buffer(struct sf_buffer *sfb) +{ + unsigned long *sdbt, *curr; + + if (!sfb->sdbt) + return; + + sdbt = sfb->sdbt; + curr = sdbt; + + /* Free the SDBT after all SDBs are processed... */ + while (1) { + if (!*curr || !sdbt) + break; + + /* Process table-link entries */ + if (is_link_entry(curr)) { + curr = get_next_sdbt(curr); + if (sdbt) + free_page((unsigned long)sdbt); + + /* If the origin is reached, sampling buffer is freed */ + if (curr == sfb->sdbt) + break; + else + sdbt = curr; + } else { + /* Process SDB pointer */ + if (*curr) { + free_page((unsigned long)phys_to_virt(*curr)); + curr++; + } + } + } + + debug_sprintf_event(sfdbg, 5, "%s: freed sdbt %#lx\n", __func__, + (unsigned long)sfb->sdbt); + memset(sfb, 0, sizeof(*sfb)); +} + +static int alloc_sample_data_block(unsigned long *sdbt, gfp_t gfp_flags) +{ + struct hws_trailer_entry *te; + unsigned long sdb; + + /* Allocate and initialize sample-data-block */ + sdb = get_zeroed_page(gfp_flags); + if (!sdb) + return -ENOMEM; + te = trailer_entry_ptr(sdb); + te->header.a = 1; + + /* Link SDB into the sample-data-block-table */ + *sdbt = virt_to_phys((void *)sdb); + + return 0; +} + +/* + * realloc_sampling_buffer() - extend sampler memory + * + * Allocates new sample-data-blocks and adds them to the specified sampling + * buffer memory. + * + * Important: This modifies the sampling buffer and must be called when the + * sampling facility is disabled. + * + * Returns zero on success, non-zero otherwise. + */ +static int realloc_sampling_buffer(struct sf_buffer *sfb, + unsigned long num_sdb, gfp_t gfp_flags) +{ + int i, rc; + unsigned long *new, *tail, *tail_prev = NULL; + + if (!sfb->sdbt || !sfb->tail) + return -EINVAL; + + if (!is_link_entry(sfb->tail)) + return -EINVAL; + + /* Append to the existing sampling buffer, overwriting the table-link + * register. + * The tail variables always points to the "tail" (last and table-link) + * entry in an SDB-table. + */ + tail = sfb->tail; + + /* Do a sanity check whether the table-link entry points to + * the sampling buffer origin. + */ + if (sfb->sdbt != get_next_sdbt(tail)) { + debug_sprintf_event(sfdbg, 3, "%s: " + "sampling buffer is not linked: origin %#lx" + " tail %#lx\n", __func__, + (unsigned long)sfb->sdbt, + (unsigned long)tail); + return -EINVAL; + } + + /* Allocate remaining SDBs */ + rc = 0; + for (i = 0; i < num_sdb; i++) { + /* Allocate a new SDB-table if it is full. */ + if (require_table_link(tail)) { + new = (unsigned long *)get_zeroed_page(gfp_flags); + if (!new) { + rc = -ENOMEM; + break; + } + sfb->num_sdbt++; + /* Link current page to tail of chain */ + *tail = virt_to_phys((void *)new) + 1; + tail_prev = tail; + tail = new; + } + + /* Allocate a new sample-data-block. + * If there is not enough memory, stop the realloc process + * and simply use what was allocated. If this is a temporary + * issue, a new realloc call (if required) might succeed. + */ + rc = alloc_sample_data_block(tail, gfp_flags); + if (rc) { + /* Undo last SDBT. An SDBT with no SDB at its first + * entry but with an SDBT entry instead can not be + * handled by the interrupt handler code. + * Avoid this situation. + */ + if (tail_prev) { + sfb->num_sdbt--; + free_page((unsigned long)new); + tail = tail_prev; + } + break; + } + sfb->num_sdb++; + tail++; + tail_prev = new = NULL; /* Allocated at least one SBD */ + } + + /* Link sampling buffer to its origin */ + *tail = virt_to_phys(sfb->sdbt) + 1; + sfb->tail = tail; + + debug_sprintf_event(sfdbg, 4, "%s: new buffer" + " settings: sdbt %lu sdb %lu\n", __func__, + sfb->num_sdbt, sfb->num_sdb); + return rc; +} + +/* + * allocate_sampling_buffer() - allocate sampler memory + * + * Allocates and initializes a sampling buffer structure using the + * specified number of sample-data-blocks (SDB). For each allocation, + * a 4K page is used. The number of sample-data-block-tables (SDBT) + * are calculated from SDBs. + * Also set the ALERT_REQ mask in each SDBs trailer. + * + * Returns zero on success, non-zero otherwise. + */ +static int alloc_sampling_buffer(struct sf_buffer *sfb, unsigned long num_sdb) +{ + int rc; + + if (sfb->sdbt) + return -EINVAL; + + /* Allocate the sample-data-block-table origin */ + sfb->sdbt = (unsigned long *)get_zeroed_page(GFP_KERNEL); + if (!sfb->sdbt) + return -ENOMEM; + sfb->num_sdb = 0; + sfb->num_sdbt = 1; + + /* Link the table origin to point to itself to prepare for + * realloc_sampling_buffer() invocation. + */ + sfb->tail = sfb->sdbt; + *sfb->tail = virt_to_phys((void *)sfb->sdbt) + 1; + + /* Allocate requested number of sample-data-blocks */ + rc = realloc_sampling_buffer(sfb, num_sdb, GFP_KERNEL); + if (rc) { + free_sampling_buffer(sfb); + debug_sprintf_event(sfdbg, 4, "%s: " + "realloc_sampling_buffer failed with rc %i\n", + __func__, rc); + } else + debug_sprintf_event(sfdbg, 4, + "%s: tear %#lx dear %#lx\n", __func__, + (unsigned long)sfb->sdbt, (unsigned long)*sfb->sdbt); + return rc; +} + +static void sfb_set_limits(unsigned long min, unsigned long max) +{ + struct hws_qsi_info_block si; + + CPUM_SF_MIN_SDB = min; + CPUM_SF_MAX_SDB = max; + + memset(&si, 0, sizeof(si)); + if (!qsi(&si)) + CPUM_SF_SDB_DIAG_FACTOR = DIV_ROUND_UP(si.dsdes, si.bsdes); +} + +static unsigned long sfb_max_limit(struct hw_perf_event *hwc) +{ + return SAMPL_DIAG_MODE(hwc) ? CPUM_SF_MAX_SDB * CPUM_SF_SDB_DIAG_FACTOR + : CPUM_SF_MAX_SDB; +} + +static unsigned long sfb_pending_allocs(struct sf_buffer *sfb, + struct hw_perf_event *hwc) +{ + if (!sfb->sdbt) + return SFB_ALLOC_REG(hwc); + if (SFB_ALLOC_REG(hwc) > sfb->num_sdb) + return SFB_ALLOC_REG(hwc) - sfb->num_sdb; + return 0; +} + +static int sfb_has_pending_allocs(struct sf_buffer *sfb, + struct hw_perf_event *hwc) +{ + return sfb_pending_allocs(sfb, hwc) > 0; +} + +static void sfb_account_allocs(unsigned long num, struct hw_perf_event *hwc) +{ + /* Limit the number of SDBs to not exceed the maximum */ + num = min_t(unsigned long, num, sfb_max_limit(hwc) - SFB_ALLOC_REG(hwc)); + if (num) + SFB_ALLOC_REG(hwc) += num; +} + +static void sfb_init_allocs(unsigned long num, struct hw_perf_event *hwc) +{ + SFB_ALLOC_REG(hwc) = 0; + sfb_account_allocs(num, hwc); +} + +static void deallocate_buffers(struct cpu_hw_sf *cpuhw) +{ + if (cpuhw->sfb.sdbt) + free_sampling_buffer(&cpuhw->sfb); +} + +static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc) +{ + unsigned long n_sdb, freq; + size_t sample_size; + + /* Calculate sampling buffers using 4K pages + * + * 1. The sampling size is 32 bytes for basic sampling. This size + * is the same for all machine types. Diagnostic + * sampling uses auxlilary data buffer setup which provides the + * memory for SDBs using linux common code auxiliary trace + * setup. + * + * 2. Function alloc_sampling_buffer() sets the Alert Request + * Control indicator to trigger a measurement-alert to harvest + * sample-data-blocks (SDB). This is done per SDB. This + * measurement alert interrupt fires quick enough to handle + * one SDB, on very high frequency and work loads there might + * be 2 to 3 SBDs available for sample processing. + * Currently there is no need for setup alert request on every + * n-th page. This is counterproductive as one IRQ triggers + * a very high number of samples to be processed at one IRQ. + * + * 3. Use the sampling frequency as input. + * Compute the number of SDBs and ensure a minimum + * of CPUM_SF_MIN_SDB. Depending on frequency add some more + * SDBs to handle a higher sampling rate. + * Use a minimum of CPUM_SF_MIN_SDB and allow for 100 samples + * (one SDB) for every 10000 HZ frequency increment. + * + * 4. Compute the number of sample-data-block-tables (SDBT) and + * ensure a minimum of CPUM_SF_MIN_SDBT (one table can manage up + * to 511 SDBs). + */ + sample_size = sizeof(struct hws_basic_entry); + freq = sample_rate_to_freq(&cpuhw->qsi, SAMPL_RATE(hwc)); + n_sdb = CPUM_SF_MIN_SDB + DIV_ROUND_UP(freq, 10000); + + /* If there is already a sampling buffer allocated, it is very likely + * that the sampling facility is enabled too. If the event to be + * initialized requires a greater sampling buffer, the allocation must + * be postponed. Changing the sampling buffer requires the sampling + * facility to be in the disabled state. So, account the number of + * required SDBs and let cpumsf_pmu_enable() resize the buffer just + * before the event is started. + */ + sfb_init_allocs(n_sdb, hwc); + if (sf_buffer_available(cpuhw)) + return 0; + + debug_sprintf_event(sfdbg, 3, + "%s: rate %lu f %lu sdb %lu/%lu" + " sample_size %lu cpuhw %p\n", __func__, + SAMPL_RATE(hwc), freq, n_sdb, sfb_max_limit(hwc), + sample_size, cpuhw); + + return alloc_sampling_buffer(&cpuhw->sfb, + sfb_pending_allocs(&cpuhw->sfb, hwc)); +} + +static unsigned long min_percent(unsigned int percent, unsigned long base, + unsigned long min) +{ + return min_t(unsigned long, min, DIV_ROUND_UP(percent * base, 100)); +} + +static unsigned long compute_sfb_extent(unsigned long ratio, unsigned long base) +{ + /* Use a percentage-based approach to extend the sampling facility + * buffer. Accept up to 5% sample data loss. + * Vary the extents between 1% to 5% of the current number of + * sample-data-blocks. + */ + if (ratio <= 5) + return 0; + if (ratio <= 25) + return min_percent(1, base, 1); + if (ratio <= 50) + return min_percent(1, base, 1); + if (ratio <= 75) + return min_percent(2, base, 2); + if (ratio <= 100) + return min_percent(3, base, 3); + if (ratio <= 250) + return min_percent(4, base, 4); + + return min_percent(5, base, 8); +} + +static void sfb_account_overflows(struct cpu_hw_sf *cpuhw, + struct hw_perf_event *hwc) +{ + unsigned long ratio, num; + + if (!OVERFLOW_REG(hwc)) + return; + + /* The sample_overflow contains the average number of sample data + * that has been lost because sample-data-blocks were full. + * + * Calculate the total number of sample data entries that has been + * discarded. Then calculate the ratio of lost samples to total samples + * per second in percent. + */ + ratio = DIV_ROUND_UP(100 * OVERFLOW_REG(hwc) * cpuhw->sfb.num_sdb, + sample_rate_to_freq(&cpuhw->qsi, SAMPL_RATE(hwc))); + + /* Compute number of sample-data-blocks */ + num = compute_sfb_extent(ratio, cpuhw->sfb.num_sdb); + if (num) + sfb_account_allocs(num, hwc); + + debug_sprintf_event(sfdbg, 5, "%s: overflow %llu ratio %lu num %lu\n", + __func__, OVERFLOW_REG(hwc), ratio, num); + OVERFLOW_REG(hwc) = 0; +} + +/* extend_sampling_buffer() - Extend sampling buffer + * @sfb: Sampling buffer structure (for local CPU) + * @hwc: Perf event hardware structure + * + * Use this function to extend the sampling buffer based on the overflow counter + * and postponed allocation extents stored in the specified Perf event hardware. + * + * Important: This function disables the sampling facility in order to safely + * change the sampling buffer structure. Do not call this function + * when the PMU is active. + */ +static void extend_sampling_buffer(struct sf_buffer *sfb, + struct hw_perf_event *hwc) +{ + unsigned long num, num_old; + int rc; + + num = sfb_pending_allocs(sfb, hwc); + if (!num) + return; + num_old = sfb->num_sdb; + + /* Disable the sampling facility to reset any states and also + * clear pending measurement alerts. + */ + sf_disable(); + + /* Extend the sampling buffer. + * This memory allocation typically happens in an atomic context when + * called by perf. Because this is a reallocation, it is fine if the + * new SDB-request cannot be satisfied immediately. + */ + rc = realloc_sampling_buffer(sfb, num, GFP_ATOMIC); + if (rc) + debug_sprintf_event(sfdbg, 5, "%s: realloc failed with rc %i\n", + __func__, rc); + + if (sfb_has_pending_allocs(sfb, hwc)) + debug_sprintf_event(sfdbg, 5, "%s: " + "req %lu alloc %lu remaining %lu\n", + __func__, num, sfb->num_sdb - num_old, + sfb_pending_allocs(sfb, hwc)); +} + +/* Number of perf events counting hardware events */ +static atomic_t num_events; +/* Used to avoid races in calling reserve/release_cpumf_hardware */ +static DEFINE_MUTEX(pmc_reserve_mutex); + +#define PMC_INIT 0 +#define PMC_RELEASE 1 +#define PMC_FAILURE 2 +static void setup_pmc_cpu(void *flags) +{ + struct cpu_hw_sf *cpusf = this_cpu_ptr(&cpu_hw_sf); + int err = 0; + + switch (*((int *)flags)) { + case PMC_INIT: + memset(cpusf, 0, sizeof(*cpusf)); + err = qsi(&cpusf->qsi); + if (err) + break; + cpusf->flags |= PMU_F_RESERVED; + err = sf_disable(); + break; + case PMC_RELEASE: + cpusf->flags &= ~PMU_F_RESERVED; + err = sf_disable(); + if (!err) + deallocate_buffers(cpusf); + break; + } + if (err) { + *((int *)flags) |= PMC_FAILURE; + pr_err("Switching off the sampling facility failed with rc %i\n", err); + } +} + +static void release_pmc_hardware(void) +{ + int flags = PMC_RELEASE; + + irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT); + on_each_cpu(setup_pmc_cpu, &flags, 1); +} + +static int reserve_pmc_hardware(void) +{ + int flags = PMC_INIT; + + on_each_cpu(setup_pmc_cpu, &flags, 1); + if (flags & PMC_FAILURE) { + release_pmc_hardware(); + return -ENODEV; + } + irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT); + + return 0; +} + +static void hw_perf_event_destroy(struct perf_event *event) +{ + /* Release PMC if this is the last perf event */ + if (!atomic_add_unless(&num_events, -1, 1)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_dec_return(&num_events) == 0) + release_pmc_hardware(); + mutex_unlock(&pmc_reserve_mutex); + } +} + +static void hw_init_period(struct hw_perf_event *hwc, u64 period) +{ + hwc->sample_period = period; + hwc->last_period = hwc->sample_period; + local64_set(&hwc->period_left, hwc->sample_period); +} + +static unsigned long hw_limit_rate(const struct hws_qsi_info_block *si, + unsigned long rate) +{ + return clamp_t(unsigned long, rate, + si->min_sampl_rate, si->max_sampl_rate); +} + +static u32 cpumsf_pid_type(struct perf_event *event, + u32 pid, enum pid_type type) +{ + struct task_struct *tsk; + + /* Idle process */ + if (!pid) + goto out; + + tsk = find_task_by_pid_ns(pid, &init_pid_ns); + pid = -1; + if (tsk) { + /* + * Only top level events contain the pid namespace in which + * they are created. + */ + if (event->parent) + event = event->parent; + pid = __task_pid_nr_ns(tsk, type, event->ns); + /* + * See also 1d953111b648 + * "perf/core: Don't report zero PIDs for exiting tasks". + */ + if (!pid && !pid_alive(tsk)) + pid = -1; + } +out: + return pid; +} + +static void cpumsf_output_event_pid(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + u32 pid; + struct perf_event_header header; + struct perf_output_handle handle; + + /* + * Obtain the PID from the basic-sampling data entry and + * correct the data->tid_entry.pid value. + */ + pid = data->tid_entry.pid; + + /* Protect callchain buffers, tasks */ + rcu_read_lock(); + + perf_prepare_sample(data, event, regs); + perf_prepare_header(&header, data, event, regs); + if (perf_output_begin(&handle, data, event, header.size)) + goto out; + + /* Update the process ID (see also kernel/events/core.c) */ + data->tid_entry.pid = cpumsf_pid_type(event, pid, PIDTYPE_TGID); + data->tid_entry.tid = cpumsf_pid_type(event, pid, PIDTYPE_PID); + + perf_output_sample(&handle, &header, data, event); + perf_output_end(&handle); +out: + rcu_read_unlock(); +} + +static unsigned long getrate(bool freq, unsigned long sample, + struct hws_qsi_info_block *si) +{ + unsigned long rate; + + if (freq) { + rate = freq_to_sample_rate(si, sample); + rate = hw_limit_rate(si, rate); + } else { + /* The min/max sampling rates specifies the valid range + * of sample periods. If the specified sample period is + * out of range, limit the period to the range boundary. + */ + rate = hw_limit_rate(si, sample); + + /* The perf core maintains a maximum sample rate that is + * configurable through the sysctl interface. Ensure the + * sampling rate does not exceed this value. This also helps + * to avoid throttling when pushing samples with + * perf_event_overflow(). + */ + if (sample_rate_to_freq(si, rate) > + sysctl_perf_event_sample_rate) { + debug_sprintf_event(sfdbg, 1, "%s: " + "Sampling rate exceeds maximum " + "perf sample rate\n", __func__); + rate = 0; + } + } + return rate; +} + +/* The sampling information (si) contains information about the + * min/max sampling intervals and the CPU speed. So calculate the + * correct sampling interval and avoid the whole period adjust + * feedback loop. + * + * Since the CPU Measurement sampling facility can not handle frequency + * calculate the sampling interval when frequency is specified using + * this formula: + * interval := cpu_speed * 1000000 / sample_freq + * + * Returns errno on bad input and zero on success with parameter interval + * set to the correct sampling rate. + * + * Note: This function turns off freq bit to avoid calling function + * perf_adjust_period(). This causes frequency adjustment in the common + * code part which causes tremendous variations in the counter values. + */ +static int __hw_perf_event_init_rate(struct perf_event *event, + struct hws_qsi_info_block *si) +{ + struct perf_event_attr *attr = &event->attr; + struct hw_perf_event *hwc = &event->hw; + unsigned long rate; + + if (attr->freq) { + if (!attr->sample_freq) + return -EINVAL; + rate = getrate(attr->freq, attr->sample_freq, si); + attr->freq = 0; /* Don't call perf_adjust_period() */ + SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_FREQ_MODE; + } else { + rate = getrate(attr->freq, attr->sample_period, si); + if (!rate) + return -EINVAL; + } + attr->sample_period = rate; + SAMPL_RATE(hwc) = rate; + hw_init_period(hwc, SAMPL_RATE(hwc)); + debug_sprintf_event(sfdbg, 4, "%s: cpu %d period %#llx freq %d,%#lx\n", + __func__, event->cpu, event->attr.sample_period, + event->attr.freq, SAMPLE_FREQ_MODE(hwc)); + return 0; +} + +static int __hw_perf_event_init(struct perf_event *event) +{ + struct cpu_hw_sf *cpuhw; + struct hws_qsi_info_block si; + struct perf_event_attr *attr = &event->attr; + struct hw_perf_event *hwc = &event->hw; + int cpu, err; + + /* Reserve CPU-measurement sampling facility */ + err = 0; + if (!atomic_inc_not_zero(&num_events)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_read(&num_events) == 0 && reserve_pmc_hardware()) + err = -EBUSY; + else + atomic_inc(&num_events); + mutex_unlock(&pmc_reserve_mutex); + } + event->destroy = hw_perf_event_destroy; + + if (err) + goto out; + + /* Access per-CPU sampling information (query sampling info) */ + /* + * The event->cpu value can be -1 to count on every CPU, for example, + * when attaching to a task. If this is specified, use the query + * sampling info from the current CPU, otherwise use event->cpu to + * retrieve the per-CPU information. + * Later, cpuhw indicates whether to allocate sampling buffers for a + * particular CPU (cpuhw!=NULL) or each online CPU (cpuw==NULL). + */ + memset(&si, 0, sizeof(si)); + cpuhw = NULL; + if (event->cpu == -1) + qsi(&si); + else { + /* Event is pinned to a particular CPU, retrieve the per-CPU + * sampling structure for accessing the CPU-specific QSI. + */ + cpuhw = &per_cpu(cpu_hw_sf, event->cpu); + si = cpuhw->qsi; + } + + /* Check sampling facility authorization and, if not authorized, + * fall back to other PMUs. It is safe to check any CPU because + * the authorization is identical for all configured CPUs. + */ + if (!si.as) { + err = -ENOENT; + goto out; + } + + if (si.ribm & CPU_MF_SF_RIBM_NOTAV) { + pr_warn("CPU Measurement Facility sampling is temporarily not available\n"); + err = -EBUSY; + goto out; + } + + /* Always enable basic sampling */ + SAMPL_FLAGS(hwc) = PERF_CPUM_SF_BASIC_MODE; + + /* Check if diagnostic sampling is requested. Deny if the required + * sampling authorization is missing. + */ + if (attr->config == PERF_EVENT_CPUM_SF_DIAG) { + if (!si.ad) { + err = -EPERM; + goto out; + } + SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_DIAG_MODE; + } + + err = __hw_perf_event_init_rate(event, &si); + if (err) + goto out; + + /* Initialize sample data overflow accounting */ + hwc->extra_reg.reg = REG_OVERFLOW; + OVERFLOW_REG(hwc) = 0; + + /* Use AUX buffer. No need to allocate it by ourself */ + if (attr->config == PERF_EVENT_CPUM_SF_DIAG) + return 0; + + /* Allocate the per-CPU sampling buffer using the CPU information + * from the event. If the event is not pinned to a particular + * CPU (event->cpu == -1; or cpuhw == NULL), allocate sampling + * buffers for each online CPU. + */ + if (cpuhw) + /* Event is pinned to a particular CPU */ + err = allocate_buffers(cpuhw, hwc); + else { + /* Event is not pinned, allocate sampling buffer on + * each online CPU + */ + for_each_online_cpu(cpu) { + cpuhw = &per_cpu(cpu_hw_sf, cpu); + err = allocate_buffers(cpuhw, hwc); + if (err) + break; + } + } + + /* If PID/TID sampling is active, replace the default overflow + * handler to extract and resolve the PIDs from the basic-sampling + * data entries. + */ + if (event->attr.sample_type & PERF_SAMPLE_TID) + if (is_default_overflow_handler(event)) + event->overflow_handler = cpumsf_output_event_pid; +out: + return err; +} + +static bool is_callchain_event(struct perf_event *event) +{ + u64 sample_type = event->attr.sample_type; + + return sample_type & (PERF_SAMPLE_CALLCHAIN | PERF_SAMPLE_REGS_USER | + PERF_SAMPLE_STACK_USER); +} + +static int cpumsf_pmu_event_init(struct perf_event *event) +{ + int err; + + /* No support for taken branch sampling */ + /* No support for callchain, stacks and registers */ + if (has_branch_stack(event) || is_callchain_event(event)) + return -EOPNOTSUPP; + + switch (event->attr.type) { + case PERF_TYPE_RAW: + if ((event->attr.config != PERF_EVENT_CPUM_SF) && + (event->attr.config != PERF_EVENT_CPUM_SF_DIAG)) + return -ENOENT; + break; + case PERF_TYPE_HARDWARE: + /* Support sampling of CPU cycles in addition to the + * counter facility. However, the counter facility + * is more precise and, hence, restrict this PMU to + * sampling events only. + */ + if (event->attr.config != PERF_COUNT_HW_CPU_CYCLES) + return -ENOENT; + if (!is_sampling_event(event)) + return -ENOENT; + break; + default: + return -ENOENT; + } + + /* Force reset of idle/hv excludes regardless of what the + * user requested. + */ + if (event->attr.exclude_hv) + event->attr.exclude_hv = 0; + if (event->attr.exclude_idle) + event->attr.exclude_idle = 0; + + err = __hw_perf_event_init(event); + if (unlikely(err)) + if (event->destroy) + event->destroy(event); + return err; +} + +static void cpumsf_pmu_enable(struct pmu *pmu) +{ + struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); + struct hw_perf_event *hwc; + int err; + + if (cpuhw->flags & PMU_F_ENABLED) + return; + + if (cpuhw->flags & PMU_F_ERR_MASK) + return; + + /* Check whether to extent the sampling buffer. + * + * Two conditions trigger an increase of the sampling buffer for a + * perf event: + * 1. Postponed buffer allocations from the event initialization. + * 2. Sampling overflows that contribute to pending allocations. + * + * Note that the extend_sampling_buffer() function disables the sampling + * facility, but it can be fully re-enabled using sampling controls that + * have been saved in cpumsf_pmu_disable(). + */ + if (cpuhw->event) { + hwc = &cpuhw->event->hw; + if (!(SAMPL_DIAG_MODE(hwc))) { + /* + * Account number of overflow-designated + * buffer extents + */ + sfb_account_overflows(cpuhw, hwc); + extend_sampling_buffer(&cpuhw->sfb, hwc); + } + /* Rate may be adjusted with ioctl() */ + cpuhw->lsctl.interval = SAMPL_RATE(&cpuhw->event->hw); + } + + /* (Re)enable the PMU and sampling facility */ + cpuhw->flags |= PMU_F_ENABLED; + barrier(); + + err = lsctl(&cpuhw->lsctl); + if (err) { + cpuhw->flags &= ~PMU_F_ENABLED; + pr_err("Loading sampling controls failed: op 1 err %i\n", err); + return; + } + + /* Load current program parameter */ + lpp(&S390_lowcore.lpp); + + debug_sprintf_event(sfdbg, 6, "%s: es %i cs %i ed %i cd %i " + "interval %#lx tear %#lx dear %#lx\n", __func__, + cpuhw->lsctl.es, cpuhw->lsctl.cs, cpuhw->lsctl.ed, + cpuhw->lsctl.cd, cpuhw->lsctl.interval, + cpuhw->lsctl.tear, cpuhw->lsctl.dear); +} + +static void cpumsf_pmu_disable(struct pmu *pmu) +{ + struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); + struct hws_lsctl_request_block inactive; + struct hws_qsi_info_block si; + int err; + + if (!(cpuhw->flags & PMU_F_ENABLED)) + return; + + if (cpuhw->flags & PMU_F_ERR_MASK) + return; + + /* Switch off sampling activation control */ + inactive = cpuhw->lsctl; + inactive.cs = 0; + inactive.cd = 0; + + err = lsctl(&inactive); + if (err) { + pr_err("Loading sampling controls failed: op 2 err %i\n", err); + return; + } + + /* Save state of TEAR and DEAR register contents */ + err = qsi(&si); + if (!err) { + /* TEAR/DEAR values are valid only if the sampling facility is + * enabled. Note that cpumsf_pmu_disable() might be called even + * for a disabled sampling facility because cpumsf_pmu_enable() + * controls the enable/disable state. + */ + if (si.es) { + cpuhw->lsctl.tear = si.tear; + cpuhw->lsctl.dear = si.dear; + } + } else + debug_sprintf_event(sfdbg, 3, "%s: qsi() failed with err %i\n", + __func__, err); + + cpuhw->flags &= ~PMU_F_ENABLED; +} + +/* perf_exclude_event() - Filter event + * @event: The perf event + * @regs: pt_regs structure + * @sde_regs: Sample-data-entry (sde) regs structure + * + * Filter perf events according to their exclude specification. + * + * Return non-zero if the event shall be excluded. + */ +static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs, + struct perf_sf_sde_regs *sde_regs) +{ + if (event->attr.exclude_user && user_mode(regs)) + return 1; + if (event->attr.exclude_kernel && !user_mode(regs)) + return 1; + if (event->attr.exclude_guest && sde_regs->in_guest) + return 1; + if (event->attr.exclude_host && !sde_regs->in_guest) + return 1; + return 0; +} + +/* perf_push_sample() - Push samples to perf + * @event: The perf event + * @sample: Hardware sample data + * + * Use the hardware sample data to create perf event sample. The sample + * is the pushed to the event subsystem and the function checks for + * possible event overflows. If an event overflow occurs, the PMU is + * stopped. + * + * Return non-zero if an event overflow occurred. + */ +static int perf_push_sample(struct perf_event *event, + struct hws_basic_entry *basic) +{ + int overflow; + struct pt_regs regs; + struct perf_sf_sde_regs *sde_regs; + struct perf_sample_data data; + + /* Setup perf sample */ + perf_sample_data_init(&data, 0, event->hw.last_period); + + /* Setup pt_regs to look like an CPU-measurement external interrupt + * using the Program Request Alert code. The regs.int_parm_long + * field which is unused contains additional sample-data-entry related + * indicators. + */ + memset(®s, 0, sizeof(regs)); + regs.int_code = 0x1407; + regs.int_parm = CPU_MF_INT_SF_PRA; + sde_regs = (struct perf_sf_sde_regs *) ®s.int_parm_long; + + psw_bits(regs.psw).ia = basic->ia; + psw_bits(regs.psw).dat = basic->T; + psw_bits(regs.psw).wait = basic->W; + psw_bits(regs.psw).pstate = basic->P; + psw_bits(regs.psw).as = basic->AS; + + /* + * Use the hardware provided configuration level to decide if the + * sample belongs to a guest or host. If that is not available, + * fall back to the following heuristics: + * A non-zero guest program parameter always indicates a guest + * sample. Some early samples or samples from guests without + * lpp usage would be misaccounted to the host. We use the asn + * value as an addon heuristic to detect most of these guest samples. + * If the value differs from 0xffff (the host value), we assume to + * be a KVM guest. + */ + switch (basic->CL) { + case 1: /* logical partition */ + sde_regs->in_guest = 0; + break; + case 2: /* virtual machine */ + sde_regs->in_guest = 1; + break; + default: /* old machine, use heuristics */ + if (basic->gpp || basic->prim_asn != 0xffff) + sde_regs->in_guest = 1; + break; + } + + /* + * Store the PID value from the sample-data-entry to be + * processed and resolved by cpumsf_output_event_pid(). + */ + data.tid_entry.pid = basic->hpp & LPP_PID_MASK; + + overflow = 0; + if (perf_exclude_event(event, ®s, sde_regs)) + goto out; + if (perf_event_overflow(event, &data, ®s)) { + overflow = 1; + event->pmu->stop(event, 0); + } + perf_event_update_userpage(event); +out: + return overflow; +} + +static void perf_event_count_update(struct perf_event *event, u64 count) +{ + local64_add(count, &event->count); +} + +/* hw_collect_samples() - Walk through a sample-data-block and collect samples + * @event: The perf event + * @sdbt: Sample-data-block table + * @overflow: Event overflow counter + * + * Walks through a sample-data-block and collects sampling data entries that are + * then pushed to the perf event subsystem. Depending on the sampling function, + * there can be either basic-sampling or combined-sampling data entries. A + * combined-sampling data entry consists of a basic- and a diagnostic-sampling + * data entry. The sampling function is determined by the flags in the perf + * event hardware structure. The function always works with a combined-sampling + * data entry but ignores the the diagnostic portion if it is not available. + * + * Note that the implementation focuses on basic-sampling data entries and, if + * such an entry is not valid, the entire combined-sampling data entry is + * ignored. + * + * The overflow variables counts the number of samples that has been discarded + * due to a perf event overflow. + */ +static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt, + unsigned long long *overflow) +{ + struct hws_trailer_entry *te; + struct hws_basic_entry *sample; + + te = trailer_entry_ptr((unsigned long)sdbt); + sample = (struct hws_basic_entry *)sdbt; + while ((unsigned long *)sample < (unsigned long *)te) { + /* Check for an empty sample */ + if (!sample->def || sample->LS) + break; + + /* Update perf event period */ + perf_event_count_update(event, SAMPL_RATE(&event->hw)); + + /* Check whether sample is valid */ + if (sample->def == 0x0001) { + /* If an event overflow occurred, the PMU is stopped to + * throttle event delivery. Remaining sample data is + * discarded. + */ + if (!*overflow) { + /* Check whether sample is consistent */ + if (sample->I == 0 && sample->W == 0) { + /* Deliver sample data to perf */ + *overflow = perf_push_sample(event, + sample); + } + } else + /* Count discarded samples */ + *overflow += 1; + } else { + debug_sprintf_event(sfdbg, 4, + "%s: Found unknown" + " sampling data entry: te->f %i" + " basic.def %#4x (%p)\n", __func__, + te->header.f, sample->def, sample); + /* Sample slot is not yet written or other record. + * + * This condition can occur if the buffer was reused + * from a combined basic- and diagnostic-sampling. + * If only basic-sampling is then active, entries are + * written into the larger diagnostic entries. + * This is typically the case for sample-data-blocks + * that are not full. Stop processing if the first + * invalid format was detected. + */ + if (!te->header.f) + break; + } + + /* Reset sample slot and advance to next sample */ + sample->def = 0; + sample++; + } +} + +/* hw_perf_event_update() - Process sampling buffer + * @event: The perf event + * @flush_all: Flag to also flush partially filled sample-data-blocks + * + * Processes the sampling buffer and create perf event samples. + * The sampling buffer position are retrieved and saved in the TEAR_REG + * register of the specified perf event. + * + * Only full sample-data-blocks are processed. Specify the flush_all flag + * to also walk through partially filled sample-data-blocks. + */ +static void hw_perf_event_update(struct perf_event *event, int flush_all) +{ + unsigned long long event_overflow, sampl_overflow, num_sdb; + union hws_trailer_header old, prev, new; + struct hw_perf_event *hwc = &event->hw; + struct hws_trailer_entry *te; + unsigned long *sdbt, sdb; + int done; + + /* + * AUX buffer is used when in diagnostic sampling mode. + * No perf events/samples are created. + */ + if (SAMPL_DIAG_MODE(&event->hw)) + return; + + sdbt = (unsigned long *)TEAR_REG(hwc); + done = event_overflow = sampl_overflow = num_sdb = 0; + while (!done) { + /* Get the trailer entry of the sample-data-block */ + sdb = (unsigned long)phys_to_virt(*sdbt); + te = trailer_entry_ptr(sdb); + + /* Leave loop if no more work to do (block full indicator) */ + if (!te->header.f) { + done = 1; + if (!flush_all) + break; + } + + /* Check the sample overflow count */ + if (te->header.overflow) + /* Account sample overflows and, if a particular limit + * is reached, extend the sampling buffer. + * For details, see sfb_account_overflows(). + */ + sampl_overflow += te->header.overflow; + + /* Timestamps are valid for full sample-data-blocks only */ + debug_sprintf_event(sfdbg, 6, "%s: sdbt %#lx/%#lx " + "overflow %llu timestamp %#llx\n", + __func__, sdb, (unsigned long)sdbt, + te->header.overflow, + (te->header.f) ? trailer_timestamp(te) : 0ULL); + + /* Collect all samples from a single sample-data-block and + * flag if an (perf) event overflow happened. If so, the PMU + * is stopped and remaining samples will be discarded. + */ + hw_collect_samples(event, (unsigned long *)sdb, &event_overflow); + num_sdb++; + + /* Reset trailer (using compare-double-and-swap) */ + prev.val = READ_ONCE_ALIGNED_128(te->header.val); + do { + old.val = prev.val; + new.val = prev.val; + new.f = 0; + new.a = 1; + new.overflow = 0; + prev.val = cmpxchg128(&te->header.val, old.val, new.val); + } while (prev.val != old.val); + + /* Advance to next sample-data-block */ + sdbt++; + if (is_link_entry(sdbt)) + sdbt = get_next_sdbt(sdbt); + + /* Update event hardware registers */ + TEAR_REG(hwc) = (unsigned long) sdbt; + + /* Stop processing sample-data if all samples of the current + * sample-data-block were flushed even if it was not full. + */ + if (flush_all && done) + break; + } + + /* Account sample overflows in the event hardware structure */ + if (sampl_overflow) + OVERFLOW_REG(hwc) = DIV_ROUND_UP(OVERFLOW_REG(hwc) + + sampl_overflow, 1 + num_sdb); + + /* Perf_event_overflow() and perf_event_account_interrupt() limit + * the interrupt rate to an upper limit. Roughly 1000 samples per + * task tick. + * Hitting this limit results in a large number + * of throttled REF_REPORT_THROTTLE entries and the samples + * are dropped. + * Slightly increase the interval to avoid hitting this limit. + */ + if (event_overflow) { + SAMPL_RATE(hwc) += DIV_ROUND_UP(SAMPL_RATE(hwc), 10); + debug_sprintf_event(sfdbg, 1, "%s: rate adjustment %ld\n", + __func__, + DIV_ROUND_UP(SAMPL_RATE(hwc), 10)); + } + + if (sampl_overflow || event_overflow) + debug_sprintf_event(sfdbg, 4, "%s: " + "overflows: sample %llu event %llu" + " total %llu num_sdb %llu\n", + __func__, sampl_overflow, event_overflow, + OVERFLOW_REG(hwc), num_sdb); +} + +static inline unsigned long aux_sdb_index(struct aux_buffer *aux, + unsigned long i) +{ + return i % aux->sfb.num_sdb; +} + +static inline unsigned long aux_sdb_num(unsigned long start, unsigned long end) +{ + return end >= start ? end - start + 1 : 0; +} + +static inline unsigned long aux_sdb_num_alert(struct aux_buffer *aux) +{ + return aux_sdb_num(aux->head, aux->alert_mark); +} + +static inline unsigned long aux_sdb_num_empty(struct aux_buffer *aux) +{ + return aux_sdb_num(aux->head, aux->empty_mark); +} + +/* + * Get trailer entry by index of SDB. + */ +static struct hws_trailer_entry *aux_sdb_trailer(struct aux_buffer *aux, + unsigned long index) +{ + unsigned long sdb; + + index = aux_sdb_index(aux, index); + sdb = aux->sdb_index[index]; + return trailer_entry_ptr(sdb); +} + +/* + * Finish sampling on the cpu. Called by cpumsf_pmu_del() with pmu + * disabled. Collect the full SDBs in AUX buffer which have not reached + * the point of alert indicator. And ignore the SDBs which are not + * full. + * + * 1. Scan SDBs to see how much data is there and consume them. + * 2. Remove alert indicator in the buffer. + */ +static void aux_output_end(struct perf_output_handle *handle) +{ + unsigned long i, range_scan, idx; + struct aux_buffer *aux; + struct hws_trailer_entry *te; + + aux = perf_get_aux(handle); + if (!aux) + return; + + range_scan = aux_sdb_num_alert(aux); + for (i = 0, idx = aux->head; i < range_scan; i++, idx++) { + te = aux_sdb_trailer(aux, idx); + if (!te->header.f) + break; + } + /* i is num of SDBs which are full */ + perf_aux_output_end(handle, i << PAGE_SHIFT); + + /* Remove alert indicators in the buffer */ + te = aux_sdb_trailer(aux, aux->alert_mark); + te->header.a = 0; + + debug_sprintf_event(sfdbg, 6, "%s: SDBs %ld range %ld head %ld\n", + __func__, i, range_scan, aux->head); +} + +/* + * Start sampling on the CPU. Called by cpumsf_pmu_add() when an event + * is first added to the CPU or rescheduled again to the CPU. It is called + * with pmu disabled. + * + * 1. Reset the trailer of SDBs to get ready for new data. + * 2. Tell the hardware where to put the data by reset the SDBs buffer + * head(tear/dear). + */ +static int aux_output_begin(struct perf_output_handle *handle, + struct aux_buffer *aux, + struct cpu_hw_sf *cpuhw) +{ + unsigned long range, i, range_scan, idx, head, base, offset; + struct hws_trailer_entry *te; + + if (WARN_ON_ONCE(handle->head & ~PAGE_MASK)) + return -EINVAL; + + aux->head = handle->head >> PAGE_SHIFT; + range = (handle->size + 1) >> PAGE_SHIFT; + if (range <= 1) + return -ENOMEM; + + /* + * SDBs between aux->head and aux->empty_mark are already ready + * for new data. range_scan is num of SDBs not within them. + */ + debug_sprintf_event(sfdbg, 6, + "%s: range %ld head %ld alert %ld empty %ld\n", + __func__, range, aux->head, aux->alert_mark, + aux->empty_mark); + if (range > aux_sdb_num_empty(aux)) { + range_scan = range - aux_sdb_num_empty(aux); + idx = aux->empty_mark + 1; + for (i = 0; i < range_scan; i++, idx++) { + te = aux_sdb_trailer(aux, idx); + te->header.f = 0; + te->header.a = 0; + te->header.overflow = 0; + } + /* Save the position of empty SDBs */ + aux->empty_mark = aux->head + range - 1; + } + + /* Set alert indicator */ + aux->alert_mark = aux->head + range/2 - 1; + te = aux_sdb_trailer(aux, aux->alert_mark); + te->header.a = 1; + + /* Reset hardware buffer head */ + head = aux_sdb_index(aux, aux->head); + base = aux->sdbt_index[head / CPUM_SF_SDB_PER_TABLE]; + offset = head % CPUM_SF_SDB_PER_TABLE; + cpuhw->lsctl.tear = virt_to_phys((void *)base) + offset * sizeof(unsigned long); + cpuhw->lsctl.dear = virt_to_phys((void *)aux->sdb_index[head]); + + debug_sprintf_event(sfdbg, 6, "%s: head %ld alert %ld empty %ld " + "index %ld tear %#lx dear %#lx\n", __func__, + aux->head, aux->alert_mark, aux->empty_mark, + head / CPUM_SF_SDB_PER_TABLE, + cpuhw->lsctl.tear, cpuhw->lsctl.dear); + + return 0; +} + +/* + * Set alert indicator on SDB at index @alert_index while sampler is running. + * + * Return true if successfully. + * Return false if full indicator is already set by hardware sampler. + */ +static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index, + unsigned long long *overflow) +{ + union hws_trailer_header old, prev, new; + struct hws_trailer_entry *te; + + te = aux_sdb_trailer(aux, alert_index); + prev.val = READ_ONCE_ALIGNED_128(te->header.val); + do { + old.val = prev.val; + new.val = prev.val; + *overflow = old.overflow; + if (old.f) { + /* + * SDB is already set by hardware. + * Abort and try to set somewhere + * behind. + */ + return false; + } + new.a = 1; + new.overflow = 0; + prev.val = cmpxchg128(&te->header.val, old.val, new.val); + } while (prev.val != old.val); + return true; +} + +/* + * aux_reset_buffer() - Scan and setup SDBs for new samples + * @aux: The AUX buffer to set + * @range: The range of SDBs to scan started from aux->head + * @overflow: Set to overflow count + * + * Set alert indicator on the SDB at index of aux->alert_mark. If this SDB is + * marked as empty, check if it is already set full by the hardware sampler. + * If yes, that means new data is already there before we can set an alert + * indicator. Caller should try to set alert indicator to some position behind. + * + * Scan the SDBs in AUX buffer from behind aux->empty_mark. They are used + * previously and have already been consumed by user space. Reset these SDBs + * (clear full indicator and alert indicator) for new data. + * If aux->alert_mark fall in this area, just set it. Overflow count is + * recorded while scanning. + * + * SDBs between aux->head and aux->empty_mark are already reset at last time. + * and ready for new samples. So scanning on this area could be skipped. + * + * Return true if alert indicator is set successfully and false if not. + */ +static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range, + unsigned long long *overflow) +{ + unsigned long i, range_scan, idx, idx_old; + union hws_trailer_header old, prev, new; + unsigned long long orig_overflow; + struct hws_trailer_entry *te; + + debug_sprintf_event(sfdbg, 6, "%s: range %ld head %ld alert %ld " + "empty %ld\n", __func__, range, aux->head, + aux->alert_mark, aux->empty_mark); + if (range <= aux_sdb_num_empty(aux)) + /* + * No need to scan. All SDBs in range are marked as empty. + * Just set alert indicator. Should check race with hardware + * sampler. + */ + return aux_set_alert(aux, aux->alert_mark, overflow); + + if (aux->alert_mark <= aux->empty_mark) + /* + * Set alert indicator on empty SDB. Should check race + * with hardware sampler. + */ + if (!aux_set_alert(aux, aux->alert_mark, overflow)) + return false; + + /* + * Scan the SDBs to clear full and alert indicator used previously. + * Start scanning from one SDB behind empty_mark. If the new alert + * indicator fall into this range, set it. + */ + range_scan = range - aux_sdb_num_empty(aux); + idx_old = idx = aux->empty_mark + 1; + for (i = 0; i < range_scan; i++, idx++) { + te = aux_sdb_trailer(aux, idx); + prev.val = READ_ONCE_ALIGNED_128(te->header.val); + do { + old.val = prev.val; + new.val = prev.val; + orig_overflow = old.overflow; + new.f = 0; + new.overflow = 0; + if (idx == aux->alert_mark) + new.a = 1; + else + new.a = 0; + prev.val = cmpxchg128(&te->header.val, old.val, new.val); + } while (prev.val != old.val); + *overflow += orig_overflow; + } + + /* Update empty_mark to new position */ + aux->empty_mark = aux->head + range - 1; + + debug_sprintf_event(sfdbg, 6, "%s: range_scan %ld idx %ld..%ld " + "empty %ld\n", __func__, range_scan, idx_old, + idx - 1, aux->empty_mark); + return true; +} + +/* + * Measurement alert handler for diagnostic mode sampling. + */ +static void hw_collect_aux(struct cpu_hw_sf *cpuhw) +{ + struct aux_buffer *aux; + int done = 0; + unsigned long range = 0, size; + unsigned long long overflow = 0; + struct perf_output_handle *handle = &cpuhw->handle; + unsigned long num_sdb; + + aux = perf_get_aux(handle); + if (WARN_ON_ONCE(!aux)) + return; + + /* Inform user space new data arrived */ + size = aux_sdb_num_alert(aux) << PAGE_SHIFT; + debug_sprintf_event(sfdbg, 6, "%s: #alert %ld\n", __func__, + size >> PAGE_SHIFT); + perf_aux_output_end(handle, size); + + num_sdb = aux->sfb.num_sdb; + while (!done) { + /* Get an output handle */ + aux = perf_aux_output_begin(handle, cpuhw->event); + if (handle->size == 0) { + pr_err("The AUX buffer with %lu pages for the " + "diagnostic-sampling mode is full\n", + num_sdb); + break; + } + if (WARN_ON_ONCE(!aux)) + return; + + /* Update head and alert_mark to new position */ + aux->head = handle->head >> PAGE_SHIFT; + range = (handle->size + 1) >> PAGE_SHIFT; + if (range == 1) + aux->alert_mark = aux->head; + else + aux->alert_mark = aux->head + range/2 - 1; + + if (aux_reset_buffer(aux, range, &overflow)) { + if (!overflow) { + done = 1; + break; + } + size = range << PAGE_SHIFT; + perf_aux_output_end(&cpuhw->handle, size); + pr_err("Sample data caused the AUX buffer with %lu " + "pages to overflow\n", aux->sfb.num_sdb); + debug_sprintf_event(sfdbg, 1, "%s: head %ld range %ld " + "overflow %lld\n", __func__, + aux->head, range, overflow); + } else { + size = aux_sdb_num_alert(aux) << PAGE_SHIFT; + perf_aux_output_end(&cpuhw->handle, size); + debug_sprintf_event(sfdbg, 6, "%s: head %ld alert %ld " + "already full, try another\n", + __func__, + aux->head, aux->alert_mark); + } + } + + if (done) + debug_sprintf_event(sfdbg, 6, "%s: head %ld alert %ld " + "empty %ld\n", __func__, aux->head, + aux->alert_mark, aux->empty_mark); +} + +/* + * Callback when freeing AUX buffers. + */ +static void aux_buffer_free(void *data) +{ + struct aux_buffer *aux = data; + unsigned long i, num_sdbt; + + if (!aux) + return; + + /* Free SDBT. SDB is freed by the caller */ + num_sdbt = aux->sfb.num_sdbt; + for (i = 0; i < num_sdbt; i++) + free_page(aux->sdbt_index[i]); + + kfree(aux->sdbt_index); + kfree(aux->sdb_index); + kfree(aux); + + debug_sprintf_event(sfdbg, 4, "%s: SDBTs %lu\n", __func__, num_sdbt); +} + +static void aux_sdb_init(unsigned long sdb) +{ + struct hws_trailer_entry *te; + + te = trailer_entry_ptr(sdb); + + /* Save clock base */ + te->clock_base = 1; + te->progusage2 = tod_clock_base.tod; +} + +/* + * aux_buffer_setup() - Setup AUX buffer for diagnostic mode sampling + * @event: Event the buffer is setup for, event->cpu == -1 means current + * @pages: Array of pointers to buffer pages passed from perf core + * @nr_pages: Total pages + * @snapshot: Flag for snapshot mode + * + * This is the callback when setup an event using AUX buffer. Perf tool can + * trigger this by an additional mmap() call on the event. Unlike the buffer + * for basic samples, AUX buffer belongs to the event. It is scheduled with + * the task among online cpus when it is a per-thread event. + * + * Return the private AUX buffer structure if success or NULL if fails. + */ +static void *aux_buffer_setup(struct perf_event *event, void **pages, + int nr_pages, bool snapshot) +{ + struct sf_buffer *sfb; + struct aux_buffer *aux; + unsigned long *new, *tail; + int i, n_sdbt; + + if (!nr_pages || !pages) + return NULL; + + if (nr_pages > CPUM_SF_MAX_SDB * CPUM_SF_SDB_DIAG_FACTOR) { + pr_err("AUX buffer size (%i pages) is larger than the " + "maximum sampling buffer limit\n", + nr_pages); + return NULL; + } else if (nr_pages < CPUM_SF_MIN_SDB * CPUM_SF_SDB_DIAG_FACTOR) { + pr_err("AUX buffer size (%i pages) is less than the " + "minimum sampling buffer limit\n", + nr_pages); + return NULL; + } + + /* Allocate aux_buffer struct for the event */ + aux = kzalloc(sizeof(struct aux_buffer), GFP_KERNEL); + if (!aux) + goto no_aux; + sfb = &aux->sfb; + + /* Allocate sdbt_index for fast reference */ + n_sdbt = DIV_ROUND_UP(nr_pages, CPUM_SF_SDB_PER_TABLE); + aux->sdbt_index = kmalloc_array(n_sdbt, sizeof(void *), GFP_KERNEL); + if (!aux->sdbt_index) + goto no_sdbt_index; + + /* Allocate sdb_index for fast reference */ + aux->sdb_index = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL); + if (!aux->sdb_index) + goto no_sdb_index; + + /* Allocate the first SDBT */ + sfb->num_sdbt = 0; + sfb->sdbt = (unsigned long *)get_zeroed_page(GFP_KERNEL); + if (!sfb->sdbt) + goto no_sdbt; + aux->sdbt_index[sfb->num_sdbt++] = (unsigned long)sfb->sdbt; + tail = sfb->tail = sfb->sdbt; + + /* + * Link the provided pages of AUX buffer to SDBT. + * Allocate SDBT if needed. + */ + for (i = 0; i < nr_pages; i++, tail++) { + if (require_table_link(tail)) { + new = (unsigned long *)get_zeroed_page(GFP_KERNEL); + if (!new) + goto no_sdbt; + aux->sdbt_index[sfb->num_sdbt++] = (unsigned long)new; + /* Link current page to tail of chain */ + *tail = virt_to_phys(new) + 1; + tail = new; + } + /* Tail is the entry in a SDBT */ + *tail = virt_to_phys(pages[i]); + aux->sdb_index[i] = (unsigned long)pages[i]; + aux_sdb_init((unsigned long)pages[i]); + } + sfb->num_sdb = nr_pages; + + /* Link the last entry in the SDBT to the first SDBT */ + *tail = virt_to_phys(sfb->sdbt) + 1; + sfb->tail = tail; + + /* + * Initial all SDBs are zeroed. Mark it as empty. + * So there is no need to clear the full indicator + * when this event is first added. + */ + aux->empty_mark = sfb->num_sdb - 1; + + debug_sprintf_event(sfdbg, 4, "%s: SDBTs %lu SDBs %lu\n", __func__, + sfb->num_sdbt, sfb->num_sdb); + + return aux; + +no_sdbt: + /* SDBs (AUX buffer pages) are freed by caller */ + for (i = 0; i < sfb->num_sdbt; i++) + free_page(aux->sdbt_index[i]); + kfree(aux->sdb_index); +no_sdb_index: + kfree(aux->sdbt_index); +no_sdbt_index: + kfree(aux); +no_aux: + return NULL; +} + +static void cpumsf_pmu_read(struct perf_event *event) +{ + /* Nothing to do ... updates are interrupt-driven */ +} + +/* Check if the new sampling period/frequency is appropriate. + * + * Return non-zero on error and zero on passed checks. + */ +static int cpumsf_pmu_check_period(struct perf_event *event, u64 value) +{ + struct hws_qsi_info_block si; + unsigned long rate; + bool do_freq; + + memset(&si, 0, sizeof(si)); + if (event->cpu == -1) { + if (qsi(&si)) + return -ENODEV; + } else { + /* Event is pinned to a particular CPU, retrieve the per-CPU + * sampling structure for accessing the CPU-specific QSI. + */ + struct cpu_hw_sf *cpuhw = &per_cpu(cpu_hw_sf, event->cpu); + + si = cpuhw->qsi; + } + + do_freq = !!SAMPLE_FREQ_MODE(&event->hw); + rate = getrate(do_freq, value, &si); + if (!rate) + return -EINVAL; + + event->attr.sample_period = rate; + SAMPL_RATE(&event->hw) = rate; + hw_init_period(&event->hw, SAMPL_RATE(&event->hw)); + debug_sprintf_event(sfdbg, 4, "%s:" + " cpu %d value %#llx period %#llx freq %d\n", + __func__, event->cpu, value, + event->attr.sample_period, do_freq); + return 0; +} + +/* Activate sampling control. + * Next call of pmu_enable() starts sampling. + */ +static void cpumsf_pmu_start(struct perf_event *event, int flags) +{ + struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); + + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + if (flags & PERF_EF_RELOAD) + WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); + + perf_pmu_disable(event->pmu); + event->hw.state = 0; + cpuhw->lsctl.cs = 1; + if (SAMPL_DIAG_MODE(&event->hw)) + cpuhw->lsctl.cd = 1; + perf_pmu_enable(event->pmu); +} + +/* Deactivate sampling control. + * Next call of pmu_enable() stops sampling. + */ +static void cpumsf_pmu_stop(struct perf_event *event, int flags) +{ + struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); + + if (event->hw.state & PERF_HES_STOPPED) + return; + + perf_pmu_disable(event->pmu); + cpuhw->lsctl.cs = 0; + cpuhw->lsctl.cd = 0; + event->hw.state |= PERF_HES_STOPPED; + + if ((flags & PERF_EF_UPDATE) && !(event->hw.state & PERF_HES_UPTODATE)) { + hw_perf_event_update(event, 1); + event->hw.state |= PERF_HES_UPTODATE; + } + perf_pmu_enable(event->pmu); +} + +static int cpumsf_pmu_add(struct perf_event *event, int flags) +{ + struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); + struct aux_buffer *aux; + int err; + + if (cpuhw->flags & PMU_F_IN_USE) + return -EAGAIN; + + if (!SAMPL_DIAG_MODE(&event->hw) && !cpuhw->sfb.sdbt) + return -EINVAL; + + err = 0; + perf_pmu_disable(event->pmu); + + event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + /* Set up sampling controls. Always program the sampling register + * using the SDB-table start. Reset TEAR_REG event hardware register + * that is used by hw_perf_event_update() to store the sampling buffer + * position after samples have been flushed. + */ + cpuhw->lsctl.s = 0; + cpuhw->lsctl.h = 1; + cpuhw->lsctl.interval = SAMPL_RATE(&event->hw); + if (!SAMPL_DIAG_MODE(&event->hw)) { + cpuhw->lsctl.tear = virt_to_phys(cpuhw->sfb.sdbt); + cpuhw->lsctl.dear = *(unsigned long *)cpuhw->sfb.sdbt; + TEAR_REG(&event->hw) = (unsigned long)cpuhw->sfb.sdbt; + } + + /* Ensure sampling functions are in the disabled state. If disabled, + * switch on sampling enable control. */ + if (WARN_ON_ONCE(cpuhw->lsctl.es == 1 || cpuhw->lsctl.ed == 1)) { + err = -EAGAIN; + goto out; + } + if (SAMPL_DIAG_MODE(&event->hw)) { + aux = perf_aux_output_begin(&cpuhw->handle, event); + if (!aux) { + err = -EINVAL; + goto out; + } + err = aux_output_begin(&cpuhw->handle, aux, cpuhw); + if (err) + goto out; + cpuhw->lsctl.ed = 1; + } + cpuhw->lsctl.es = 1; + + /* Set in_use flag and store event */ + cpuhw->event = event; + cpuhw->flags |= PMU_F_IN_USE; + + if (flags & PERF_EF_START) + cpumsf_pmu_start(event, PERF_EF_RELOAD); +out: + perf_event_update_userpage(event); + perf_pmu_enable(event->pmu); + return err; +} + +static void cpumsf_pmu_del(struct perf_event *event, int flags) +{ + struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); + + perf_pmu_disable(event->pmu); + cpumsf_pmu_stop(event, PERF_EF_UPDATE); + + cpuhw->lsctl.es = 0; + cpuhw->lsctl.ed = 0; + cpuhw->flags &= ~PMU_F_IN_USE; + cpuhw->event = NULL; + + if (SAMPL_DIAG_MODE(&event->hw)) + aux_output_end(&cpuhw->handle); + perf_event_update_userpage(event); + perf_pmu_enable(event->pmu); +} + +CPUMF_EVENT_ATTR(SF, SF_CYCLES_BASIC, PERF_EVENT_CPUM_SF); +CPUMF_EVENT_ATTR(SF, SF_CYCLES_BASIC_DIAG, PERF_EVENT_CPUM_SF_DIAG); + +/* Attribute list for CPU_SF. + * + * The availablitiy depends on the CPU_MF sampling facility authorization + * for basic + diagnositic samples. This is determined at initialization + * time by the sampling facility device driver. + * If the authorization for basic samples is turned off, it should be + * also turned off for diagnostic sampling. + * + * During initialization of the device driver, check the authorization + * level for diagnostic sampling and installs the attribute + * file for diagnostic sampling if necessary. + * + * For now install a placeholder to reference all possible attributes: + * SF_CYCLES_BASIC and SF_CYCLES_BASIC_DIAG. + * Add another entry for the final NULL pointer. + */ +enum { + SF_CYCLES_BASIC_ATTR_IDX = 0, + SF_CYCLES_BASIC_DIAG_ATTR_IDX, + SF_CYCLES_ATTR_MAX +}; + +static struct attribute *cpumsf_pmu_events_attr[SF_CYCLES_ATTR_MAX + 1] = { + [SF_CYCLES_BASIC_ATTR_IDX] = CPUMF_EVENT_PTR(SF, SF_CYCLES_BASIC) +}; + +PMU_FORMAT_ATTR(event, "config:0-63"); + +static struct attribute *cpumsf_pmu_format_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group cpumsf_pmu_events_group = { + .name = "events", + .attrs = cpumsf_pmu_events_attr, +}; + +static struct attribute_group cpumsf_pmu_format_group = { + .name = "format", + .attrs = cpumsf_pmu_format_attr, +}; + +static const struct attribute_group *cpumsf_pmu_attr_groups[] = { + &cpumsf_pmu_events_group, + &cpumsf_pmu_format_group, + NULL, +}; + +static struct pmu cpumf_sampling = { + .pmu_enable = cpumsf_pmu_enable, + .pmu_disable = cpumsf_pmu_disable, + + .event_init = cpumsf_pmu_event_init, + .add = cpumsf_pmu_add, + .del = cpumsf_pmu_del, + + .start = cpumsf_pmu_start, + .stop = cpumsf_pmu_stop, + .read = cpumsf_pmu_read, + + .attr_groups = cpumsf_pmu_attr_groups, + + .setup_aux = aux_buffer_setup, + .free_aux = aux_buffer_free, + + .check_period = cpumsf_pmu_check_period, +}; + +static void cpumf_measurement_alert(struct ext_code ext_code, + unsigned int alert, unsigned long unused) +{ + struct cpu_hw_sf *cpuhw; + + if (!(alert & CPU_MF_INT_SF_MASK)) + return; + inc_irq_stat(IRQEXT_CMS); + cpuhw = this_cpu_ptr(&cpu_hw_sf); + + /* Measurement alerts are shared and might happen when the PMU + * is not reserved. Ignore these alerts in this case. */ + if (!(cpuhw->flags & PMU_F_RESERVED)) + return; + + /* The processing below must take care of multiple alert events that + * might be indicated concurrently. */ + + /* Program alert request */ + if (alert & CPU_MF_INT_SF_PRA) { + if (cpuhw->flags & PMU_F_IN_USE) + if (SAMPL_DIAG_MODE(&cpuhw->event->hw)) + hw_collect_aux(cpuhw); + else + hw_perf_event_update(cpuhw->event, 0); + else + WARN_ON_ONCE(!(cpuhw->flags & PMU_F_IN_USE)); + } + + /* Report measurement alerts only for non-PRA codes */ + if (alert != CPU_MF_INT_SF_PRA) + debug_sprintf_event(sfdbg, 6, "%s: alert %#x\n", __func__, + alert); + + /* Sampling authorization change request */ + if (alert & CPU_MF_INT_SF_SACA) + qsi(&cpuhw->qsi); + + /* Loss of sample data due to high-priority machine activities */ + if (alert & CPU_MF_INT_SF_LSDA) { + pr_err("Sample data was lost\n"); + cpuhw->flags |= PMU_F_ERR_LSDA; + sf_disable(); + } + + /* Invalid sampling buffer entry */ + if (alert & (CPU_MF_INT_SF_IAE|CPU_MF_INT_SF_ISE)) { + pr_err("A sampling buffer entry is incorrect (alert=0x%x)\n", + alert); + cpuhw->flags |= PMU_F_ERR_IBE; + sf_disable(); + } +} + +static int cpusf_pmu_setup(unsigned int cpu, int flags) +{ + /* Ignore the notification if no events are scheduled on the PMU. + * This might be racy... + */ + if (!atomic_read(&num_events)) + return 0; + + local_irq_disable(); + setup_pmc_cpu(&flags); + local_irq_enable(); + return 0; +} + +static int s390_pmu_sf_online_cpu(unsigned int cpu) +{ + return cpusf_pmu_setup(cpu, PMC_INIT); +} + +static int s390_pmu_sf_offline_cpu(unsigned int cpu) +{ + return cpusf_pmu_setup(cpu, PMC_RELEASE); +} + +static int param_get_sfb_size(char *buffer, const struct kernel_param *kp) +{ + if (!cpum_sf_avail()) + return -ENODEV; + return sprintf(buffer, "%lu,%lu", CPUM_SF_MIN_SDB, CPUM_SF_MAX_SDB); +} + +static int param_set_sfb_size(const char *val, const struct kernel_param *kp) +{ + int rc; + unsigned long min, max; + + if (!cpum_sf_avail()) + return -ENODEV; + if (!val || !strlen(val)) + return -EINVAL; + + /* Valid parameter values: "min,max" or "max" */ + min = CPUM_SF_MIN_SDB; + max = CPUM_SF_MAX_SDB; + if (strchr(val, ',')) + rc = (sscanf(val, "%lu,%lu", &min, &max) == 2) ? 0 : -EINVAL; + else + rc = kstrtoul(val, 10, &max); + + if (min < 2 || min >= max || max > get_num_physpages()) + rc = -EINVAL; + if (rc) + return rc; + + sfb_set_limits(min, max); + pr_info("The sampling buffer limits have changed to: " + "min %lu max %lu (diag %lu)\n", + CPUM_SF_MIN_SDB, CPUM_SF_MAX_SDB, CPUM_SF_SDB_DIAG_FACTOR); + return 0; +} + +#define param_check_sfb_size(name, p) __param_check(name, p, void) +static const struct kernel_param_ops param_ops_sfb_size = { + .set = param_set_sfb_size, + .get = param_get_sfb_size, +}; + +#define RS_INIT_FAILURE_QSI 0x0001 +#define RS_INIT_FAILURE_BSDES 0x0002 +#define RS_INIT_FAILURE_ALRT 0x0003 +#define RS_INIT_FAILURE_PERF 0x0004 +static void __init pr_cpumsf_err(unsigned int reason) +{ + pr_err("Sampling facility support for perf is not available: " + "reason %#x\n", reason); +} + +static int __init init_cpum_sampling_pmu(void) +{ + struct hws_qsi_info_block si; + int err; + + if (!cpum_sf_avail()) + return -ENODEV; + + memset(&si, 0, sizeof(si)); + if (qsi(&si)) { + pr_cpumsf_err(RS_INIT_FAILURE_QSI); + return -ENODEV; + } + + if (!si.as && !si.ad) + return -ENODEV; + + if (si.bsdes != sizeof(struct hws_basic_entry)) { + pr_cpumsf_err(RS_INIT_FAILURE_BSDES); + return -EINVAL; + } + + if (si.ad) { + sfb_set_limits(CPUM_SF_MIN_SDB, CPUM_SF_MAX_SDB); + /* Sampling of diagnostic data authorized, + * install event into attribute list of PMU device. + */ + cpumsf_pmu_events_attr[SF_CYCLES_BASIC_DIAG_ATTR_IDX] = + CPUMF_EVENT_PTR(SF, SF_CYCLES_BASIC_DIAG); + } + + sfdbg = debug_register(KMSG_COMPONENT, 2, 1, 80); + if (!sfdbg) { + pr_err("Registering for s390dbf failed\n"); + return -ENOMEM; + } + debug_register_view(sfdbg, &debug_sprintf_view); + + err = register_external_irq(EXT_IRQ_MEASURE_ALERT, + cpumf_measurement_alert); + if (err) { + pr_cpumsf_err(RS_INIT_FAILURE_ALRT); + debug_unregister(sfdbg); + goto out; + } + + err = perf_pmu_register(&cpumf_sampling, "cpum_sf", PERF_TYPE_RAW); + if (err) { + pr_cpumsf_err(RS_INIT_FAILURE_PERF); + unregister_external_irq(EXT_IRQ_MEASURE_ALERT, + cpumf_measurement_alert); + debug_unregister(sfdbg); + goto out; + } + + cpuhp_setup_state(CPUHP_AP_PERF_S390_SF_ONLINE, "perf/s390/sf:online", + s390_pmu_sf_online_cpu, s390_pmu_sf_offline_cpu); +out: + return err; +} + +arch_initcall(init_cpum_sampling_pmu); +core_param(cpum_sfb_size, CPUM_SF_MAX_SDB, sfb_size, 0644); diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c new file mode 100644 index 0000000000..c27321cb09 --- /dev/null +++ b/arch/s390/kernel/perf_event.c @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Performance event support for s390x + * + * Copyright IBM Corp. 2012, 2013 + * Author(s): Hendrik Brueckner + */ +#define KMSG_COMPONENT "perf" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct kvm_s390_sie_block *sie_block(struct pt_regs *regs) +{ + struct stack_frame *stack = (struct stack_frame *) regs->gprs[15]; + + if (!stack) + return NULL; + + return (struct kvm_s390_sie_block *)stack->sie_control_block; +} + +static bool is_in_guest(struct pt_regs *regs) +{ + if (user_mode(regs)) + return false; +#if IS_ENABLED(CONFIG_KVM) + return instruction_pointer(regs) == (unsigned long) &sie_exit; +#else + return false; +#endif +} + +static unsigned long guest_is_user_mode(struct pt_regs *regs) +{ + return sie_block(regs)->gpsw.mask & PSW_MASK_PSTATE; +} + +static unsigned long instruction_pointer_guest(struct pt_regs *regs) +{ + return sie_block(regs)->gpsw.addr; +} + +unsigned long perf_instruction_pointer(struct pt_regs *regs) +{ + return is_in_guest(regs) ? instruction_pointer_guest(regs) + : instruction_pointer(regs); +} + +static unsigned long perf_misc_guest_flags(struct pt_regs *regs) +{ + return guest_is_user_mode(regs) ? PERF_RECORD_MISC_GUEST_USER + : PERF_RECORD_MISC_GUEST_KERNEL; +} + +static unsigned long perf_misc_flags_sf(struct pt_regs *regs) +{ + struct perf_sf_sde_regs *sde_regs; + unsigned long flags; + + sde_regs = (struct perf_sf_sde_regs *) ®s->int_parm_long; + if (sde_regs->in_guest) + flags = user_mode(regs) ? PERF_RECORD_MISC_GUEST_USER + : PERF_RECORD_MISC_GUEST_KERNEL; + else + flags = user_mode(regs) ? PERF_RECORD_MISC_USER + : PERF_RECORD_MISC_KERNEL; + return flags; +} + +unsigned long perf_misc_flags(struct pt_regs *regs) +{ + /* Check if the cpum_sf PMU has created the pt_regs structure. + * In this case, perf misc flags can be easily extracted. Otherwise, + * do regular checks on the pt_regs content. + */ + if (regs->int_code == 0x1407 && regs->int_parm == CPU_MF_INT_SF_PRA) + if (!regs->gprs[15]) + return perf_misc_flags_sf(regs); + + if (is_in_guest(regs)) + return perf_misc_guest_flags(regs); + + return user_mode(regs) ? PERF_RECORD_MISC_USER + : PERF_RECORD_MISC_KERNEL; +} + +static void print_debug_cf(void) +{ + struct cpumf_ctr_info cf_info; + int cpu = smp_processor_id(); + + memset(&cf_info, 0, sizeof(cf_info)); + if (!qctri(&cf_info)) + pr_info("CPU[%i] CPUM_CF: ver=%u.%u A=%04x E=%04x C=%04x\n", + cpu, cf_info.cfvn, cf_info.csvn, + cf_info.auth_ctl, cf_info.enable_ctl, cf_info.act_ctl); +} + +static void print_debug_sf(void) +{ + struct hws_qsi_info_block si; + int cpu = smp_processor_id(); + + memset(&si, 0, sizeof(si)); + if (qsi(&si)) + return; + + pr_info("CPU[%i] CPUM_SF: basic=%i diag=%i min=%lu max=%lu cpu_speed=%u\n", + cpu, si.as, si.ad, si.min_sampl_rate, si.max_sampl_rate, + si.cpu_speed); + + if (si.as) + pr_info("CPU[%i] CPUM_SF: Basic-sampling: a=%i e=%i c=%i" + " bsdes=%i tear=%016lx dear=%016lx\n", cpu, + si.as, si.es, si.cs, si.bsdes, si.tear, si.dear); + if (si.ad) + pr_info("CPU[%i] CPUM_SF: Diagnostic-sampling: a=%i e=%i c=%i" + " dsdes=%i tear=%016lx dear=%016lx\n", cpu, + si.ad, si.ed, si.cd, si.dsdes, si.tear, si.dear); +} + +void perf_event_print_debug(void) +{ + unsigned long flags; + + local_irq_save(flags); + if (cpum_cf_avail()) + print_debug_cf(); + if (cpum_sf_avail()) + print_debug_sf(); + local_irq_restore(flags); +} + +/* Service level infrastructure */ +static void sl_print_counter(struct seq_file *m) +{ + struct cpumf_ctr_info ci; + + memset(&ci, 0, sizeof(ci)); + if (qctri(&ci)) + return; + + seq_printf(m, "CPU-MF: Counter facility: version=%u.%u " + "authorization=%04x\n", ci.cfvn, ci.csvn, ci.auth_ctl); +} + +static void sl_print_sampling(struct seq_file *m) +{ + struct hws_qsi_info_block si; + + memset(&si, 0, sizeof(si)); + if (qsi(&si)) + return; + + if (!si.as && !si.ad) + return; + + seq_printf(m, "CPU-MF: Sampling facility: min_rate=%lu max_rate=%lu" + " cpu_speed=%u\n", si.min_sampl_rate, si.max_sampl_rate, + si.cpu_speed); + if (si.as) + seq_printf(m, "CPU-MF: Sampling facility: mode=basic" + " sample_size=%u\n", si.bsdes); + if (si.ad) + seq_printf(m, "CPU-MF: Sampling facility: mode=diagnostic" + " sample_size=%u\n", si.dsdes); +} + +static void service_level_perf_print(struct seq_file *m, + struct service_level *sl) +{ + if (cpum_cf_avail()) + sl_print_counter(m); + if (cpum_sf_avail()) + sl_print_sampling(m); +} + +static struct service_level service_level_perf = { + .seq_print = service_level_perf_print, +}; + +static int __init service_level_perf_register(void) +{ + return register_service_level(&service_level_perf); +} +arch_initcall(service_level_perf_register); + +void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, + struct pt_regs *regs) +{ + struct unwind_state state; + unsigned long addr; + + unwind_for_each_frame(&state, current, regs, 0) { + addr = unwind_get_return_address(&state); + if (!addr || perf_callchain_store(entry, addr)) + return; + } +} + +/* Perf definitions for PMU event attributes in sysfs */ +ssize_t cpumf_events_sysfs_show(struct device *dev, + struct device_attribute *attr, char *page) +{ + struct perf_pmu_events_attr *pmu_attr; + + pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr); + return sprintf(page, "event=0x%04llx\n", pmu_attr->id); +} diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c new file mode 100644 index 0000000000..fe7d1774de --- /dev/null +++ b/arch/s390/kernel/perf_pai_crypto.c @@ -0,0 +1,698 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Performance event support - Processor Activity Instrumentation Facility + * + * Copyright IBM Corp. 2022 + * Author(s): Thomas Richter + */ +#define KMSG_COMPONENT "pai_crypto" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static debug_info_t *cfm_dbg; +static unsigned int paicrypt_cnt; /* Size of the mapped counter sets */ + /* extracted with QPACI instruction */ + +DEFINE_STATIC_KEY_FALSE(pai_key); + +struct pai_userdata { + u16 num; + u64 value; +} __packed; + +struct paicrypt_map { + unsigned long *page; /* Page for CPU to store counters */ + struct pai_userdata *save; /* Page to store no-zero counters */ + unsigned int active_events; /* # of PAI crypto users */ + refcount_t refcnt; /* Reference count mapped buffers */ + enum paievt_mode mode; /* Type of event */ + struct perf_event *event; /* Perf event for sampling */ +}; + +static DEFINE_PER_CPU(struct paicrypt_map, paicrypt_map); + +/* Release the PMU if event is the last perf event */ +static DEFINE_MUTEX(pai_reserve_mutex); + +/* Adjust usage counters and remove allocated memory when all users are + * gone. + */ +static void paicrypt_event_destroy(struct perf_event *event) +{ + struct paicrypt_map *cpump = per_cpu_ptr(&paicrypt_map, event->cpu); + + cpump->event = NULL; + static_branch_dec(&pai_key); + mutex_lock(&pai_reserve_mutex); + debug_sprintf_event(cfm_dbg, 5, "%s event %#llx cpu %d users %d" + " mode %d refcnt %u\n", __func__, + event->attr.config, event->cpu, + cpump->active_events, cpump->mode, + refcount_read(&cpump->refcnt)); + if (refcount_dec_and_test(&cpump->refcnt)) { + debug_sprintf_event(cfm_dbg, 4, "%s page %#lx save %p\n", + __func__, (unsigned long)cpump->page, + cpump->save); + free_page((unsigned long)cpump->page); + cpump->page = NULL; + kvfree(cpump->save); + cpump->save = NULL; + cpump->mode = PAI_MODE_NONE; + } + mutex_unlock(&pai_reserve_mutex); +} + +static u64 paicrypt_getctr(struct paicrypt_map *cpump, int nr, bool kernel) +{ + if (kernel) + nr += PAI_CRYPTO_MAXCTR; + return cpump->page[nr]; +} + +/* Read the counter values. Return value from location in CMP. For event + * CRYPTO_ALL sum up all events. + */ +static u64 paicrypt_getdata(struct perf_event *event, bool kernel) +{ + struct paicrypt_map *cpump = this_cpu_ptr(&paicrypt_map); + u64 sum = 0; + int i; + + if (event->attr.config != PAI_CRYPTO_BASE) { + return paicrypt_getctr(cpump, + event->attr.config - PAI_CRYPTO_BASE, + kernel); + } + + for (i = 1; i <= paicrypt_cnt; i++) { + u64 val = paicrypt_getctr(cpump, i, kernel); + + if (!val) + continue; + sum += val; + } + return sum; +} + +static u64 paicrypt_getall(struct perf_event *event) +{ + u64 sum = 0; + + if (!event->attr.exclude_kernel) + sum += paicrypt_getdata(event, true); + if (!event->attr.exclude_user) + sum += paicrypt_getdata(event, false); + + return sum; +} + +/* Used to avoid races in checking concurrent access of counting and + * sampling for crypto events + * + * Only one instance of event pai_crypto/CRYPTO_ALL/ for sampling is + * allowed and when this event is running, no counting event is allowed. + * Several counting events are allowed in parallel, but no sampling event + * is allowed while one (or more) counting events are running. + * + * This function is called in process context and it is save to block. + * When the event initialization functions fails, no other call back will + * be invoked. + * + * Allocate the memory for the event. + */ +static int paicrypt_busy(struct perf_event_attr *a, struct paicrypt_map *cpump) +{ + int rc = 0; + + mutex_lock(&pai_reserve_mutex); + if (a->sample_period) { /* Sampling requested */ + if (cpump->mode != PAI_MODE_NONE) + rc = -EBUSY; /* ... sampling/counting active */ + } else { /* Counting requested */ + if (cpump->mode == PAI_MODE_SAMPLING) + rc = -EBUSY; /* ... and sampling active */ + } + if (rc) + goto unlock; + + /* Allocate memory for counter page and counter extraction. + * Only the first counting event has to allocate a page. + */ + if (cpump->page) { + refcount_inc(&cpump->refcnt); + goto unlock; + } + + rc = -ENOMEM; + cpump->page = (unsigned long *)get_zeroed_page(GFP_KERNEL); + if (!cpump->page) + goto unlock; + cpump->save = kvmalloc_array(paicrypt_cnt + 1, + sizeof(struct pai_userdata), GFP_KERNEL); + if (!cpump->save) { + free_page((unsigned long)cpump->page); + cpump->page = NULL; + goto unlock; + } + rc = 0; + refcount_set(&cpump->refcnt, 1); + +unlock: + /* If rc is non-zero, do not set mode and reference count */ + if (!rc) { + cpump->mode = a->sample_period ? PAI_MODE_SAMPLING + : PAI_MODE_COUNTING; + } + debug_sprintf_event(cfm_dbg, 5, "%s sample_period %#llx users %d" + " mode %d refcnt %u page %#lx save %p rc %d\n", + __func__, a->sample_period, cpump->active_events, + cpump->mode, refcount_read(&cpump->refcnt), + (unsigned long)cpump->page, cpump->save, rc); + mutex_unlock(&pai_reserve_mutex); + return rc; +} + +/* Might be called on different CPU than the one the event is intended for. */ +static int paicrypt_event_init(struct perf_event *event) +{ + struct perf_event_attr *a = &event->attr; + struct paicrypt_map *cpump; + int rc; + + /* PAI crypto PMU registered as PERF_TYPE_RAW, check event type */ + if (a->type != PERF_TYPE_RAW && event->pmu->type != a->type) + return -ENOENT; + /* PAI crypto event must be in valid range */ + if (a->config < PAI_CRYPTO_BASE || + a->config > PAI_CRYPTO_BASE + paicrypt_cnt) + return -EINVAL; + /* Allow only CPU wide operation, no process context for now. */ + if (event->hw.target || event->cpu == -1) + return -ENOENT; + /* Allow only CRYPTO_ALL for sampling. */ + if (a->sample_period && a->config != PAI_CRYPTO_BASE) + return -EINVAL; + + cpump = per_cpu_ptr(&paicrypt_map, event->cpu); + rc = paicrypt_busy(a, cpump); + if (rc) + return rc; + + /* Event initialization sets last_tag to 0. When later on the events + * are deleted and re-added, do not reset the event count value to zero. + * Events are added, deleted and re-added when 2 or more events + * are active at the same time. + */ + event->hw.last_tag = 0; + cpump->event = event; + event->destroy = paicrypt_event_destroy; + + if (a->sample_period) { + a->sample_period = 1; + a->freq = 0; + /* Register for paicrypt_sched_task() to be called */ + event->attach_state |= PERF_ATTACH_SCHED_CB; + /* Add raw data which contain the memory mapped counters */ + a->sample_type |= PERF_SAMPLE_RAW; + /* Turn off inheritance */ + a->inherit = 0; + } + + static_branch_inc(&pai_key); + return 0; +} + +static void paicrypt_read(struct perf_event *event) +{ + u64 prev, new, delta; + + prev = local64_read(&event->hw.prev_count); + new = paicrypt_getall(event); + local64_set(&event->hw.prev_count, new); + delta = (prev <= new) ? new - prev + : (-1ULL - prev) + new + 1; /* overflow */ + local64_add(delta, &event->count); +} + +static void paicrypt_start(struct perf_event *event, int flags) +{ + u64 sum; + + if (!event->hw.last_tag) { + event->hw.last_tag = 1; + sum = paicrypt_getall(event); /* Get current value */ + local64_set(&event->count, 0); + local64_set(&event->hw.prev_count, sum); + } +} + +static int paicrypt_add(struct perf_event *event, int flags) +{ + struct paicrypt_map *cpump = this_cpu_ptr(&paicrypt_map); + unsigned long ccd; + + if (++cpump->active_events == 1) { + ccd = virt_to_phys(cpump->page) | PAI_CRYPTO_KERNEL_OFFSET; + WRITE_ONCE(S390_lowcore.ccd, ccd); + __ctl_set_bit(0, 50); + } + cpump->event = event; + if (flags & PERF_EF_START && !event->attr.sample_period) { + /* Only counting needs initial counter value */ + paicrypt_start(event, PERF_EF_RELOAD); + } + event->hw.state = 0; + if (event->attr.sample_period) + perf_sched_cb_inc(event->pmu); + return 0; +} + +static void paicrypt_stop(struct perf_event *event, int flags) +{ + paicrypt_read(event); + event->hw.state = PERF_HES_STOPPED; +} + +static void paicrypt_del(struct perf_event *event, int flags) +{ + struct paicrypt_map *cpump = this_cpu_ptr(&paicrypt_map); + + if (event->attr.sample_period) + perf_sched_cb_dec(event->pmu); + if (!event->attr.sample_period) + /* Only counting needs to read counter */ + paicrypt_stop(event, PERF_EF_UPDATE); + if (--cpump->active_events == 0) { + __ctl_clear_bit(0, 50); + WRITE_ONCE(S390_lowcore.ccd, 0); + } +} + +/* Create raw data and save it in buffer. Returns number of bytes copied. + * Saves only positive counter entries of the form + * 2 bytes: Number of counter + * 8 bytes: Value of counter + */ +static size_t paicrypt_copy(struct pai_userdata *userdata, + struct paicrypt_map *cpump, + bool exclude_user, bool exclude_kernel) +{ + int i, outidx = 0; + + for (i = 1; i <= paicrypt_cnt; i++) { + u64 val = 0; + + if (!exclude_kernel) + val += paicrypt_getctr(cpump, i, true); + if (!exclude_user) + val += paicrypt_getctr(cpump, i, false); + if (val) { + userdata[outidx].num = i; + userdata[outidx].value = val; + outidx++; + } + } + return outidx * sizeof(struct pai_userdata); +} + +static int paicrypt_push_sample(void) +{ + struct paicrypt_map *cpump = this_cpu_ptr(&paicrypt_map); + struct perf_event *event = cpump->event; + struct perf_sample_data data; + struct perf_raw_record raw; + struct pt_regs regs; + size_t rawsize; + int overflow; + + if (!cpump->event) /* No event active */ + return 0; + rawsize = paicrypt_copy(cpump->save, cpump, + cpump->event->attr.exclude_user, + cpump->event->attr.exclude_kernel); + if (!rawsize) /* No incremented counters */ + return 0; + + /* Setup perf sample */ + memset(®s, 0, sizeof(regs)); + memset(&raw, 0, sizeof(raw)); + memset(&data, 0, sizeof(data)); + perf_sample_data_init(&data, 0, event->hw.last_period); + if (event->attr.sample_type & PERF_SAMPLE_TID) { + data.tid_entry.pid = task_tgid_nr(current); + data.tid_entry.tid = task_pid_nr(current); + } + if (event->attr.sample_type & PERF_SAMPLE_TIME) + data.time = event->clock(); + if (event->attr.sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) + data.id = event->id; + if (event->attr.sample_type & PERF_SAMPLE_CPU) { + data.cpu_entry.cpu = smp_processor_id(); + data.cpu_entry.reserved = 0; + } + if (event->attr.sample_type & PERF_SAMPLE_RAW) { + raw.frag.size = rawsize; + raw.frag.data = cpump->save; + perf_sample_save_raw_data(&data, &raw); + } + + overflow = perf_event_overflow(event, &data, ®s); + perf_event_update_userpage(event); + /* Clear lowcore page after read */ + memset(cpump->page, 0, PAGE_SIZE); + return overflow; +} + +/* Called on schedule-in and schedule-out. No access to event structure, + * but for sampling only event CRYPTO_ALL is allowed. + */ +static void paicrypt_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) +{ + /* We started with a clean page on event installation. So read out + * results on schedule_out and if page was dirty, clear values. + */ + if (!sched_in) + paicrypt_push_sample(); +} + +/* Attribute definitions for paicrypt interface. As with other CPU + * Measurement Facilities, there is one attribute per mapped counter. + * The number of mapped counters may vary per machine generation. Use + * the QUERY PROCESSOR ACTIVITY COUNTER INFORMATION (QPACI) instruction + * to determine the number of mapped counters. The instructions returns + * a positive number, which is the highest number of supported counters. + * All counters less than this number are also supported, there are no + * holes. A returned number of zero means no support for mapped counters. + * + * The identification of the counter is a unique number. The chosen range + * is 0x1000 + offset in mapped kernel page. + * All CPU Measurement Facility counters identifiers must be unique and + * the numbers from 0 to 496 are already used for the CPU Measurement + * Counter facility. Numbers 0xb0000, 0xbc000 and 0xbd000 are already + * used for the CPU Measurement Sampling facility. + */ +PMU_FORMAT_ATTR(event, "config:0-63"); + +static struct attribute *paicrypt_format_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group paicrypt_events_group = { + .name = "events", + .attrs = NULL /* Filled in attr_event_init() */ +}; + +static struct attribute_group paicrypt_format_group = { + .name = "format", + .attrs = paicrypt_format_attr, +}; + +static const struct attribute_group *paicrypt_attr_groups[] = { + &paicrypt_events_group, + &paicrypt_format_group, + NULL, +}; + +/* Performance monitoring unit for mapped counters */ +static struct pmu paicrypt = { + .task_ctx_nr = perf_invalid_context, + .event_init = paicrypt_event_init, + .add = paicrypt_add, + .del = paicrypt_del, + .start = paicrypt_start, + .stop = paicrypt_stop, + .read = paicrypt_read, + .sched_task = paicrypt_sched_task, + .attr_groups = paicrypt_attr_groups +}; + +/* List of symbolic PAI counter names. */ +static const char * const paicrypt_ctrnames[] = { + [0] = "CRYPTO_ALL", + [1] = "KM_DEA", + [2] = "KM_TDEA_128", + [3] = "KM_TDEA_192", + [4] = "KM_ENCRYPTED_DEA", + [5] = "KM_ENCRYPTED_TDEA_128", + [6] = "KM_ENCRYPTED_TDEA_192", + [7] = "KM_AES_128", + [8] = "KM_AES_192", + [9] = "KM_AES_256", + [10] = "KM_ENCRYPTED_AES_128", + [11] = "KM_ENCRYPTED_AES_192", + [12] = "KM_ENCRYPTED_AES_256", + [13] = "KM_XTS_AES_128", + [14] = "KM_XTS_AES_256", + [15] = "KM_XTS_ENCRYPTED_AES_128", + [16] = "KM_XTS_ENCRYPTED_AES_256", + [17] = "KMC_DEA", + [18] = "KMC_TDEA_128", + [19] = "KMC_TDEA_192", + [20] = "KMC_ENCRYPTED_DEA", + [21] = "KMC_ENCRYPTED_TDEA_128", + [22] = "KMC_ENCRYPTED_TDEA_192", + [23] = "KMC_AES_128", + [24] = "KMC_AES_192", + [25] = "KMC_AES_256", + [26] = "KMC_ENCRYPTED_AES_128", + [27] = "KMC_ENCRYPTED_AES_192", + [28] = "KMC_ENCRYPTED_AES_256", + [29] = "KMC_PRNG", + [30] = "KMA_GCM_AES_128", + [31] = "KMA_GCM_AES_192", + [32] = "KMA_GCM_AES_256", + [33] = "KMA_GCM_ENCRYPTED_AES_128", + [34] = "KMA_GCM_ENCRYPTED_AES_192", + [35] = "KMA_GCM_ENCRYPTED_AES_256", + [36] = "KMF_DEA", + [37] = "KMF_TDEA_128", + [38] = "KMF_TDEA_192", + [39] = "KMF_ENCRYPTED_DEA", + [40] = "KMF_ENCRYPTED_TDEA_128", + [41] = "KMF_ENCRYPTED_TDEA_192", + [42] = "KMF_AES_128", + [43] = "KMF_AES_192", + [44] = "KMF_AES_256", + [45] = "KMF_ENCRYPTED_AES_128", + [46] = "KMF_ENCRYPTED_AES_192", + [47] = "KMF_ENCRYPTED_AES_256", + [48] = "KMCTR_DEA", + [49] = "KMCTR_TDEA_128", + [50] = "KMCTR_TDEA_192", + [51] = "KMCTR_ENCRYPTED_DEA", + [52] = "KMCTR_ENCRYPTED_TDEA_128", + [53] = "KMCTR_ENCRYPTED_TDEA_192", + [54] = "KMCTR_AES_128", + [55] = "KMCTR_AES_192", + [56] = "KMCTR_AES_256", + [57] = "KMCTR_ENCRYPTED_AES_128", + [58] = "KMCTR_ENCRYPTED_AES_192", + [59] = "KMCTR_ENCRYPTED_AES_256", + [60] = "KMO_DEA", + [61] = "KMO_TDEA_128", + [62] = "KMO_TDEA_192", + [63] = "KMO_ENCRYPTED_DEA", + [64] = "KMO_ENCRYPTED_TDEA_128", + [65] = "KMO_ENCRYPTED_TDEA_192", + [66] = "KMO_AES_128", + [67] = "KMO_AES_192", + [68] = "KMO_AES_256", + [69] = "KMO_ENCRYPTED_AES_128", + [70] = "KMO_ENCRYPTED_AES_192", + [71] = "KMO_ENCRYPTED_AES_256", + [72] = "KIMD_SHA_1", + [73] = "KIMD_SHA_256", + [74] = "KIMD_SHA_512", + [75] = "KIMD_SHA3_224", + [76] = "KIMD_SHA3_256", + [77] = "KIMD_SHA3_384", + [78] = "KIMD_SHA3_512", + [79] = "KIMD_SHAKE_128", + [80] = "KIMD_SHAKE_256", + [81] = "KIMD_GHASH", + [82] = "KLMD_SHA_1", + [83] = "KLMD_SHA_256", + [84] = "KLMD_SHA_512", + [85] = "KLMD_SHA3_224", + [86] = "KLMD_SHA3_256", + [87] = "KLMD_SHA3_384", + [88] = "KLMD_SHA3_512", + [89] = "KLMD_SHAKE_128", + [90] = "KLMD_SHAKE_256", + [91] = "KMAC_DEA", + [92] = "KMAC_TDEA_128", + [93] = "KMAC_TDEA_192", + [94] = "KMAC_ENCRYPTED_DEA", + [95] = "KMAC_ENCRYPTED_TDEA_128", + [96] = "KMAC_ENCRYPTED_TDEA_192", + [97] = "KMAC_AES_128", + [98] = "KMAC_AES_192", + [99] = "KMAC_AES_256", + [100] = "KMAC_ENCRYPTED_AES_128", + [101] = "KMAC_ENCRYPTED_AES_192", + [102] = "KMAC_ENCRYPTED_AES_256", + [103] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_DEA", + [104] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_TDEA_128", + [105] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_TDEA_192", + [106] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_DEA", + [107] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_TDEA_128", + [108] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_TDEA_192", + [109] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_128", + [110] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_192", + [111] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_256", + [112] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_128", + [113] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_192", + [114] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_256A", + [115] = "PCC_COMPUTE_XTS_PARAMETER_USING_AES_128", + [116] = "PCC_COMPUTE_XTS_PARAMETER_USING_AES_256", + [117] = "PCC_COMPUTE_XTS_PARAMETER_USING_ENCRYPTED_AES_128", + [118] = "PCC_COMPUTE_XTS_PARAMETER_USING_ENCRYPTED_AES_256", + [119] = "PCC_SCALAR_MULTIPLY_P256", + [120] = "PCC_SCALAR_MULTIPLY_P384", + [121] = "PCC_SCALAR_MULTIPLY_P521", + [122] = "PCC_SCALAR_MULTIPLY_ED25519", + [123] = "PCC_SCALAR_MULTIPLY_ED448", + [124] = "PCC_SCALAR_MULTIPLY_X25519", + [125] = "PCC_SCALAR_MULTIPLY_X448", + [126] = "PRNO_SHA_512_DRNG", + [127] = "PRNO_TRNG_QUERY_RAW_TO_CONDITIONED_RATIO", + [128] = "PRNO_TRNG", + [129] = "KDSA_ECDSA_VERIFY_P256", + [130] = "KDSA_ECDSA_VERIFY_P384", + [131] = "KDSA_ECDSA_VERIFY_P521", + [132] = "KDSA_ECDSA_SIGN_P256", + [133] = "KDSA_ECDSA_SIGN_P384", + [134] = "KDSA_ECDSA_SIGN_P521", + [135] = "KDSA_ENCRYPTED_ECDSA_SIGN_P256", + [136] = "KDSA_ENCRYPTED_ECDSA_SIGN_P384", + [137] = "KDSA_ENCRYPTED_ECDSA_SIGN_P521", + [138] = "KDSA_EDDSA_VERIFY_ED25519", + [139] = "KDSA_EDDSA_VERIFY_ED448", + [140] = "KDSA_EDDSA_SIGN_ED25519", + [141] = "KDSA_EDDSA_SIGN_ED448", + [142] = "KDSA_ENCRYPTED_EDDSA_SIGN_ED25519", + [143] = "KDSA_ENCRYPTED_EDDSA_SIGN_ED448", + [144] = "PCKMO_ENCRYPT_DEA_KEY", + [145] = "PCKMO_ENCRYPT_TDEA_128_KEY", + [146] = "PCKMO_ENCRYPT_TDEA_192_KEY", + [147] = "PCKMO_ENCRYPT_AES_128_KEY", + [148] = "PCKMO_ENCRYPT_AES_192_KEY", + [149] = "PCKMO_ENCRYPT_AES_256_KEY", + [150] = "PCKMO_ENCRYPT_ECC_P256_KEY", + [151] = "PCKMO_ENCRYPT_ECC_P384_KEY", + [152] = "PCKMO_ENCRYPT_ECC_P521_KEY", + [153] = "PCKMO_ENCRYPT_ECC_ED25519_KEY", + [154] = "PCKMO_ENCRYPT_ECC_ED448_KEY", + [155] = "IBM_RESERVED_155", + [156] = "IBM_RESERVED_156", +}; + +static void __init attr_event_free(struct attribute **attrs, int num) +{ + struct perf_pmu_events_attr *pa; + int i; + + for (i = 0; i < num; i++) { + struct device_attribute *dap; + + dap = container_of(attrs[i], struct device_attribute, attr); + pa = container_of(dap, struct perf_pmu_events_attr, attr); + kfree(pa); + } + kfree(attrs); +} + +static int __init attr_event_init_one(struct attribute **attrs, int num) +{ + struct perf_pmu_events_attr *pa; + + pa = kzalloc(sizeof(*pa), GFP_KERNEL); + if (!pa) + return -ENOMEM; + + sysfs_attr_init(&pa->attr.attr); + pa->id = PAI_CRYPTO_BASE + num; + pa->attr.attr.name = paicrypt_ctrnames[num]; + pa->attr.attr.mode = 0444; + pa->attr.show = cpumf_events_sysfs_show; + pa->attr.store = NULL; + attrs[num] = &pa->attr.attr; + return 0; +} + +/* Create PMU sysfs event attributes on the fly. */ +static int __init attr_event_init(void) +{ + struct attribute **attrs; + int ret, i; + + attrs = kmalloc_array(ARRAY_SIZE(paicrypt_ctrnames) + 1, sizeof(*attrs), + GFP_KERNEL); + if (!attrs) + return -ENOMEM; + for (i = 0; i < ARRAY_SIZE(paicrypt_ctrnames); i++) { + ret = attr_event_init_one(attrs, i); + if (ret) { + attr_event_free(attrs, i - 1); + return ret; + } + } + attrs[i] = NULL; + paicrypt_events_group.attrs = attrs; + return 0; +} + +static int __init paicrypt_init(void) +{ + struct qpaci_info_block ib; + int rc; + + if (!test_facility(196)) + return 0; + + qpaci(&ib); + paicrypt_cnt = ib.num_cc; + if (paicrypt_cnt == 0) + return 0; + if (paicrypt_cnt >= PAI_CRYPTO_MAXCTR) + paicrypt_cnt = PAI_CRYPTO_MAXCTR - 1; + + rc = attr_event_init(); /* Export known PAI crypto events */ + if (rc) { + pr_err("Creation of PMU pai_crypto /sysfs failed\n"); + return rc; + } + + /* Setup s390dbf facility */ + cfm_dbg = debug_register(KMSG_COMPONENT, 2, 256, 128); + if (!cfm_dbg) { + pr_err("Registration of s390dbf pai_crypto failed\n"); + return -ENOMEM; + } + debug_register_view(cfm_dbg, &debug_sprintf_view); + + rc = perf_pmu_register(&paicrypt, "pai_crypto", -1); + if (rc) { + pr_err("Registering the pai_crypto PMU failed with rc=%i\n", + rc); + debug_unregister_view(cfm_dbg, &debug_sprintf_view); + debug_unregister(cfm_dbg); + return rc; + } + return 0; +} + +device_initcall(paicrypt_init); diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c new file mode 100644 index 0000000000..c57c1a2032 --- /dev/null +++ b/arch/s390/kernel/perf_pai_ext.c @@ -0,0 +1,667 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Performance event support - Processor Activity Instrumentation Extension + * Facility + * + * Copyright IBM Corp. 2022 + * Author(s): Thomas Richter + */ +#define KMSG_COMPONENT "pai_ext" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define PAIE1_CB_SZ 0x200 /* Size of PAIE1 control block */ +#define PAIE1_CTRBLOCK_SZ 0x400 /* Size of PAIE1 counter blocks */ + +static debug_info_t *paiext_dbg; +static unsigned int paiext_cnt; /* Extracted with QPACI instruction */ + +struct pai_userdata { + u16 num; + u64 value; +} __packed; + +/* Create the PAI extension 1 control block area. + * The PAI extension control block 1 is pointed to by lowcore + * address 0x1508 for each CPU. This control block is 512 bytes in size + * and requires a 512 byte boundary alignment. + */ +struct paiext_cb { /* PAI extension 1 control block */ + u64 header; /* Not used */ + u64 reserved1; + u64 acc; /* Addr to analytics counter control block */ + u8 reserved2[488]; +} __packed; + +struct paiext_map { + unsigned long *area; /* Area for CPU to store counters */ + struct pai_userdata *save; /* Area to store non-zero counters */ + enum paievt_mode mode; /* Type of event */ + unsigned int active_events; /* # of PAI Extension users */ + refcount_t refcnt; + struct perf_event *event; /* Perf event for sampling */ + struct paiext_cb *paiext_cb; /* PAI extension control block area */ +}; + +struct paiext_mapptr { + struct paiext_map *mapptr; +}; + +static struct paiext_root { /* Anchor to per CPU data */ + refcount_t refcnt; /* Overall active events */ + struct paiext_mapptr __percpu *mapptr; +} paiext_root; + +/* Free per CPU data when the last event is removed. */ +static void paiext_root_free(void) +{ + if (refcount_dec_and_test(&paiext_root.refcnt)) { + free_percpu(paiext_root.mapptr); + paiext_root.mapptr = NULL; + } +} + +/* On initialization of first event also allocate per CPU data dynamically. + * Start with an array of pointers, the array size is the maximum number of + * CPUs possible, which might be larger than the number of CPUs currently + * online. + */ +static int paiext_root_alloc(void) +{ + if (!refcount_inc_not_zero(&paiext_root.refcnt)) { + /* The memory is already zeroed. */ + paiext_root.mapptr = alloc_percpu(struct paiext_mapptr); + if (!paiext_root.mapptr) { + /* Returning without refcnt adjustment is ok. The + * error code is handled by paiext_alloc() which + * decrements refcnt when an event can not be + * created. + */ + return -ENOMEM; + } + refcount_set(&paiext_root.refcnt, 1); + } + return 0; +} + +/* Protects against concurrent increment of sampler and counter member + * increments at the same time and prohibits concurrent execution of + * counting and sampling events. + * Ensures that analytics counter block is deallocated only when the + * sampling and counting on that cpu is zero. + * For details see paiext_alloc(). + */ +static DEFINE_MUTEX(paiext_reserve_mutex); + +/* Free all memory allocated for event counting/sampling setup */ +static void paiext_free(struct paiext_mapptr *mp) +{ + kfree(mp->mapptr->area); + kfree(mp->mapptr->paiext_cb); + kvfree(mp->mapptr->save); + kfree(mp->mapptr); + mp->mapptr = NULL; +} + +/* Release the PMU if event is the last perf event */ +static void paiext_event_destroy(struct perf_event *event) +{ + struct paiext_mapptr *mp = per_cpu_ptr(paiext_root.mapptr, event->cpu); + struct paiext_map *cpump = mp->mapptr; + + mutex_lock(&paiext_reserve_mutex); + cpump->event = NULL; + if (refcount_dec_and_test(&cpump->refcnt)) /* Last reference gone */ + paiext_free(mp); + paiext_root_free(); + mutex_unlock(&paiext_reserve_mutex); + debug_sprintf_event(paiext_dbg, 4, "%s cpu %d mapptr %p\n", __func__, + event->cpu, mp->mapptr); + +} + +/* Used to avoid races in checking concurrent access of counting and + * sampling for pai_extension events. + * + * Only one instance of event pai_ext/NNPA_ALL/ for sampling is + * allowed and when this event is running, no counting event is allowed. + * Several counting events are allowed in parallel, but no sampling event + * is allowed while one (or more) counting events are running. + * + * This function is called in process context and it is safe to block. + * When the event initialization functions fails, no other call back will + * be invoked. + * + * Allocate the memory for the event. + */ +static int paiext_alloc(struct perf_event_attr *a, struct perf_event *event) +{ + struct paiext_mapptr *mp; + struct paiext_map *cpump; + int rc; + + mutex_lock(&paiext_reserve_mutex); + + rc = paiext_root_alloc(); + if (rc) + goto unlock; + + mp = per_cpu_ptr(paiext_root.mapptr, event->cpu); + cpump = mp->mapptr; + if (!cpump) { /* Paiext_map allocated? */ + rc = -ENOMEM; + cpump = kzalloc(sizeof(*cpump), GFP_KERNEL); + if (!cpump) + goto undo; + + /* Allocate memory for counter area and counter extraction. + * These are + * - a 512 byte block and requires 512 byte boundary alignment. + * - a 1KB byte block and requires 1KB boundary alignment. + * Only the first counting event has to allocate the area. + * + * Note: This works with commit 59bb47985c1d by default. + * Backporting this to kernels without this commit might + * need adjustment. + */ + mp->mapptr = cpump; + cpump->area = kzalloc(PAIE1_CTRBLOCK_SZ, GFP_KERNEL); + cpump->paiext_cb = kzalloc(PAIE1_CB_SZ, GFP_KERNEL); + cpump->save = kvmalloc_array(paiext_cnt + 1, + sizeof(struct pai_userdata), + GFP_KERNEL); + if (!cpump->save || !cpump->area || !cpump->paiext_cb) { + paiext_free(mp); + goto undo; + } + refcount_set(&cpump->refcnt, 1); + cpump->mode = a->sample_period ? PAI_MODE_SAMPLING + : PAI_MODE_COUNTING; + } else { + /* Multiple invocation, check what is active. + * Supported are multiple counter events or only one sampling + * event concurrently at any one time. + */ + if (cpump->mode == PAI_MODE_SAMPLING || + (cpump->mode == PAI_MODE_COUNTING && a->sample_period)) { + rc = -EBUSY; + goto undo; + } + refcount_inc(&cpump->refcnt); + } + + rc = 0; + cpump->event = event; + +undo: + if (rc) { + /* Error in allocation of event, decrement anchor. Since + * the event in not created, its destroy() function is never + * invoked. Adjust the reference counter for the anchor. + */ + paiext_root_free(); + } +unlock: + mutex_unlock(&paiext_reserve_mutex); + /* If rc is non-zero, no increment of counter/sampler was done. */ + return rc; +} + +/* The PAI extension 1 control block supports up to 128 entries. Return + * the index within PAIE1_CB given the event number. Also validate event + * number. + */ +static int paiext_event_valid(struct perf_event *event) +{ + u64 cfg = event->attr.config; + + if (cfg >= PAI_NNPA_BASE && cfg <= PAI_NNPA_BASE + paiext_cnt) { + /* Offset NNPA in paiext_cb */ + event->hw.config_base = offsetof(struct paiext_cb, acc); + return 0; + } + return -EINVAL; +} + +/* Might be called on different CPU than the one the event is intended for. */ +static int paiext_event_init(struct perf_event *event) +{ + struct perf_event_attr *a = &event->attr; + int rc; + + /* PMU pai_ext registered as PERF_TYPE_RAW, check event type */ + if (a->type != PERF_TYPE_RAW && event->pmu->type != a->type) + return -ENOENT; + /* PAI extension event must be valid and in supported range */ + rc = paiext_event_valid(event); + if (rc) + return rc; + /* Allow only CPU wide operation, no process context for now. */ + if (event->hw.target || event->cpu == -1) + return -ENOENT; + /* Allow only event NNPA_ALL for sampling. */ + if (a->sample_period && a->config != PAI_NNPA_BASE) + return -EINVAL; + /* Prohibit exclude_user event selection */ + if (a->exclude_user) + return -EINVAL; + + rc = paiext_alloc(a, event); + if (rc) + return rc; + event->hw.last_tag = 0; + event->destroy = paiext_event_destroy; + + if (a->sample_period) { + a->sample_period = 1; + a->freq = 0; + /* Register for paicrypt_sched_task() to be called */ + event->attach_state |= PERF_ATTACH_SCHED_CB; + /* Add raw data which are the memory mapped counters */ + a->sample_type |= PERF_SAMPLE_RAW; + /* Turn off inheritance */ + a->inherit = 0; + } + + return 0; +} + +static u64 paiext_getctr(struct paiext_map *cpump, int nr) +{ + return cpump->area[nr]; +} + +/* Read the counter values. Return value from location in buffer. For event + * NNPA_ALL sum up all events. + */ +static u64 paiext_getdata(struct perf_event *event) +{ + struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); + struct paiext_map *cpump = mp->mapptr; + u64 sum = 0; + int i; + + if (event->attr.config != PAI_NNPA_BASE) + return paiext_getctr(cpump, event->attr.config - PAI_NNPA_BASE); + + for (i = 1; i <= paiext_cnt; i++) + sum += paiext_getctr(cpump, i); + + return sum; +} + +static u64 paiext_getall(struct perf_event *event) +{ + return paiext_getdata(event); +} + +static void paiext_read(struct perf_event *event) +{ + u64 prev, new, delta; + + prev = local64_read(&event->hw.prev_count); + new = paiext_getall(event); + local64_set(&event->hw.prev_count, new); + delta = new - prev; + local64_add(delta, &event->count); +} + +static void paiext_start(struct perf_event *event, int flags) +{ + u64 sum; + + if (event->hw.last_tag) + return; + event->hw.last_tag = 1; + sum = paiext_getall(event); /* Get current value */ + local64_set(&event->hw.prev_count, sum); + local64_set(&event->count, 0); +} + +static int paiext_add(struct perf_event *event, int flags) +{ + struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); + struct paiext_map *cpump = mp->mapptr; + struct paiext_cb *pcb = cpump->paiext_cb; + + if (++cpump->active_events == 1) { + S390_lowcore.aicd = virt_to_phys(cpump->paiext_cb); + pcb->acc = virt_to_phys(cpump->area) | 0x1; + /* Enable CPU instruction lookup for PAIE1 control block */ + __ctl_set_bit(0, 49); + debug_sprintf_event(paiext_dbg, 4, "%s 1508 %llx acc %llx\n", + __func__, S390_lowcore.aicd, pcb->acc); + } + if (flags & PERF_EF_START && !event->attr.sample_period) { + /* Only counting needs initial counter value */ + paiext_start(event, PERF_EF_RELOAD); + } + event->hw.state = 0; + if (event->attr.sample_period) { + cpump->event = event; + perf_sched_cb_inc(event->pmu); + } + return 0; +} + +static void paiext_stop(struct perf_event *event, int flags) +{ + paiext_read(event); + event->hw.state = PERF_HES_STOPPED; +} + +static void paiext_del(struct perf_event *event, int flags) +{ + struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); + struct paiext_map *cpump = mp->mapptr; + struct paiext_cb *pcb = cpump->paiext_cb; + + if (event->attr.sample_period) + perf_sched_cb_dec(event->pmu); + if (!event->attr.sample_period) { + /* Only counting needs to read counter */ + paiext_stop(event, PERF_EF_UPDATE); + } + if (--cpump->active_events == 0) { + /* Disable CPU instruction lookup for PAIE1 control block */ + __ctl_clear_bit(0, 49); + pcb->acc = 0; + S390_lowcore.aicd = 0; + debug_sprintf_event(paiext_dbg, 4, "%s 1508 %llx acc %llx\n", + __func__, S390_lowcore.aicd, pcb->acc); + } +} + +/* Create raw data and save it in buffer. Returns number of bytes copied. + * Saves only positive counter entries of the form + * 2 bytes: Number of counter + * 8 bytes: Value of counter + */ +static size_t paiext_copy(struct paiext_map *cpump) +{ + struct pai_userdata *userdata = cpump->save; + int i, outidx = 0; + + for (i = 1; i <= paiext_cnt; i++) { + u64 val = paiext_getctr(cpump, i); + + if (val) { + userdata[outidx].num = i; + userdata[outidx].value = val; + outidx++; + } + } + return outidx * sizeof(*userdata); +} + +/* Write sample when one or more counters values are nonzero. + * + * Note: The function paiext_sched_task() and paiext_push_sample() are not + * invoked after function paiext_del() has been called because of function + * perf_sched_cb_dec(). + * The function paiext_sched_task() and paiext_push_sample() are only + * called when sampling is active. Function perf_sched_cb_inc() + * has been invoked to install function paiext_sched_task() as call back + * to run at context switch time (see paiext_add()). + * + * This causes function perf_event_context_sched_out() and + * perf_event_context_sched_in() to check whether the PMU has installed an + * sched_task() callback. That callback is not active after paiext_del() + * returns and has deleted the event on that CPU. + */ +static int paiext_push_sample(void) +{ + struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); + struct paiext_map *cpump = mp->mapptr; + struct perf_event *event = cpump->event; + struct perf_sample_data data; + struct perf_raw_record raw; + struct pt_regs regs; + size_t rawsize; + int overflow; + + rawsize = paiext_copy(cpump); + if (!rawsize) /* No incremented counters */ + return 0; + + /* Setup perf sample */ + memset(®s, 0, sizeof(regs)); + memset(&raw, 0, sizeof(raw)); + memset(&data, 0, sizeof(data)); + perf_sample_data_init(&data, 0, event->hw.last_period); + if (event->attr.sample_type & PERF_SAMPLE_TID) { + data.tid_entry.pid = task_tgid_nr(current); + data.tid_entry.tid = task_pid_nr(current); + } + if (event->attr.sample_type & PERF_SAMPLE_TIME) + data.time = event->clock(); + if (event->attr.sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) + data.id = event->id; + if (event->attr.sample_type & PERF_SAMPLE_CPU) + data.cpu_entry.cpu = smp_processor_id(); + if (event->attr.sample_type & PERF_SAMPLE_RAW) { + raw.frag.size = rawsize; + raw.frag.data = cpump->save; + perf_sample_save_raw_data(&data, &raw); + } + + overflow = perf_event_overflow(event, &data, ®s); + perf_event_update_userpage(event); + /* Clear lowcore area after read */ + memset(cpump->area, 0, PAIE1_CTRBLOCK_SZ); + return overflow; +} + +/* Called on schedule-in and schedule-out. No access to event structure, + * but for sampling only event NNPA_ALL is allowed. + */ +static void paiext_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) +{ + /* We started with a clean page on event installation. So read out + * results on schedule_out and if page was dirty, clear values. + */ + if (!sched_in) + paiext_push_sample(); +} + +/* Attribute definitions for pai extension1 interface. As with other CPU + * Measurement Facilities, there is one attribute per mapped counter. + * The number of mapped counters may vary per machine generation. Use + * the QUERY PROCESSOR ACTIVITY COUNTER INFORMATION (QPACI) instruction + * to determine the number of mapped counters. The instructions returns + * a positive number, which is the highest number of supported counters. + * All counters less than this number are also supported, there are no + * holes. A returned number of zero means no support for mapped counters. + * + * The identification of the counter is a unique number. The chosen range + * is 0x1800 + offset in mapped kernel page. + * All CPU Measurement Facility counters identifiers must be unique and + * the numbers from 0 to 496 are already used for the CPU Measurement + * Counter facility. Number 0x1000 to 0x103e are used for PAI cryptography + * counters. + * Numbers 0xb0000, 0xbc000 and 0xbd000 are already + * used for the CPU Measurement Sampling facility. + */ +PMU_FORMAT_ATTR(event, "config:0-63"); + +static struct attribute *paiext_format_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group paiext_events_group = { + .name = "events", + .attrs = NULL, /* Filled in attr_event_init() */ +}; + +static struct attribute_group paiext_format_group = { + .name = "format", + .attrs = paiext_format_attr, +}; + +static const struct attribute_group *paiext_attr_groups[] = { + &paiext_events_group, + &paiext_format_group, + NULL, +}; + +/* Performance monitoring unit for mapped counters */ +static struct pmu paiext = { + .task_ctx_nr = perf_invalid_context, + .event_init = paiext_event_init, + .add = paiext_add, + .del = paiext_del, + .start = paiext_start, + .stop = paiext_stop, + .read = paiext_read, + .sched_task = paiext_sched_task, + .attr_groups = paiext_attr_groups, +}; + +/* List of symbolic PAI extension 1 NNPA counter names. */ +static const char * const paiext_ctrnames[] = { + [0] = "NNPA_ALL", + [1] = "NNPA_ADD", + [2] = "NNPA_SUB", + [3] = "NNPA_MUL", + [4] = "NNPA_DIV", + [5] = "NNPA_MIN", + [6] = "NNPA_MAX", + [7] = "NNPA_LOG", + [8] = "NNPA_EXP", + [9] = "NNPA_IBM_RESERVED_9", + [10] = "NNPA_RELU", + [11] = "NNPA_TANH", + [12] = "NNPA_SIGMOID", + [13] = "NNPA_SOFTMAX", + [14] = "NNPA_BATCHNORM", + [15] = "NNPA_MAXPOOL2D", + [16] = "NNPA_AVGPOOL2D", + [17] = "NNPA_LSTMACT", + [18] = "NNPA_GRUACT", + [19] = "NNPA_CONVOLUTION", + [20] = "NNPA_MATMUL_OP", + [21] = "NNPA_MATMUL_OP_BCAST23", + [22] = "NNPA_SMALLBATCH", + [23] = "NNPA_LARGEDIM", + [24] = "NNPA_SMALLTENSOR", + [25] = "NNPA_1MFRAME", + [26] = "NNPA_2GFRAME", + [27] = "NNPA_ACCESSEXCEPT", +}; + +static void __init attr_event_free(struct attribute **attrs, int num) +{ + struct perf_pmu_events_attr *pa; + struct device_attribute *dap; + int i; + + for (i = 0; i < num; i++) { + dap = container_of(attrs[i], struct device_attribute, attr); + pa = container_of(dap, struct perf_pmu_events_attr, attr); + kfree(pa); + } + kfree(attrs); +} + +static int __init attr_event_init_one(struct attribute **attrs, int num) +{ + struct perf_pmu_events_attr *pa; + + pa = kzalloc(sizeof(*pa), GFP_KERNEL); + if (!pa) + return -ENOMEM; + + sysfs_attr_init(&pa->attr.attr); + pa->id = PAI_NNPA_BASE + num; + pa->attr.attr.name = paiext_ctrnames[num]; + pa->attr.attr.mode = 0444; + pa->attr.show = cpumf_events_sysfs_show; + pa->attr.store = NULL; + attrs[num] = &pa->attr.attr; + return 0; +} + +/* Create PMU sysfs event attributes on the fly. */ +static int __init attr_event_init(void) +{ + struct attribute **attrs; + int ret, i; + + attrs = kmalloc_array(ARRAY_SIZE(paiext_ctrnames) + 1, sizeof(*attrs), + GFP_KERNEL); + if (!attrs) + return -ENOMEM; + for (i = 0; i < ARRAY_SIZE(paiext_ctrnames); i++) { + ret = attr_event_init_one(attrs, i); + if (ret) { + attr_event_free(attrs, i - 1); + return ret; + } + } + attrs[i] = NULL; + paiext_events_group.attrs = attrs; + return 0; +} + +static int __init paiext_init(void) +{ + struct qpaci_info_block ib; + int rc = -ENOMEM; + + if (!test_facility(197)) + return 0; + + qpaci(&ib); + paiext_cnt = ib.num_nnpa; + if (paiext_cnt >= PAI_NNPA_MAXCTR) + paiext_cnt = PAI_NNPA_MAXCTR; + if (!paiext_cnt) + return 0; + + rc = attr_event_init(); + if (rc) { + pr_err("Creation of PMU " KMSG_COMPONENT " /sysfs failed\n"); + return rc; + } + + /* Setup s390dbf facility */ + paiext_dbg = debug_register(KMSG_COMPONENT, 2, 256, 128); + if (!paiext_dbg) { + pr_err("Registration of s390dbf " KMSG_COMPONENT " failed\n"); + rc = -ENOMEM; + goto out_init; + } + debug_register_view(paiext_dbg, &debug_sprintf_view); + + rc = perf_pmu_register(&paiext, KMSG_COMPONENT, -1); + if (rc) { + pr_err("Registration of " KMSG_COMPONENT " PMU failed with " + "rc=%i\n", rc); + goto out_pmu; + } + + return 0; + +out_pmu: + debug_unregister_view(paiext_dbg, &debug_sprintf_view); + debug_unregister(paiext_dbg); +out_init: + attr_event_free(paiext_events_group.attrs, + ARRAY_SIZE(paiext_ctrnames) + 1); + return rc; +} + +device_initcall(paiext_init); diff --git a/arch/s390/kernel/perf_regs.c b/arch/s390/kernel/perf_regs.c new file mode 100644 index 0000000000..6e9e5d5e92 --- /dev/null +++ b/arch/s390/kernel/perf_regs.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include + +u64 perf_reg_value(struct pt_regs *regs, int idx) +{ + freg_t fp; + + if (idx >= PERF_REG_S390_R0 && idx <= PERF_REG_S390_R15) + return regs->gprs[idx]; + + if (idx >= PERF_REG_S390_FP0 && idx <= PERF_REG_S390_FP15) { + if (!user_mode(regs)) + return 0; + + idx -= PERF_REG_S390_FP0; + fp = MACHINE_HAS_VX ? *(freg_t *)(current->thread.fpu.vxrs + idx) + : current->thread.fpu.fprs[idx]; + return fp.ui; + } + + if (idx == PERF_REG_S390_MASK) + return regs->psw.mask; + if (idx == PERF_REG_S390_PC) + return regs->psw.addr; + + WARN_ON_ONCE((u32)idx >= PERF_REG_S390_MAX); + return 0; +} + +#define REG_RESERVED (~((1UL << PERF_REG_S390_MAX) - 1)) + +int perf_reg_validate(u64 mask) +{ + if (!mask || mask & REG_RESERVED) + return -EINVAL; + + return 0; +} + +u64 perf_reg_abi(struct task_struct *task) +{ + if (test_tsk_thread_flag(task, TIF_31BIT)) + return PERF_SAMPLE_REGS_ABI_32; + + return PERF_SAMPLE_REGS_ABI_64; +} + +void perf_get_regs_user(struct perf_regs *regs_user, + struct pt_regs *regs) +{ + /* + * Use the regs from the first interruption and let + * perf_sample_regs_intr() handle interrupts (regs == get_irq_regs()). + * + * Also save FPU registers for user-space tasks only. + */ + regs_user->regs = task_pt_regs(current); + if (user_mode(regs_user->regs)) + save_fpu_regs(); + regs_user->abi = perf_reg_abi(current); +} diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c new file mode 100644 index 0000000000..2580004177 --- /dev/null +++ b/arch/s390/kernel/process.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file handles the architecture dependent parts of process handling. + * + * Copyright IBM Corp. 1999, 2009 + * Author(s): Martin Schwidefsky , + * Hartmut Penner , + * Denis Joseph Barrow, + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "entry.h" + +void ret_from_fork(void) asm("ret_from_fork"); + +void __ret_from_fork(struct task_struct *prev, struct pt_regs *regs) +{ + void (*func)(void *arg); + + schedule_tail(prev); + + if (!user_mode(regs)) { + /* Kernel thread */ + func = (void *)regs->gprs[9]; + func((void *)regs->gprs[10]); + } + clear_pt_regs_flag(regs, PIF_SYSCALL); + syscall_exit_to_user_mode(regs); +} + +void flush_thread(void) +{ +} + +void arch_setup_new_exec(void) +{ + if (S390_lowcore.current_pid != current->pid) { + S390_lowcore.current_pid = current->pid; + if (test_facility(40)) + lpp(&S390_lowcore.lpp); + } +} + +void arch_release_task_struct(struct task_struct *tsk) +{ + runtime_instr_release(tsk); + guarded_storage_release(tsk); +} + +int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) +{ + /* + * Save the floating-point or vector register state of the current + * task and set the CIF_FPU flag to lazy restore the FPU register + * state when returning to user space. + */ + save_fpu_regs(); + + memcpy(dst, src, arch_task_struct_size); + dst->thread.fpu.regs = dst->thread.fpu.fprs; + + /* + * Don't transfer over the runtime instrumentation or the guarded + * storage control block pointers. These fields are cleared here instead + * of in copy_thread() to avoid premature freeing of associated memory + * on fork() failure. Wait to clear the RI flag because ->stack still + * refers to the source thread. + */ + dst->thread.ri_cb = NULL; + dst->thread.gs_cb = NULL; + dst->thread.gs_bc_cb = NULL; + + return 0; +} + +int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) +{ + unsigned long clone_flags = args->flags; + unsigned long new_stackp = args->stack; + unsigned long tls = args->tls; + struct fake_frame + { + struct stack_frame sf; + struct pt_regs childregs; + } *frame; + + frame = container_of(task_pt_regs(p), struct fake_frame, childregs); + p->thread.ksp = (unsigned long) frame; + /* Save access registers to new thread structure. */ + save_access_regs(&p->thread.acrs[0]); + /* start new process with ar4 pointing to the correct address space */ + /* Don't copy debug registers */ + memset(&p->thread.per_user, 0, sizeof(p->thread.per_user)); + memset(&p->thread.per_event, 0, sizeof(p->thread.per_event)); + clear_tsk_thread_flag(p, TIF_SINGLE_STEP); + p->thread.per_flags = 0; + /* Initialize per thread user and system timer values */ + p->thread.user_timer = 0; + p->thread.guest_timer = 0; + p->thread.system_timer = 0; + p->thread.hardirq_timer = 0; + p->thread.softirq_timer = 0; + p->thread.last_break = 1; + + frame->sf.back_chain = 0; + frame->sf.gprs[11 - 6] = (unsigned long)&frame->childregs; + frame->sf.gprs[12 - 6] = (unsigned long)p; + /* new return point is ret_from_fork */ + frame->sf.gprs[14 - 6] = (unsigned long)ret_from_fork; + /* fake return stack for resume(), don't go back to schedule */ + frame->sf.gprs[15 - 6] = (unsigned long)frame; + + /* Store access registers to kernel stack of new process. */ + if (unlikely(args->fn)) { + /* kernel thread */ + memset(&frame->childregs, 0, sizeof(struct pt_regs)); + frame->childregs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_IO | + PSW_MASK_EXT | PSW_MASK_MCHECK; + frame->childregs.gprs[9] = (unsigned long)args->fn; + frame->childregs.gprs[10] = (unsigned long)args->fn_arg; + frame->childregs.orig_gpr2 = -1; + frame->childregs.last_break = 1; + return 0; + } + frame->childregs = *current_pt_regs(); + frame->childregs.gprs[2] = 0; /* child returns 0 on fork. */ + frame->childregs.flags = 0; + if (new_stackp) + frame->childregs.gprs[15] = new_stackp; + /* + * Clear the runtime instrumentation flag after the above childregs + * copy. The CB pointer was already cleared in arch_dup_task_struct(). + */ + frame->childregs.psw.mask &= ~PSW_MASK_RI; + + /* Set a new TLS ? */ + if (clone_flags & CLONE_SETTLS) { + if (is_compat_task()) { + p->thread.acrs[0] = (unsigned int)tls; + } else { + p->thread.acrs[0] = (unsigned int)(tls >> 32); + p->thread.acrs[1] = (unsigned int)tls; + } + } + /* + * s390 stores the svc return address in arch_data when calling + * sigreturn()/restart_syscall() via vdso. 1 means no valid address + * stored. + */ + p->restart_block.arch_data = 1; + return 0; +} + +void execve_tail(void) +{ + current->thread.fpu.fpc = 0; + asm volatile("sfpc %0" : : "d" (0)); +} + +unsigned long __get_wchan(struct task_struct *p) +{ + struct unwind_state state; + unsigned long ip = 0; + + if (!task_stack_page(p)) + return 0; + + if (!try_get_task_stack(p)) + return 0; + + unwind_for_each_frame(&state, p, NULL, 0) { + if (state.stack_info.type != STACK_TYPE_TASK) { + ip = 0; + break; + } + + ip = unwind_get_return_address(&state); + if (!ip) + break; + + if (!in_sched_functions(ip)) + break; + } + + put_task_stack(p); + return ip; +} + +unsigned long arch_align_stack(unsigned long sp) +{ + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) + sp -= get_random_u32_below(PAGE_SIZE); + return sp & ~0xf; +} + +static inline unsigned long brk_rnd(void) +{ + return (get_random_u16() & BRK_RND_MASK) << PAGE_SHIFT; +} + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + unsigned long ret; + + ret = PAGE_ALIGN(mm->brk + brk_rnd()); + return (ret > mm->brk) ? ret : mm->brk; +} diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c new file mode 100644 index 0000000000..0a999c8226 --- /dev/null +++ b/arch/s390/kernel/processor.c @@ -0,0 +1,366 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2008 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) + */ + +#define KMSG_COMPONENT "cpu" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +unsigned long __read_mostly elf_hwcap; +char elf_platform[ELF_PLATFORM_SIZE]; + +struct cpu_info { + unsigned int cpu_mhz_dynamic; + unsigned int cpu_mhz_static; + struct cpuid cpu_id; +}; + +static DEFINE_PER_CPU(struct cpu_info, cpu_info); +static DEFINE_PER_CPU(int, cpu_relax_retry); + +static bool machine_has_cpu_mhz; + +void __init cpu_detect_mhz_feature(void) +{ + if (test_facility(34) && __ecag(ECAG_CPU_ATTRIBUTE, 0) != -1UL) + machine_has_cpu_mhz = true; +} + +static void update_cpu_mhz(void *arg) +{ + unsigned long mhz; + struct cpu_info *c; + + mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0); + c = this_cpu_ptr(&cpu_info); + c->cpu_mhz_dynamic = mhz >> 32; + c->cpu_mhz_static = mhz & 0xffffffff; +} + +void s390_update_cpu_mhz(void) +{ + s390_adjust_jiffies(); + if (machine_has_cpu_mhz) + on_each_cpu(update_cpu_mhz, NULL, 0); +} + +void notrace stop_machine_yield(const struct cpumask *cpumask) +{ + int cpu, this_cpu; + + this_cpu = smp_processor_id(); + if (__this_cpu_inc_return(cpu_relax_retry) >= spin_retry) { + __this_cpu_write(cpu_relax_retry, 0); + cpu = cpumask_next_wrap(this_cpu, cpumask, this_cpu, false); + if (cpu >= nr_cpu_ids) + return; + if (arch_vcpu_is_preempted(cpu)) + smp_yield_cpu(cpu); + } +} + +/* + * cpu_init - initializes state that is per-CPU. + */ +void cpu_init(void) +{ + struct cpuid *id = this_cpu_ptr(&cpu_info.cpu_id); + + get_cpu_id(id); + if (machine_has_cpu_mhz) + update_cpu_mhz(NULL); + mmgrab(&init_mm); + current->active_mm = &init_mm; + BUG_ON(current->mm); + enter_lazy_tlb(&init_mm, current); +} + +static void show_facilities(struct seq_file *m) +{ + unsigned int bit; + + seq_puts(m, "facilities :"); + for_each_set_bit_inv(bit, (long *)&stfle_fac_list, MAX_FACILITY_BIT) + seq_printf(m, " %d", bit); + seq_putc(m, '\n'); +} + +static void show_cpu_summary(struct seq_file *m, void *v) +{ + static const char *hwcap_str[] = { + [HWCAP_NR_ESAN3] = "esan3", + [HWCAP_NR_ZARCH] = "zarch", + [HWCAP_NR_STFLE] = "stfle", + [HWCAP_NR_MSA] = "msa", + [HWCAP_NR_LDISP] = "ldisp", + [HWCAP_NR_EIMM] = "eimm", + [HWCAP_NR_DFP] = "dfp", + [HWCAP_NR_HPAGE] = "edat", + [HWCAP_NR_ETF3EH] = "etf3eh", + [HWCAP_NR_HIGH_GPRS] = "highgprs", + [HWCAP_NR_TE] = "te", + [HWCAP_NR_VXRS] = "vx", + [HWCAP_NR_VXRS_BCD] = "vxd", + [HWCAP_NR_VXRS_EXT] = "vxe", + [HWCAP_NR_GS] = "gs", + [HWCAP_NR_VXRS_EXT2] = "vxe2", + [HWCAP_NR_VXRS_PDE] = "vxp", + [HWCAP_NR_SORT] = "sort", + [HWCAP_NR_DFLT] = "dflt", + [HWCAP_NR_VXRS_PDE2] = "vxp2", + [HWCAP_NR_NNPA] = "nnpa", + [HWCAP_NR_PCI_MIO] = "pcimio", + [HWCAP_NR_SIE] = "sie", + }; + int i, cpu; + + BUILD_BUG_ON(ARRAY_SIZE(hwcap_str) != HWCAP_NR_MAX); + seq_printf(m, "vendor_id : IBM/S390\n" + "# processors : %i\n" + "bogomips per cpu: %lu.%02lu\n", + num_online_cpus(), loops_per_jiffy/(500000/HZ), + (loops_per_jiffy/(5000/HZ))%100); + seq_printf(m, "max thread id : %d\n", smp_cpu_mtid); + seq_puts(m, "features\t: "); + for (i = 0; i < ARRAY_SIZE(hwcap_str); i++) + if (hwcap_str[i] && (elf_hwcap & (1UL << i))) + seq_printf(m, "%s ", hwcap_str[i]); + seq_puts(m, "\n"); + show_facilities(m); + show_cacheinfo(m); + for_each_online_cpu(cpu) { + struct cpuid *id = &per_cpu(cpu_info.cpu_id, cpu); + + seq_printf(m, "processor %d: " + "version = %02X, " + "identification = %06X, " + "machine = %04X\n", + cpu, id->version, id->ident, id->machine); + } +} + +static int __init setup_hwcaps(void) +{ + /* instructions named N3, "backported" to esa-mode */ + elf_hwcap |= HWCAP_ESAN3; + + /* z/Architecture mode active */ + elf_hwcap |= HWCAP_ZARCH; + + /* store-facility-list-extended */ + if (test_facility(7)) + elf_hwcap |= HWCAP_STFLE; + + /* message-security assist */ + if (test_facility(17)) + elf_hwcap |= HWCAP_MSA; + + /* long-displacement */ + if (test_facility(19)) + elf_hwcap |= HWCAP_LDISP; + + /* extended-immediate */ + elf_hwcap |= HWCAP_EIMM; + + /* extended-translation facility 3 enhancement */ + if (test_facility(22) && test_facility(30)) + elf_hwcap |= HWCAP_ETF3EH; + + /* decimal floating point & perform floating point operation */ + if (test_facility(42) && test_facility(44)) + elf_hwcap |= HWCAP_DFP; + + /* huge page support */ + if (MACHINE_HAS_EDAT1) + elf_hwcap |= HWCAP_HPAGE; + + /* 64-bit register support for 31-bit processes */ + elf_hwcap |= HWCAP_HIGH_GPRS; + + /* transactional execution */ + if (MACHINE_HAS_TE) + elf_hwcap |= HWCAP_TE; + + /* + * Vector extension can be disabled with the "novx" parameter. + * Use MACHINE_HAS_VX instead of facility bit 129. + */ + if (MACHINE_HAS_VX) { + elf_hwcap |= HWCAP_VXRS; + if (test_facility(134)) + elf_hwcap |= HWCAP_VXRS_BCD; + if (test_facility(135)) + elf_hwcap |= HWCAP_VXRS_EXT; + if (test_facility(148)) + elf_hwcap |= HWCAP_VXRS_EXT2; + if (test_facility(152)) + elf_hwcap |= HWCAP_VXRS_PDE; + if (test_facility(192)) + elf_hwcap |= HWCAP_VXRS_PDE2; + } + + if (test_facility(150)) + elf_hwcap |= HWCAP_SORT; + + if (test_facility(151)) + elf_hwcap |= HWCAP_DFLT; + + if (test_facility(165)) + elf_hwcap |= HWCAP_NNPA; + + /* guarded storage */ + if (MACHINE_HAS_GS) + elf_hwcap |= HWCAP_GS; + + if (MACHINE_HAS_PCI_MIO) + elf_hwcap |= HWCAP_PCI_MIO; + + /* virtualization support */ + if (sclp.has_sief2) + elf_hwcap |= HWCAP_SIE; + + return 0; +} +arch_initcall(setup_hwcaps); + +static int __init setup_elf_platform(void) +{ + struct cpuid cpu_id; + + get_cpu_id(&cpu_id); + add_device_randomness(&cpu_id, sizeof(cpu_id)); + switch (cpu_id.machine) { + default: /* Use "z10" as default. */ + strcpy(elf_platform, "z10"); + break; + case 0x2817: + case 0x2818: + strcpy(elf_platform, "z196"); + break; + case 0x2827: + case 0x2828: + strcpy(elf_platform, "zEC12"); + break; + case 0x2964: + case 0x2965: + strcpy(elf_platform, "z13"); + break; + case 0x3906: + case 0x3907: + strcpy(elf_platform, "z14"); + break; + case 0x8561: + case 0x8562: + strcpy(elf_platform, "z15"); + break; + case 0x3931: + case 0x3932: + strcpy(elf_platform, "z16"); + break; + } + return 0; +} +arch_initcall(setup_elf_platform); + +static void show_cpu_topology(struct seq_file *m, unsigned long n) +{ +#ifdef CONFIG_SCHED_TOPOLOGY + seq_printf(m, "physical id : %d\n", topology_physical_package_id(n)); + seq_printf(m, "core id : %d\n", topology_core_id(n)); + seq_printf(m, "book id : %d\n", topology_book_id(n)); + seq_printf(m, "drawer id : %d\n", topology_drawer_id(n)); + seq_printf(m, "dedicated : %d\n", topology_cpu_dedicated(n)); + seq_printf(m, "address : %d\n", smp_cpu_get_cpu_address(n)); + seq_printf(m, "siblings : %d\n", cpumask_weight(topology_core_cpumask(n))); + seq_printf(m, "cpu cores : %d\n", topology_booted_cores(n)); +#endif /* CONFIG_SCHED_TOPOLOGY */ +} + +static void show_cpu_ids(struct seq_file *m, unsigned long n) +{ + struct cpuid *id = &per_cpu(cpu_info.cpu_id, n); + + seq_printf(m, "version : %02X\n", id->version); + seq_printf(m, "identification : %06X\n", id->ident); + seq_printf(m, "machine : %04X\n", id->machine); +} + +static void show_cpu_mhz(struct seq_file *m, unsigned long n) +{ + struct cpu_info *c = per_cpu_ptr(&cpu_info, n); + + if (!machine_has_cpu_mhz) + return; + seq_printf(m, "cpu MHz dynamic : %d\n", c->cpu_mhz_dynamic); + seq_printf(m, "cpu MHz static : %d\n", c->cpu_mhz_static); +} + +/* + * show_cpuinfo - Get information on one CPU for use by procfs. + */ +static int show_cpuinfo(struct seq_file *m, void *v) +{ + unsigned long n = (unsigned long) v - 1; + unsigned long first = cpumask_first(cpu_online_mask); + + if (n == first) + show_cpu_summary(m, v); + seq_printf(m, "\ncpu number : %ld\n", n); + show_cpu_topology(m, n); + show_cpu_ids(m, n); + show_cpu_mhz(m, n); + return 0; +} + +static inline void *c_update(loff_t *pos) +{ + if (*pos) + *pos = cpumask_next(*pos - 1, cpu_online_mask); + else + *pos = cpumask_first(cpu_online_mask); + return *pos < nr_cpu_ids ? (void *)*pos + 1 : NULL; +} + +static void *c_start(struct seq_file *m, loff_t *pos) +{ + cpus_read_lock(); + return c_update(pos); +} + +static void *c_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return c_update(pos); +} + +static void c_stop(struct seq_file *m, void *v) +{ + cpus_read_unlock(); +} + +const struct seq_operations cpuinfo_op = { + .start = c_start, + .next = c_next, + .stop = c_stop, + .show = show_cpuinfo, +}; diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c new file mode 100644 index 0000000000..ea244a73ef --- /dev/null +++ b/arch/s390/kernel/ptrace.c @@ -0,0 +1,1608 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Ptrace user space interface. + * + * Copyright IBM Corp. 1999, 2010 + * Author(s): Denis Joseph Barrow + * Martin Schwidefsky (schwidefsky@de.ibm.com) + */ + +#include "asm/ptrace.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "entry.h" + +#ifdef CONFIG_COMPAT +#include "compat_ptrace.h" +#endif + +void update_cr_regs(struct task_struct *task) +{ + struct pt_regs *regs = task_pt_regs(task); + struct thread_struct *thread = &task->thread; + struct per_regs old, new; + union ctlreg0 cr0_old, cr0_new; + union ctlreg2 cr2_old, cr2_new; + int cr0_changed, cr2_changed; + + __ctl_store(cr0_old.val, 0, 0); + __ctl_store(cr2_old.val, 2, 2); + cr0_new = cr0_old; + cr2_new = cr2_old; + /* Take care of the enable/disable of transactional execution. */ + if (MACHINE_HAS_TE) { + /* Set or clear transaction execution TXC bit 8. */ + cr0_new.tcx = 1; + if (task->thread.per_flags & PER_FLAG_NO_TE) + cr0_new.tcx = 0; + /* Set or clear transaction execution TDC bits 62 and 63. */ + cr2_new.tdc = 0; + if (task->thread.per_flags & PER_FLAG_TE_ABORT_RAND) { + if (task->thread.per_flags & PER_FLAG_TE_ABORT_RAND_TEND) + cr2_new.tdc = 1; + else + cr2_new.tdc = 2; + } + } + /* Take care of enable/disable of guarded storage. */ + if (MACHINE_HAS_GS) { + cr2_new.gse = 0; + if (task->thread.gs_cb) + cr2_new.gse = 1; + } + /* Load control register 0/2 iff changed */ + cr0_changed = cr0_new.val != cr0_old.val; + cr2_changed = cr2_new.val != cr2_old.val; + if (cr0_changed) + __ctl_load(cr0_new.val, 0, 0); + if (cr2_changed) + __ctl_load(cr2_new.val, 2, 2); + /* Copy user specified PER registers */ + new.control = thread->per_user.control; + new.start = thread->per_user.start; + new.end = thread->per_user.end; + + /* merge TIF_SINGLE_STEP into user specified PER registers. */ + if (test_tsk_thread_flag(task, TIF_SINGLE_STEP) || + test_tsk_thread_flag(task, TIF_UPROBE_SINGLESTEP)) { + if (test_tsk_thread_flag(task, TIF_BLOCK_STEP)) + new.control |= PER_EVENT_BRANCH; + else + new.control |= PER_EVENT_IFETCH; + new.control |= PER_CONTROL_SUSPENSION; + new.control |= PER_EVENT_TRANSACTION_END; + if (test_tsk_thread_flag(task, TIF_UPROBE_SINGLESTEP)) + new.control |= PER_EVENT_IFETCH; + new.start = 0; + new.end = -1UL; + } + + /* Take care of the PER enablement bit in the PSW. */ + if (!(new.control & PER_EVENT_MASK)) { + regs->psw.mask &= ~PSW_MASK_PER; + return; + } + regs->psw.mask |= PSW_MASK_PER; + __ctl_store(old, 9, 11); + if (memcmp(&new, &old, sizeof(struct per_regs)) != 0) + __ctl_load(new, 9, 11); +} + +void user_enable_single_step(struct task_struct *task) +{ + clear_tsk_thread_flag(task, TIF_BLOCK_STEP); + set_tsk_thread_flag(task, TIF_SINGLE_STEP); +} + +void user_disable_single_step(struct task_struct *task) +{ + clear_tsk_thread_flag(task, TIF_BLOCK_STEP); + clear_tsk_thread_flag(task, TIF_SINGLE_STEP); +} + +void user_enable_block_step(struct task_struct *task) +{ + set_tsk_thread_flag(task, TIF_SINGLE_STEP); + set_tsk_thread_flag(task, TIF_BLOCK_STEP); +} + +/* + * Called by kernel/ptrace.c when detaching.. + * + * Clear all debugging related fields. + */ +void ptrace_disable(struct task_struct *task) +{ + memset(&task->thread.per_user, 0, sizeof(task->thread.per_user)); + memset(&task->thread.per_event, 0, sizeof(task->thread.per_event)); + clear_tsk_thread_flag(task, TIF_SINGLE_STEP); + clear_tsk_thread_flag(task, TIF_PER_TRAP); + task->thread.per_flags = 0; +} + +#define __ADDR_MASK 7 + +static inline unsigned long __peek_user_per(struct task_struct *child, + addr_t addr) +{ + if (addr == offsetof(struct per_struct_kernel, cr9)) + /* Control bits of the active per set. */ + return test_thread_flag(TIF_SINGLE_STEP) ? + PER_EVENT_IFETCH : child->thread.per_user.control; + else if (addr == offsetof(struct per_struct_kernel, cr10)) + /* Start address of the active per set. */ + return test_thread_flag(TIF_SINGLE_STEP) ? + 0 : child->thread.per_user.start; + else if (addr == offsetof(struct per_struct_kernel, cr11)) + /* End address of the active per set. */ + return test_thread_flag(TIF_SINGLE_STEP) ? + -1UL : child->thread.per_user.end; + else if (addr == offsetof(struct per_struct_kernel, bits)) + /* Single-step bit. */ + return test_thread_flag(TIF_SINGLE_STEP) ? + (1UL << (BITS_PER_LONG - 1)) : 0; + else if (addr == offsetof(struct per_struct_kernel, starting_addr)) + /* Start address of the user specified per set. */ + return child->thread.per_user.start; + else if (addr == offsetof(struct per_struct_kernel, ending_addr)) + /* End address of the user specified per set. */ + return child->thread.per_user.end; + else if (addr == offsetof(struct per_struct_kernel, perc_atmid)) + /* PER code, ATMID and AI of the last PER trap */ + return (unsigned long) + child->thread.per_event.cause << (BITS_PER_LONG - 16); + else if (addr == offsetof(struct per_struct_kernel, address)) + /* Address of the last PER trap */ + return child->thread.per_event.address; + else if (addr == offsetof(struct per_struct_kernel, access_id)) + /* Access id of the last PER trap */ + return (unsigned long) + child->thread.per_event.paid << (BITS_PER_LONG - 8); + return 0; +} + +/* + * Read the word at offset addr from the user area of a process. The + * trouble here is that the information is littered over different + * locations. The process registers are found on the kernel stack, + * the floating point stuff and the trace settings are stored in + * the task structure. In addition the different structures in + * struct user contain pad bytes that should be read as zeroes. + * Lovely... + */ +static unsigned long __peek_user(struct task_struct *child, addr_t addr) +{ + addr_t offset, tmp; + + if (addr < offsetof(struct user, regs.acrs)) { + /* + * psw and gprs are stored on the stack + */ + tmp = *(addr_t *)((addr_t) &task_pt_regs(child)->psw + addr); + if (addr == offsetof(struct user, regs.psw.mask)) { + /* Return a clean psw mask. */ + tmp &= PSW_MASK_USER | PSW_MASK_RI; + tmp |= PSW_USER_BITS; + } + + } else if (addr < offsetof(struct user, regs.orig_gpr2)) { + /* + * access registers are stored in the thread structure + */ + offset = addr - offsetof(struct user, regs.acrs); + /* + * Very special case: old & broken 64 bit gdb reading + * from acrs[15]. Result is a 64 bit value. Read the + * 32 bit acrs[15] value and shift it by 32. Sick... + */ + if (addr == offsetof(struct user, regs.acrs[15])) + tmp = ((unsigned long) child->thread.acrs[15]) << 32; + else + tmp = *(addr_t *)((addr_t) &child->thread.acrs + offset); + + } else if (addr == offsetof(struct user, regs.orig_gpr2)) { + /* + * orig_gpr2 is stored on the kernel stack + */ + tmp = (addr_t) task_pt_regs(child)->orig_gpr2; + + } else if (addr < offsetof(struct user, regs.fp_regs)) { + /* + * prevent reads of padding hole between + * orig_gpr2 and fp_regs on s390. + */ + tmp = 0; + + } else if (addr == offsetof(struct user, regs.fp_regs.fpc)) { + /* + * floating point control reg. is in the thread structure + */ + tmp = child->thread.fpu.fpc; + tmp <<= BITS_PER_LONG - 32; + + } else if (addr < offsetof(struct user, regs.fp_regs) + sizeof(s390_fp_regs)) { + /* + * floating point regs. are either in child->thread.fpu + * or the child->thread.fpu.vxrs array + */ + offset = addr - offsetof(struct user, regs.fp_regs.fprs); + if (MACHINE_HAS_VX) + tmp = *(addr_t *) + ((addr_t) child->thread.fpu.vxrs + 2*offset); + else + tmp = *(addr_t *) + ((addr_t) child->thread.fpu.fprs + offset); + + } else if (addr < offsetof(struct user, regs.per_info) + sizeof(per_struct)) { + /* + * Handle access to the per_info structure. + */ + addr -= offsetof(struct user, regs.per_info); + tmp = __peek_user_per(child, addr); + + } else + tmp = 0; + + return tmp; +} + +static int +peek_user(struct task_struct *child, addr_t addr, addr_t data) +{ + addr_t tmp, mask; + + /* + * Stupid gdb peeks/pokes the access registers in 64 bit with + * an alignment of 4. Programmers from hell... + */ + mask = __ADDR_MASK; + if (addr >= offsetof(struct user, regs.acrs) && + addr < offsetof(struct user, regs.orig_gpr2)) + mask = 3; + if ((addr & mask) || addr > sizeof(struct user) - __ADDR_MASK) + return -EIO; + + tmp = __peek_user(child, addr); + return put_user(tmp, (addr_t __user *) data); +} + +static inline void __poke_user_per(struct task_struct *child, + addr_t addr, addr_t data) +{ + /* + * There are only three fields in the per_info struct that the + * debugger user can write to. + * 1) cr9: the debugger wants to set a new PER event mask + * 2) starting_addr: the debugger wants to set a new starting + * address to use with the PER event mask. + * 3) ending_addr: the debugger wants to set a new ending + * address to use with the PER event mask. + * The user specified PER event mask and the start and end + * addresses are used only if single stepping is not in effect. + * Writes to any other field in per_info are ignored. + */ + if (addr == offsetof(struct per_struct_kernel, cr9)) + /* PER event mask of the user specified per set. */ + child->thread.per_user.control = + data & (PER_EVENT_MASK | PER_CONTROL_MASK); + else if (addr == offsetof(struct per_struct_kernel, starting_addr)) + /* Starting address of the user specified per set. */ + child->thread.per_user.start = data; + else if (addr == offsetof(struct per_struct_kernel, ending_addr)) + /* Ending address of the user specified per set. */ + child->thread.per_user.end = data; +} + +/* + * Write a word to the user area of a process at location addr. This + * operation does have an additional problem compared to peek_user. + * Stores to the program status word and on the floating point + * control register needs to get checked for validity. + */ +static int __poke_user(struct task_struct *child, addr_t addr, addr_t data) +{ + addr_t offset; + + + if (addr < offsetof(struct user, regs.acrs)) { + struct pt_regs *regs = task_pt_regs(child); + /* + * psw and gprs are stored on the stack + */ + if (addr == offsetof(struct user, regs.psw.mask)) { + unsigned long mask = PSW_MASK_USER; + + mask |= is_ri_task(child) ? PSW_MASK_RI : 0; + if ((data ^ PSW_USER_BITS) & ~mask) + /* Invalid psw mask. */ + return -EINVAL; + if ((data & PSW_MASK_ASC) == PSW_ASC_HOME) + /* Invalid address-space-control bits */ + return -EINVAL; + if ((data & PSW_MASK_EA) && !(data & PSW_MASK_BA)) + /* Invalid addressing mode bits */ + return -EINVAL; + } + + if (test_pt_regs_flag(regs, PIF_SYSCALL) && + addr == offsetof(struct user, regs.gprs[2])) { + struct pt_regs *regs = task_pt_regs(child); + + regs->int_code = 0x20000 | (data & 0xffff); + } + *(addr_t *)((addr_t) ®s->psw + addr) = data; + } else if (addr < offsetof(struct user, regs.orig_gpr2)) { + /* + * access registers are stored in the thread structure + */ + offset = addr - offsetof(struct user, regs.acrs); + /* + * Very special case: old & broken 64 bit gdb writing + * to acrs[15] with a 64 bit value. Ignore the lower + * half of the value and write the upper 32 bit to + * acrs[15]. Sick... + */ + if (addr == offsetof(struct user, regs.acrs[15])) + child->thread.acrs[15] = (unsigned int) (data >> 32); + else + *(addr_t *)((addr_t) &child->thread.acrs + offset) = data; + + } else if (addr == offsetof(struct user, regs.orig_gpr2)) { + /* + * orig_gpr2 is stored on the kernel stack + */ + task_pt_regs(child)->orig_gpr2 = data; + + } else if (addr < offsetof(struct user, regs.fp_regs)) { + /* + * prevent writes of padding hole between + * orig_gpr2 and fp_regs on s390. + */ + return 0; + + } else if (addr == offsetof(struct user, regs.fp_regs.fpc)) { + /* + * floating point control reg. is in the thread structure + */ + if ((unsigned int) data != 0 || + test_fp_ctl(data >> (BITS_PER_LONG - 32))) + return -EINVAL; + child->thread.fpu.fpc = data >> (BITS_PER_LONG - 32); + + } else if (addr < offsetof(struct user, regs.fp_regs) + sizeof(s390_fp_regs)) { + /* + * floating point regs. are either in child->thread.fpu + * or the child->thread.fpu.vxrs array + */ + offset = addr - offsetof(struct user, regs.fp_regs.fprs); + if (MACHINE_HAS_VX) + *(addr_t *)((addr_t) + child->thread.fpu.vxrs + 2*offset) = data; + else + *(addr_t *)((addr_t) + child->thread.fpu.fprs + offset) = data; + + } else if (addr < offsetof(struct user, regs.per_info) + sizeof(per_struct)) { + /* + * Handle access to the per_info structure. + */ + addr -= offsetof(struct user, regs.per_info); + __poke_user_per(child, addr, data); + + } + + return 0; +} + +static int poke_user(struct task_struct *child, addr_t addr, addr_t data) +{ + addr_t mask; + + /* + * Stupid gdb peeks/pokes the access registers in 64 bit with + * an alignment of 4. Programmers from hell indeed... + */ + mask = __ADDR_MASK; + if (addr >= offsetof(struct user, regs.acrs) && + addr < offsetof(struct user, regs.orig_gpr2)) + mask = 3; + if ((addr & mask) || addr > sizeof(struct user) - __ADDR_MASK) + return -EIO; + + return __poke_user(child, addr, data); +} + +long arch_ptrace(struct task_struct *child, long request, + unsigned long addr, unsigned long data) +{ + ptrace_area parea; + int copied, ret; + + switch (request) { + case PTRACE_PEEKUSR: + /* read the word at location addr in the USER area. */ + return peek_user(child, addr, data); + + case PTRACE_POKEUSR: + /* write the word at location addr in the USER area */ + return poke_user(child, addr, data); + + case PTRACE_PEEKUSR_AREA: + case PTRACE_POKEUSR_AREA: + if (copy_from_user(&parea, (void __force __user *) addr, + sizeof(parea))) + return -EFAULT; + addr = parea.kernel_addr; + data = parea.process_addr; + copied = 0; + while (copied < parea.len) { + if (request == PTRACE_PEEKUSR_AREA) + ret = peek_user(child, addr, data); + else { + addr_t utmp; + if (get_user(utmp, + (addr_t __force __user *) data)) + return -EFAULT; + ret = poke_user(child, addr, utmp); + } + if (ret) + return ret; + addr += sizeof(unsigned long); + data += sizeof(unsigned long); + copied += sizeof(unsigned long); + } + return 0; + case PTRACE_GET_LAST_BREAK: + return put_user(child->thread.last_break, (unsigned long __user *)data); + case PTRACE_ENABLE_TE: + if (!MACHINE_HAS_TE) + return -EIO; + child->thread.per_flags &= ~PER_FLAG_NO_TE; + return 0; + case PTRACE_DISABLE_TE: + if (!MACHINE_HAS_TE) + return -EIO; + child->thread.per_flags |= PER_FLAG_NO_TE; + child->thread.per_flags &= ~PER_FLAG_TE_ABORT_RAND; + return 0; + case PTRACE_TE_ABORT_RAND: + if (!MACHINE_HAS_TE || (child->thread.per_flags & PER_FLAG_NO_TE)) + return -EIO; + switch (data) { + case 0UL: + child->thread.per_flags &= ~PER_FLAG_TE_ABORT_RAND; + break; + case 1UL: + child->thread.per_flags |= PER_FLAG_TE_ABORT_RAND; + child->thread.per_flags |= PER_FLAG_TE_ABORT_RAND_TEND; + break; + case 2UL: + child->thread.per_flags |= PER_FLAG_TE_ABORT_RAND; + child->thread.per_flags &= ~PER_FLAG_TE_ABORT_RAND_TEND; + break; + default: + return -EINVAL; + } + return 0; + default: + return ptrace_request(child, request, addr, data); + } +} + +#ifdef CONFIG_COMPAT +/* + * Now the fun part starts... a 31 bit program running in the + * 31 bit emulation tracing another program. PTRACE_PEEKTEXT, + * PTRACE_PEEKDATA, PTRACE_POKETEXT and PTRACE_POKEDATA are easy + * to handle, the difference to the 64 bit versions of the requests + * is that the access is done in multiples of 4 byte instead of + * 8 bytes (sizeof(unsigned long) on 31/64 bit). + * The ugly part are PTRACE_PEEKUSR, PTRACE_PEEKUSR_AREA, + * PTRACE_POKEUSR and PTRACE_POKEUSR_AREA. If the traced program + * is a 31 bit program too, the content of struct user can be + * emulated. A 31 bit program peeking into the struct user of + * a 64 bit program is a no-no. + */ + +/* + * Same as peek_user_per but for a 31 bit program. + */ +static inline __u32 __peek_user_per_compat(struct task_struct *child, + addr_t addr) +{ + if (addr == offsetof(struct compat_per_struct_kernel, cr9)) + /* Control bits of the active per set. */ + return (__u32) test_thread_flag(TIF_SINGLE_STEP) ? + PER_EVENT_IFETCH : child->thread.per_user.control; + else if (addr == offsetof(struct compat_per_struct_kernel, cr10)) + /* Start address of the active per set. */ + return (__u32) test_thread_flag(TIF_SINGLE_STEP) ? + 0 : child->thread.per_user.start; + else if (addr == offsetof(struct compat_per_struct_kernel, cr11)) + /* End address of the active per set. */ + return test_thread_flag(TIF_SINGLE_STEP) ? + PSW32_ADDR_INSN : child->thread.per_user.end; + else if (addr == offsetof(struct compat_per_struct_kernel, bits)) + /* Single-step bit. */ + return (__u32) test_thread_flag(TIF_SINGLE_STEP) ? + 0x80000000 : 0; + else if (addr == offsetof(struct compat_per_struct_kernel, starting_addr)) + /* Start address of the user specified per set. */ + return (__u32) child->thread.per_user.start; + else if (addr == offsetof(struct compat_per_struct_kernel, ending_addr)) + /* End address of the user specified per set. */ + return (__u32) child->thread.per_user.end; + else if (addr == offsetof(struct compat_per_struct_kernel, perc_atmid)) + /* PER code, ATMID and AI of the last PER trap */ + return (__u32) child->thread.per_event.cause << 16; + else if (addr == offsetof(struct compat_per_struct_kernel, address)) + /* Address of the last PER trap */ + return (__u32) child->thread.per_event.address; + else if (addr == offsetof(struct compat_per_struct_kernel, access_id)) + /* Access id of the last PER trap */ + return (__u32) child->thread.per_event.paid << 24; + return 0; +} + +/* + * Same as peek_user but for a 31 bit program. + */ +static u32 __peek_user_compat(struct task_struct *child, addr_t addr) +{ + addr_t offset; + __u32 tmp; + + if (addr < offsetof(struct compat_user, regs.acrs)) { + struct pt_regs *regs = task_pt_regs(child); + /* + * psw and gprs are stored on the stack + */ + if (addr == offsetof(struct compat_user, regs.psw.mask)) { + /* Fake a 31 bit psw mask. */ + tmp = (__u32)(regs->psw.mask >> 32); + tmp &= PSW32_MASK_USER | PSW32_MASK_RI; + tmp |= PSW32_USER_BITS; + } else if (addr == offsetof(struct compat_user, regs.psw.addr)) { + /* Fake a 31 bit psw address. */ + tmp = (__u32) regs->psw.addr | + (__u32)(regs->psw.mask & PSW_MASK_BA); + } else { + /* gpr 0-15 */ + tmp = *(__u32 *)((addr_t) ®s->psw + addr*2 + 4); + } + } else if (addr < offsetof(struct compat_user, regs.orig_gpr2)) { + /* + * access registers are stored in the thread structure + */ + offset = addr - offsetof(struct compat_user, regs.acrs); + tmp = *(__u32*)((addr_t) &child->thread.acrs + offset); + + } else if (addr == offsetof(struct compat_user, regs.orig_gpr2)) { + /* + * orig_gpr2 is stored on the kernel stack + */ + tmp = *(__u32*)((addr_t) &task_pt_regs(child)->orig_gpr2 + 4); + + } else if (addr < offsetof(struct compat_user, regs.fp_regs)) { + /* + * prevent reads of padding hole between + * orig_gpr2 and fp_regs on s390. + */ + tmp = 0; + + } else if (addr == offsetof(struct compat_user, regs.fp_regs.fpc)) { + /* + * floating point control reg. is in the thread structure + */ + tmp = child->thread.fpu.fpc; + + } else if (addr < offsetof(struct compat_user, regs.fp_regs) + sizeof(s390_fp_regs)) { + /* + * floating point regs. are either in child->thread.fpu + * or the child->thread.fpu.vxrs array + */ + offset = addr - offsetof(struct compat_user, regs.fp_regs.fprs); + if (MACHINE_HAS_VX) + tmp = *(__u32 *) + ((addr_t) child->thread.fpu.vxrs + 2*offset); + else + tmp = *(__u32 *) + ((addr_t) child->thread.fpu.fprs + offset); + + } else if (addr < offsetof(struct compat_user, regs.per_info) + sizeof(struct compat_per_struct_kernel)) { + /* + * Handle access to the per_info structure. + */ + addr -= offsetof(struct compat_user, regs.per_info); + tmp = __peek_user_per_compat(child, addr); + + } else + tmp = 0; + + return tmp; +} + +static int peek_user_compat(struct task_struct *child, + addr_t addr, addr_t data) +{ + __u32 tmp; + + if (!is_compat_task() || (addr & 3) || addr > sizeof(struct user) - 3) + return -EIO; + + tmp = __peek_user_compat(child, addr); + return put_user(tmp, (__u32 __user *) data); +} + +/* + * Same as poke_user_per but for a 31 bit program. + */ +static inline void __poke_user_per_compat(struct task_struct *child, + addr_t addr, __u32 data) +{ + if (addr == offsetof(struct compat_per_struct_kernel, cr9)) + /* PER event mask of the user specified per set. */ + child->thread.per_user.control = + data & (PER_EVENT_MASK | PER_CONTROL_MASK); + else if (addr == offsetof(struct compat_per_struct_kernel, starting_addr)) + /* Starting address of the user specified per set. */ + child->thread.per_user.start = data; + else if (addr == offsetof(struct compat_per_struct_kernel, ending_addr)) + /* Ending address of the user specified per set. */ + child->thread.per_user.end = data; +} + +/* + * Same as poke_user but for a 31 bit program. + */ +static int __poke_user_compat(struct task_struct *child, + addr_t addr, addr_t data) +{ + __u32 tmp = (__u32) data; + addr_t offset; + + if (addr < offsetof(struct compat_user, regs.acrs)) { + struct pt_regs *regs = task_pt_regs(child); + /* + * psw, gprs, acrs and orig_gpr2 are stored on the stack + */ + if (addr == offsetof(struct compat_user, regs.psw.mask)) { + __u32 mask = PSW32_MASK_USER; + + mask |= is_ri_task(child) ? PSW32_MASK_RI : 0; + /* Build a 64 bit psw mask from 31 bit mask. */ + if ((tmp ^ PSW32_USER_BITS) & ~mask) + /* Invalid psw mask. */ + return -EINVAL; + if ((data & PSW32_MASK_ASC) == PSW32_ASC_HOME) + /* Invalid address-space-control bits */ + return -EINVAL; + regs->psw.mask = (regs->psw.mask & ~PSW_MASK_USER) | + (regs->psw.mask & PSW_MASK_BA) | + (__u64)(tmp & mask) << 32; + } else if (addr == offsetof(struct compat_user, regs.psw.addr)) { + /* Build a 64 bit psw address from 31 bit address. */ + regs->psw.addr = (__u64) tmp & PSW32_ADDR_INSN; + /* Transfer 31 bit amode bit to psw mask. */ + regs->psw.mask = (regs->psw.mask & ~PSW_MASK_BA) | + (__u64)(tmp & PSW32_ADDR_AMODE); + } else { + if (test_pt_regs_flag(regs, PIF_SYSCALL) && + addr == offsetof(struct compat_user, regs.gprs[2])) { + struct pt_regs *regs = task_pt_regs(child); + + regs->int_code = 0x20000 | (data & 0xffff); + } + /* gpr 0-15 */ + *(__u32*)((addr_t) ®s->psw + addr*2 + 4) = tmp; + } + } else if (addr < offsetof(struct compat_user, regs.orig_gpr2)) { + /* + * access registers are stored in the thread structure + */ + offset = addr - offsetof(struct compat_user, regs.acrs); + *(__u32*)((addr_t) &child->thread.acrs + offset) = tmp; + + } else if (addr == offsetof(struct compat_user, regs.orig_gpr2)) { + /* + * orig_gpr2 is stored on the kernel stack + */ + *(__u32*)((addr_t) &task_pt_regs(child)->orig_gpr2 + 4) = tmp; + + } else if (addr < offsetof(struct compat_user, regs.fp_regs)) { + /* + * prevent writess of padding hole between + * orig_gpr2 and fp_regs on s390. + */ + return 0; + + } else if (addr == offsetof(struct compat_user, regs.fp_regs.fpc)) { + /* + * floating point control reg. is in the thread structure + */ + if (test_fp_ctl(tmp)) + return -EINVAL; + child->thread.fpu.fpc = data; + + } else if (addr < offsetof(struct compat_user, regs.fp_regs) + sizeof(s390_fp_regs)) { + /* + * floating point regs. are either in child->thread.fpu + * or the child->thread.fpu.vxrs array + */ + offset = addr - offsetof(struct compat_user, regs.fp_regs.fprs); + if (MACHINE_HAS_VX) + *(__u32 *)((addr_t) + child->thread.fpu.vxrs + 2*offset) = tmp; + else + *(__u32 *)((addr_t) + child->thread.fpu.fprs + offset) = tmp; + + } else if (addr < offsetof(struct compat_user, regs.per_info) + sizeof(struct compat_per_struct_kernel)) { + /* + * Handle access to the per_info structure. + */ + addr -= offsetof(struct compat_user, regs.per_info); + __poke_user_per_compat(child, addr, data); + } + + return 0; +} + +static int poke_user_compat(struct task_struct *child, + addr_t addr, addr_t data) +{ + if (!is_compat_task() || (addr & 3) || + addr > sizeof(struct compat_user) - 3) + return -EIO; + + return __poke_user_compat(child, addr, data); +} + +long compat_arch_ptrace(struct task_struct *child, compat_long_t request, + compat_ulong_t caddr, compat_ulong_t cdata) +{ + unsigned long addr = caddr; + unsigned long data = cdata; + compat_ptrace_area parea; + int copied, ret; + + switch (request) { + case PTRACE_PEEKUSR: + /* read the word at location addr in the USER area. */ + return peek_user_compat(child, addr, data); + + case PTRACE_POKEUSR: + /* write the word at location addr in the USER area */ + return poke_user_compat(child, addr, data); + + case PTRACE_PEEKUSR_AREA: + case PTRACE_POKEUSR_AREA: + if (copy_from_user(&parea, (void __force __user *) addr, + sizeof(parea))) + return -EFAULT; + addr = parea.kernel_addr; + data = parea.process_addr; + copied = 0; + while (copied < parea.len) { + if (request == PTRACE_PEEKUSR_AREA) + ret = peek_user_compat(child, addr, data); + else { + __u32 utmp; + if (get_user(utmp, + (__u32 __force __user *) data)) + return -EFAULT; + ret = poke_user_compat(child, addr, utmp); + } + if (ret) + return ret; + addr += sizeof(unsigned int); + data += sizeof(unsigned int); + copied += sizeof(unsigned int); + } + return 0; + case PTRACE_GET_LAST_BREAK: + return put_user(child->thread.last_break, (unsigned int __user *)data); + } + return compat_ptrace_request(child, request, addr, data); +} +#endif + +/* + * user_regset definitions. + */ + +static int s390_regs_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + unsigned pos; + if (target == current) + save_access_regs(target->thread.acrs); + + for (pos = 0; pos < sizeof(s390_regs); pos += sizeof(long)) + membuf_store(&to, __peek_user(target, pos)); + return 0; +} + +static int s390_regs_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int rc = 0; + + if (target == current) + save_access_regs(target->thread.acrs); + + if (kbuf) { + const unsigned long *k = kbuf; + while (count > 0 && !rc) { + rc = __poke_user(target, pos, *k++); + count -= sizeof(*k); + pos += sizeof(*k); + } + } else { + const unsigned long __user *u = ubuf; + while (count > 0 && !rc) { + unsigned long word; + rc = __get_user(word, u++); + if (rc) + break; + rc = __poke_user(target, pos, word); + count -= sizeof(*u); + pos += sizeof(*u); + } + } + + if (rc == 0 && target == current) + restore_access_regs(target->thread.acrs); + + return rc; +} + +static int s390_fpregs_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + _s390_fp_regs fp_regs; + + if (target == current) + save_fpu_regs(); + + fp_regs.fpc = target->thread.fpu.fpc; + fpregs_store(&fp_regs, &target->thread.fpu); + + return membuf_write(&to, &fp_regs, sizeof(fp_regs)); +} + +static int s390_fpregs_set(struct task_struct *target, + const struct user_regset *regset, unsigned int pos, + unsigned int count, const void *kbuf, + const void __user *ubuf) +{ + int rc = 0; + freg_t fprs[__NUM_FPRS]; + + if (target == current) + save_fpu_regs(); + + if (MACHINE_HAS_VX) + convert_vx_to_fp(fprs, target->thread.fpu.vxrs); + else + memcpy(&fprs, target->thread.fpu.fprs, sizeof(fprs)); + + /* If setting FPC, must validate it first. */ + if (count > 0 && pos < offsetof(s390_fp_regs, fprs)) { + u32 ufpc[2] = { target->thread.fpu.fpc, 0 }; + rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ufpc, + 0, offsetof(s390_fp_regs, fprs)); + if (rc) + return rc; + if (ufpc[1] != 0 || test_fp_ctl(ufpc[0])) + return -EINVAL; + target->thread.fpu.fpc = ufpc[0]; + } + + if (rc == 0 && count > 0) + rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + fprs, offsetof(s390_fp_regs, fprs), -1); + if (rc) + return rc; + + if (MACHINE_HAS_VX) + convert_fp_to_vx(target->thread.fpu.vxrs, fprs); + else + memcpy(target->thread.fpu.fprs, &fprs, sizeof(fprs)); + + return rc; +} + +static int s390_last_break_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + return membuf_store(&to, target->thread.last_break); +} + +static int s390_last_break_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + return 0; +} + +static int s390_tdb_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + struct pt_regs *regs = task_pt_regs(target); + size_t size; + + if (!(regs->int_code & 0x200)) + return -ENODATA; + size = sizeof(target->thread.trap_tdb.data); + return membuf_write(&to, target->thread.trap_tdb.data, size); +} + +static int s390_tdb_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + return 0; +} + +static int s390_vxrs_low_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + __u64 vxrs[__NUM_VXRS_LOW]; + int i; + + if (!MACHINE_HAS_VX) + return -ENODEV; + if (target == current) + save_fpu_regs(); + for (i = 0; i < __NUM_VXRS_LOW; i++) + vxrs[i] = target->thread.fpu.vxrs[i].low; + return membuf_write(&to, vxrs, sizeof(vxrs)); +} + +static int s390_vxrs_low_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + __u64 vxrs[__NUM_VXRS_LOW]; + int i, rc; + + if (!MACHINE_HAS_VX) + return -ENODEV; + if (target == current) + save_fpu_regs(); + + for (i = 0; i < __NUM_VXRS_LOW; i++) + vxrs[i] = target->thread.fpu.vxrs[i].low; + + rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, vxrs, 0, -1); + if (rc == 0) + for (i = 0; i < __NUM_VXRS_LOW; i++) + target->thread.fpu.vxrs[i].low = vxrs[i]; + + return rc; +} + +static int s390_vxrs_high_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + if (!MACHINE_HAS_VX) + return -ENODEV; + if (target == current) + save_fpu_regs(); + return membuf_write(&to, target->thread.fpu.vxrs + __NUM_VXRS_LOW, + __NUM_VXRS_HIGH * sizeof(__vector128)); +} + +static int s390_vxrs_high_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int rc; + + if (!MACHINE_HAS_VX) + return -ENODEV; + if (target == current) + save_fpu_regs(); + + rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + target->thread.fpu.vxrs + __NUM_VXRS_LOW, 0, -1); + return rc; +} + +static int s390_system_call_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + return membuf_store(&to, target->thread.system_call); +} + +static int s390_system_call_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + unsigned int *data = &target->thread.system_call; + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, + data, 0, sizeof(unsigned int)); +} + +static int s390_gs_cb_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + struct gs_cb *data = target->thread.gs_cb; + + if (!MACHINE_HAS_GS) + return -ENODEV; + if (!data) + return -ENODATA; + if (target == current) + save_gs_cb(data); + return membuf_write(&to, data, sizeof(struct gs_cb)); +} + +static int s390_gs_cb_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct gs_cb gs_cb = { }, *data = NULL; + int rc; + + if (!MACHINE_HAS_GS) + return -ENODEV; + if (!target->thread.gs_cb) { + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + } + if (!target->thread.gs_cb) + gs_cb.gsd = 25; + else if (target == current) + save_gs_cb(&gs_cb); + else + gs_cb = *target->thread.gs_cb; + rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &gs_cb, 0, sizeof(gs_cb)); + if (rc) { + kfree(data); + return -EFAULT; + } + preempt_disable(); + if (!target->thread.gs_cb) + target->thread.gs_cb = data; + *target->thread.gs_cb = gs_cb; + if (target == current) { + __ctl_set_bit(2, 4); + restore_gs_cb(target->thread.gs_cb); + } + preempt_enable(); + return rc; +} + +static int s390_gs_bc_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + struct gs_cb *data = target->thread.gs_bc_cb; + + if (!MACHINE_HAS_GS) + return -ENODEV; + if (!data) + return -ENODATA; + return membuf_write(&to, data, sizeof(struct gs_cb)); +} + +static int s390_gs_bc_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct gs_cb *data = target->thread.gs_bc_cb; + + if (!MACHINE_HAS_GS) + return -ENODEV; + if (!data) { + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + target->thread.gs_bc_cb = data; + } + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, + data, 0, sizeof(struct gs_cb)); +} + +static bool is_ri_cb_valid(struct runtime_instr_cb *cb) +{ + return (cb->rca & 0x1f) == 0 && + (cb->roa & 0xfff) == 0 && + (cb->rla & 0xfff) == 0xfff && + cb->s == 1 && + cb->k == 1 && + cb->h == 0 && + cb->reserved1 == 0 && + cb->ps == 1 && + cb->qs == 0 && + cb->pc == 1 && + cb->qc == 0 && + cb->reserved2 == 0 && + cb->reserved3 == 0 && + cb->reserved4 == 0 && + cb->reserved5 == 0 && + cb->reserved6 == 0 && + cb->reserved7 == 0 && + cb->reserved8 == 0 && + cb->rla >= cb->roa && + cb->rca >= cb->roa && + cb->rca <= cb->rla+1 && + cb->m < 3; +} + +static int s390_runtime_instr_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + struct runtime_instr_cb *data = target->thread.ri_cb; + + if (!test_facility(64)) + return -ENODEV; + if (!data) + return -ENODATA; + + return membuf_write(&to, data, sizeof(struct runtime_instr_cb)); +} + +static int s390_runtime_instr_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct runtime_instr_cb ri_cb = { }, *data = NULL; + int rc; + + if (!test_facility(64)) + return -ENODEV; + + if (!target->thread.ri_cb) { + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + } + + if (target->thread.ri_cb) { + if (target == current) + store_runtime_instr_cb(&ri_cb); + else + ri_cb = *target->thread.ri_cb; + } + + rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &ri_cb, 0, sizeof(struct runtime_instr_cb)); + if (rc) { + kfree(data); + return -EFAULT; + } + + if (!is_ri_cb_valid(&ri_cb)) { + kfree(data); + return -EINVAL; + } + /* + * Override access key in any case, since user space should + * not be able to set it, nor should it care about it. + */ + ri_cb.key = PAGE_DEFAULT_KEY >> 4; + preempt_disable(); + if (!target->thread.ri_cb) + target->thread.ri_cb = data; + *target->thread.ri_cb = ri_cb; + if (target == current) + load_runtime_instr_cb(target->thread.ri_cb); + preempt_enable(); + + return 0; +} + +static const struct user_regset s390_regsets[] = { + { + .core_note_type = NT_PRSTATUS, + .n = sizeof(s390_regs) / sizeof(long), + .size = sizeof(long), + .align = sizeof(long), + .regset_get = s390_regs_get, + .set = s390_regs_set, + }, + { + .core_note_type = NT_PRFPREG, + .n = sizeof(s390_fp_regs) / sizeof(long), + .size = sizeof(long), + .align = sizeof(long), + .regset_get = s390_fpregs_get, + .set = s390_fpregs_set, + }, + { + .core_note_type = NT_S390_SYSTEM_CALL, + .n = 1, + .size = sizeof(unsigned int), + .align = sizeof(unsigned int), + .regset_get = s390_system_call_get, + .set = s390_system_call_set, + }, + { + .core_note_type = NT_S390_LAST_BREAK, + .n = 1, + .size = sizeof(long), + .align = sizeof(long), + .regset_get = s390_last_break_get, + .set = s390_last_break_set, + }, + { + .core_note_type = NT_S390_TDB, + .n = 1, + .size = 256, + .align = 1, + .regset_get = s390_tdb_get, + .set = s390_tdb_set, + }, + { + .core_note_type = NT_S390_VXRS_LOW, + .n = __NUM_VXRS_LOW, + .size = sizeof(__u64), + .align = sizeof(__u64), + .regset_get = s390_vxrs_low_get, + .set = s390_vxrs_low_set, + }, + { + .core_note_type = NT_S390_VXRS_HIGH, + .n = __NUM_VXRS_HIGH, + .size = sizeof(__vector128), + .align = sizeof(__vector128), + .regset_get = s390_vxrs_high_get, + .set = s390_vxrs_high_set, + }, + { + .core_note_type = NT_S390_GS_CB, + .n = sizeof(struct gs_cb) / sizeof(__u64), + .size = sizeof(__u64), + .align = sizeof(__u64), + .regset_get = s390_gs_cb_get, + .set = s390_gs_cb_set, + }, + { + .core_note_type = NT_S390_GS_BC, + .n = sizeof(struct gs_cb) / sizeof(__u64), + .size = sizeof(__u64), + .align = sizeof(__u64), + .regset_get = s390_gs_bc_get, + .set = s390_gs_bc_set, + }, + { + .core_note_type = NT_S390_RI_CB, + .n = sizeof(struct runtime_instr_cb) / sizeof(__u64), + .size = sizeof(__u64), + .align = sizeof(__u64), + .regset_get = s390_runtime_instr_get, + .set = s390_runtime_instr_set, + }, +}; + +static const struct user_regset_view user_s390_view = { + .name = "s390x", + .e_machine = EM_S390, + .regsets = s390_regsets, + .n = ARRAY_SIZE(s390_regsets) +}; + +#ifdef CONFIG_COMPAT +static int s390_compat_regs_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + unsigned n; + + if (target == current) + save_access_regs(target->thread.acrs); + + for (n = 0; n < sizeof(s390_compat_regs); n += sizeof(compat_ulong_t)) + membuf_store(&to, __peek_user_compat(target, n)); + return 0; +} + +static int s390_compat_regs_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int rc = 0; + + if (target == current) + save_access_regs(target->thread.acrs); + + if (kbuf) { + const compat_ulong_t *k = kbuf; + while (count > 0 && !rc) { + rc = __poke_user_compat(target, pos, *k++); + count -= sizeof(*k); + pos += sizeof(*k); + } + } else { + const compat_ulong_t __user *u = ubuf; + while (count > 0 && !rc) { + compat_ulong_t word; + rc = __get_user(word, u++); + if (rc) + break; + rc = __poke_user_compat(target, pos, word); + count -= sizeof(*u); + pos += sizeof(*u); + } + } + + if (rc == 0 && target == current) + restore_access_regs(target->thread.acrs); + + return rc; +} + +static int s390_compat_regs_high_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + compat_ulong_t *gprs_high; + int i; + + gprs_high = (compat_ulong_t *)task_pt_regs(target)->gprs; + for (i = 0; i < NUM_GPRS; i++, gprs_high += 2) + membuf_store(&to, *gprs_high); + return 0; +} + +static int s390_compat_regs_high_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + compat_ulong_t *gprs_high; + int rc = 0; + + gprs_high = (compat_ulong_t *) + &task_pt_regs(target)->gprs[pos / sizeof(compat_ulong_t)]; + if (kbuf) { + const compat_ulong_t *k = kbuf; + while (count > 0) { + *gprs_high = *k++; + *gprs_high += 2; + count -= sizeof(*k); + } + } else { + const compat_ulong_t __user *u = ubuf; + while (count > 0 && !rc) { + unsigned long word; + rc = __get_user(word, u++); + if (rc) + break; + *gprs_high = word; + *gprs_high += 2; + count -= sizeof(*u); + } + } + + return rc; +} + +static int s390_compat_last_break_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + compat_ulong_t last_break = target->thread.last_break; + + return membuf_store(&to, (unsigned long)last_break); +} + +static int s390_compat_last_break_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + return 0; +} + +static const struct user_regset s390_compat_regsets[] = { + { + .core_note_type = NT_PRSTATUS, + .n = sizeof(s390_compat_regs) / sizeof(compat_long_t), + .size = sizeof(compat_long_t), + .align = sizeof(compat_long_t), + .regset_get = s390_compat_regs_get, + .set = s390_compat_regs_set, + }, + { + .core_note_type = NT_PRFPREG, + .n = sizeof(s390_fp_regs) / sizeof(compat_long_t), + .size = sizeof(compat_long_t), + .align = sizeof(compat_long_t), + .regset_get = s390_fpregs_get, + .set = s390_fpregs_set, + }, + { + .core_note_type = NT_S390_SYSTEM_CALL, + .n = 1, + .size = sizeof(compat_uint_t), + .align = sizeof(compat_uint_t), + .regset_get = s390_system_call_get, + .set = s390_system_call_set, + }, + { + .core_note_type = NT_S390_LAST_BREAK, + .n = 1, + .size = sizeof(long), + .align = sizeof(long), + .regset_get = s390_compat_last_break_get, + .set = s390_compat_last_break_set, + }, + { + .core_note_type = NT_S390_TDB, + .n = 1, + .size = 256, + .align = 1, + .regset_get = s390_tdb_get, + .set = s390_tdb_set, + }, + { + .core_note_type = NT_S390_VXRS_LOW, + .n = __NUM_VXRS_LOW, + .size = sizeof(__u64), + .align = sizeof(__u64), + .regset_get = s390_vxrs_low_get, + .set = s390_vxrs_low_set, + }, + { + .core_note_type = NT_S390_VXRS_HIGH, + .n = __NUM_VXRS_HIGH, + .size = sizeof(__vector128), + .align = sizeof(__vector128), + .regset_get = s390_vxrs_high_get, + .set = s390_vxrs_high_set, + }, + { + .core_note_type = NT_S390_HIGH_GPRS, + .n = sizeof(s390_compat_regs_high) / sizeof(compat_long_t), + .size = sizeof(compat_long_t), + .align = sizeof(compat_long_t), + .regset_get = s390_compat_regs_high_get, + .set = s390_compat_regs_high_set, + }, + { + .core_note_type = NT_S390_GS_CB, + .n = sizeof(struct gs_cb) / sizeof(__u64), + .size = sizeof(__u64), + .align = sizeof(__u64), + .regset_get = s390_gs_cb_get, + .set = s390_gs_cb_set, + }, + { + .core_note_type = NT_S390_GS_BC, + .n = sizeof(struct gs_cb) / sizeof(__u64), + .size = sizeof(__u64), + .align = sizeof(__u64), + .regset_get = s390_gs_bc_get, + .set = s390_gs_bc_set, + }, + { + .core_note_type = NT_S390_RI_CB, + .n = sizeof(struct runtime_instr_cb) / sizeof(__u64), + .size = sizeof(__u64), + .align = sizeof(__u64), + .regset_get = s390_runtime_instr_get, + .set = s390_runtime_instr_set, + }, +}; + +static const struct user_regset_view user_s390_compat_view = { + .name = "s390", + .e_machine = EM_S390, + .regsets = s390_compat_regsets, + .n = ARRAY_SIZE(s390_compat_regsets) +}; +#endif + +const struct user_regset_view *task_user_regset_view(struct task_struct *task) +{ +#ifdef CONFIG_COMPAT + if (test_tsk_thread_flag(task, TIF_31BIT)) + return &user_s390_compat_view; +#endif + return &user_s390_view; +} + +static const char *gpr_names[NUM_GPRS] = { + "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", +}; + +unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset) +{ + if (offset >= NUM_GPRS) + return 0; + return regs->gprs[offset]; +} + +int regs_query_register_offset(const char *name) +{ + unsigned long offset; + + if (!name || *name != 'r') + return -EINVAL; + if (kstrtoul(name + 1, 10, &offset)) + return -EINVAL; + if (offset >= NUM_GPRS) + return -EINVAL; + return offset; +} + +const char *regs_query_register_name(unsigned int offset) +{ + if (offset >= NUM_GPRS) + return NULL; + return gpr_names[offset]; +} + +static int regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr) +{ + unsigned long ksp = kernel_stack_pointer(regs); + + return (addr & ~(THREAD_SIZE - 1)) == (ksp & ~(THREAD_SIZE - 1)); +} + +/** + * regs_get_kernel_stack_nth() - get Nth entry of the stack + * @regs:pt_regs which contains kernel stack pointer. + * @n:stack entry number. + * + * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which + * is specifined by @regs. If the @n th entry is NOT in the kernel stack, + * this returns 0. + */ +unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n) +{ + unsigned long addr; + + addr = kernel_stack_pointer(regs) + n * sizeof(long); + if (!regs_within_kernel_stack(regs, addr)) + return 0; + return *(unsigned long *)addr; +} diff --git a/arch/s390/kernel/reipl.S b/arch/s390/kernel/reipl.S new file mode 100644 index 0000000000..88087a32eb --- /dev/null +++ b/arch/s390/kernel/reipl.S @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp 2000, 2011 + * Author(s): Holger Smolinski , + * Denis Joseph Barrow, + */ + +#include +#include +#include +#include + + GEN_BR_THUNK %r9 + +# +# Issue "store status" for the current CPU to its prefix page +# and call passed function afterwards +# +# r2 = Function to be called after store status +# r3 = Parameter for function +# +SYM_CODE_START(store_status) + /* Save register one and load save area base */ + stg %r1,__LC_SAVE_AREA_RESTART + /* General purpose registers */ + lghi %r1,__LC_GPREGS_SAVE_AREA + stmg %r0,%r15,0(%r1) + mvc 8(8,%r1),__LC_SAVE_AREA_RESTART + /* Control registers */ + lghi %r1,__LC_CREGS_SAVE_AREA + stctg %c0,%c15,0(%r1) + /* Access registers */ + lghi %r1,__LC_AREGS_SAVE_AREA + stam %a0,%a15,0(%r1) + /* Floating point registers */ + lghi %r1,__LC_FPREGS_SAVE_AREA + std %f0, 0x00(%r1) + std %f1, 0x08(%r1) + std %f2, 0x10(%r1) + std %f3, 0x18(%r1) + std %f4, 0x20(%r1) + std %f5, 0x28(%r1) + std %f6, 0x30(%r1) + std %f7, 0x38(%r1) + std %f8, 0x40(%r1) + std %f9, 0x48(%r1) + std %f10,0x50(%r1) + std %f11,0x58(%r1) + std %f12,0x60(%r1) + std %f13,0x68(%r1) + std %f14,0x70(%r1) + std %f15,0x78(%r1) + /* Floating point control register */ + lghi %r1,__LC_FP_CREG_SAVE_AREA + stfpc 0(%r1) + /* CPU timer */ + lghi %r1,__LC_CPU_TIMER_SAVE_AREA + stpt 0(%r1) + /* Store prefix register */ + lghi %r1,__LC_PREFIX_SAVE_AREA + stpx 0(%r1) + /* Clock comparator - seven bytes */ + lghi %r1,__LC_CLOCK_COMP_SAVE_AREA + larl %r4,clkcmp + stckc 0(%r4) + mvc 1(7,%r1),1(%r4) + /* Program status word */ + lghi %r1,__LC_PSW_SAVE_AREA + epsw %r4,%r5 + st %r4,0(%r1) + st %r5,4(%r1) + stg %r2,8(%r1) + lgr %r9,%r2 + lgr %r2,%r3 + BR_EX %r9 +SYM_CODE_END(store_status) + + .section .bss + .balign 8 +SYM_DATA_LOCAL(clkcmp, .quad 0x0000000000000000) + .previous diff --git a/arch/s390/kernel/relocate_kernel.S b/arch/s390/kernel/relocate_kernel.S new file mode 100644 index 0000000000..0ae297c82a --- /dev/null +++ b/arch/s390/kernel/relocate_kernel.S @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2005 + * + * Author(s): Rolf Adelsberger + * + */ + +#include +#include +#include + +/* + * moves the new kernel to its destination... + * %r2 = pointer to first kimage_entry_t + * %r3 = start address - where to jump to after the job is done... + * %r4 = subcode + * + * %r5 will be used as temp. storage + * %r6 holds the destination address + * %r7 = PAGE_SIZE + * %r8 holds the source address + * %r9 = PAGE_SIZE + * + * 0xf000 is a page_mask + */ + + .text +SYM_CODE_START(relocate_kernel) + basr %r13,0 # base address +.base: + lghi %r7,PAGE_SIZE # load PAGE_SIZE in r7 + lghi %r9,PAGE_SIZE # load PAGE_SIZE in r9 + lg %r5,0(%r2) # read another word for indirection page + aghi %r2,8 # increment pointer + tml %r5,0x1 # is it a destination page? + je .indir_check # NO, goto "indir_check" + lgr %r6,%r5 # r6 = r5 + nill %r6,0xf000 # mask it out and... + j .base # ...next iteration +.indir_check: + tml %r5,0x2 # is it a indirection page? + je .done_test # NO, goto "done_test" + nill %r5,0xf000 # YES, mask out, + lgr %r2,%r5 # move it into the right register, + j .base # and read next... +.done_test: + tml %r5,0x4 # is it the done indicator? + je .source_test # NO! Well, then it should be the source indicator... + j .done # ok, lets finish it here... +.source_test: + tml %r5,0x8 # it should be a source indicator... + je .base # NO, ignore it... + lgr %r8,%r5 # r8 = r5 + nill %r8,0xf000 # masking +0: mvcle %r6,%r8,0x0 # copy PAGE_SIZE bytes from r8 to r6 - pad with 0 + jo 0b + j .base +.done: + lgr %r0,%r4 # subcode + cghi %r3,0 + je .diag + la %r4,load_psw-.base(%r13) # load psw-address into the register + o %r3,4(%r4) # or load address into psw + st %r3,4(%r4) + mvc 0(8,%r0),0(%r4) # copy psw to absolute address 0 +.diag: + diag %r0,%r0,0x308 +SYM_CODE_END(relocate_kernel) + + .balign 8 +SYM_DATA_START_LOCAL(load_psw) + .long 0x00080000,0x80000000 +SYM_DATA_END_LABEL(load_psw, SYM_L_LOCAL, relocate_kernel_end) + .balign 8 +SYM_DATA(relocate_kernel_len, .quad relocate_kernel_end - relocate_kernel) diff --git a/arch/s390/kernel/rethook.c b/arch/s390/kernel/rethook.c new file mode 100644 index 0000000000..af10e6bdd3 --- /dev/null +++ b/arch/s390/kernel/rethook.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include +#include +#include "rethook.h" + +void arch_rethook_prepare(struct rethook_node *rh, struct pt_regs *regs, bool mcount) +{ + rh->ret_addr = regs->gprs[14]; + rh->frame = regs->gprs[15]; + + /* Replace the return addr with trampoline addr */ + regs->gprs[14] = (unsigned long)&arch_rethook_trampoline; +} +NOKPROBE_SYMBOL(arch_rethook_prepare); + +void arch_rethook_fixup_return(struct pt_regs *regs, + unsigned long correct_ret_addr) +{ + /* Replace fake return address with real one. */ + regs->gprs[14] = correct_ret_addr; +} +NOKPROBE_SYMBOL(arch_rethook_fixup_return); + +/* + * Called from arch_rethook_trampoline + */ +unsigned long arch_rethook_trampoline_callback(struct pt_regs *regs) +{ + return rethook_trampoline_handler(regs, regs->gprs[15]); +} +NOKPROBE_SYMBOL(arch_rethook_trampoline_callback); + +/* assembler function that handles the rethook must not be probed itself */ +NOKPROBE_SYMBOL(arch_rethook_trampoline); diff --git a/arch/s390/kernel/rethook.h b/arch/s390/kernel/rethook.h new file mode 100644 index 0000000000..32f069eed3 --- /dev/null +++ b/arch/s390/kernel/rethook.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __S390_RETHOOK_H +#define __S390_RETHOOK_H + +unsigned long arch_rethook_trampoline_callback(struct pt_regs *regs); + +#endif diff --git a/arch/s390/kernel/runtime_instr.c b/arch/s390/kernel/runtime_instr.c new file mode 100644 index 0000000000..1788a5454b --- /dev/null +++ b/arch/s390/kernel/runtime_instr.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2012 + * Author(s): Jan Glauber + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "entry.h" + +/* empty control block to disable RI by loading it */ +struct runtime_instr_cb runtime_instr_empty_cb; + +void runtime_instr_release(struct task_struct *tsk) +{ + kfree(tsk->thread.ri_cb); +} + +static void disable_runtime_instr(void) +{ + struct task_struct *task = current; + struct pt_regs *regs; + + if (!task->thread.ri_cb) + return; + regs = task_pt_regs(task); + preempt_disable(); + load_runtime_instr_cb(&runtime_instr_empty_cb); + kfree(task->thread.ri_cb); + task->thread.ri_cb = NULL; + preempt_enable(); + + /* + * Make sure the RI bit is deleted from the PSW. If the user did not + * switch off RI before the system call the process will get a + * specification exception otherwise. + */ + regs->psw.mask &= ~PSW_MASK_RI; +} + +static void init_runtime_instr_cb(struct runtime_instr_cb *cb) +{ + cb->rla = 0xfff; + cb->s = 1; + cb->k = 1; + cb->ps = 1; + cb->pc = 1; + cb->key = PAGE_DEFAULT_KEY >> 4; + cb->v = 1; +} + +/* + * The signum argument is unused. In older kernels it was used to + * specify a real-time signal. For backwards compatibility user space + * should pass a valid real-time signal number (the signum argument + * was checked in older kernels). + */ +SYSCALL_DEFINE2(s390_runtime_instr, int, command, int, signum) +{ + struct runtime_instr_cb *cb; + + if (!test_facility(64)) + return -EOPNOTSUPP; + + if (command == S390_RUNTIME_INSTR_STOP) { + disable_runtime_instr(); + return 0; + } + + if (command != S390_RUNTIME_INSTR_START) + return -EINVAL; + + if (!current->thread.ri_cb) { + cb = kzalloc(sizeof(*cb), GFP_KERNEL); + if (!cb) + return -ENOMEM; + } else { + cb = current->thread.ri_cb; + memset(cb, 0, sizeof(*cb)); + } + + init_runtime_instr_cb(cb); + + /* now load the control block to make it available */ + preempt_disable(); + current->thread.ri_cb = cb; + load_runtime_instr_cb(cb); + preempt_enable(); + return 0; +} diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c new file mode 100644 index 0000000000..de6ad0fb23 --- /dev/null +++ b/arch/s390/kernel/setup.c @@ -0,0 +1,1012 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * S390 version + * Copyright IBM Corp. 1999, 2012 + * Author(s): Hartmut Penner (hp@de.ibm.com), + * Martin Schwidefsky (schwidefsky@de.ibm.com) + * + * Derived from "arch/i386/kernel/setup.c" + * Copyright (C) 1995, Linus Torvalds + */ + +/* + * This file handles the architecture-dependent parts of initialization + */ + +#define KMSG_COMPONENT "setup" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "entry.h" + +/* + * Machine setup.. + */ +unsigned int console_mode = 0; +EXPORT_SYMBOL(console_mode); + +unsigned int console_devno = -1; +EXPORT_SYMBOL(console_devno); + +unsigned int console_irq = -1; +EXPORT_SYMBOL(console_irq); + +/* + * Some code and data needs to stay below 2 GB, even when the kernel would be + * relocated above 2 GB, because it has to use 31 bit addresses. + * Such code and data is part of the .amode31 section. + */ +char __amode31_ref *__samode31 = _samode31; +char __amode31_ref *__eamode31 = _eamode31; +char __amode31_ref *__stext_amode31 = _stext_amode31; +char __amode31_ref *__etext_amode31 = _etext_amode31; +struct exception_table_entry __amode31_ref *__start_amode31_ex_table = _start_amode31_ex_table; +struct exception_table_entry __amode31_ref *__stop_amode31_ex_table = _stop_amode31_ex_table; + +/* + * Control registers CR2, CR5 and CR15 are initialized with addresses + * of tables that must be placed below 2G which is handled by the AMODE31 + * sections. + * Because the AMODE31 sections are relocated below 2G at startup, + * the content of control registers CR2, CR5 and CR15 must be updated + * with new addresses after the relocation. The initial initialization of + * control registers occurs in head64.S and then gets updated again after AMODE31 + * relocation. We must access the relevant AMODE31 tables indirectly via + * pointers placed in the .amode31.refs linker section. Those pointers get + * updated automatically during AMODE31 relocation and always contain a valid + * address within AMODE31 sections. + */ + +static __amode31_data u32 __ctl_duct_amode31[16] __aligned(64); + +static __amode31_data u64 __ctl_aste_amode31[8] __aligned(64) = { + [1] = 0xffffffffffffffff +}; + +static __amode31_data u32 __ctl_duald_amode31[32] __aligned(128) = { + 0x80000000, 0, 0, 0, + 0x80000000, 0, 0, 0, + 0x80000000, 0, 0, 0, + 0x80000000, 0, 0, 0, + 0x80000000, 0, 0, 0, + 0x80000000, 0, 0, 0, + 0x80000000, 0, 0, 0, + 0x80000000, 0, 0, 0 +}; + +static __amode31_data u32 __ctl_linkage_stack_amode31[8] __aligned(64) = { + 0, 0, 0x89000000, 0, + 0, 0, 0x8a000000, 0 +}; + +static u64 __amode31_ref *__ctl_aste = __ctl_aste_amode31; +static u32 __amode31_ref *__ctl_duald = __ctl_duald_amode31; +static u32 __amode31_ref *__ctl_linkage_stack = __ctl_linkage_stack_amode31; +static u32 __amode31_ref *__ctl_duct = __ctl_duct_amode31; + +unsigned long __bootdata_preserved(max_mappable); +unsigned long __bootdata(ident_map_size); +struct physmem_info __bootdata(physmem_info); + +unsigned long __bootdata_preserved(__kaslr_offset); +int __bootdata_preserved(__kaslr_enabled); +unsigned int __bootdata_preserved(zlib_dfltcc_support); +EXPORT_SYMBOL(zlib_dfltcc_support); +u64 __bootdata_preserved(stfle_fac_list[16]); +EXPORT_SYMBOL(stfle_fac_list); +u64 __bootdata_preserved(alt_stfle_fac_list[16]); +struct oldmem_data __bootdata_preserved(oldmem_data); + +unsigned long VMALLOC_START; +EXPORT_SYMBOL(VMALLOC_START); + +unsigned long VMALLOC_END; +EXPORT_SYMBOL(VMALLOC_END); + +struct page *vmemmap; +EXPORT_SYMBOL(vmemmap); +unsigned long vmemmap_size; + +unsigned long MODULES_VADDR; +unsigned long MODULES_END; + +/* An array with a pointer to the lowcore of every CPU. */ +struct lowcore *lowcore_ptr[NR_CPUS]; +EXPORT_SYMBOL(lowcore_ptr); + +DEFINE_STATIC_KEY_FALSE(cpu_has_bear); + +/* + * The Write Back bit position in the physaddr is given by the SLPC PCI. + * Leaving the mask zero always uses write through which is safe + */ +unsigned long mio_wb_bit_mask __ro_after_init; + +/* + * This is set up by the setup-routine at boot-time + * for S390 need to find out, what we have to setup + * using address 0x10400 ... + */ + +#include + +/* + * condev= and conmode= setup parameter. + */ + +static int __init condev_setup(char *str) +{ + int vdev; + + vdev = simple_strtoul(str, &str, 0); + if (vdev >= 0 && vdev < 65536) { + console_devno = vdev; + console_irq = -1; + } + return 1; +} + +__setup("condev=", condev_setup); + +static void __init set_preferred_console(void) +{ + if (CONSOLE_IS_3215 || CONSOLE_IS_SCLP) + add_preferred_console("ttyS", 0, NULL); + else if (CONSOLE_IS_3270) + add_preferred_console("tty3270", 0, NULL); + else if (CONSOLE_IS_VT220) + add_preferred_console("ttysclp", 0, NULL); + else if (CONSOLE_IS_HVC) + add_preferred_console("hvc", 0, NULL); +} + +static int __init conmode_setup(char *str) +{ +#if defined(CONFIG_SCLP_CONSOLE) || defined(CONFIG_SCLP_VT220_CONSOLE) + if (!strcmp(str, "hwc") || !strcmp(str, "sclp")) + SET_CONSOLE_SCLP; +#endif +#if defined(CONFIG_TN3215_CONSOLE) + if (!strcmp(str, "3215")) + SET_CONSOLE_3215; +#endif +#if defined(CONFIG_TN3270_CONSOLE) + if (!strcmp(str, "3270")) + SET_CONSOLE_3270; +#endif + set_preferred_console(); + return 1; +} + +__setup("conmode=", conmode_setup); + +static void __init conmode_default(void) +{ + char query_buffer[1024]; + char *ptr; + + if (MACHINE_IS_VM) { + cpcmd("QUERY CONSOLE", query_buffer, 1024, NULL); + console_devno = simple_strtoul(query_buffer + 5, NULL, 16); + ptr = strstr(query_buffer, "SUBCHANNEL ="); + console_irq = simple_strtoul(ptr + 13, NULL, 16); + cpcmd("QUERY TERM", query_buffer, 1024, NULL); + ptr = strstr(query_buffer, "CONMODE"); + /* + * Set the conmode to 3215 so that the device recognition + * will set the cu_type of the console to 3215. If the + * conmode is 3270 and we don't set it back then both + * 3215 and the 3270 driver will try to access the console + * device (3215 as console and 3270 as normal tty). + */ + cpcmd("TERM CONMODE 3215", NULL, 0, NULL); + if (ptr == NULL) { +#if defined(CONFIG_SCLP_CONSOLE) || defined(CONFIG_SCLP_VT220_CONSOLE) + SET_CONSOLE_SCLP; +#endif + return; + } + if (str_has_prefix(ptr + 8, "3270")) { +#if defined(CONFIG_TN3270_CONSOLE) + SET_CONSOLE_3270; +#elif defined(CONFIG_TN3215_CONSOLE) + SET_CONSOLE_3215; +#elif defined(CONFIG_SCLP_CONSOLE) || defined(CONFIG_SCLP_VT220_CONSOLE) + SET_CONSOLE_SCLP; +#endif + } else if (str_has_prefix(ptr + 8, "3215")) { +#if defined(CONFIG_TN3215_CONSOLE) + SET_CONSOLE_3215; +#elif defined(CONFIG_TN3270_CONSOLE) + SET_CONSOLE_3270; +#elif defined(CONFIG_SCLP_CONSOLE) || defined(CONFIG_SCLP_VT220_CONSOLE) + SET_CONSOLE_SCLP; +#endif + } + } else if (MACHINE_IS_KVM) { + if (sclp.has_vt220 && IS_ENABLED(CONFIG_SCLP_VT220_CONSOLE)) + SET_CONSOLE_VT220; + else if (sclp.has_linemode && IS_ENABLED(CONFIG_SCLP_CONSOLE)) + SET_CONSOLE_SCLP; + else + SET_CONSOLE_HVC; + } else { +#if defined(CONFIG_SCLP_CONSOLE) || defined(CONFIG_SCLP_VT220_CONSOLE) + SET_CONSOLE_SCLP; +#endif + } +} + +#ifdef CONFIG_CRASH_DUMP +static void __init setup_zfcpdump(void) +{ + if (!is_ipl_type_dump()) + return; + if (oldmem_data.start) + return; + strcat(boot_command_line, " cio_ignore=all,!ipldev,!condev"); + console_loglevel = 2; +} +#else +static inline void setup_zfcpdump(void) {} +#endif /* CONFIG_CRASH_DUMP */ + + /* + * Reboot, halt and power_off stubs. They just call _machine_restart, + * _machine_halt or _machine_power_off. + */ + +void machine_restart(char *command) +{ + if ((!in_interrupt() && !in_atomic()) || oops_in_progress) + /* + * Only unblank the console if we are called in enabled + * context or a bust_spinlocks cleared the way for us. + */ + console_unblank(); + _machine_restart(command); +} + +void machine_halt(void) +{ + if (!in_interrupt() || oops_in_progress) + /* + * Only unblank the console if we are called in enabled + * context or a bust_spinlocks cleared the way for us. + */ + console_unblank(); + _machine_halt(); +} + +void machine_power_off(void) +{ + if (!in_interrupt() || oops_in_progress) + /* + * Only unblank the console if we are called in enabled + * context or a bust_spinlocks cleared the way for us. + */ + console_unblank(); + _machine_power_off(); +} + +/* + * Dummy power off function. + */ +void (*pm_power_off)(void) = machine_power_off; +EXPORT_SYMBOL_GPL(pm_power_off); + +void *restart_stack; + +unsigned long stack_alloc(void) +{ +#ifdef CONFIG_VMAP_STACK + void *ret; + + ret = __vmalloc_node(THREAD_SIZE, THREAD_SIZE, THREADINFO_GFP, + NUMA_NO_NODE, __builtin_return_address(0)); + kmemleak_not_leak(ret); + return (unsigned long)ret; +#else + return __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER); +#endif +} + +void stack_free(unsigned long stack) +{ +#ifdef CONFIG_VMAP_STACK + vfree((void *) stack); +#else + free_pages(stack, THREAD_SIZE_ORDER); +#endif +} + +void __init __noreturn arch_call_rest_init(void) +{ + smp_reinit_ipl_cpu(); + rest_init(); +} + +static unsigned long __init stack_alloc_early(void) +{ + unsigned long stack; + + stack = (unsigned long)memblock_alloc(THREAD_SIZE, THREAD_SIZE); + if (!stack) { + panic("%s: Failed to allocate %lu bytes align=0x%lx\n", + __func__, THREAD_SIZE, THREAD_SIZE); + } + return stack; +} + +static void __init setup_lowcore(void) +{ + struct lowcore *lc, *abs_lc; + + /* + * Setup lowcore for boot cpu + */ + BUILD_BUG_ON(sizeof(struct lowcore) != LC_PAGES * PAGE_SIZE); + lc = memblock_alloc_low(sizeof(*lc), sizeof(*lc)); + if (!lc) + panic("%s: Failed to allocate %zu bytes align=%zx\n", + __func__, sizeof(*lc), sizeof(*lc)); + + lc->restart_psw.mask = PSW_KERNEL_BITS & ~PSW_MASK_DAT; + lc->restart_psw.addr = __pa(restart_int_handler); + lc->external_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK; + lc->external_new_psw.addr = (unsigned long) ext_int_handler; + lc->svc_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK; + lc->svc_new_psw.addr = (unsigned long) system_call; + lc->program_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK; + lc->program_new_psw.addr = (unsigned long) pgm_check_handler; + lc->mcck_new_psw.mask = PSW_KERNEL_BITS; + lc->mcck_new_psw.addr = (unsigned long) mcck_int_handler; + lc->io_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK; + lc->io_new_psw.addr = (unsigned long) io_int_handler; + lc->clock_comparator = clock_comparator_max; + lc->current_task = (unsigned long)&init_task; + lc->lpp = LPP_MAGIC; + lc->machine_flags = S390_lowcore.machine_flags; + lc->preempt_count = S390_lowcore.preempt_count; + nmi_alloc_mcesa_early(&lc->mcesad); + lc->sys_enter_timer = S390_lowcore.sys_enter_timer; + lc->exit_timer = S390_lowcore.exit_timer; + lc->user_timer = S390_lowcore.user_timer; + lc->system_timer = S390_lowcore.system_timer; + lc->steal_timer = S390_lowcore.steal_timer; + lc->last_update_timer = S390_lowcore.last_update_timer; + lc->last_update_clock = S390_lowcore.last_update_clock; + /* + * Allocate the global restart stack which is the same for + * all CPUs in case *one* of them does a PSW restart. + */ + restart_stack = (void *)(stack_alloc_early() + STACK_INIT_OFFSET); + lc->mcck_stack = stack_alloc_early() + STACK_INIT_OFFSET; + lc->async_stack = stack_alloc_early() + STACK_INIT_OFFSET; + lc->nodat_stack = stack_alloc_early() + STACK_INIT_OFFSET; + lc->kernel_stack = S390_lowcore.kernel_stack; + /* + * Set up PSW restart to call ipl.c:do_restart(). Copy the relevant + * restart data to the absolute zero lowcore. This is necessary if + * PSW restart is done on an offline CPU that has lowcore zero. + */ + lc->restart_stack = (unsigned long) restart_stack; + lc->restart_fn = (unsigned long) do_restart; + lc->restart_data = 0; + lc->restart_source = -1U; + __ctl_store(lc->cregs_save_area, 0, 15); + lc->spinlock_lockval = arch_spin_lockval(0); + lc->spinlock_index = 0; + arch_spin_lock_setup(0); + lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW); + lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW); + lc->preempt_count = PREEMPT_DISABLED; + lc->kernel_asce = S390_lowcore.kernel_asce; + lc->user_asce = S390_lowcore.user_asce; + + abs_lc = get_abs_lowcore(); + abs_lc->restart_stack = lc->restart_stack; + abs_lc->restart_fn = lc->restart_fn; + abs_lc->restart_data = lc->restart_data; + abs_lc->restart_source = lc->restart_source; + abs_lc->restart_psw = lc->restart_psw; + abs_lc->restart_flags = RESTART_FLAG_CTLREGS; + memcpy(abs_lc->cregs_save_area, lc->cregs_save_area, sizeof(abs_lc->cregs_save_area)); + abs_lc->program_new_psw = lc->program_new_psw; + abs_lc->mcesad = lc->mcesad; + put_abs_lowcore(abs_lc); + + set_prefix(__pa(lc)); + lowcore_ptr[0] = lc; + if (abs_lowcore_map(0, lowcore_ptr[0], false)) + panic("Couldn't setup absolute lowcore"); +} + +static struct resource code_resource = { + .name = "Kernel code", + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, +}; + +static struct resource data_resource = { + .name = "Kernel data", + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, +}; + +static struct resource bss_resource = { + .name = "Kernel bss", + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, +}; + +static struct resource __initdata *standard_resources[] = { + &code_resource, + &data_resource, + &bss_resource, +}; + +static void __init setup_resources(void) +{ + struct resource *res, *std_res, *sub_res; + phys_addr_t start, end; + int j; + u64 i; + + code_resource.start = (unsigned long) _text; + code_resource.end = (unsigned long) _etext - 1; + data_resource.start = (unsigned long) _etext; + data_resource.end = (unsigned long) _edata - 1; + bss_resource.start = (unsigned long) __bss_start; + bss_resource.end = (unsigned long) __bss_stop - 1; + + for_each_mem_range(i, &start, &end) { + res = memblock_alloc(sizeof(*res), 8); + if (!res) + panic("%s: Failed to allocate %zu bytes align=0x%x\n", + __func__, sizeof(*res), 8); + res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; + + res->name = "System RAM"; + res->start = start; + /* + * In memblock, end points to the first byte after the + * range while in resources, end points to the last byte in + * the range. + */ + res->end = end - 1; + request_resource(&iomem_resource, res); + + for (j = 0; j < ARRAY_SIZE(standard_resources); j++) { + std_res = standard_resources[j]; + if (std_res->start < res->start || + std_res->start > res->end) + continue; + if (std_res->end > res->end) { + sub_res = memblock_alloc(sizeof(*sub_res), 8); + if (!sub_res) + panic("%s: Failed to allocate %zu bytes align=0x%x\n", + __func__, sizeof(*sub_res), 8); + *sub_res = *std_res; + sub_res->end = res->end; + std_res->start = res->end + 1; + request_resource(res, sub_res); + } else { + request_resource(res, std_res); + } + } + } +#ifdef CONFIG_CRASH_DUMP + /* + * Re-add removed crash kernel memory as reserved memory. This makes + * sure it will be mapped with the identity mapping and struct pages + * will be created, so it can be resized later on. + * However add it later since the crash kernel resource should not be + * part of the System RAM resource. + */ + if (crashk_res.end) { + memblock_add_node(crashk_res.start, resource_size(&crashk_res), + 0, MEMBLOCK_NONE); + memblock_reserve(crashk_res.start, resource_size(&crashk_res)); + insert_resource(&iomem_resource, &crashk_res); + } +#endif +} + +static void __init setup_memory_end(void) +{ + max_pfn = max_low_pfn = PFN_DOWN(ident_map_size); + pr_notice("The maximum memory size is %luMB\n", ident_map_size >> 20); +} + +#ifdef CONFIG_CRASH_DUMP + +/* + * When kdump is enabled, we have to ensure that no memory from the area + * [0 - crashkernel memory size] is set offline - it will be exchanged with + * the crashkernel memory region when kdump is triggered. The crashkernel + * memory region can never get offlined (pages are unmovable). + */ +static int kdump_mem_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct memory_notify *arg = data; + + if (action != MEM_GOING_OFFLINE) + return NOTIFY_OK; + if (arg->start_pfn < PFN_DOWN(resource_size(&crashk_res))) + return NOTIFY_BAD; + return NOTIFY_OK; +} + +static struct notifier_block kdump_mem_nb = { + .notifier_call = kdump_mem_notifier, +}; + +#endif + +/* + * Reserve page tables created by decompressor + */ +static void __init reserve_pgtables(void) +{ + unsigned long start, end; + struct reserved_range *range; + + for_each_physmem_reserved_type_range(RR_VMEM, range, &start, &end) + memblock_reserve(start, end - start); +} + +/* + * Reserve memory for kdump kernel to be loaded with kexec + */ +static void __init reserve_crashkernel(void) +{ +#ifdef CONFIG_CRASH_DUMP + unsigned long long crash_base, crash_size; + phys_addr_t low, high; + int rc; + + rc = parse_crashkernel(boot_command_line, ident_map_size, &crash_size, + &crash_base); + + crash_base = ALIGN(crash_base, KEXEC_CRASH_MEM_ALIGN); + crash_size = ALIGN(crash_size, KEXEC_CRASH_MEM_ALIGN); + if (rc || crash_size == 0) + return; + + if (memblock.memory.regions[0].size < crash_size) { + pr_info("crashkernel reservation failed: %s\n", + "first memory chunk must be at least crashkernel size"); + return; + } + + low = crash_base ?: oldmem_data.start; + high = low + crash_size; + if (low >= oldmem_data.start && high <= oldmem_data.start + oldmem_data.size) { + /* The crashkernel fits into OLDMEM, reuse OLDMEM */ + crash_base = low; + } else { + /* Find suitable area in free memory */ + low = max_t(unsigned long, crash_size, sclp.hsa_size); + high = crash_base ? crash_base + crash_size : ULONG_MAX; + + if (crash_base && crash_base < low) { + pr_info("crashkernel reservation failed: %s\n", + "crash_base too low"); + return; + } + low = crash_base ?: low; + crash_base = memblock_phys_alloc_range(crash_size, + KEXEC_CRASH_MEM_ALIGN, + low, high); + } + + if (!crash_base) { + pr_info("crashkernel reservation failed: %s\n", + "no suitable area found"); + return; + } + + if (register_memory_notifier(&kdump_mem_nb)) { + memblock_phys_free(crash_base, crash_size); + return; + } + + if (!oldmem_data.start && MACHINE_IS_VM) + diag10_range(PFN_DOWN(crash_base), PFN_DOWN(crash_size)); + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; + memblock_remove(crash_base, crash_size); + pr_info("Reserving %lluMB of memory at %lluMB " + "for crashkernel (System RAM: %luMB)\n", + crash_size >> 20, crash_base >> 20, + (unsigned long)memblock.memory.total_size >> 20); + os_info_crashkernel_add(crash_base, crash_size); +#endif +} + +/* + * Reserve the initrd from being used by memblock + */ +static void __init reserve_initrd(void) +{ + unsigned long addr, size; + + if (!IS_ENABLED(CONFIG_BLK_DEV_INITRD) || !get_physmem_reserved(RR_INITRD, &addr, &size)) + return; + initrd_start = (unsigned long)__va(addr); + initrd_end = initrd_start + size; + memblock_reserve(addr, size); +} + +/* + * Reserve the memory area used to pass the certificate lists + */ +static void __init reserve_certificate_list(void) +{ + if (ipl_cert_list_addr) + memblock_reserve(ipl_cert_list_addr, ipl_cert_list_size); +} + +static void __init reserve_physmem_info(void) +{ + unsigned long addr, size; + + if (get_physmem_reserved(RR_MEM_DETECT_EXTENDED, &addr, &size)) + memblock_reserve(addr, size); +} + +static void __init free_physmem_info(void) +{ + unsigned long addr, size; + + if (get_physmem_reserved(RR_MEM_DETECT_EXTENDED, &addr, &size)) + memblock_phys_free(addr, size); +} + +static void __init memblock_add_physmem_info(void) +{ + unsigned long start, end; + int i; + + pr_debug("physmem info source: %s (%hhd)\n", + get_physmem_info_source(), physmem_info.info_source); + /* keep memblock lists close to the kernel */ + memblock_set_bottom_up(true); + for_each_physmem_usable_range(i, &start, &end) + memblock_add(start, end - start); + for_each_physmem_online_range(i, &start, &end) + memblock_physmem_add(start, end - start); + memblock_set_bottom_up(false); + memblock_set_node(0, ULONG_MAX, &memblock.memory, 0); +} + +/* + * Reserve memory used for lowcore/command line/kernel image. + */ +static void __init reserve_kernel(void) +{ + memblock_reserve(0, STARTUP_NORMAL_OFFSET); + memblock_reserve(OLDMEM_BASE, sizeof(unsigned long)); + memblock_reserve(OLDMEM_SIZE, sizeof(unsigned long)); + memblock_reserve(physmem_info.reserved[RR_AMODE31].start, __eamode31 - __samode31); + memblock_reserve(__pa(sclp_early_sccb), EXT_SCCB_READ_SCP); + memblock_reserve(__pa(_stext), _end - _stext); +} + +static void __init setup_memory(void) +{ + phys_addr_t start, end; + u64 i; + + /* + * Init storage key for present memory + */ + for_each_mem_range(i, &start, &end) + storage_key_init_range(start, end); + + psw_set_key(PAGE_DEFAULT_KEY); +} + +static void __init relocate_amode31_section(void) +{ + unsigned long amode31_size = __eamode31 - __samode31; + long amode31_offset, *ptr; + + amode31_offset = physmem_info.reserved[RR_AMODE31].start - (unsigned long)__samode31; + pr_info("Relocating AMODE31 section of size 0x%08lx\n", amode31_size); + + /* Move original AMODE31 section to the new one */ + memmove((void *)physmem_info.reserved[RR_AMODE31].start, __samode31, amode31_size); + /* Zero out the old AMODE31 section to catch invalid accesses within it */ + memset(__samode31, 0, amode31_size); + + /* Update all AMODE31 region references */ + for (ptr = _start_amode31_refs; ptr != _end_amode31_refs; ptr++) + *ptr += amode31_offset; +} + +/* This must be called after AMODE31 relocation */ +static void __init setup_cr(void) +{ + union ctlreg2 cr2; + union ctlreg5 cr5; + union ctlreg15 cr15; + + __ctl_duct[1] = (unsigned long)__ctl_aste; + __ctl_duct[2] = (unsigned long)__ctl_aste; + __ctl_duct[4] = (unsigned long)__ctl_duald; + + /* Update control registers CR2, CR5 and CR15 */ + __ctl_store(cr2.val, 2, 2); + __ctl_store(cr5.val, 5, 5); + __ctl_store(cr15.val, 15, 15); + cr2.ducto = (unsigned long)__ctl_duct >> 6; + cr5.pasteo = (unsigned long)__ctl_duct >> 6; + cr15.lsea = (unsigned long)__ctl_linkage_stack >> 3; + __ctl_load(cr2.val, 2, 2); + __ctl_load(cr5.val, 5, 5); + __ctl_load(cr15.val, 15, 15); +} + +/* + * Add system information as device randomness + */ +static void __init setup_randomness(void) +{ + struct sysinfo_3_2_2 *vmms; + + vmms = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + if (!vmms) + panic("Failed to allocate memory for sysinfo structure\n"); + if (stsi(vmms, 3, 2, 2) == 0 && vmms->count) + add_device_randomness(&vmms->vm, sizeof(vmms->vm[0]) * vmms->count); + memblock_free(vmms, PAGE_SIZE); + + if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG)) + static_branch_enable(&s390_arch_random_available); +} + +/* + * Find the correct size for the task_struct. This depends on + * the size of the struct fpu at the end of the thread_struct + * which is embedded in the task_struct. + */ +static void __init setup_task_size(void) +{ + int task_size = sizeof(struct task_struct); + + if (!MACHINE_HAS_VX) { + task_size -= sizeof(__vector128) * __NUM_VXRS; + task_size += sizeof(freg_t) * __NUM_FPRS; + } + arch_task_struct_size = task_size; +} + +/* + * Issue diagnose 318 to set the control program name and + * version codes. + */ +static void __init setup_control_program_code(void) +{ + union diag318_info diag318_info = { + .cpnc = CPNC_LINUX, + .cpvc = 0, + }; + + if (!sclp.has_diag318) + return; + + diag_stat_inc(DIAG_STAT_X318); + asm volatile("diag %0,0,0x318\n" : : "d" (diag318_info.val)); +} + +/* + * Print the component list from the IPL report + */ +static void __init log_component_list(void) +{ + struct ipl_rb_component_entry *ptr, *end; + char *str; + + if (!early_ipl_comp_list_addr) + return; + if (ipl_block.hdr.flags & IPL_PL_FLAG_SIPL) + pr_info("Linux is running with Secure-IPL enabled\n"); + else + pr_info("Linux is running with Secure-IPL disabled\n"); + ptr = __va(early_ipl_comp_list_addr); + end = (void *) ptr + early_ipl_comp_list_size; + pr_info("The IPL report contains the following components:\n"); + while (ptr < end) { + if (ptr->flags & IPL_RB_COMPONENT_FLAG_SIGNED) { + if (ptr->flags & IPL_RB_COMPONENT_FLAG_VERIFIED) + str = "signed, verified"; + else + str = "signed, verification failed"; + } else { + str = "not signed"; + } + pr_info("%016llx - %016llx (%s)\n", + ptr->addr, ptr->addr + ptr->len, str); + ptr++; + } +} + +/* + * Setup function called from init/main.c just after the banner + * was printed. + */ + +void __init setup_arch(char **cmdline_p) +{ + /* + * print what head.S has found out about the machine + */ + if (MACHINE_IS_VM) + pr_info("Linux is running as a z/VM " + "guest operating system in 64-bit mode\n"); + else if (MACHINE_IS_KVM) + pr_info("Linux is running under KVM in 64-bit mode\n"); + else if (MACHINE_IS_LPAR) + pr_info("Linux is running natively in 64-bit mode\n"); + else + pr_info("Linux is running as a guest in 64-bit mode\n"); + + log_component_list(); + + /* Have one command line that is parsed and saved in /proc/cmdline */ + /* boot_command_line has been already set up in early.c */ + *cmdline_p = boot_command_line; + + ROOT_DEV = Root_RAM0; + + setup_initial_init_mm(_text, _etext, _edata, _end); + + if (IS_ENABLED(CONFIG_EXPOLINE_AUTO)) + nospec_auto_detect(); + + jump_label_init(); + parse_early_param(); +#ifdef CONFIG_CRASH_DUMP + /* Deactivate elfcorehdr= kernel parameter */ + elfcorehdr_addr = ELFCORE_ADDR_MAX; +#endif + + os_info_init(); + setup_ipl(); + setup_task_size(); + setup_control_program_code(); + + /* Do some memory reservations *before* memory is added to memblock */ + reserve_pgtables(); + reserve_kernel(); + reserve_initrd(); + reserve_certificate_list(); + reserve_physmem_info(); + memblock_set_current_limit(ident_map_size); + memblock_allow_resize(); + + /* Get information about *all* installed memory */ + memblock_add_physmem_info(); + + free_physmem_info(); + setup_memory_end(); + memblock_dump_all(); + setup_memory(); + + relocate_amode31_section(); + setup_cr(); + setup_uv(); + dma_contiguous_reserve(ident_map_size); + vmcp_cma_reserve(); + if (MACHINE_HAS_EDAT2) + hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT); + + reserve_crashkernel(); +#ifdef CONFIG_CRASH_DUMP + /* + * Be aware that smp_save_dump_secondary_cpus() triggers a system reset. + * Therefore CPU and device initialization should be done afterwards. + */ + smp_save_dump_secondary_cpus(); +#endif + + setup_resources(); + setup_lowcore(); + smp_fill_possible_mask(); + cpu_detect_mhz_feature(); + cpu_init(); + numa_setup(); + smp_detect_cpus(); + topology_init_early(); + + if (test_facility(193)) + static_branch_enable(&cpu_has_bear); + + /* + * Create kernel page tables. + */ + paging_init(); + + /* + * After paging_init created the kernel page table, the new PSWs + * in lowcore can now run with DAT enabled. + */ +#ifdef CONFIG_CRASH_DUMP + smp_save_dump_ipl_cpu(); +#endif + + /* Setup default console */ + conmode_default(); + set_preferred_console(); + + apply_alternative_instructions(); + if (IS_ENABLED(CONFIG_EXPOLINE)) + nospec_init_branches(); + + /* Setup zfcp/nvme dump support */ + setup_zfcpdump(); + + /* Add system specific data to the random pool */ + setup_randomness(); +} diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c new file mode 100644 index 0000000000..d63557d386 --- /dev/null +++ b/arch/s390/kernel/signal.c @@ -0,0 +1,534 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 1999, 2006 + * Author(s): Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com) + * + * Based on Intel version + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "entry.h" + +/* + * Layout of an old-style signal-frame: + * ----------------------------------------- + * | save area (_SIGNAL_FRAMESIZE) | + * ----------------------------------------- + * | struct sigcontext | + * | oldmask | + * | _sigregs * | + * ----------------------------------------- + * | _sigregs with | + * | _s390_regs_common | + * | _s390_fp_regs | + * ----------------------------------------- + * | int signo | + * ----------------------------------------- + * | _sigregs_ext with | + * | gprs_high 64 byte (opt) | + * | vxrs_low 128 byte (opt) | + * | vxrs_high 256 byte (opt) | + * | reserved 128 byte (opt) | + * ----------------------------------------- + * | __u16 svc_insn | + * ----------------------------------------- + * The svc_insn entry with the sigreturn system call opcode does not + * have a fixed position and moves if gprs_high or vxrs exist. + * Future extensions will be added to _sigregs_ext. + */ +struct sigframe +{ + __u8 callee_used_stack[__SIGNAL_FRAMESIZE]; + struct sigcontext sc; + _sigregs sregs; + int signo; + _sigregs_ext sregs_ext; + __u16 svc_insn; /* Offset of svc_insn is NOT fixed! */ +}; + +/* + * Layout of an rt signal-frame: + * ----------------------------------------- + * | save area (_SIGNAL_FRAMESIZE) | + * ----------------------------------------- + * | svc __NR_rt_sigreturn 2 byte | + * ----------------------------------------- + * | struct siginfo | + * ----------------------------------------- + * | struct ucontext_extended with | + * | unsigned long uc_flags | + * | struct ucontext *uc_link | + * | stack_t uc_stack | + * | _sigregs uc_mcontext with | + * | _s390_regs_common | + * | _s390_fp_regs | + * | sigset_t uc_sigmask | + * | _sigregs_ext uc_mcontext_ext | + * | gprs_high 64 byte (opt) | + * | vxrs_low 128 byte (opt) | + * | vxrs_high 256 byte (opt)| + * | reserved 128 byte (opt) | + * ----------------------------------------- + * Future extensions will be added to _sigregs_ext. + */ +struct rt_sigframe +{ + __u8 callee_used_stack[__SIGNAL_FRAMESIZE]; + __u16 svc_insn; + struct siginfo info; + struct ucontext_extended uc; +}; + +/* Store registers needed to create the signal frame */ +static void store_sigregs(void) +{ + save_access_regs(current->thread.acrs); + save_fpu_regs(); +} + +/* Load registers after signal return */ +static void load_sigregs(void) +{ + restore_access_regs(current->thread.acrs); +} + +/* Returns non-zero on fault. */ +static int save_sigregs(struct pt_regs *regs, _sigregs __user *sregs) +{ + _sigregs user_sregs; + + /* Copy a 'clean' PSW mask to the user to avoid leaking + information about whether PER is currently on. */ + user_sregs.regs.psw.mask = PSW_USER_BITS | + (regs->psw.mask & (PSW_MASK_USER | PSW_MASK_RI)); + user_sregs.regs.psw.addr = regs->psw.addr; + memcpy(&user_sregs.regs.gprs, ®s->gprs, sizeof(sregs->regs.gprs)); + memcpy(&user_sregs.regs.acrs, current->thread.acrs, + sizeof(user_sregs.regs.acrs)); + fpregs_store(&user_sregs.fpregs, ¤t->thread.fpu); + if (__copy_to_user(sregs, &user_sregs, sizeof(_sigregs))) + return -EFAULT; + return 0; +} + +static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs) +{ + _sigregs user_sregs; + + /* Always make any pending restarted system call return -EINTR */ + current->restart_block.fn = do_no_restart_syscall; + + if (__copy_from_user(&user_sregs, sregs, sizeof(user_sregs))) + return -EFAULT; + + if (!is_ri_task(current) && (user_sregs.regs.psw.mask & PSW_MASK_RI)) + return -EINVAL; + + /* Test the floating-point-control word. */ + if (test_fp_ctl(user_sregs.fpregs.fpc)) + return -EINVAL; + + /* Use regs->psw.mask instead of PSW_USER_BITS to preserve PER bit. */ + regs->psw.mask = (regs->psw.mask & ~(PSW_MASK_USER | PSW_MASK_RI)) | + (user_sregs.regs.psw.mask & (PSW_MASK_USER | PSW_MASK_RI)); + /* Check for invalid user address space control. */ + if ((regs->psw.mask & PSW_MASK_ASC) == PSW_ASC_HOME) + regs->psw.mask = PSW_ASC_PRIMARY | + (regs->psw.mask & ~PSW_MASK_ASC); + /* Check for invalid amode */ + if (regs->psw.mask & PSW_MASK_EA) + regs->psw.mask |= PSW_MASK_BA; + regs->psw.addr = user_sregs.regs.psw.addr; + memcpy(®s->gprs, &user_sregs.regs.gprs, sizeof(sregs->regs.gprs)); + memcpy(¤t->thread.acrs, &user_sregs.regs.acrs, + sizeof(current->thread.acrs)); + + fpregs_load(&user_sregs.fpregs, ¤t->thread.fpu); + + clear_pt_regs_flag(regs, PIF_SYSCALL); /* No longer in a system call */ + return 0; +} + +/* Returns non-zero on fault. */ +static int save_sigregs_ext(struct pt_regs *regs, + _sigregs_ext __user *sregs_ext) +{ + __u64 vxrs[__NUM_VXRS_LOW]; + int i; + + /* Save vector registers to signal stack */ + if (MACHINE_HAS_VX) { + for (i = 0; i < __NUM_VXRS_LOW; i++) + vxrs[i] = current->thread.fpu.vxrs[i].low; + if (__copy_to_user(&sregs_ext->vxrs_low, vxrs, + sizeof(sregs_ext->vxrs_low)) || + __copy_to_user(&sregs_ext->vxrs_high, + current->thread.fpu.vxrs + __NUM_VXRS_LOW, + sizeof(sregs_ext->vxrs_high))) + return -EFAULT; + } + return 0; +} + +static int restore_sigregs_ext(struct pt_regs *regs, + _sigregs_ext __user *sregs_ext) +{ + __u64 vxrs[__NUM_VXRS_LOW]; + int i; + + /* Restore vector registers from signal stack */ + if (MACHINE_HAS_VX) { + if (__copy_from_user(vxrs, &sregs_ext->vxrs_low, + sizeof(sregs_ext->vxrs_low)) || + __copy_from_user(current->thread.fpu.vxrs + __NUM_VXRS_LOW, + &sregs_ext->vxrs_high, + sizeof(sregs_ext->vxrs_high))) + return -EFAULT; + for (i = 0; i < __NUM_VXRS_LOW; i++) + current->thread.fpu.vxrs[i].low = vxrs[i]; + } + return 0; +} + +SYSCALL_DEFINE0(sigreturn) +{ + struct pt_regs *regs = task_pt_regs(current); + struct sigframe __user *frame = + (struct sigframe __user *) regs->gprs[15]; + sigset_t set; + + if (__copy_from_user(&set.sig, &frame->sc.oldmask, _SIGMASK_COPY_SIZE)) + goto badframe; + set_current_blocked(&set); + save_fpu_regs(); + if (restore_sigregs(regs, &frame->sregs)) + goto badframe; + if (restore_sigregs_ext(regs, &frame->sregs_ext)) + goto badframe; + load_sigregs(); + return regs->gprs[2]; +badframe: + force_sig(SIGSEGV); + return 0; +} + +SYSCALL_DEFINE0(rt_sigreturn) +{ + struct pt_regs *regs = task_pt_regs(current); + struct rt_sigframe __user *frame = + (struct rt_sigframe __user *)regs->gprs[15]; + sigset_t set; + + if (__copy_from_user(&set.sig, &frame->uc.uc_sigmask, sizeof(set))) + goto badframe; + set_current_blocked(&set); + if (restore_altstack(&frame->uc.uc_stack)) + goto badframe; + save_fpu_regs(); + if (restore_sigregs(regs, &frame->uc.uc_mcontext)) + goto badframe; + if (restore_sigregs_ext(regs, &frame->uc.uc_mcontext_ext)) + goto badframe; + load_sigregs(); + return regs->gprs[2]; +badframe: + force_sig(SIGSEGV); + return 0; +} + +/* + * Determine which stack to use.. + */ +static inline void __user * +get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size) +{ + unsigned long sp; + + /* Default to using normal stack */ + sp = regs->gprs[15]; + + /* Overflow on alternate signal stack gives SIGSEGV. */ + if (on_sig_stack(sp) && !on_sig_stack((sp - frame_size) & -8UL)) + return (void __user *) -1UL; + + /* This is the X/Open sanctioned signal stack switching. */ + if (ka->sa.sa_flags & SA_ONSTACK) { + if (! sas_ss_flags(sp)) + sp = current->sas_ss_sp + current->sas_ss_size; + } + + return (void __user *)((sp - frame_size) & -8ul); +} + +static int setup_frame(int sig, struct k_sigaction *ka, + sigset_t *set, struct pt_regs * regs) +{ + struct sigframe __user *frame; + struct sigcontext sc; + unsigned long restorer; + size_t frame_size; + + /* + * gprs_high are only present for a 31-bit task running on + * a 64-bit kernel (see compat_signal.c) but the space for + * gprs_high need to be allocated if vector registers are + * included in the signal frame on a 31-bit system. + */ + frame_size = sizeof(*frame) - sizeof(frame->sregs_ext); + if (MACHINE_HAS_VX) + frame_size += sizeof(frame->sregs_ext); + frame = get_sigframe(ka, regs, frame_size); + if (frame == (void __user *) -1UL) + return -EFAULT; + + /* Set up backchain. */ + if (__put_user(regs->gprs[15], (addr_t __user *) frame)) + return -EFAULT; + + /* Create struct sigcontext on the signal stack */ + memcpy(&sc.oldmask, &set->sig, _SIGMASK_COPY_SIZE); + sc.sregs = (_sigregs __user __force *) &frame->sregs; + if (__copy_to_user(&frame->sc, &sc, sizeof(frame->sc))) + return -EFAULT; + + /* Store registers needed to create the signal frame */ + store_sigregs(); + + /* Create _sigregs on the signal stack */ + if (save_sigregs(regs, &frame->sregs)) + return -EFAULT; + + /* Place signal number on stack to allow backtrace from handler. */ + if (__put_user(regs->gprs[2], (int __user *) &frame->signo)) + return -EFAULT; + + /* Create _sigregs_ext on the signal stack */ + if (save_sigregs_ext(regs, &frame->sregs_ext)) + return -EFAULT; + + /* Set up to return from userspace. If provided, use a stub + already in userspace. */ + if (ka->sa.sa_flags & SA_RESTORER) + restorer = (unsigned long) ka->sa.sa_restorer; + else + restorer = VDSO64_SYMBOL(current, sigreturn); + + /* Set up registers for signal handler */ + regs->gprs[14] = restorer; + regs->gprs[15] = (unsigned long) frame; + /* Force default amode and default user address space control. */ + regs->psw.mask = PSW_MASK_EA | PSW_MASK_BA | + (PSW_USER_BITS & PSW_MASK_ASC) | + (regs->psw.mask & ~PSW_MASK_ASC); + regs->psw.addr = (unsigned long) ka->sa.sa_handler; + + regs->gprs[2] = sig; + regs->gprs[3] = (unsigned long) &frame->sc; + + /* We forgot to include these in the sigcontext. + To avoid breaking binary compatibility, they are passed as args. */ + if (sig == SIGSEGV || sig == SIGBUS || sig == SIGILL || + sig == SIGTRAP || sig == SIGFPE) { + /* set extra registers only for synchronous signals */ + regs->gprs[4] = regs->int_code & 127; + regs->gprs[5] = regs->int_parm_long; + regs->gprs[6] = current->thread.last_break; + } + return 0; +} + +static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, + struct pt_regs *regs) +{ + struct rt_sigframe __user *frame; + unsigned long uc_flags, restorer; + size_t frame_size; + + frame_size = sizeof(struct rt_sigframe) - sizeof(_sigregs_ext); + /* + * gprs_high are only present for a 31-bit task running on + * a 64-bit kernel (see compat_signal.c) but the space for + * gprs_high need to be allocated if vector registers are + * included in the signal frame on a 31-bit system. + */ + uc_flags = 0; + if (MACHINE_HAS_VX) { + frame_size += sizeof(_sigregs_ext); + uc_flags |= UC_VXRS; + } + frame = get_sigframe(&ksig->ka, regs, frame_size); + if (frame == (void __user *) -1UL) + return -EFAULT; + + /* Set up backchain. */ + if (__put_user(regs->gprs[15], (addr_t __user *) frame)) + return -EFAULT; + + /* Set up to return from userspace. If provided, use a stub + already in userspace. */ + if (ksig->ka.sa.sa_flags & SA_RESTORER) + restorer = (unsigned long) ksig->ka.sa.sa_restorer; + else + restorer = VDSO64_SYMBOL(current, rt_sigreturn); + + /* Create siginfo on the signal stack */ + if (copy_siginfo_to_user(&frame->info, &ksig->info)) + return -EFAULT; + + /* Store registers needed to create the signal frame */ + store_sigregs(); + + /* Create ucontext on the signal stack. */ + if (__put_user(uc_flags, &frame->uc.uc_flags) || + __put_user(NULL, &frame->uc.uc_link) || + __save_altstack(&frame->uc.uc_stack, regs->gprs[15]) || + save_sigregs(regs, &frame->uc.uc_mcontext) || + __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)) || + save_sigregs_ext(regs, &frame->uc.uc_mcontext_ext)) + return -EFAULT; + + /* Set up registers for signal handler */ + regs->gprs[14] = restorer; + regs->gprs[15] = (unsigned long) frame; + /* Force default amode and default user address space control. */ + regs->psw.mask = PSW_MASK_EA | PSW_MASK_BA | + (PSW_USER_BITS & PSW_MASK_ASC) | + (regs->psw.mask & ~PSW_MASK_ASC); + regs->psw.addr = (unsigned long) ksig->ka.sa.sa_handler; + + regs->gprs[2] = ksig->sig; + regs->gprs[3] = (unsigned long) &frame->info; + regs->gprs[4] = (unsigned long) &frame->uc; + regs->gprs[5] = current->thread.last_break; + return 0; +} + +static void handle_signal(struct ksignal *ksig, sigset_t *oldset, + struct pt_regs *regs) +{ + int ret; + + /* Set up the stack frame */ + if (ksig->ka.sa.sa_flags & SA_SIGINFO) + ret = setup_rt_frame(ksig, oldset, regs); + else + ret = setup_frame(ksig->sig, &ksig->ka, oldset, regs); + + signal_setup_done(ret, ksig, test_thread_flag(TIF_SINGLE_STEP)); +} + +/* + * Note that 'init' is a special process: it doesn't get signals it doesn't + * want to handle. Thus you cannot kill init even with a SIGKILL even by + * mistake. + * + * Note that we go through the signals twice: once to check the signals that + * the kernel can handle, and then we build all the user-level signal handling + * stack-frames in one go after that. + */ + +void arch_do_signal_or_restart(struct pt_regs *regs) +{ + struct ksignal ksig; + sigset_t *oldset = sigmask_to_save(); + + /* + * Get signal to deliver. When running under ptrace, at this point + * the debugger may change all our registers, including the system + * call information. + */ + current->thread.system_call = + test_pt_regs_flag(regs, PIF_SYSCALL) ? regs->int_code : 0; + + if (get_signal(&ksig)) { + /* Whee! Actually deliver the signal. */ + if (current->thread.system_call) { + regs->int_code = current->thread.system_call; + /* Check for system call restarting. */ + switch (regs->gprs[2]) { + case -ERESTART_RESTARTBLOCK: + case -ERESTARTNOHAND: + regs->gprs[2] = -EINTR; + break; + case -ERESTARTSYS: + if (!(ksig.ka.sa.sa_flags & SA_RESTART)) { + regs->gprs[2] = -EINTR; + break; + } + fallthrough; + case -ERESTARTNOINTR: + regs->gprs[2] = regs->orig_gpr2; + regs->psw.addr = + __rewind_psw(regs->psw, + regs->int_code >> 16); + break; + } + } + /* No longer in a system call */ + clear_pt_regs_flag(regs, PIF_SYSCALL); + + rseq_signal_deliver(&ksig, regs); + if (is_compat_task()) + handle_signal32(&ksig, oldset, regs); + else + handle_signal(&ksig, oldset, regs); + return; + } + + /* No handlers present - check for system call restart */ + clear_pt_regs_flag(regs, PIF_SYSCALL); + if (current->thread.system_call) { + regs->int_code = current->thread.system_call; + switch (regs->gprs[2]) { + case -ERESTART_RESTARTBLOCK: + /* Restart with sys_restart_syscall */ + regs->gprs[2] = regs->orig_gpr2; + current->restart_block.arch_data = regs->psw.addr; + if (is_compat_task()) + regs->psw.addr = VDSO32_SYMBOL(current, restart_syscall); + else + regs->psw.addr = VDSO64_SYMBOL(current, restart_syscall); + if (test_thread_flag(TIF_SINGLE_STEP)) + clear_thread_flag(TIF_PER_TRAP); + break; + case -ERESTARTNOHAND: + case -ERESTARTSYS: + case -ERESTARTNOINTR: + regs->gprs[2] = regs->orig_gpr2; + regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16); + if (test_thread_flag(TIF_SINGLE_STEP)) + clear_thread_flag(TIF_PER_TRAP); + break; + } + } + + /* + * If there's no signal to deliver, we just put the saved sigmask back. + */ + restore_saved_sigmask(); +} diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c new file mode 100644 index 0000000000..a4edb7ea66 --- /dev/null +++ b/arch/s390/kernel/smp.c @@ -0,0 +1,1317 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * SMP related functions + * + * Copyright IBM Corp. 1999, 2012 + * Author(s): Denis Joseph Barrow, + * Martin Schwidefsky , + * + * based on other smp stuff by + * (c) 1995 Alan Cox, CymruNET Ltd + * (c) 1998 Ingo Molnar + * + * The code outside of smp.c uses logical cpu numbers, only smp.c does + * the translation of logical to physical cpu ids. All new code that + * operates on physical cpu numbers needs to go into smp.c. + */ + +#define KMSG_COMPONENT "cpu" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "entry.h" + +enum { + ec_schedule = 0, + ec_call_function_single, + ec_stop_cpu, + ec_mcck_pending, + ec_irq_work, +}; + +enum { + CPU_STATE_STANDBY, + CPU_STATE_CONFIGURED, +}; + +static DEFINE_PER_CPU(struct cpu *, cpu_device); + +struct pcpu { + unsigned long ec_mask; /* bit mask for ec_xxx functions */ + unsigned long ec_clk; /* sigp timestamp for ec_xxx */ + signed char state; /* physical cpu state */ + signed char polarization; /* physical polarization */ + u16 address; /* physical cpu address */ +}; + +static u8 boot_core_type; +static struct pcpu pcpu_devices[NR_CPUS]; + +unsigned int smp_cpu_mt_shift; +EXPORT_SYMBOL(smp_cpu_mt_shift); + +unsigned int smp_cpu_mtid; +EXPORT_SYMBOL(smp_cpu_mtid); + +#ifdef CONFIG_CRASH_DUMP +__vector128 __initdata boot_cpu_vector_save_area[__NUM_VXRS]; +#endif + +static unsigned int smp_max_threads __initdata = -1U; +cpumask_t cpu_setup_mask; + +static int __init early_nosmt(char *s) +{ + smp_max_threads = 1; + return 0; +} +early_param("nosmt", early_nosmt); + +static int __init early_smt(char *s) +{ + get_option(&s, &smp_max_threads); + return 0; +} +early_param("smt", early_smt); + +/* + * The smp_cpu_state_mutex must be held when changing the state or polarization + * member of a pcpu data structure within the pcpu_devices array. + */ +DEFINE_MUTEX(smp_cpu_state_mutex); + +/* + * Signal processor helper functions. + */ +static inline int __pcpu_sigp_relax(u16 addr, u8 order, unsigned long parm) +{ + int cc; + + while (1) { + cc = __pcpu_sigp(addr, order, parm, NULL); + if (cc != SIGP_CC_BUSY) + return cc; + cpu_relax(); + } +} + +static int pcpu_sigp_retry(struct pcpu *pcpu, u8 order, u32 parm) +{ + int cc, retry; + + for (retry = 0; ; retry++) { + cc = __pcpu_sigp(pcpu->address, order, parm, NULL); + if (cc != SIGP_CC_BUSY) + break; + if (retry >= 3) + udelay(10); + } + return cc; +} + +static inline int pcpu_stopped(struct pcpu *pcpu) +{ + u32 status; + + if (__pcpu_sigp(pcpu->address, SIGP_SENSE, + 0, &status) != SIGP_CC_STATUS_STORED) + return 0; + return !!(status & (SIGP_STATUS_CHECK_STOP|SIGP_STATUS_STOPPED)); +} + +static inline int pcpu_running(struct pcpu *pcpu) +{ + if (__pcpu_sigp(pcpu->address, SIGP_SENSE_RUNNING, + 0, NULL) != SIGP_CC_STATUS_STORED) + return 1; + /* Status stored condition code is equivalent to cpu not running. */ + return 0; +} + +/* + * Find struct pcpu by cpu address. + */ +static struct pcpu *pcpu_find_address(const struct cpumask *mask, u16 address) +{ + int cpu; + + for_each_cpu(cpu, mask) + if (pcpu_devices[cpu].address == address) + return pcpu_devices + cpu; + return NULL; +} + +static void pcpu_ec_call(struct pcpu *pcpu, int ec_bit) +{ + int order; + + if (test_and_set_bit(ec_bit, &pcpu->ec_mask)) + return; + order = pcpu_running(pcpu) ? SIGP_EXTERNAL_CALL : SIGP_EMERGENCY_SIGNAL; + pcpu->ec_clk = get_tod_clock_fast(); + pcpu_sigp_retry(pcpu, order, 0); +} + +static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu) +{ + unsigned long async_stack, nodat_stack, mcck_stack; + struct lowcore *lc; + + lc = (struct lowcore *) __get_free_pages(GFP_KERNEL | GFP_DMA, LC_ORDER); + nodat_stack = __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER); + async_stack = stack_alloc(); + mcck_stack = stack_alloc(); + if (!lc || !nodat_stack || !async_stack || !mcck_stack) + goto out; + memcpy(lc, &S390_lowcore, 512); + memset((char *) lc + 512, 0, sizeof(*lc) - 512); + lc->async_stack = async_stack + STACK_INIT_OFFSET; + lc->nodat_stack = nodat_stack + STACK_INIT_OFFSET; + lc->mcck_stack = mcck_stack + STACK_INIT_OFFSET; + lc->cpu_nr = cpu; + lc->spinlock_lockval = arch_spin_lockval(cpu); + lc->spinlock_index = 0; + lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW); + lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW); + lc->preempt_count = PREEMPT_DISABLED; + if (nmi_alloc_mcesa(&lc->mcesad)) + goto out; + if (abs_lowcore_map(cpu, lc, true)) + goto out_mcesa; + lowcore_ptr[cpu] = lc; + pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, __pa(lc)); + return 0; + +out_mcesa: + nmi_free_mcesa(&lc->mcesad); +out: + stack_free(mcck_stack); + stack_free(async_stack); + free_pages(nodat_stack, THREAD_SIZE_ORDER); + free_pages((unsigned long) lc, LC_ORDER); + return -ENOMEM; +} + +static void pcpu_free_lowcore(struct pcpu *pcpu) +{ + unsigned long async_stack, nodat_stack, mcck_stack; + struct lowcore *lc; + int cpu; + + cpu = pcpu - pcpu_devices; + lc = lowcore_ptr[cpu]; + nodat_stack = lc->nodat_stack - STACK_INIT_OFFSET; + async_stack = lc->async_stack - STACK_INIT_OFFSET; + mcck_stack = lc->mcck_stack - STACK_INIT_OFFSET; + pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, 0); + lowcore_ptr[cpu] = NULL; + abs_lowcore_unmap(cpu); + nmi_free_mcesa(&lc->mcesad); + stack_free(async_stack); + stack_free(mcck_stack); + free_pages(nodat_stack, THREAD_SIZE_ORDER); + free_pages((unsigned long) lc, LC_ORDER); +} + +static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu) +{ + struct lowcore *lc, *abs_lc; + + lc = lowcore_ptr[cpu]; + cpumask_set_cpu(cpu, &init_mm.context.cpu_attach_mask); + cpumask_set_cpu(cpu, mm_cpumask(&init_mm)); + lc->cpu_nr = cpu; + lc->restart_flags = RESTART_FLAG_CTLREGS; + lc->spinlock_lockval = arch_spin_lockval(cpu); + lc->spinlock_index = 0; + lc->percpu_offset = __per_cpu_offset[cpu]; + lc->kernel_asce = S390_lowcore.kernel_asce; + lc->user_asce = s390_invalid_asce; + lc->machine_flags = S390_lowcore.machine_flags; + lc->user_timer = lc->system_timer = + lc->steal_timer = lc->avg_steal_timer = 0; + abs_lc = get_abs_lowcore(); + memcpy(lc->cregs_save_area, abs_lc->cregs_save_area, sizeof(lc->cregs_save_area)); + put_abs_lowcore(abs_lc); + lc->cregs_save_area[1] = lc->kernel_asce; + lc->cregs_save_area[7] = lc->user_asce; + save_access_regs((unsigned int *) lc->access_regs_save_area); + arch_spin_lock_setup(cpu); +} + +static void pcpu_attach_task(struct pcpu *pcpu, struct task_struct *tsk) +{ + struct lowcore *lc; + int cpu; + + cpu = pcpu - pcpu_devices; + lc = lowcore_ptr[cpu]; + lc->kernel_stack = (unsigned long)task_stack_page(tsk) + STACK_INIT_OFFSET; + lc->current_task = (unsigned long)tsk; + lc->lpp = LPP_MAGIC; + lc->current_pid = tsk->pid; + lc->user_timer = tsk->thread.user_timer; + lc->guest_timer = tsk->thread.guest_timer; + lc->system_timer = tsk->thread.system_timer; + lc->hardirq_timer = tsk->thread.hardirq_timer; + lc->softirq_timer = tsk->thread.softirq_timer; + lc->steal_timer = 0; +} + +static void pcpu_start_fn(struct pcpu *pcpu, void (*func)(void *), void *data) +{ + struct lowcore *lc; + int cpu; + + cpu = pcpu - pcpu_devices; + lc = lowcore_ptr[cpu]; + lc->restart_stack = lc->kernel_stack; + lc->restart_fn = (unsigned long) func; + lc->restart_data = (unsigned long) data; + lc->restart_source = -1U; + pcpu_sigp_retry(pcpu, SIGP_RESTART, 0); +} + +typedef void (pcpu_delegate_fn)(void *); + +/* + * Call function via PSW restart on pcpu and stop the current cpu. + */ +static void __pcpu_delegate(pcpu_delegate_fn *func, void *data) +{ + func(data); /* should not return */ +} + +static void pcpu_delegate(struct pcpu *pcpu, + pcpu_delegate_fn *func, + void *data, unsigned long stack) +{ + struct lowcore *lc, *abs_lc; + unsigned int source_cpu; + + lc = lowcore_ptr[pcpu - pcpu_devices]; + source_cpu = stap(); + + if (pcpu->address == source_cpu) { + call_on_stack(2, stack, void, __pcpu_delegate, + pcpu_delegate_fn *, func, void *, data); + } + /* Stop target cpu (if func returns this stops the current cpu). */ + pcpu_sigp_retry(pcpu, SIGP_STOP, 0); + pcpu_sigp_retry(pcpu, SIGP_CPU_RESET, 0); + /* Restart func on the target cpu and stop the current cpu. */ + if (lc) { + lc->restart_stack = stack; + lc->restart_fn = (unsigned long)func; + lc->restart_data = (unsigned long)data; + lc->restart_source = source_cpu; + } else { + abs_lc = get_abs_lowcore(); + abs_lc->restart_stack = stack; + abs_lc->restart_fn = (unsigned long)func; + abs_lc->restart_data = (unsigned long)data; + abs_lc->restart_source = source_cpu; + put_abs_lowcore(abs_lc); + } + asm volatile( + "0: sigp 0,%0,%2 # sigp restart to target cpu\n" + " brc 2,0b # busy, try again\n" + "1: sigp 0,%1,%3 # sigp stop to current cpu\n" + " brc 2,1b # busy, try again\n" + : : "d" (pcpu->address), "d" (source_cpu), + "K" (SIGP_RESTART), "K" (SIGP_STOP) + : "0", "1", "cc"); + for (;;) ; +} + +/* + * Enable additional logical cpus for multi-threading. + */ +static int pcpu_set_smt(unsigned int mtid) +{ + int cc; + + if (smp_cpu_mtid == mtid) + return 0; + cc = __pcpu_sigp(0, SIGP_SET_MULTI_THREADING, mtid, NULL); + if (cc == 0) { + smp_cpu_mtid = mtid; + smp_cpu_mt_shift = 0; + while (smp_cpu_mtid >= (1U << smp_cpu_mt_shift)) + smp_cpu_mt_shift++; + pcpu_devices[0].address = stap(); + } + return cc; +} + +/* + * Call function on an online CPU. + */ +void smp_call_online_cpu(void (*func)(void *), void *data) +{ + struct pcpu *pcpu; + + /* Use the current cpu if it is online. */ + pcpu = pcpu_find_address(cpu_online_mask, stap()); + if (!pcpu) + /* Use the first online cpu. */ + pcpu = pcpu_devices + cpumask_first(cpu_online_mask); + pcpu_delegate(pcpu, func, data, (unsigned long) restart_stack); +} + +/* + * Call function on the ipl CPU. + */ +void smp_call_ipl_cpu(void (*func)(void *), void *data) +{ + struct lowcore *lc = lowcore_ptr[0]; + + if (pcpu_devices[0].address == stap()) + lc = &S390_lowcore; + + pcpu_delegate(&pcpu_devices[0], func, data, + lc->nodat_stack); +} + +int smp_find_processor_id(u16 address) +{ + int cpu; + + for_each_present_cpu(cpu) + if (pcpu_devices[cpu].address == address) + return cpu; + return -1; +} + +void schedule_mcck_handler(void) +{ + pcpu_ec_call(pcpu_devices + smp_processor_id(), ec_mcck_pending); +} + +bool notrace arch_vcpu_is_preempted(int cpu) +{ + if (test_cpu_flag_of(CIF_ENABLED_WAIT, cpu)) + return false; + if (pcpu_running(pcpu_devices + cpu)) + return false; + return true; +} +EXPORT_SYMBOL(arch_vcpu_is_preempted); + +void notrace smp_yield_cpu(int cpu) +{ + if (!MACHINE_HAS_DIAG9C) + return; + diag_stat_inc_norecursion(DIAG_STAT_X09C); + asm volatile("diag %0,0,0x9c" + : : "d" (pcpu_devices[cpu].address)); +} +EXPORT_SYMBOL_GPL(smp_yield_cpu); + +/* + * Send cpus emergency shutdown signal. This gives the cpus the + * opportunity to complete outstanding interrupts. + */ +void notrace smp_emergency_stop(void) +{ + static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED; + static cpumask_t cpumask; + u64 end; + int cpu; + + arch_spin_lock(&lock); + cpumask_copy(&cpumask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), &cpumask); + + end = get_tod_clock() + (1000000UL << 12); + for_each_cpu(cpu, &cpumask) { + struct pcpu *pcpu = pcpu_devices + cpu; + set_bit(ec_stop_cpu, &pcpu->ec_mask); + while (__pcpu_sigp(pcpu->address, SIGP_EMERGENCY_SIGNAL, + 0, NULL) == SIGP_CC_BUSY && + get_tod_clock() < end) + cpu_relax(); + } + while (get_tod_clock() < end) { + for_each_cpu(cpu, &cpumask) + if (pcpu_stopped(pcpu_devices + cpu)) + cpumask_clear_cpu(cpu, &cpumask); + if (cpumask_empty(&cpumask)) + break; + cpu_relax(); + } + arch_spin_unlock(&lock); +} +NOKPROBE_SYMBOL(smp_emergency_stop); + +/* + * Stop all cpus but the current one. + */ +void smp_send_stop(void) +{ + int cpu; + + /* Disable all interrupts/machine checks */ + __load_psw_mask(PSW_KERNEL_BITS); + trace_hardirqs_off(); + + debug_set_critical(); + + if (oops_in_progress) + smp_emergency_stop(); + + /* stop all processors */ + for_each_online_cpu(cpu) { + if (cpu == smp_processor_id()) + continue; + pcpu_sigp_retry(pcpu_devices + cpu, SIGP_STOP, 0); + while (!pcpu_stopped(pcpu_devices + cpu)) + cpu_relax(); + } +} + +/* + * This is the main routine where commands issued by other + * cpus are handled. + */ +static void smp_handle_ext_call(void) +{ + unsigned long bits; + + /* handle bit signal external calls */ + bits = xchg(&pcpu_devices[smp_processor_id()].ec_mask, 0); + if (test_bit(ec_stop_cpu, &bits)) + smp_stop_cpu(); + if (test_bit(ec_schedule, &bits)) + scheduler_ipi(); + if (test_bit(ec_call_function_single, &bits)) + generic_smp_call_function_single_interrupt(); + if (test_bit(ec_mcck_pending, &bits)) + s390_handle_mcck(); + if (test_bit(ec_irq_work, &bits)) + irq_work_run(); +} + +static void do_ext_call_interrupt(struct ext_code ext_code, + unsigned int param32, unsigned long param64) +{ + inc_irq_stat(ext_code.code == 0x1202 ? IRQEXT_EXC : IRQEXT_EMS); + smp_handle_ext_call(); +} + +void arch_send_call_function_ipi_mask(const struct cpumask *mask) +{ + int cpu; + + for_each_cpu(cpu, mask) + pcpu_ec_call(pcpu_devices + cpu, ec_call_function_single); +} + +void arch_send_call_function_single_ipi(int cpu) +{ + pcpu_ec_call(pcpu_devices + cpu, ec_call_function_single); +} + +/* + * this function sends a 'reschedule' IPI to another CPU. + * it goes straight through and wastes no time serializing + * anything. Worst case is that we lose a reschedule ... + */ +void arch_smp_send_reschedule(int cpu) +{ + pcpu_ec_call(pcpu_devices + cpu, ec_schedule); +} + +#ifdef CONFIG_IRQ_WORK +void arch_irq_work_raise(void) +{ + pcpu_ec_call(pcpu_devices + smp_processor_id(), ec_irq_work); +} +#endif + +/* + * parameter area for the set/clear control bit callbacks + */ +struct ec_creg_mask_parms { + unsigned long orval; + unsigned long andval; + int cr; +}; + +/* + * callback for setting/clearing control bits + */ +static void smp_ctl_bit_callback(void *info) +{ + struct ec_creg_mask_parms *pp = info; + unsigned long cregs[16]; + + __ctl_store(cregs, 0, 15); + cregs[pp->cr] = (cregs[pp->cr] & pp->andval) | pp->orval; + __ctl_load(cregs, 0, 15); +} + +static DEFINE_SPINLOCK(ctl_lock); + +void smp_ctl_set_clear_bit(int cr, int bit, bool set) +{ + struct ec_creg_mask_parms parms = { .cr = cr, }; + struct lowcore *abs_lc; + u64 ctlreg; + + if (set) { + parms.orval = 1UL << bit; + parms.andval = -1UL; + } else { + parms.orval = 0; + parms.andval = ~(1UL << bit); + } + spin_lock(&ctl_lock); + abs_lc = get_abs_lowcore(); + ctlreg = abs_lc->cregs_save_area[cr]; + ctlreg = (ctlreg & parms.andval) | parms.orval; + abs_lc->cregs_save_area[cr] = ctlreg; + put_abs_lowcore(abs_lc); + on_each_cpu(smp_ctl_bit_callback, &parms, 1); + spin_unlock(&ctl_lock); +} +EXPORT_SYMBOL(smp_ctl_set_clear_bit); + +#ifdef CONFIG_CRASH_DUMP + +int smp_store_status(int cpu) +{ + struct lowcore *lc; + struct pcpu *pcpu; + unsigned long pa; + + pcpu = pcpu_devices + cpu; + lc = lowcore_ptr[cpu]; + pa = __pa(&lc->floating_pt_save_area); + if (__pcpu_sigp_relax(pcpu->address, SIGP_STORE_STATUS_AT_ADDRESS, + pa) != SIGP_CC_ORDER_CODE_ACCEPTED) + return -EIO; + if (!MACHINE_HAS_VX && !MACHINE_HAS_GS) + return 0; + pa = lc->mcesad & MCESA_ORIGIN_MASK; + if (MACHINE_HAS_GS) + pa |= lc->mcesad & MCESA_LC_MASK; + if (__pcpu_sigp_relax(pcpu->address, SIGP_STORE_ADDITIONAL_STATUS, + pa) != SIGP_CC_ORDER_CODE_ACCEPTED) + return -EIO; + return 0; +} + +/* + * Collect CPU state of the previous, crashed system. + * There are four cases: + * 1) standard zfcp/nvme dump + * condition: OLDMEM_BASE == NULL && is_ipl_type_dump() == true + * The state for all CPUs except the boot CPU needs to be collected + * with sigp stop-and-store-status. The boot CPU state is located in + * the absolute lowcore of the memory stored in the HSA. The zcore code + * will copy the boot CPU state from the HSA. + * 2) stand-alone kdump for SCSI/NVMe (zfcp/nvme dump with swapped memory) + * condition: OLDMEM_BASE != NULL && is_ipl_type_dump() == true + * The state for all CPUs except the boot CPU needs to be collected + * with sigp stop-and-store-status. The firmware or the boot-loader + * stored the registers of the boot CPU in the absolute lowcore in the + * memory of the old system. + * 3) kdump and the old kernel did not store the CPU state, + * or stand-alone kdump for DASD + * condition: OLDMEM_BASE != NULL && !is_kdump_kernel() + * The state for all CPUs except the boot CPU needs to be collected + * with sigp stop-and-store-status. The kexec code or the boot-loader + * stored the registers of the boot CPU in the memory of the old system. + * 4) kdump and the old kernel stored the CPU state + * condition: OLDMEM_BASE != NULL && is_kdump_kernel() + * This case does not exist for s390 anymore, setup_arch explicitly + * deactivates the elfcorehdr= kernel parameter + */ +static bool dump_available(void) +{ + return oldmem_data.start || is_ipl_type_dump(); +} + +void __init smp_save_dump_ipl_cpu(void) +{ + struct save_area *sa; + void *regs; + + if (!dump_available()) + return; + sa = save_area_alloc(true); + regs = memblock_alloc(512, 8); + if (!sa || !regs) + panic("could not allocate memory for boot CPU save area\n"); + copy_oldmem_kernel(regs, __LC_FPREGS_SAVE_AREA, 512); + save_area_add_regs(sa, regs); + memblock_free(regs, 512); + if (MACHINE_HAS_VX) + save_area_add_vxrs(sa, boot_cpu_vector_save_area); +} + +void __init smp_save_dump_secondary_cpus(void) +{ + int addr, boot_cpu_addr, max_cpu_addr; + struct save_area *sa; + void *page; + + if (!dump_available()) + return; + /* Allocate a page as dumping area for the store status sigps */ + page = memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); + if (!page) + panic("ERROR: Failed to allocate %lx bytes below %lx\n", + PAGE_SIZE, 1UL << 31); + + /* Set multi-threading state to the previous system. */ + pcpu_set_smt(sclp.mtid_prev); + boot_cpu_addr = stap(); + max_cpu_addr = SCLP_MAX_CORES << sclp.mtid_prev; + for (addr = 0; addr <= max_cpu_addr; addr++) { + if (addr == boot_cpu_addr) + continue; + if (__pcpu_sigp_relax(addr, SIGP_SENSE, 0) == + SIGP_CC_NOT_OPERATIONAL) + continue; + sa = save_area_alloc(false); + if (!sa) + panic("could not allocate memory for save area\n"); + __pcpu_sigp_relax(addr, SIGP_STORE_STATUS_AT_ADDRESS, __pa(page)); + save_area_add_regs(sa, page); + if (MACHINE_HAS_VX) { + __pcpu_sigp_relax(addr, SIGP_STORE_ADDITIONAL_STATUS, __pa(page)); + save_area_add_vxrs(sa, page); + } + } + memblock_free(page, PAGE_SIZE); + diag_amode31_ops.diag308_reset(); + pcpu_set_smt(0); +} +#endif /* CONFIG_CRASH_DUMP */ + +void smp_cpu_set_polarization(int cpu, int val) +{ + pcpu_devices[cpu].polarization = val; +} + +int smp_cpu_get_polarization(int cpu) +{ + return pcpu_devices[cpu].polarization; +} + +int smp_cpu_get_cpu_address(int cpu) +{ + return pcpu_devices[cpu].address; +} + +static void __ref smp_get_core_info(struct sclp_core_info *info, int early) +{ + static int use_sigp_detection; + int address; + + if (use_sigp_detection || sclp_get_core_info(info, early)) { + use_sigp_detection = 1; + for (address = 0; + address < (SCLP_MAX_CORES << smp_cpu_mt_shift); + address += (1U << smp_cpu_mt_shift)) { + if (__pcpu_sigp_relax(address, SIGP_SENSE, 0) == + SIGP_CC_NOT_OPERATIONAL) + continue; + info->core[info->configured].core_id = + address >> smp_cpu_mt_shift; + info->configured++; + } + info->combined = info->configured; + } +} + +static int smp_add_present_cpu(int cpu); + +static int smp_add_core(struct sclp_core_entry *core, cpumask_t *avail, + bool configured, bool early) +{ + struct pcpu *pcpu; + int cpu, nr, i; + u16 address; + + nr = 0; + if (sclp.has_core_type && core->type != boot_core_type) + return nr; + cpu = cpumask_first(avail); + address = core->core_id << smp_cpu_mt_shift; + for (i = 0; (i <= smp_cpu_mtid) && (cpu < nr_cpu_ids); i++) { + if (pcpu_find_address(cpu_present_mask, address + i)) + continue; + pcpu = pcpu_devices + cpu; + pcpu->address = address + i; + if (configured) + pcpu->state = CPU_STATE_CONFIGURED; + else + pcpu->state = CPU_STATE_STANDBY; + smp_cpu_set_polarization(cpu, POLARIZATION_UNKNOWN); + set_cpu_present(cpu, true); + if (!early && smp_add_present_cpu(cpu) != 0) + set_cpu_present(cpu, false); + else + nr++; + cpumask_clear_cpu(cpu, avail); + cpu = cpumask_next(cpu, avail); + } + return nr; +} + +static int __smp_rescan_cpus(struct sclp_core_info *info, bool early) +{ + struct sclp_core_entry *core; + static cpumask_t avail; + bool configured; + u16 core_id; + int nr, i; + + cpus_read_lock(); + mutex_lock(&smp_cpu_state_mutex); + nr = 0; + cpumask_xor(&avail, cpu_possible_mask, cpu_present_mask); + /* + * Add IPL core first (which got logical CPU number 0) to make sure + * that all SMT threads get subsequent logical CPU numbers. + */ + if (early) { + core_id = pcpu_devices[0].address >> smp_cpu_mt_shift; + for (i = 0; i < info->configured; i++) { + core = &info->core[i]; + if (core->core_id == core_id) { + nr += smp_add_core(core, &avail, true, early); + break; + } + } + } + for (i = 0; i < info->combined; i++) { + configured = i < info->configured; + nr += smp_add_core(&info->core[i], &avail, configured, early); + } + mutex_unlock(&smp_cpu_state_mutex); + cpus_read_unlock(); + return nr; +} + +void __init smp_detect_cpus(void) +{ + unsigned int cpu, mtid, c_cpus, s_cpus; + struct sclp_core_info *info; + u16 address; + + /* Get CPU information */ + info = memblock_alloc(sizeof(*info), 8); + if (!info) + panic("%s: Failed to allocate %zu bytes align=0x%x\n", + __func__, sizeof(*info), 8); + smp_get_core_info(info, 1); + /* Find boot CPU type */ + if (sclp.has_core_type) { + address = stap(); + for (cpu = 0; cpu < info->combined; cpu++) + if (info->core[cpu].core_id == address) { + /* The boot cpu dictates the cpu type. */ + boot_core_type = info->core[cpu].type; + break; + } + if (cpu >= info->combined) + panic("Could not find boot CPU type"); + } + + /* Set multi-threading state for the current system */ + mtid = boot_core_type ? sclp.mtid : sclp.mtid_cp; + mtid = (mtid < smp_max_threads) ? mtid : smp_max_threads - 1; + pcpu_set_smt(mtid); + + /* Print number of CPUs */ + c_cpus = s_cpus = 0; + for (cpu = 0; cpu < info->combined; cpu++) { + if (sclp.has_core_type && + info->core[cpu].type != boot_core_type) + continue; + if (cpu < info->configured) + c_cpus += smp_cpu_mtid + 1; + else + s_cpus += smp_cpu_mtid + 1; + } + pr_info("%d configured CPUs, %d standby CPUs\n", c_cpus, s_cpus); + + /* Add CPUs present at boot */ + __smp_rescan_cpus(info, true); + memblock_free(info, sizeof(*info)); +} + +/* + * Activate a secondary processor. + */ +static void smp_start_secondary(void *cpuvoid) +{ + int cpu = raw_smp_processor_id(); + + S390_lowcore.last_update_clock = get_tod_clock(); + S390_lowcore.restart_stack = (unsigned long)restart_stack; + S390_lowcore.restart_fn = (unsigned long)do_restart; + S390_lowcore.restart_data = 0; + S390_lowcore.restart_source = -1U; + S390_lowcore.restart_flags = 0; + restore_access_regs(S390_lowcore.access_regs_save_area); + cpu_init(); + rcu_cpu_starting(cpu); + init_cpu_timer(); + vtime_init(); + vdso_getcpu_init(); + pfault_init(); + cpumask_set_cpu(cpu, &cpu_setup_mask); + update_cpu_masks(); + notify_cpu_starting(cpu); + if (topology_cpu_dedicated(cpu)) + set_cpu_flag(CIF_DEDICATED_CPU); + else + clear_cpu_flag(CIF_DEDICATED_CPU); + set_cpu_online(cpu, true); + inc_irq_stat(CPU_RST); + local_irq_enable(); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); +} + +/* Upping and downing of CPUs */ +int __cpu_up(unsigned int cpu, struct task_struct *tidle) +{ + struct pcpu *pcpu = pcpu_devices + cpu; + int rc; + + if (pcpu->state != CPU_STATE_CONFIGURED) + return -EIO; + if (pcpu_sigp_retry(pcpu, SIGP_INITIAL_CPU_RESET, 0) != + SIGP_CC_ORDER_CODE_ACCEPTED) + return -EIO; + + rc = pcpu_alloc_lowcore(pcpu, cpu); + if (rc) + return rc; + /* + * Make sure global control register contents do not change + * until new CPU has initialized control registers. + */ + spin_lock(&ctl_lock); + pcpu_prepare_secondary(pcpu, cpu); + pcpu_attach_task(pcpu, tidle); + pcpu_start_fn(pcpu, smp_start_secondary, NULL); + /* Wait until cpu puts itself in the online & active maps */ + while (!cpu_online(cpu)) + cpu_relax(); + spin_unlock(&ctl_lock); + return 0; +} + +static unsigned int setup_possible_cpus __initdata; + +static int __init _setup_possible_cpus(char *s) +{ + get_option(&s, &setup_possible_cpus); + return 0; +} +early_param("possible_cpus", _setup_possible_cpus); + +int __cpu_disable(void) +{ + unsigned long cregs[16]; + int cpu; + + /* Handle possible pending IPIs */ + smp_handle_ext_call(); + cpu = smp_processor_id(); + set_cpu_online(cpu, false); + cpumask_clear_cpu(cpu, &cpu_setup_mask); + update_cpu_masks(); + /* Disable pseudo page faults on this cpu. */ + pfault_fini(); + /* Disable interrupt sources via control register. */ + __ctl_store(cregs, 0, 15); + cregs[0] &= ~0x0000ee70UL; /* disable all external interrupts */ + cregs[6] &= ~0xff000000UL; /* disable all I/O interrupts */ + cregs[14] &= ~0x1f000000UL; /* disable most machine checks */ + __ctl_load(cregs, 0, 15); + clear_cpu_flag(CIF_NOHZ_DELAY); + return 0; +} + +void __cpu_die(unsigned int cpu) +{ + struct pcpu *pcpu; + + /* Wait until target cpu is down */ + pcpu = pcpu_devices + cpu; + while (!pcpu_stopped(pcpu)) + cpu_relax(); + pcpu_free_lowcore(pcpu); + cpumask_clear_cpu(cpu, mm_cpumask(&init_mm)); + cpumask_clear_cpu(cpu, &init_mm.context.cpu_attach_mask); +} + +void __noreturn cpu_die(void) +{ + idle_task_exit(); + pcpu_sigp_retry(pcpu_devices + smp_processor_id(), SIGP_STOP, 0); + for (;;) ; +} + +void __init smp_fill_possible_mask(void) +{ + unsigned int possible, sclp_max, cpu; + + sclp_max = max(sclp.mtid, sclp.mtid_cp) + 1; + sclp_max = min(smp_max_threads, sclp_max); + sclp_max = (sclp.max_cores * sclp_max) ?: nr_cpu_ids; + possible = setup_possible_cpus ?: nr_cpu_ids; + possible = min(possible, sclp_max); + for (cpu = 0; cpu < possible && cpu < nr_cpu_ids; cpu++) + set_cpu_possible(cpu, true); +} + +void __init smp_prepare_cpus(unsigned int max_cpus) +{ + /* request the 0x1201 emergency signal external interrupt */ + if (register_external_irq(EXT_IRQ_EMERGENCY_SIG, do_ext_call_interrupt)) + panic("Couldn't request external interrupt 0x1201"); + /* request the 0x1202 external call external interrupt */ + if (register_external_irq(EXT_IRQ_EXTERNAL_CALL, do_ext_call_interrupt)) + panic("Couldn't request external interrupt 0x1202"); +} + +void __init smp_prepare_boot_cpu(void) +{ + struct pcpu *pcpu = pcpu_devices; + + WARN_ON(!cpu_present(0) || !cpu_online(0)); + pcpu->state = CPU_STATE_CONFIGURED; + S390_lowcore.percpu_offset = __per_cpu_offset[0]; + smp_cpu_set_polarization(0, POLARIZATION_UNKNOWN); +} + +void __init smp_setup_processor_id(void) +{ + pcpu_devices[0].address = stap(); + S390_lowcore.cpu_nr = 0; + S390_lowcore.spinlock_lockval = arch_spin_lockval(0); + S390_lowcore.spinlock_index = 0; +} + +/* + * the frequency of the profiling timer can be changed + * by writing a multiplier value into /proc/profile. + * + * usually you want to run this on all CPUs ;) + */ +int setup_profiling_timer(unsigned int multiplier) +{ + return 0; +} + +static ssize_t cpu_configure_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + ssize_t count; + + mutex_lock(&smp_cpu_state_mutex); + count = sprintf(buf, "%d\n", pcpu_devices[dev->id].state); + mutex_unlock(&smp_cpu_state_mutex); + return count; +} + +static ssize_t cpu_configure_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct pcpu *pcpu; + int cpu, val, rc, i; + char delim; + + if (sscanf(buf, "%d %c", &val, &delim) != 1) + return -EINVAL; + if (val != 0 && val != 1) + return -EINVAL; + cpus_read_lock(); + mutex_lock(&smp_cpu_state_mutex); + rc = -EBUSY; + /* disallow configuration changes of online cpus and cpu 0 */ + cpu = dev->id; + cpu = smp_get_base_cpu(cpu); + if (cpu == 0) + goto out; + for (i = 0; i <= smp_cpu_mtid; i++) + if (cpu_online(cpu + i)) + goto out; + pcpu = pcpu_devices + cpu; + rc = 0; + switch (val) { + case 0: + if (pcpu->state != CPU_STATE_CONFIGURED) + break; + rc = sclp_core_deconfigure(pcpu->address >> smp_cpu_mt_shift); + if (rc) + break; + for (i = 0; i <= smp_cpu_mtid; i++) { + if (cpu + i >= nr_cpu_ids || !cpu_present(cpu + i)) + continue; + pcpu[i].state = CPU_STATE_STANDBY; + smp_cpu_set_polarization(cpu + i, + POLARIZATION_UNKNOWN); + } + topology_expect_change(); + break; + case 1: + if (pcpu->state != CPU_STATE_STANDBY) + break; + rc = sclp_core_configure(pcpu->address >> smp_cpu_mt_shift); + if (rc) + break; + for (i = 0; i <= smp_cpu_mtid; i++) { + if (cpu + i >= nr_cpu_ids || !cpu_present(cpu + i)) + continue; + pcpu[i].state = CPU_STATE_CONFIGURED; + smp_cpu_set_polarization(cpu + i, + POLARIZATION_UNKNOWN); + } + topology_expect_change(); + break; + default: + break; + } +out: + mutex_unlock(&smp_cpu_state_mutex); + cpus_read_unlock(); + return rc ? rc : count; +} +static DEVICE_ATTR(configure, 0644, cpu_configure_show, cpu_configure_store); + +static ssize_t show_cpu_address(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", pcpu_devices[dev->id].address); +} +static DEVICE_ATTR(address, 0444, show_cpu_address, NULL); + +static struct attribute *cpu_common_attrs[] = { + &dev_attr_configure.attr, + &dev_attr_address.attr, + NULL, +}; + +static struct attribute_group cpu_common_attr_group = { + .attrs = cpu_common_attrs, +}; + +static struct attribute *cpu_online_attrs[] = { + &dev_attr_idle_count.attr, + &dev_attr_idle_time_us.attr, + NULL, +}; + +static struct attribute_group cpu_online_attr_group = { + .attrs = cpu_online_attrs, +}; + +static int smp_cpu_online(unsigned int cpu) +{ + struct device *s = &per_cpu(cpu_device, cpu)->dev; + + return sysfs_create_group(&s->kobj, &cpu_online_attr_group); +} + +static int smp_cpu_pre_down(unsigned int cpu) +{ + struct device *s = &per_cpu(cpu_device, cpu)->dev; + + sysfs_remove_group(&s->kobj, &cpu_online_attr_group); + return 0; +} + +static int smp_add_present_cpu(int cpu) +{ + struct device *s; + struct cpu *c; + int rc; + + c = kzalloc(sizeof(*c), GFP_KERNEL); + if (!c) + return -ENOMEM; + per_cpu(cpu_device, cpu) = c; + s = &c->dev; + c->hotpluggable = 1; + rc = register_cpu(c, cpu); + if (rc) + goto out; + rc = sysfs_create_group(&s->kobj, &cpu_common_attr_group); + if (rc) + goto out_cpu; + rc = topology_cpu_init(c); + if (rc) + goto out_topology; + return 0; + +out_topology: + sysfs_remove_group(&s->kobj, &cpu_common_attr_group); +out_cpu: + unregister_cpu(c); +out: + return rc; +} + +int __ref smp_rescan_cpus(void) +{ + struct sclp_core_info *info; + int nr; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + smp_get_core_info(info, 0); + nr = __smp_rescan_cpus(info, false); + kfree(info); + if (nr) + topology_schedule_update(); + return 0; +} + +static ssize_t __ref rescan_store(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + int rc; + + rc = lock_device_hotplug_sysfs(); + if (rc) + return rc; + rc = smp_rescan_cpus(); + unlock_device_hotplug(); + return rc ? rc : count; +} +static DEVICE_ATTR_WO(rescan); + +static int __init s390_smp_init(void) +{ + struct device *dev_root; + int cpu, rc = 0; + + dev_root = bus_get_dev_root(&cpu_subsys); + if (dev_root) { + rc = device_create_file(dev_root, &dev_attr_rescan); + put_device(dev_root); + if (rc) + return rc; + } + + for_each_present_cpu(cpu) { + rc = smp_add_present_cpu(cpu); + if (rc) + goto out; + } + + rc = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "s390/smp:online", + smp_cpu_online, smp_cpu_pre_down); + rc = rc <= 0 ? rc : 0; +out: + return rc; +} +subsys_initcall(s390_smp_init); + +static __always_inline void set_new_lowcore(struct lowcore *lc) +{ + union register_pair dst, src; + u32 pfx; + + src.even = (unsigned long) &S390_lowcore; + src.odd = sizeof(S390_lowcore); + dst.even = (unsigned long) lc; + dst.odd = sizeof(*lc); + pfx = __pa(lc); + + asm volatile( + " mvcl %[dst],%[src]\n" + " spx %[pfx]\n" + : [dst] "+&d" (dst.pair), [src] "+&d" (src.pair) + : [pfx] "Q" (pfx) + : "memory", "cc"); +} + +int __init smp_reinit_ipl_cpu(void) +{ + unsigned long async_stack, nodat_stack, mcck_stack; + struct lowcore *lc, *lc_ipl; + unsigned long flags, cr0; + u64 mcesad; + + lc_ipl = lowcore_ptr[0]; + lc = (struct lowcore *) __get_free_pages(GFP_KERNEL | GFP_DMA, LC_ORDER); + nodat_stack = __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER); + async_stack = stack_alloc(); + mcck_stack = stack_alloc(); + if (!lc || !nodat_stack || !async_stack || !mcck_stack || nmi_alloc_mcesa(&mcesad)) + panic("Couldn't allocate memory"); + + local_irq_save(flags); + local_mcck_disable(); + set_new_lowcore(lc); + S390_lowcore.nodat_stack = nodat_stack + STACK_INIT_OFFSET; + S390_lowcore.async_stack = async_stack + STACK_INIT_OFFSET; + S390_lowcore.mcck_stack = mcck_stack + STACK_INIT_OFFSET; + __ctl_store(cr0, 0, 0); + __ctl_clear_bit(0, 28); /* disable lowcore protection */ + S390_lowcore.mcesad = mcesad; + __ctl_load(cr0, 0, 0); + if (abs_lowcore_map(0, lc, false)) + panic("Couldn't remap absolute lowcore"); + lowcore_ptr[0] = lc; + local_mcck_enable(); + local_irq_restore(flags); + + memblock_free_late(__pa(lc_ipl->mcck_stack - STACK_INIT_OFFSET), THREAD_SIZE); + memblock_free_late(__pa(lc_ipl->async_stack - STACK_INIT_OFFSET), THREAD_SIZE); + memblock_free_late(__pa(lc_ipl->nodat_stack - STACK_INIT_OFFSET), THREAD_SIZE); + memblock_free_late(__pa(lc_ipl), sizeof(*lc_ipl)); + return 0; +} diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c new file mode 100644 index 0000000000..0787010139 --- /dev/null +++ b/arch/s390/kernel/stacktrace.c @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Stack trace management functions + * + * Copyright IBM Corp. 2006 + */ + +#include +#include +#include +#include + +void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, + struct task_struct *task, struct pt_regs *regs) +{ + struct unwind_state state; + unsigned long addr; + + unwind_for_each_frame(&state, task, regs, 0) { + addr = unwind_get_return_address(&state); + if (!addr || !consume_entry(cookie, addr)) + break; + } +} + +int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, + void *cookie, struct task_struct *task) +{ + struct unwind_state state; + unsigned long addr; + + unwind_for_each_frame(&state, task, NULL, 0) { + if (state.stack_info.type != STACK_TYPE_TASK) + return -EINVAL; + + if (state.regs) + return -EINVAL; + + addr = unwind_get_return_address(&state); + if (!addr) + return -EINVAL; + +#ifdef CONFIG_RETHOOK + /* + * Mark stacktraces with krethook functions on them + * as unreliable. + */ + if (state.ip == (unsigned long)arch_rethook_trampoline) + return -EINVAL; +#endif + + if (!consume_entry(cookie, addr)) + return -EINVAL; + } + + /* Check for stack corruption */ + if (unwind_error(&state)) + return -EINVAL; + return 0; +} diff --git a/arch/s390/kernel/sthyi.c b/arch/s390/kernel/sthyi.c new file mode 100644 index 0000000000..30bb20461d --- /dev/null +++ b/arch/s390/kernel/sthyi.c @@ -0,0 +1,517 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * store hypervisor information instruction emulation functions. + * + * Copyright IBM Corp. 2016 + * Author(s): Janosch Frank + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "entry.h" + +#define DED_WEIGHT 0xffff +/* + * CP and IFL as EBCDIC strings, SP/0x40 determines the end of string + * as they are justified with spaces. + */ +#define CP 0xc3d7404040404040UL +#define IFL 0xc9c6d34040404040UL + +enum hdr_flags { + HDR_NOT_LPAR = 0x10, + HDR_STACK_INCM = 0x20, + HDR_STSI_UNAV = 0x40, + HDR_PERF_UNAV = 0x80, +}; + +enum mac_validity { + MAC_NAME_VLD = 0x20, + MAC_ID_VLD = 0x40, + MAC_CNT_VLD = 0x80, +}; + +enum par_flag { + PAR_MT_EN = 0x80, +}; + +enum par_validity { + PAR_GRP_VLD = 0x08, + PAR_ID_VLD = 0x10, + PAR_ABS_VLD = 0x20, + PAR_WGHT_VLD = 0x40, + PAR_PCNT_VLD = 0x80, +}; + +struct hdr_sctn { + u8 infhflg1; + u8 infhflg2; /* reserved */ + u8 infhval1; /* reserved */ + u8 infhval2; /* reserved */ + u8 reserved[3]; + u8 infhygct; + u16 infhtotl; + u16 infhdln; + u16 infmoff; + u16 infmlen; + u16 infpoff; + u16 infplen; + u16 infhoff1; + u16 infhlen1; + u16 infgoff1; + u16 infglen1; + u16 infhoff2; + u16 infhlen2; + u16 infgoff2; + u16 infglen2; + u16 infhoff3; + u16 infhlen3; + u16 infgoff3; + u16 infglen3; + u8 reserved2[4]; +} __packed; + +struct mac_sctn { + u8 infmflg1; /* reserved */ + u8 infmflg2; /* reserved */ + u8 infmval1; + u8 infmval2; /* reserved */ + u16 infmscps; + u16 infmdcps; + u16 infmsifl; + u16 infmdifl; + char infmname[8]; + char infmtype[4]; + char infmmanu[16]; + char infmseq[16]; + char infmpman[4]; + u8 reserved[4]; +} __packed; + +struct par_sctn { + u8 infpflg1; + u8 infpflg2; /* reserved */ + u8 infpval1; + u8 infpval2; /* reserved */ + u16 infppnum; + u16 infpscps; + u16 infpdcps; + u16 infpsifl; + u16 infpdifl; + u16 reserved; + char infppnam[8]; + u32 infpwbcp; + u32 infpabcp; + u32 infpwbif; + u32 infpabif; + char infplgnm[8]; + u32 infplgcp; + u32 infplgif; +} __packed; + +struct sthyi_sctns { + struct hdr_sctn hdr; + struct mac_sctn mac; + struct par_sctn par; +} __packed; + +struct cpu_inf { + u64 lpar_cap; + u64 lpar_grp_cap; + u64 lpar_weight; + u64 all_weight; + int cpu_num_ded; + int cpu_num_shd; +}; + +struct lpar_cpu_inf { + struct cpu_inf cp; + struct cpu_inf ifl; +}; + +/* + * STHYI requires extensive locking in the higher hypervisors + * and is very computational/memory expensive. Therefore we + * cache the retrieved data whose valid period is 1s. + */ +#define CACHE_VALID_JIFFIES HZ + +struct sthyi_info { + void *info; + unsigned long end; +}; + +static DEFINE_MUTEX(sthyi_mutex); +static struct sthyi_info sthyi_cache; + +static inline u64 cpu_id(u8 ctidx, void *diag224_buf) +{ + return *((u64 *)(diag224_buf + (ctidx + 1) * DIAG204_CPU_NAME_LEN)); +} + +/* + * Scales the cpu capping from the lpar range to the one expected in + * sthyi data. + * + * diag204 reports a cap in hundredths of processor units. + * z/VM's range for one core is 0 - 0x10000. + */ +static u32 scale_cap(u32 in) +{ + return (0x10000 * in) / 100; +} + +static void fill_hdr(struct sthyi_sctns *sctns) +{ + sctns->hdr.infhdln = sizeof(sctns->hdr); + sctns->hdr.infmoff = sizeof(sctns->hdr); + sctns->hdr.infmlen = sizeof(sctns->mac); + sctns->hdr.infplen = sizeof(sctns->par); + sctns->hdr.infpoff = sctns->hdr.infhdln + sctns->hdr.infmlen; + sctns->hdr.infhtotl = sctns->hdr.infpoff + sctns->hdr.infplen; +} + +static void fill_stsi_mac(struct sthyi_sctns *sctns, + struct sysinfo_1_1_1 *sysinfo) +{ + sclp_ocf_cpc_name_copy(sctns->mac.infmname); + if (*(u64 *)sctns->mac.infmname != 0) + sctns->mac.infmval1 |= MAC_NAME_VLD; + + if (stsi(sysinfo, 1, 1, 1)) + return; + + memcpy(sctns->mac.infmtype, sysinfo->type, sizeof(sctns->mac.infmtype)); + memcpy(sctns->mac.infmmanu, sysinfo->manufacturer, sizeof(sctns->mac.infmmanu)); + memcpy(sctns->mac.infmpman, sysinfo->plant, sizeof(sctns->mac.infmpman)); + memcpy(sctns->mac.infmseq, sysinfo->sequence, sizeof(sctns->mac.infmseq)); + + sctns->mac.infmval1 |= MAC_ID_VLD; +} + +static void fill_stsi_par(struct sthyi_sctns *sctns, + struct sysinfo_2_2_2 *sysinfo) +{ + if (stsi(sysinfo, 2, 2, 2)) + return; + + sctns->par.infppnum = sysinfo->lpar_number; + memcpy(sctns->par.infppnam, sysinfo->name, sizeof(sctns->par.infppnam)); + + sctns->par.infpval1 |= PAR_ID_VLD; +} + +static void fill_stsi(struct sthyi_sctns *sctns) +{ + void *sysinfo; + + /* Errors are handled through the validity bits in the response. */ + sysinfo = (void *)__get_free_page(GFP_KERNEL); + if (!sysinfo) + return; + + fill_stsi_mac(sctns, sysinfo); + fill_stsi_par(sctns, sysinfo); + + free_pages((unsigned long)sysinfo, 0); +} + +static void fill_diag_mac(struct sthyi_sctns *sctns, + struct diag204_x_phys_block *block, + void *diag224_buf) +{ + int i; + + for (i = 0; i < block->hdr.cpus; i++) { + switch (cpu_id(block->cpus[i].ctidx, diag224_buf)) { + case CP: + if (block->cpus[i].weight == DED_WEIGHT) + sctns->mac.infmdcps++; + else + sctns->mac.infmscps++; + break; + case IFL: + if (block->cpus[i].weight == DED_WEIGHT) + sctns->mac.infmdifl++; + else + sctns->mac.infmsifl++; + break; + } + } + sctns->mac.infmval1 |= MAC_CNT_VLD; +} + +/* Returns a pointer to the the next partition block. */ +static struct diag204_x_part_block *lpar_cpu_inf(struct lpar_cpu_inf *part_inf, + bool this_lpar, + void *diag224_buf, + struct diag204_x_part_block *block) +{ + int i, capped = 0, weight_cp = 0, weight_ifl = 0; + struct cpu_inf *cpu_inf; + + for (i = 0; i < block->hdr.rcpus; i++) { + if (!(block->cpus[i].cflag & DIAG204_CPU_ONLINE)) + continue; + + switch (cpu_id(block->cpus[i].ctidx, diag224_buf)) { + case CP: + cpu_inf = &part_inf->cp; + if (block->cpus[i].cur_weight < DED_WEIGHT) + weight_cp |= block->cpus[i].cur_weight; + break; + case IFL: + cpu_inf = &part_inf->ifl; + if (block->cpus[i].cur_weight < DED_WEIGHT) + weight_ifl |= block->cpus[i].cur_weight; + break; + default: + continue; + } + + if (!this_lpar) + continue; + + capped |= block->cpus[i].cflag & DIAG204_CPU_CAPPED; + cpu_inf->lpar_cap |= block->cpus[i].cpu_type_cap; + cpu_inf->lpar_grp_cap |= block->cpus[i].group_cpu_type_cap; + + if (block->cpus[i].weight == DED_WEIGHT) + cpu_inf->cpu_num_ded += 1; + else + cpu_inf->cpu_num_shd += 1; + } + + if (this_lpar && capped) { + part_inf->cp.lpar_weight = weight_cp; + part_inf->ifl.lpar_weight = weight_ifl; + } + part_inf->cp.all_weight += weight_cp; + part_inf->ifl.all_weight += weight_ifl; + return (struct diag204_x_part_block *)&block->cpus[i]; +} + +static void fill_diag(struct sthyi_sctns *sctns) +{ + int i, r, pages; + bool this_lpar; + void *diag204_buf; + void *diag224_buf = NULL; + struct diag204_x_info_blk_hdr *ti_hdr; + struct diag204_x_part_block *part_block; + struct diag204_x_phys_block *phys_block; + struct lpar_cpu_inf lpar_inf = {}; + + /* Errors are handled through the validity bits in the response. */ + pages = diag204((unsigned long)DIAG204_SUBC_RSI | + (unsigned long)DIAG204_INFO_EXT, 0, NULL); + if (pages <= 0) + return; + + diag204_buf = __vmalloc_node(array_size(pages, PAGE_SIZE), + PAGE_SIZE, GFP_KERNEL, NUMA_NO_NODE, + __builtin_return_address(0)); + if (!diag204_buf) + return; + + r = diag204((unsigned long)DIAG204_SUBC_STIB7 | + (unsigned long)DIAG204_INFO_EXT, pages, diag204_buf); + if (r < 0) + goto out; + + diag224_buf = (void *)__get_free_page(GFP_KERNEL | GFP_DMA); + if (!diag224_buf || diag224(diag224_buf)) + goto out; + + ti_hdr = diag204_buf; + part_block = diag204_buf + sizeof(*ti_hdr); + + for (i = 0; i < ti_hdr->npar; i++) { + /* + * For the calling lpar we also need to get the cpu + * caps and weights. The time information block header + * specifies the offset to the partition block of the + * caller lpar, so we know when we process its data. + */ + this_lpar = (void *)part_block - diag204_buf == ti_hdr->this_part; + part_block = lpar_cpu_inf(&lpar_inf, this_lpar, diag224_buf, + part_block); + } + + phys_block = (struct diag204_x_phys_block *)part_block; + part_block = diag204_buf + ti_hdr->this_part; + if (part_block->hdr.mtid) + sctns->par.infpflg1 = PAR_MT_EN; + + sctns->par.infpval1 |= PAR_GRP_VLD; + sctns->par.infplgcp = scale_cap(lpar_inf.cp.lpar_grp_cap); + sctns->par.infplgif = scale_cap(lpar_inf.ifl.lpar_grp_cap); + memcpy(sctns->par.infplgnm, part_block->hdr.hardware_group_name, + sizeof(sctns->par.infplgnm)); + + sctns->par.infpscps = lpar_inf.cp.cpu_num_shd; + sctns->par.infpdcps = lpar_inf.cp.cpu_num_ded; + sctns->par.infpsifl = lpar_inf.ifl.cpu_num_shd; + sctns->par.infpdifl = lpar_inf.ifl.cpu_num_ded; + sctns->par.infpval1 |= PAR_PCNT_VLD; + + sctns->par.infpabcp = scale_cap(lpar_inf.cp.lpar_cap); + sctns->par.infpabif = scale_cap(lpar_inf.ifl.lpar_cap); + sctns->par.infpval1 |= PAR_ABS_VLD; + + /* + * Everything below needs global performance data to be + * meaningful. + */ + if (!(ti_hdr->flags & DIAG204_LPAR_PHYS_FLG)) { + sctns->hdr.infhflg1 |= HDR_PERF_UNAV; + goto out; + } + + fill_diag_mac(sctns, phys_block, diag224_buf); + + if (lpar_inf.cp.lpar_weight) { + sctns->par.infpwbcp = sctns->mac.infmscps * 0x10000 * + lpar_inf.cp.lpar_weight / lpar_inf.cp.all_weight; + } + + if (lpar_inf.ifl.lpar_weight) { + sctns->par.infpwbif = sctns->mac.infmsifl * 0x10000 * + lpar_inf.ifl.lpar_weight / lpar_inf.ifl.all_weight; + } + sctns->par.infpval1 |= PAR_WGHT_VLD; + +out: + free_page((unsigned long)diag224_buf); + vfree(diag204_buf); +} + +static int sthyi(u64 vaddr, u64 *rc) +{ + union register_pair r1 = { .even = 0, }; /* subcode */ + union register_pair r2 = { .even = vaddr, }; + int cc; + + asm volatile( + ".insn rre,0xB2560000,%[r1],%[r2]\n" + "ipm %[cc]\n" + "srl %[cc],28\n" + : [cc] "=&d" (cc), [r2] "+&d" (r2.pair) + : [r1] "d" (r1.pair) + : "memory", "cc"); + *rc = r2.odd; + return cc; +} + +static int fill_dst(void *dst, u64 *rc) +{ + struct sthyi_sctns *sctns = (struct sthyi_sctns *)dst; + + /* + * If the facility is on, we don't want to emulate the instruction. + * We ask the hypervisor to provide the data. + */ + if (test_facility(74)) + return sthyi((u64)dst, rc); + + fill_hdr(sctns); + fill_stsi(sctns); + fill_diag(sctns); + *rc = 0; + return 0; +} + +static int sthyi_init_cache(void) +{ + if (sthyi_cache.info) + return 0; + sthyi_cache.info = (void *)get_zeroed_page(GFP_KERNEL); + if (!sthyi_cache.info) + return -ENOMEM; + sthyi_cache.end = jiffies - 1; /* expired */ + return 0; +} + +static int sthyi_update_cache(u64 *rc) +{ + int r; + + memset(sthyi_cache.info, 0, PAGE_SIZE); + r = fill_dst(sthyi_cache.info, rc); + if (r) + return r; + sthyi_cache.end = jiffies + CACHE_VALID_JIFFIES; + return r; +} + +/* + * sthyi_fill - Fill page with data returned by the STHYI instruction + * + * @dst: Pointer to zeroed page + * @rc: Pointer for storing the return code of the instruction + * + * Fills the destination with system information returned by the STHYI + * instruction. The data is generated by emulation or execution of STHYI, + * if available. The return value is either a negative error value or + * the condition code that would be returned, the rc parameter is the + * return code which is passed in register R2 + 1. + */ +int sthyi_fill(void *dst, u64 *rc) +{ + int r; + + mutex_lock(&sthyi_mutex); + r = sthyi_init_cache(); + if (r) + goto out; + + if (time_is_before_jiffies(sthyi_cache.end)) { + /* cache expired */ + r = sthyi_update_cache(rc); + if (r) + goto out; + } + *rc = 0; + memcpy(dst, sthyi_cache.info, PAGE_SIZE); +out: + mutex_unlock(&sthyi_mutex); + return r; +} +EXPORT_SYMBOL_GPL(sthyi_fill); + +SYSCALL_DEFINE4(s390_sthyi, unsigned long, function_code, void __user *, buffer, + u64 __user *, return_code, unsigned long, flags) +{ + u64 sthyi_rc; + void *info; + int r; + + if (flags) + return -EINVAL; + if (function_code != STHYI_FC_CP_IFL_CAP) + return -EOPNOTSUPP; + info = (void *)get_zeroed_page(GFP_KERNEL); + if (!info) + return -ENOMEM; + r = sthyi_fill(info, &sthyi_rc); + if (r < 0) + goto out; + if (return_code && put_user(sthyi_rc, return_code)) { + r = -EFAULT; + goto out; + } + if (copy_to_user(buffer, info, PAGE_SIZE)) + r = -EFAULT; +out: + free_page((unsigned long)info); + return r; +} diff --git a/arch/s390/kernel/syscall.c b/arch/s390/kernel/syscall.c new file mode 100644 index 0000000000..dc2355c623 --- /dev/null +++ b/arch/s390/kernel/syscall.c @@ -0,0 +1,170 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * S390 version + * Copyright IBM Corp. 1999, 2000 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), + * Thomas Spatzier (tspat@de.ibm.com) + * + * Derived from "arch/i386/kernel/sys_i386.c" + * + * This file contains various random system calls that + * have a non-standard calling sequence on the Linux/s390 + * platform. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "entry.h" + +/* + * Perform the mmap() system call. Linux for S/390 isn't able to handle more + * than 5 system call parameters, so this system call uses a memory block + * for parameter passing. + */ + +struct s390_mmap_arg_struct { + unsigned long addr; + unsigned long len; + unsigned long prot; + unsigned long flags; + unsigned long fd; + unsigned long offset; +}; + +SYSCALL_DEFINE1(mmap2, struct s390_mmap_arg_struct __user *, arg) +{ + struct s390_mmap_arg_struct a; + int error = -EFAULT; + + if (copy_from_user(&a, arg, sizeof(a))) + goto out; + error = ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset); +out: + return error; +} + +#ifdef CONFIG_SYSVIPC +/* + * sys_ipc() is the de-multiplexer for the SysV IPC calls. + */ +SYSCALL_DEFINE5(s390_ipc, uint, call, int, first, unsigned long, second, + unsigned long, third, void __user *, ptr) +{ + if (call >> 16) + return -EINVAL; + /* The s390 sys_ipc variant has only five parameters instead of six + * like the generic variant. The only difference is the handling of + * the SEMTIMEDOP subcall where on s390 the third parameter is used + * as a pointer to a struct timespec where the generic variant uses + * the fifth parameter. + * Therefore we can call the generic variant by simply passing the + * third parameter also as fifth parameter. + */ + return ksys_ipc(call, first, second, third, ptr, third); +} +#endif /* CONFIG_SYSVIPC */ + +SYSCALL_DEFINE1(s390_personality, unsigned int, personality) +{ + unsigned int ret = current->personality; + + if (personality(current->personality) == PER_LINUX32 && + personality(personality) == PER_LINUX) + personality |= PER_LINUX32; + + if (personality != 0xffffffff) + set_personality(personality); + + if (personality(ret) == PER_LINUX32) + ret &= ~PER_LINUX32; + + return ret; +} + +SYSCALL_DEFINE0(ni_syscall) +{ + return -ENOSYS; +} + +static void do_syscall(struct pt_regs *regs) +{ + unsigned long nr; + + nr = regs->int_code & 0xffff; + if (!nr) { + nr = regs->gprs[1] & 0xffff; + regs->int_code &= ~0xffffUL; + regs->int_code |= nr; + } + + regs->gprs[2] = nr; + + if (nr == __NR_restart_syscall && !(current->restart_block.arch_data & 1)) { + regs->psw.addr = current->restart_block.arch_data; + current->restart_block.arch_data = 1; + } + nr = syscall_enter_from_user_mode_work(regs, nr); + + /* + * In the s390 ptrace ABI, both the syscall number and the return value + * use gpr2. However, userspace puts the syscall number either in the + * svc instruction itself, or uses gpr1. To make at least skipping syscalls + * work, the ptrace code sets PIF_SYSCALL_RET_SET, which is checked here + * and if set, the syscall will be skipped. + */ + + if (unlikely(test_and_clear_pt_regs_flag(regs, PIF_SYSCALL_RET_SET))) + goto out; + regs->gprs[2] = -ENOSYS; + if (likely(nr >= NR_syscalls)) + goto out; + do { + regs->gprs[2] = current->thread.sys_call_table[nr](regs); + } while (test_and_clear_pt_regs_flag(regs, PIF_EXECVE_PGSTE_RESTART)); +out: + syscall_exit_to_user_mode_work(regs); +} + +void noinstr __do_syscall(struct pt_regs *regs, int per_trap) +{ + add_random_kstack_offset(); + enter_from_user_mode(regs); + regs->psw = S390_lowcore.svc_old_psw; + regs->int_code = S390_lowcore.svc_int_code; + update_timer_sys(); + if (static_branch_likely(&cpu_has_bear)) + current->thread.last_break = regs->last_break; + + local_irq_enable(); + regs->orig_gpr2 = regs->gprs[2]; + + if (per_trap) + set_thread_flag(TIF_PER_TRAP); + + regs->flags = 0; + set_pt_regs_flag(regs, PIF_SYSCALL); + do_syscall(regs); + exit_to_user_mode(); +} diff --git a/arch/s390/kernel/syscalls/Makefile b/arch/s390/kernel/syscalls/Makefile new file mode 100644 index 0000000000..fb85e79794 --- /dev/null +++ b/arch/s390/kernel/syscalls/Makefile @@ -0,0 +1,45 @@ +# SPDX-License-Identifier: GPL-2.0 + +gen := arch/$(ARCH)/include/generated +kapi := $(gen)/asm +uapi := $(gen)/uapi/asm + +syscall := $(srctree)/$(src)/syscall.tbl +systbl := $(srctree)/$(src)/syscalltbl + +gen-y := $(kapi)/syscall_table.h +kapi-hdrs-y := $(kapi)/unistd_nr.h +uapi-hdrs-y := $(uapi)/unistd_32.h +uapi-hdrs-y += $(uapi)/unistd_64.h + +targets += $(addprefix ../../../,$(gen-y) $(kapi-hdrs-y) $(uapi-hdrs-y)) + +PHONY += kapi uapi + +kapi: $(gen-y) $(kapi-hdrs-y) +uapi: $(uapi-hdrs-y) + + +# Create output directory if not already present +$(shell mkdir -p $(uapi) $(kapi)) + +filechk_syshdr = $(CONFIG_SHELL) '$(systbl)' -H -a $(syshdr_abi_$(basetarget)) -f "$2" < $< + +filechk_sysnr = $(CONFIG_SHELL) '$(systbl)' -N -a $(sysnr_abi_$(basetarget)) < $< + +filechk_syscalls = $(CONFIG_SHELL) '$(systbl)' -S < $< + +syshdr_abi_unistd_32 := common,32 +$(uapi)/unistd_32.h: $(syscall) FORCE + $(call filechk,syshdr,$@) + +syshdr_abi_unistd_64 := common,64 +$(uapi)/unistd_64.h: $(syscall) FORCE + $(call filechk,syshdr,$@) + +$(kapi)/syscall_table.h: $(syscall) FORCE + $(call filechk,syscalls) + +sysnr_abi_unistd_nr := common,32,64 +$(kapi)/unistd_nr.h: $(syscall) FORCE + $(call filechk,sysnr) diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl new file mode 100644 index 0000000000..0122cc1569 --- /dev/null +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -0,0 +1,457 @@ +# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note +# +# System call table for s390 +# +# Format: +# +# +# +# where can be common, 64, or 32 + +1 common exit sys_exit sys_exit +2 common fork sys_fork sys_fork +3 common read sys_read compat_sys_s390_read +4 common write sys_write compat_sys_s390_write +5 common open sys_open compat_sys_open +6 common close sys_close sys_close +7 common restart_syscall sys_restart_syscall sys_restart_syscall +8 common creat sys_creat sys_creat +9 common link sys_link sys_link +10 common unlink sys_unlink sys_unlink +11 common execve sys_execve compat_sys_execve +12 common chdir sys_chdir sys_chdir +13 32 time - sys_time32 +14 common mknod sys_mknod sys_mknod +15 common chmod sys_chmod sys_chmod +16 32 lchown - sys_lchown16 +19 common lseek sys_lseek compat_sys_lseek +20 common getpid sys_getpid sys_getpid +21 common mount sys_mount sys_mount +22 common umount sys_oldumount sys_oldumount +23 32 setuid - sys_setuid16 +24 32 getuid - sys_getuid16 +25 32 stime - sys_stime32 +26 common ptrace sys_ptrace compat_sys_ptrace +27 common alarm sys_alarm sys_alarm +29 common pause sys_pause sys_pause +30 common utime sys_utime sys_utime32 +33 common access sys_access sys_access +34 common nice sys_nice sys_nice +36 common sync sys_sync sys_sync +37 common kill sys_kill sys_kill +38 common rename sys_rename sys_rename +39 common mkdir sys_mkdir sys_mkdir +40 common rmdir sys_rmdir sys_rmdir +41 common dup sys_dup sys_dup +42 common pipe sys_pipe sys_pipe +43 common times sys_times compat_sys_times +45 common brk sys_brk sys_brk +46 32 setgid - sys_setgid16 +47 32 getgid - sys_getgid16 +48 common signal sys_signal sys_signal +49 32 geteuid - sys_geteuid16 +50 32 getegid - sys_getegid16 +51 common acct sys_acct sys_acct +52 common umount2 sys_umount sys_umount +54 common ioctl sys_ioctl compat_sys_ioctl +55 common fcntl sys_fcntl compat_sys_fcntl +57 common setpgid sys_setpgid sys_setpgid +60 common umask sys_umask sys_umask +61 common chroot sys_chroot sys_chroot +62 common ustat sys_ustat compat_sys_ustat +63 common dup2 sys_dup2 sys_dup2 +64 common getppid sys_getppid sys_getppid +65 common getpgrp sys_getpgrp sys_getpgrp +66 common setsid sys_setsid sys_setsid +67 common sigaction sys_sigaction compat_sys_sigaction +70 32 setreuid - sys_setreuid16 +71 32 setregid - sys_setregid16 +72 common sigsuspend sys_sigsuspend sys_sigsuspend +73 common sigpending sys_sigpending compat_sys_sigpending +74 common sethostname sys_sethostname sys_sethostname +75 common setrlimit sys_setrlimit compat_sys_setrlimit +76 32 getrlimit - compat_sys_old_getrlimit +77 common getrusage sys_getrusage compat_sys_getrusage +78 common gettimeofday sys_gettimeofday compat_sys_gettimeofday +79 common settimeofday sys_settimeofday compat_sys_settimeofday +80 32 getgroups - sys_getgroups16 +81 32 setgroups - sys_setgroups16 +83 common symlink sys_symlink sys_symlink +85 common readlink sys_readlink sys_readlink +86 common uselib sys_uselib sys_uselib +87 common swapon sys_swapon sys_swapon +88 common reboot sys_reboot sys_reboot +89 common readdir - compat_sys_old_readdir +90 common mmap sys_old_mmap compat_sys_s390_old_mmap +91 common munmap sys_munmap sys_munmap +92 common truncate sys_truncate compat_sys_truncate +93 common ftruncate sys_ftruncate compat_sys_ftruncate +94 common fchmod sys_fchmod sys_fchmod +95 32 fchown - sys_fchown16 +96 common getpriority sys_getpriority sys_getpriority +97 common setpriority sys_setpriority sys_setpriority +99 common statfs sys_statfs compat_sys_statfs +100 common fstatfs sys_fstatfs compat_sys_fstatfs +101 32 ioperm - - +102 common socketcall sys_socketcall compat_sys_socketcall +103 common syslog sys_syslog sys_syslog +104 common setitimer sys_setitimer compat_sys_setitimer +105 common getitimer sys_getitimer compat_sys_getitimer +106 common stat sys_newstat compat_sys_newstat +107 common lstat sys_newlstat compat_sys_newlstat +108 common fstat sys_newfstat compat_sys_newfstat +110 common lookup_dcookie sys_lookup_dcookie compat_sys_lookup_dcookie +111 common vhangup sys_vhangup sys_vhangup +112 common idle - - +114 common wait4 sys_wait4 compat_sys_wait4 +115 common swapoff sys_swapoff sys_swapoff +116 common sysinfo sys_sysinfo compat_sys_sysinfo +117 common ipc sys_s390_ipc compat_sys_s390_ipc +118 common fsync sys_fsync sys_fsync +119 common sigreturn sys_sigreturn compat_sys_sigreturn +120 common clone sys_clone sys_clone +121 common setdomainname sys_setdomainname sys_setdomainname +122 common uname sys_newuname sys_newuname +124 common adjtimex sys_adjtimex sys_adjtimex_time32 +125 common mprotect sys_mprotect sys_mprotect +126 common sigprocmask sys_sigprocmask compat_sys_sigprocmask +127 common create_module - - +128 common init_module sys_init_module sys_init_module +129 common delete_module sys_delete_module sys_delete_module +130 common get_kernel_syms - - +131 common quotactl sys_quotactl sys_quotactl +132 common getpgid sys_getpgid sys_getpgid +133 common fchdir sys_fchdir sys_fchdir +134 common bdflush sys_ni_syscall sys_ni_syscall +135 common sysfs sys_sysfs sys_sysfs +136 common personality sys_s390_personality sys_s390_personality +137 common afs_syscall - - +138 32 setfsuid - sys_setfsuid16 +139 32 setfsgid - sys_setfsgid16 +140 32 _llseek - sys_llseek +141 common getdents sys_getdents compat_sys_getdents +142 32 _newselect - compat_sys_select +142 64 select sys_select - +143 common flock sys_flock sys_flock +144 common msync sys_msync sys_msync +145 common readv sys_readv sys_readv +146 common writev sys_writev sys_writev +147 common getsid sys_getsid sys_getsid +148 common fdatasync sys_fdatasync sys_fdatasync +149 common _sysctl - - +150 common mlock sys_mlock sys_mlock +151 common munlock sys_munlock sys_munlock +152 common mlockall sys_mlockall sys_mlockall +153 common munlockall sys_munlockall sys_munlockall +154 common sched_setparam sys_sched_setparam sys_sched_setparam +155 common sched_getparam sys_sched_getparam sys_sched_getparam +156 common sched_setscheduler sys_sched_setscheduler sys_sched_setscheduler +157 common sched_getscheduler sys_sched_getscheduler sys_sched_getscheduler +158 common sched_yield sys_sched_yield sys_sched_yield +159 common sched_get_priority_max sys_sched_get_priority_max sys_sched_get_priority_max +160 common sched_get_priority_min sys_sched_get_priority_min sys_sched_get_priority_min +161 common sched_rr_get_interval sys_sched_rr_get_interval sys_sched_rr_get_interval_time32 +162 common nanosleep sys_nanosleep sys_nanosleep_time32 +163 common mremap sys_mremap sys_mremap +164 32 setresuid - sys_setresuid16 +165 32 getresuid - sys_getresuid16 +167 common query_module - - +168 common poll sys_poll sys_poll +169 common nfsservctl - - +170 32 setresgid - sys_setresgid16 +171 32 getresgid - sys_getresgid16 +172 common prctl sys_prctl sys_prctl +173 common rt_sigreturn sys_rt_sigreturn compat_sys_rt_sigreturn +174 common rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction +175 common rt_sigprocmask sys_rt_sigprocmask compat_sys_rt_sigprocmask +176 common rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending +177 common rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time32 +178 common rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo +179 common rt_sigsuspend sys_rt_sigsuspend compat_sys_rt_sigsuspend +180 common pread64 sys_pread64 compat_sys_s390_pread64 +181 common pwrite64 sys_pwrite64 compat_sys_s390_pwrite64 +182 32 chown - sys_chown16 +183 common getcwd sys_getcwd sys_getcwd +184 common capget sys_capget sys_capget +185 common capset sys_capset sys_capset +186 common sigaltstack sys_sigaltstack compat_sys_sigaltstack +187 common sendfile sys_sendfile64 compat_sys_sendfile +188 common getpmsg - - +189 common putpmsg - - +190 common vfork sys_vfork sys_vfork +191 32 ugetrlimit - compat_sys_getrlimit +191 64 getrlimit sys_getrlimit - +192 32 mmap2 - compat_sys_s390_mmap2 +193 32 truncate64 - compat_sys_s390_truncate64 +194 32 ftruncate64 - compat_sys_s390_ftruncate64 +195 32 stat64 - compat_sys_s390_stat64 +196 32 lstat64 - compat_sys_s390_lstat64 +197 32 fstat64 - compat_sys_s390_fstat64 +198 32 lchown32 - sys_lchown +198 64 lchown sys_lchown - +199 32 getuid32 - sys_getuid +199 64 getuid sys_getuid - +200 32 getgid32 - sys_getgid +200 64 getgid sys_getgid - +201 32 geteuid32 - sys_geteuid +201 64 geteuid sys_geteuid - +202 32 getegid32 - sys_getegid +202 64 getegid sys_getegid - +203 32 setreuid32 - sys_setreuid +203 64 setreuid sys_setreuid - +204 32 setregid32 - sys_setregid +204 64 setregid sys_setregid - +205 32 getgroups32 - sys_getgroups +205 64 getgroups sys_getgroups - +206 32 setgroups32 - sys_setgroups +206 64 setgroups sys_setgroups - +207 32 fchown32 - sys_fchown +207 64 fchown sys_fchown - +208 32 setresuid32 - sys_setresuid +208 64 setresuid sys_setresuid - +209 32 getresuid32 - sys_getresuid +209 64 getresuid sys_getresuid - +210 32 setresgid32 - sys_setresgid +210 64 setresgid sys_setresgid - +211 32 getresgid32 - sys_getresgid +211 64 getresgid sys_getresgid - +212 32 chown32 - sys_chown +212 64 chown sys_chown - +213 32 setuid32 - sys_setuid +213 64 setuid sys_setuid - +214 32 setgid32 - sys_setgid +214 64 setgid sys_setgid - +215 32 setfsuid32 - sys_setfsuid +215 64 setfsuid sys_setfsuid - +216 32 setfsgid32 - sys_setfsgid +216 64 setfsgid sys_setfsgid - +217 common pivot_root sys_pivot_root sys_pivot_root +218 common mincore sys_mincore sys_mincore +219 common madvise sys_madvise sys_madvise +220 common getdents64 sys_getdents64 sys_getdents64 +221 32 fcntl64 - compat_sys_fcntl64 +222 common readahead sys_readahead compat_sys_s390_readahead +223 32 sendfile64 - compat_sys_sendfile64 +224 common setxattr sys_setxattr sys_setxattr +225 common lsetxattr sys_lsetxattr sys_lsetxattr +226 common fsetxattr sys_fsetxattr sys_fsetxattr +227 common getxattr sys_getxattr sys_getxattr +228 common lgetxattr sys_lgetxattr sys_lgetxattr +229 common fgetxattr sys_fgetxattr sys_fgetxattr +230 common listxattr sys_listxattr sys_listxattr +231 common llistxattr sys_llistxattr sys_llistxattr +232 common flistxattr sys_flistxattr sys_flistxattr +233 common removexattr sys_removexattr sys_removexattr +234 common lremovexattr sys_lremovexattr sys_lremovexattr +235 common fremovexattr sys_fremovexattr sys_fremovexattr +236 common gettid sys_gettid sys_gettid +237 common tkill sys_tkill sys_tkill +238 common futex sys_futex sys_futex_time32 +239 common sched_setaffinity sys_sched_setaffinity compat_sys_sched_setaffinity +240 common sched_getaffinity sys_sched_getaffinity compat_sys_sched_getaffinity +241 common tgkill sys_tgkill sys_tgkill +243 common io_setup sys_io_setup compat_sys_io_setup +244 common io_destroy sys_io_destroy sys_io_destroy +245 common io_getevents sys_io_getevents sys_io_getevents_time32 +246 common io_submit sys_io_submit compat_sys_io_submit +247 common io_cancel sys_io_cancel sys_io_cancel +248 common exit_group sys_exit_group sys_exit_group +249 common epoll_create sys_epoll_create sys_epoll_create +250 common epoll_ctl sys_epoll_ctl sys_epoll_ctl +251 common epoll_wait sys_epoll_wait sys_epoll_wait +252 common set_tid_address sys_set_tid_address sys_set_tid_address +253 common fadvise64 sys_fadvise64_64 compat_sys_s390_fadvise64 +254 common timer_create sys_timer_create compat_sys_timer_create +255 common timer_settime sys_timer_settime sys_timer_settime32 +256 common timer_gettime sys_timer_gettime sys_timer_gettime32 +257 common timer_getoverrun sys_timer_getoverrun sys_timer_getoverrun +258 common timer_delete sys_timer_delete sys_timer_delete +259 common clock_settime sys_clock_settime sys_clock_settime32 +260 common clock_gettime sys_clock_gettime sys_clock_gettime32 +261 common clock_getres sys_clock_getres sys_clock_getres_time32 +262 common clock_nanosleep sys_clock_nanosleep sys_clock_nanosleep_time32 +264 32 fadvise64_64 - compat_sys_s390_fadvise64_64 +265 common statfs64 sys_statfs64 compat_sys_statfs64 +266 common fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 +267 common remap_file_pages sys_remap_file_pages sys_remap_file_pages +268 common mbind sys_mbind sys_mbind +269 common get_mempolicy sys_get_mempolicy sys_get_mempolicy +270 common set_mempolicy sys_set_mempolicy sys_set_mempolicy +271 common mq_open sys_mq_open compat_sys_mq_open +272 common mq_unlink sys_mq_unlink sys_mq_unlink +273 common mq_timedsend sys_mq_timedsend sys_mq_timedsend_time32 +274 common mq_timedreceive sys_mq_timedreceive sys_mq_timedreceive_time32 +275 common mq_notify sys_mq_notify compat_sys_mq_notify +276 common mq_getsetattr sys_mq_getsetattr compat_sys_mq_getsetattr +277 common kexec_load sys_kexec_load compat_sys_kexec_load +278 common add_key sys_add_key sys_add_key +279 common request_key sys_request_key sys_request_key +280 common keyctl sys_keyctl compat_sys_keyctl +281 common waitid sys_waitid compat_sys_waitid +282 common ioprio_set sys_ioprio_set sys_ioprio_set +283 common ioprio_get sys_ioprio_get sys_ioprio_get +284 common inotify_init sys_inotify_init sys_inotify_init +285 common inotify_add_watch sys_inotify_add_watch sys_inotify_add_watch +286 common inotify_rm_watch sys_inotify_rm_watch sys_inotify_rm_watch +287 common migrate_pages sys_migrate_pages sys_migrate_pages +288 common openat sys_openat compat_sys_openat +289 common mkdirat sys_mkdirat sys_mkdirat +290 common mknodat sys_mknodat sys_mknodat +291 common fchownat sys_fchownat sys_fchownat +292 common futimesat sys_futimesat sys_futimesat_time32 +293 32 fstatat64 - compat_sys_s390_fstatat64 +293 64 newfstatat sys_newfstatat - +294 common unlinkat sys_unlinkat sys_unlinkat +295 common renameat sys_renameat sys_renameat +296 common linkat sys_linkat sys_linkat +297 common symlinkat sys_symlinkat sys_symlinkat +298 common readlinkat sys_readlinkat sys_readlinkat +299 common fchmodat sys_fchmodat sys_fchmodat +300 common faccessat sys_faccessat sys_faccessat +301 common pselect6 sys_pselect6 compat_sys_pselect6_time32 +302 common ppoll sys_ppoll compat_sys_ppoll_time32 +303 common unshare sys_unshare sys_unshare +304 common set_robust_list sys_set_robust_list compat_sys_set_robust_list +305 common get_robust_list sys_get_robust_list compat_sys_get_robust_list +306 common splice sys_splice sys_splice +307 common sync_file_range sys_sync_file_range compat_sys_s390_sync_file_range +308 common tee sys_tee sys_tee +309 common vmsplice sys_vmsplice sys_vmsplice +310 common move_pages sys_move_pages sys_move_pages +311 common getcpu sys_getcpu sys_getcpu +312 common epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait +313 common utimes sys_utimes sys_utimes_time32 +314 common fallocate sys_fallocate compat_sys_s390_fallocate +315 common utimensat sys_utimensat sys_utimensat_time32 +316 common signalfd sys_signalfd compat_sys_signalfd +317 common timerfd - - +318 common eventfd sys_eventfd sys_eventfd +319 common timerfd_create sys_timerfd_create sys_timerfd_create +320 common timerfd_settime sys_timerfd_settime sys_timerfd_settime32 +321 common timerfd_gettime sys_timerfd_gettime sys_timerfd_gettime32 +322 common signalfd4 sys_signalfd4 compat_sys_signalfd4 +323 common eventfd2 sys_eventfd2 sys_eventfd2 +324 common inotify_init1 sys_inotify_init1 sys_inotify_init1 +325 common pipe2 sys_pipe2 sys_pipe2 +326 common dup3 sys_dup3 sys_dup3 +327 common epoll_create1 sys_epoll_create1 sys_epoll_create1 +328 common preadv sys_preadv compat_sys_preadv +329 common pwritev sys_pwritev compat_sys_pwritev +330 common rt_tgsigqueueinfo sys_rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo +331 common perf_event_open sys_perf_event_open sys_perf_event_open +332 common fanotify_init sys_fanotify_init sys_fanotify_init +333 common fanotify_mark sys_fanotify_mark compat_sys_fanotify_mark +334 common prlimit64 sys_prlimit64 sys_prlimit64 +335 common name_to_handle_at sys_name_to_handle_at sys_name_to_handle_at +336 common open_by_handle_at sys_open_by_handle_at compat_sys_open_by_handle_at +337 common clock_adjtime sys_clock_adjtime sys_clock_adjtime32 +338 common syncfs sys_syncfs sys_syncfs +339 common setns sys_setns sys_setns +340 common process_vm_readv sys_process_vm_readv sys_process_vm_readv +341 common process_vm_writev sys_process_vm_writev sys_process_vm_writev +342 common s390_runtime_instr sys_s390_runtime_instr sys_s390_runtime_instr +343 common kcmp sys_kcmp sys_kcmp +344 common finit_module sys_finit_module sys_finit_module +345 common sched_setattr sys_sched_setattr sys_sched_setattr +346 common sched_getattr sys_sched_getattr sys_sched_getattr +347 common renameat2 sys_renameat2 sys_renameat2 +348 common seccomp sys_seccomp sys_seccomp +349 common getrandom sys_getrandom sys_getrandom +350 common memfd_create sys_memfd_create sys_memfd_create +351 common bpf sys_bpf sys_bpf +352 common s390_pci_mmio_write sys_s390_pci_mmio_write sys_s390_pci_mmio_write +353 common s390_pci_mmio_read sys_s390_pci_mmio_read sys_s390_pci_mmio_read +354 common execveat sys_execveat compat_sys_execveat +355 common userfaultfd sys_userfaultfd sys_userfaultfd +356 common membarrier sys_membarrier sys_membarrier +357 common recvmmsg sys_recvmmsg compat_sys_recvmmsg_time32 +358 common sendmmsg sys_sendmmsg compat_sys_sendmmsg +359 common socket sys_socket sys_socket +360 common socketpair sys_socketpair sys_socketpair +361 common bind sys_bind sys_bind +362 common connect sys_connect sys_connect +363 common listen sys_listen sys_listen +364 common accept4 sys_accept4 sys_accept4 +365 common getsockopt sys_getsockopt sys_getsockopt +366 common setsockopt sys_setsockopt sys_setsockopt +367 common getsockname sys_getsockname sys_getsockname +368 common getpeername sys_getpeername sys_getpeername +369 common sendto sys_sendto sys_sendto +370 common sendmsg sys_sendmsg compat_sys_sendmsg +371 common recvfrom sys_recvfrom compat_sys_recvfrom +372 common recvmsg sys_recvmsg compat_sys_recvmsg +373 common shutdown sys_shutdown sys_shutdown +374 common mlock2 sys_mlock2 sys_mlock2 +375 common copy_file_range sys_copy_file_range sys_copy_file_range +376 common preadv2 sys_preadv2 compat_sys_preadv2 +377 common pwritev2 sys_pwritev2 compat_sys_pwritev2 +378 common s390_guarded_storage sys_s390_guarded_storage sys_s390_guarded_storage +379 common statx sys_statx sys_statx +380 common s390_sthyi sys_s390_sthyi sys_s390_sthyi +381 common kexec_file_load sys_kexec_file_load sys_kexec_file_load +382 common io_pgetevents sys_io_pgetevents compat_sys_io_pgetevents +383 common rseq sys_rseq sys_rseq +384 common pkey_mprotect sys_pkey_mprotect sys_pkey_mprotect +385 common pkey_alloc sys_pkey_alloc sys_pkey_alloc +386 common pkey_free sys_pkey_free sys_pkey_free +# room for arch specific syscalls +392 64 semtimedop sys_semtimedop - +393 common semget sys_semget sys_semget +394 common semctl sys_semctl compat_sys_semctl +395 common shmget sys_shmget sys_shmget +396 common shmctl sys_shmctl compat_sys_shmctl +397 common shmat sys_shmat compat_sys_shmat +398 common shmdt sys_shmdt sys_shmdt +399 common msgget sys_msgget sys_msgget +400 common msgsnd sys_msgsnd compat_sys_msgsnd +401 common msgrcv sys_msgrcv compat_sys_msgrcv +402 common msgctl sys_msgctl compat_sys_msgctl +403 32 clock_gettime64 - sys_clock_gettime +404 32 clock_settime64 - sys_clock_settime +405 32 clock_adjtime64 - sys_clock_adjtime +406 32 clock_getres_time64 - sys_clock_getres +407 32 clock_nanosleep_time64 - sys_clock_nanosleep +408 32 timer_gettime64 - sys_timer_gettime +409 32 timer_settime64 - sys_timer_settime +410 32 timerfd_gettime64 - sys_timerfd_gettime +411 32 timerfd_settime64 - sys_timerfd_settime +412 32 utimensat_time64 - sys_utimensat +413 32 pselect6_time64 - compat_sys_pselect6_time64 +414 32 ppoll_time64 - compat_sys_ppoll_time64 +416 32 io_pgetevents_time64 - sys_io_pgetevents +417 32 recvmmsg_time64 - compat_sys_recvmmsg_time64 +418 32 mq_timedsend_time64 - sys_mq_timedsend +419 32 mq_timedreceive_time64 - sys_mq_timedreceive +420 32 semtimedop_time64 - sys_semtimedop +421 32 rt_sigtimedwait_time64 - compat_sys_rt_sigtimedwait_time64 +422 32 futex_time64 - sys_futex +423 32 sched_rr_get_interval_time64 - sys_sched_rr_get_interval +424 common pidfd_send_signal sys_pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register sys_io_uring_register +428 common open_tree sys_open_tree sys_open_tree +429 common move_mount sys_move_mount sys_move_mount +430 common fsopen sys_fsopen sys_fsopen +431 common fsconfig sys_fsconfig sys_fsconfig +432 common fsmount sys_fsmount sys_fsmount +433 common fspick sys_fspick sys_fspick +434 common pidfd_open sys_pidfd_open sys_pidfd_open +435 common clone3 sys_clone3 sys_clone3 +436 common close_range sys_close_range sys_close_range +437 common openat2 sys_openat2 sys_openat2 +438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise sys_process_madvise +441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 +442 common mount_setattr sys_mount_setattr sys_mount_setattr +443 common quotactl_fd sys_quotactl_fd sys_quotactl_fd +444 common landlock_create_ruleset sys_landlock_create_ruleset sys_landlock_create_ruleset +445 common landlock_add_rule sys_landlock_add_rule sys_landlock_add_rule +446 common landlock_restrict_self sys_landlock_restrict_self sys_landlock_restrict_self +447 common memfd_secret sys_memfd_secret sys_memfd_secret +448 common process_mrelease sys_process_mrelease sys_process_mrelease +449 common futex_waitv sys_futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node +451 common cachestat sys_cachestat sys_cachestat +452 common fchmodat2 sys_fchmodat2 sys_fchmodat2 diff --git a/arch/s390/kernel/syscalls/syscalltbl b/arch/s390/kernel/syscalls/syscalltbl new file mode 100755 index 0000000000..fbac1732f8 --- /dev/null +++ b/arch/s390/kernel/syscalls/syscalltbl @@ -0,0 +1,232 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# +# Generate system call table and header files +# +# Copyright IBM Corp. 2018 +# Author(s): Hendrik Brueckner + +# +# File path to the system call table definition. +# You can set the path with the -i option. If omitted, +# system call table definitions are read from standard input. +# +SYSCALL_TBL="" + + +create_syscall_table_entries() +{ + local nr abi name entry64 entry32 _ignore + local temp=$(mktemp ${TMPDIR:-/tmp}/syscalltbl-common.XXXXXXXXX) + + ( + # + # Initialize with 0 to create an NI_SYSCALL for 0 + # + local prev_nr=0 prev_32=sys_ni_syscall prev_64=sys_ni_syscall + while read nr abi name entry64 entry32 _ignore; do + test x$entry32 = x- && entry32=sys_ni_syscall + test x$entry64 = x- && entry64=sys_ni_syscall + + if test $prev_nr -eq $nr; then + # + # Same syscall but different ABI, just update + # the respective entry point + # + case $abi in + 32) + prev_32=$entry32 + ;; + 64) + prev_64=$entry64 + ;; + esac + continue; + else + printf "%d\t%s\t%s\n" $prev_nr $prev_64 $prev_32 + fi + + prev_nr=$nr + prev_64=$entry64 + prev_32=$entry32 + done + printf "%d\t%s\t%s\n" $prev_nr $prev_64 $prev_32 + ) >> $temp + + # + # Check for duplicate syscall numbers + # + if ! cat $temp |cut -f1 |uniq -d 2>&1; then + echo "Error: generated system call table contains duplicate entries: $temp" >&2 + exit 1 + fi + + # + # Generate syscall table + # + prev_nr=0 + while read nr entry64 entry32; do + while test $prev_nr -lt $((nr - 1)); do + printf "NI_SYSCALL\n" + prev_nr=$((prev_nr + 1)) + done + if test x$entry64 = xsys_ni_syscall && + test x$entry32 = xsys_ni_syscall; then + printf "NI_SYSCALL\n" + else + printf "SYSCALL(%s,%s)\n" $entry64 $entry32 + fi + prev_nr=$nr + done < $temp + rm $temp +} + +generate_syscall_table() +{ + cat <<-EoHEADER + /* SPDX-License-Identifier: GPL-2.0 */ + /* + * Definitions for sys_call_table, each line represents an + * entry in the table in the form + * SYSCALL(64 bit syscall, 31 bit emulated syscall) + * + * This file is meant to be included from entry.S. + */ + + #define NI_SYSCALL SYSCALL(sys_ni_syscall,sys_ni_syscall) + +EoHEADER + grep -Ev '^(#|[[:blank:]]*$)' $SYSCALL_TBL \ + |sort -k1 -n \ + |create_syscall_table_entries +} + +create_header_defines() +{ + local nr abi name _ignore + + while read nr abi name _ignore; do + printf "#define __NR_%s %d\n" $name $nr + done +} + +normalize_fileguard() +{ + local fileguard="$1" + + echo "$1" |tr '[[:lower:]]' '[[:upper:]]' \ + |sed -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g' +} + +generate_syscall_header() +{ + local abis=$(echo "($1)" | tr ',' '|') + local filename="$2" + local fileguard suffix + + if test "$filename"; then + fileguard=$(normalize_fileguard "__UAPI_ASM_S390_$2") + else + case "$abis" in + *64*) suffix=64 ;; + *32*) suffix=32 ;; + esac + fileguard=$(normalize_fileguard "__UAPI_ASM_S390_SYSCALLS_$suffix") + fi + + cat <<-EoHEADER + /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ + #ifndef ${fileguard} + #define ${fileguard} + +EoHEADER + + grep -E "^[[:digit:]]+[[:space:]]+${abis}" $SYSCALL_TBL \ + |sort -k1 -n \ + |create_header_defines + + cat <<-EoFOOTER + + #endif /* ${fileguard} */ +EoFOOTER +} + +__max_syscall_nr() +{ + local abis=$(echo "($1)" | tr ',' '|') + + grep -E "^[[:digit:]]+[[:space:]]+${abis}" $SYSCALL_TBL \ + |sed -ne 's/^\([[:digit:]]*\)[[:space:]].*/\1/p' \ + |sort -n \ + |tail -1 +} + + +generate_syscall_nr() +{ + local abis="$1" + local max_syscall_nr num_syscalls + + max_syscall_nr=$(__max_syscall_nr "$abis") + num_syscalls=$((max_syscall_nr + 1)) + + cat <<-EoHEADER + /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ + #ifndef __ASM_S390_SYSCALLS_NR + #define __ASM_S390_SYSCALLS_NR + + #define NR_syscalls ${num_syscalls} + + #endif /* __ASM_S390_SYSCALLS_NR */ +EoHEADER +} + + +# +# Parse command line arguments +# +do_syscall_header="" +do_syscall_table="" +do_syscall_nr="" +output_file="" +abi_list="common,64" +filename="" +while getopts ":HNSXi:a:f:" arg; do + case $arg in + a) + abi_list="$OPTARG" + ;; + i) + SYSCALL_TBL="$OPTARG" + ;; + f) + filename=${OPTARG##*/} + ;; + H) + do_syscall_header=1 + ;; + N) + do_syscall_nr=1 + ;; + S) + do_syscall_table=1 + ;; + X) + set -x + ;; + :) + echo "Missing argument for -$OPTARG" >&2 + exit 1 + ;; + \?) + echo "Invalid option specified" >&2 + exit 1 + ;; + esac +done + +test "$do_syscall_header" && generate_syscall_header "$abi_list" "$filename" +test "$do_syscall_table" && generate_syscall_table +test "$do_syscall_nr" && generate_syscall_nr "$abi_list" + +exit 0 diff --git a/arch/s390/kernel/sysinfo.c b/arch/s390/kernel/sysinfo.c new file mode 100644 index 0000000000..b5e364358c --- /dev/null +++ b/arch/s390/kernel/sysinfo.c @@ -0,0 +1,570 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2001, 2009 + * Author(s): Ulrich Weigand , + * Martin Schwidefsky , + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int topology_max_mnest; + +static inline int __stsi(void *sysinfo, int fc, int sel1, int sel2, int *lvl) +{ + int r0 = (fc << 28) | sel1; + int rc = 0; + + asm volatile( + " lr 0,%[r0]\n" + " lr 1,%[r1]\n" + " stsi 0(%[sysinfo])\n" + "0: jz 2f\n" + "1: lhi %[rc],%[retval]\n" + "2: lr %[r0],0\n" + EX_TABLE(0b, 1b) + : [r0] "+d" (r0), [rc] "+d" (rc) + : [r1] "d" (sel2), + [sysinfo] "a" (sysinfo), + [retval] "K" (-EOPNOTSUPP) + : "cc", "0", "1", "memory"); + *lvl = ((unsigned int) r0) >> 28; + return rc; +} + +/* + * stsi - store system information + * + * Returns the current configuration level if function code 0 was specified. + * Otherwise returns 0 on success or a negative value on error. + */ +int stsi(void *sysinfo, int fc, int sel1, int sel2) +{ + int lvl, rc; + + rc = __stsi(sysinfo, fc, sel1, sel2, &lvl); + if (rc) + return rc; + return fc ? 0 : lvl; +} +EXPORT_SYMBOL(stsi); + +#ifdef CONFIG_PROC_FS + +static bool convert_ext_name(unsigned char encoding, char *name, size_t len) +{ + switch (encoding) { + case 1: /* EBCDIC */ + EBCASC(name, len); + break; + case 2: /* UTF-8 */ + break; + default: + return false; + } + return true; +} + +static void stsi_1_1_1(struct seq_file *m, struct sysinfo_1_1_1 *info) +{ + int i; + + if (stsi(info, 1, 1, 1)) + return; + EBCASC(info->manufacturer, sizeof(info->manufacturer)); + EBCASC(info->type, sizeof(info->type)); + EBCASC(info->model, sizeof(info->model)); + EBCASC(info->sequence, sizeof(info->sequence)); + EBCASC(info->plant, sizeof(info->plant)); + EBCASC(info->model_capacity, sizeof(info->model_capacity)); + EBCASC(info->model_perm_cap, sizeof(info->model_perm_cap)); + EBCASC(info->model_temp_cap, sizeof(info->model_temp_cap)); + seq_printf(m, "Manufacturer: %-16.16s\n", info->manufacturer); + seq_printf(m, "Type: %-4.4s\n", info->type); + if (info->lic) + seq_printf(m, "LIC Identifier: %016lx\n", info->lic); + /* + * Sigh: the model field has been renamed with System z9 + * to model_capacity and a new model field has been added + * after the plant field. To avoid confusing older programs + * the "Model:" prints "model_capacity model" or just + * "model_capacity" if the model string is empty . + */ + seq_printf(m, "Model: %-16.16s", info->model_capacity); + if (info->model[0] != '\0') + seq_printf(m, " %-16.16s", info->model); + seq_putc(m, '\n'); + seq_printf(m, "Sequence Code: %-16.16s\n", info->sequence); + seq_printf(m, "Plant: %-4.4s\n", info->plant); + seq_printf(m, "Model Capacity: %-16.16s %08u\n", + info->model_capacity, info->model_cap_rating); + if (info->model_perm_cap_rating) + seq_printf(m, "Model Perm. Capacity: %-16.16s %08u\n", + info->model_perm_cap, + info->model_perm_cap_rating); + if (info->model_temp_cap_rating) + seq_printf(m, "Model Temp. Capacity: %-16.16s %08u\n", + info->model_temp_cap, + info->model_temp_cap_rating); + if (info->ncr) + seq_printf(m, "Nominal Cap. Rating: %08u\n", info->ncr); + if (info->npr) + seq_printf(m, "Nominal Perm. Rating: %08u\n", info->npr); + if (info->ntr) + seq_printf(m, "Nominal Temp. Rating: %08u\n", info->ntr); + if (info->cai) { + seq_printf(m, "Capacity Adj. Ind.: %d\n", info->cai); + seq_printf(m, "Capacity Ch. Reason: %d\n", info->ccr); + seq_printf(m, "Capacity Transient: %d\n", info->t); + } + if (info->p) { + for (i = 1; i <= ARRAY_SIZE(info->typepct); i++) { + seq_printf(m, "Type %d Percentage: %d\n", + i, info->typepct[i - 1]); + } + } +} + +static void stsi_15_1_x(struct seq_file *m, struct sysinfo_15_1_x *info) +{ + int i; + + seq_putc(m, '\n'); + if (!MACHINE_HAS_TOPOLOGY) + return; + if (stsi(info, 15, 1, topology_max_mnest)) + return; + seq_printf(m, "CPU Topology HW: "); + for (i = 0; i < TOPOLOGY_NR_MAG; i++) + seq_printf(m, " %d", info->mag[i]); + seq_putc(m, '\n'); +#ifdef CONFIG_SCHED_TOPOLOGY + store_topology(info); + seq_printf(m, "CPU Topology SW: "); + for (i = 0; i < TOPOLOGY_NR_MAG; i++) + seq_printf(m, " %d", info->mag[i]); + seq_putc(m, '\n'); +#endif +} + +static void stsi_1_2_2(struct seq_file *m, struct sysinfo_1_2_2 *info) +{ + struct sysinfo_1_2_2_extension *ext; + int i; + + if (stsi(info, 1, 2, 2)) + return; + ext = (struct sysinfo_1_2_2_extension *) + ((unsigned long) info + info->acc_offset); + seq_printf(m, "CPUs Total: %d\n", info->cpus_total); + seq_printf(m, "CPUs Configured: %d\n", info->cpus_configured); + seq_printf(m, "CPUs Standby: %d\n", info->cpus_standby); + seq_printf(m, "CPUs Reserved: %d\n", info->cpus_reserved); + if (info->mt_installed) { + seq_printf(m, "CPUs G-MTID: %d\n", info->mt_gtid); + seq_printf(m, "CPUs S-MTID: %d\n", info->mt_stid); + } + /* + * Sigh 2. According to the specification the alternate + * capability field is a 32 bit floating point number + * if the higher order 8 bits are not zero. Printing + * a floating point number in the kernel is a no-no, + * always print the number as 32 bit unsigned integer. + * The user-space needs to know about the strange + * encoding of the alternate cpu capability. + */ + seq_printf(m, "Capability: %u", info->capability); + if (info->format == 1) + seq_printf(m, " %u", ext->alt_capability); + seq_putc(m, '\n'); + if (info->nominal_cap) + seq_printf(m, "Nominal Capability: %d\n", info->nominal_cap); + if (info->secondary_cap) + seq_printf(m, "Secondary Capability: %d\n", info->secondary_cap); + for (i = 2; i <= info->cpus_total; i++) { + seq_printf(m, "Adjustment %02d-way: %u", + i, info->adjustment[i-2]); + if (info->format == 1) + seq_printf(m, " %u", ext->alt_adjustment[i-2]); + seq_putc(m, '\n'); + } +} + +static void stsi_2_2_2(struct seq_file *m, struct sysinfo_2_2_2 *info) +{ + if (stsi(info, 2, 2, 2)) + return; + EBCASC(info->name, sizeof(info->name)); + seq_putc(m, '\n'); + seq_printf(m, "LPAR Number: %d\n", info->lpar_number); + seq_printf(m, "LPAR Characteristics: "); + if (info->characteristics & LPAR_CHAR_DEDICATED) + seq_printf(m, "Dedicated "); + if (info->characteristics & LPAR_CHAR_SHARED) + seq_printf(m, "Shared "); + if (info->characteristics & LPAR_CHAR_LIMITED) + seq_printf(m, "Limited "); + seq_putc(m, '\n'); + seq_printf(m, "LPAR Name: %-8.8s\n", info->name); + seq_printf(m, "LPAR Adjustment: %d\n", info->caf); + seq_printf(m, "LPAR CPUs Total: %d\n", info->cpus_total); + seq_printf(m, "LPAR CPUs Configured: %d\n", info->cpus_configured); + seq_printf(m, "LPAR CPUs Standby: %d\n", info->cpus_standby); + seq_printf(m, "LPAR CPUs Reserved: %d\n", info->cpus_reserved); + seq_printf(m, "LPAR CPUs Dedicated: %d\n", info->cpus_dedicated); + seq_printf(m, "LPAR CPUs Shared: %d\n", info->cpus_shared); + if (info->mt_installed) { + seq_printf(m, "LPAR CPUs G-MTID: %d\n", info->mt_gtid); + seq_printf(m, "LPAR CPUs S-MTID: %d\n", info->mt_stid); + seq_printf(m, "LPAR CPUs PS-MTID: %d\n", info->mt_psmtid); + } + if (convert_ext_name(info->vsne, info->ext_name, sizeof(info->ext_name))) { + seq_printf(m, "LPAR Extended Name: %-.256s\n", info->ext_name); + seq_printf(m, "LPAR UUID: %pUb\n", &info->uuid); + } +} + +static void print_ext_name(struct seq_file *m, int lvl, + struct sysinfo_3_2_2 *info) +{ + size_t len = sizeof(info->ext_names[lvl]); + + if (!convert_ext_name(info->vm[lvl].evmne, info->ext_names[lvl], len)) + return; + seq_printf(m, "VM%02d Extended Name: %-.256s\n", lvl, + info->ext_names[lvl]); +} + +static void print_uuid(struct seq_file *m, int i, struct sysinfo_3_2_2 *info) +{ + if (uuid_is_null(&info->vm[i].uuid)) + return; + seq_printf(m, "VM%02d UUID: %pUb\n", i, &info->vm[i].uuid); +} + +static void stsi_3_2_2(struct seq_file *m, struct sysinfo_3_2_2 *info) +{ + int i; + + if (stsi(info, 3, 2, 2)) + return; + for (i = 0; i < info->count; i++) { + EBCASC(info->vm[i].name, sizeof(info->vm[i].name)); + EBCASC(info->vm[i].cpi, sizeof(info->vm[i].cpi)); + seq_putc(m, '\n'); + seq_printf(m, "VM%02d Name: %-8.8s\n", i, info->vm[i].name); + seq_printf(m, "VM%02d Control Program: %-16.16s\n", i, info->vm[i].cpi); + seq_printf(m, "VM%02d Adjustment: %d\n", i, info->vm[i].caf); + seq_printf(m, "VM%02d CPUs Total: %d\n", i, info->vm[i].cpus_total); + seq_printf(m, "VM%02d CPUs Configured: %d\n", i, info->vm[i].cpus_configured); + seq_printf(m, "VM%02d CPUs Standby: %d\n", i, info->vm[i].cpus_standby); + seq_printf(m, "VM%02d CPUs Reserved: %d\n", i, info->vm[i].cpus_reserved); + print_ext_name(m, i, info); + print_uuid(m, i, info); + } +} + +static int sysinfo_show(struct seq_file *m, void *v) +{ + void *info = (void *)get_zeroed_page(GFP_KERNEL); + int level; + + if (!info) + return 0; + level = stsi(NULL, 0, 0, 0); + if (level >= 1) + stsi_1_1_1(m, info); + if (level >= 1) + stsi_15_1_x(m, info); + if (level >= 1) + stsi_1_2_2(m, info); + if (level >= 2) + stsi_2_2_2(m, info); + if (level >= 3) + stsi_3_2_2(m, info); + free_page((unsigned long)info); + return 0; +} + +static int __init sysinfo_create_proc(void) +{ + proc_create_single("sysinfo", 0444, NULL, sysinfo_show); + return 0; +} +device_initcall(sysinfo_create_proc); + +#endif /* CONFIG_PROC_FS */ + +/* + * Service levels interface. + */ + +static DECLARE_RWSEM(service_level_sem); +static LIST_HEAD(service_level_list); + +int register_service_level(struct service_level *slr) +{ + struct service_level *ptr; + + down_write(&service_level_sem); + list_for_each_entry(ptr, &service_level_list, list) + if (ptr == slr) { + up_write(&service_level_sem); + return -EEXIST; + } + list_add_tail(&slr->list, &service_level_list); + up_write(&service_level_sem); + return 0; +} +EXPORT_SYMBOL(register_service_level); + +int unregister_service_level(struct service_level *slr) +{ + struct service_level *ptr, *next; + int rc = -ENOENT; + + down_write(&service_level_sem); + list_for_each_entry_safe(ptr, next, &service_level_list, list) { + if (ptr != slr) + continue; + list_del(&ptr->list); + rc = 0; + break; + } + up_write(&service_level_sem); + return rc; +} +EXPORT_SYMBOL(unregister_service_level); + +static void *service_level_start(struct seq_file *m, loff_t *pos) +{ + down_read(&service_level_sem); + return seq_list_start(&service_level_list, *pos); +} + +static void *service_level_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next(p, &service_level_list, pos); +} + +static void service_level_stop(struct seq_file *m, void *p) +{ + up_read(&service_level_sem); +} + +static int service_level_show(struct seq_file *m, void *p) +{ + struct service_level *slr; + + slr = list_entry(p, struct service_level, list); + slr->seq_print(m, slr); + return 0; +} + +static const struct seq_operations service_level_seq_ops = { + .start = service_level_start, + .next = service_level_next, + .stop = service_level_stop, + .show = service_level_show +}; + +static void service_level_vm_print(struct seq_file *m, + struct service_level *slr) +{ + char *query_buffer, *str; + + query_buffer = kmalloc(1024, GFP_KERNEL | GFP_DMA); + if (!query_buffer) + return; + cpcmd("QUERY CPLEVEL", query_buffer, 1024, NULL); + str = strchr(query_buffer, '\n'); + if (str) + *str = 0; + seq_printf(m, "VM: %s\n", query_buffer); + kfree(query_buffer); +} + +static struct service_level service_level_vm = { + .seq_print = service_level_vm_print +}; + +static __init int create_proc_service_level(void) +{ + proc_create_seq("service_levels", 0, NULL, &service_level_seq_ops); + if (MACHINE_IS_VM) + register_service_level(&service_level_vm); + return 0; +} +subsys_initcall(create_proc_service_level); + +/* + * CPU capability might have changed. Therefore recalculate loops_per_jiffy. + */ +void s390_adjust_jiffies(void) +{ + struct sysinfo_1_2_2 *info; + unsigned long capability; + struct kernel_fpu fpu; + + info = (void *) get_zeroed_page(GFP_KERNEL); + if (!info) + return; + + if (stsi(info, 1, 2, 2) == 0) { + /* + * Major sigh. The cpu capability encoding is "special". + * If the first 9 bits of info->capability are 0 then it + * is a 32 bit unsigned integer in the range 0 .. 2^23. + * If the first 9 bits are != 0 then it is a 32 bit float. + * In addition a lower value indicates a proportionally + * higher cpu capacity. Bogomips are the other way round. + * To get to a halfway suitable number we divide 1e7 + * by the cpu capability number. Yes, that means a floating + * point division .. + */ + kernel_fpu_begin(&fpu, KERNEL_FPR); + asm volatile( + " sfpc %3\n" + " l %0,%1\n" + " tmlh %0,0xff80\n" + " jnz 0f\n" + " cefbr %%f2,%0\n" + " j 1f\n" + "0: le %%f2,%1\n" + "1: cefbr %%f0,%2\n" + " debr %%f0,%%f2\n" + " cgebr %0,5,%%f0\n" + : "=&d" (capability) + : "Q" (info->capability), "d" (10000000), "d" (0) + : "cc" + ); + kernel_fpu_end(&fpu, KERNEL_FPR); + } else + /* + * Really old machine without stsi block for basic + * cpu information. Report 42.0 bogomips. + */ + capability = 42; + loops_per_jiffy = capability * (500000/HZ); + free_page((unsigned long) info); +} + +/* + * calibrate the delay loop + */ +void calibrate_delay(void) +{ + s390_adjust_jiffies(); + /* Print the good old Bogomips line .. */ + printk(KERN_DEBUG "Calibrating delay loop (skipped)... " + "%lu.%02lu BogoMIPS preset\n", loops_per_jiffy/(500000/HZ), + (loops_per_jiffy/(5000/HZ)) % 100); +} + +#ifdef CONFIG_DEBUG_FS + +#define STSI_FILE(fc, s1, s2) \ +static int stsi_open_##fc##_##s1##_##s2(struct inode *inode, struct file *file)\ +{ \ + file->private_data = (void *) get_zeroed_page(GFP_KERNEL); \ + if (!file->private_data) \ + return -ENOMEM; \ + if (stsi(file->private_data, fc, s1, s2)) { \ + free_page((unsigned long)file->private_data); \ + file->private_data = NULL; \ + return -EACCES; \ + } \ + return nonseekable_open(inode, file); \ +} \ + \ +static const struct file_operations stsi_##fc##_##s1##_##s2##_fs_ops = { \ + .open = stsi_open_##fc##_##s1##_##s2, \ + .release = stsi_release, \ + .read = stsi_read, \ + .llseek = no_llseek, \ +}; + +static int stsi_release(struct inode *inode, struct file *file) +{ + free_page((unsigned long)file->private_data); + return 0; +} + +static ssize_t stsi_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) +{ + return simple_read_from_buffer(buf, size, ppos, file->private_data, PAGE_SIZE); +} + +STSI_FILE( 1, 1, 1); +STSI_FILE( 1, 2, 1); +STSI_FILE( 1, 2, 2); +STSI_FILE( 2, 2, 1); +STSI_FILE( 2, 2, 2); +STSI_FILE( 3, 2, 2); +STSI_FILE(15, 1, 2); +STSI_FILE(15, 1, 3); +STSI_FILE(15, 1, 4); +STSI_FILE(15, 1, 5); +STSI_FILE(15, 1, 6); + +struct stsi_file { + const struct file_operations *fops; + char *name; +}; + +static struct stsi_file stsi_file[] __initdata = { + {.fops = &stsi_1_1_1_fs_ops, .name = "1_1_1"}, + {.fops = &stsi_1_2_1_fs_ops, .name = "1_2_1"}, + {.fops = &stsi_1_2_2_fs_ops, .name = "1_2_2"}, + {.fops = &stsi_2_2_1_fs_ops, .name = "2_2_1"}, + {.fops = &stsi_2_2_2_fs_ops, .name = "2_2_2"}, + {.fops = &stsi_3_2_2_fs_ops, .name = "3_2_2"}, + {.fops = &stsi_15_1_2_fs_ops, .name = "15_1_2"}, + {.fops = &stsi_15_1_3_fs_ops, .name = "15_1_3"}, + {.fops = &stsi_15_1_4_fs_ops, .name = "15_1_4"}, + {.fops = &stsi_15_1_5_fs_ops, .name = "15_1_5"}, + {.fops = &stsi_15_1_6_fs_ops, .name = "15_1_6"}, +}; + +static u8 stsi_0_0_0; + +static __init int stsi_init_debugfs(void) +{ + struct dentry *stsi_root; + struct stsi_file *sf; + int lvl, i; + + stsi_root = debugfs_create_dir("stsi", arch_debugfs_dir); + lvl = stsi(NULL, 0, 0, 0); + if (lvl > 0) + stsi_0_0_0 = lvl; + debugfs_create_u8("0_0_0", 0400, stsi_root, &stsi_0_0_0); + for (i = 0; i < ARRAY_SIZE(stsi_file); i++) { + sf = &stsi_file[i]; + debugfs_create_file(sf->name, 0400, stsi_root, NULL, sf->fops); + } + if (IS_ENABLED(CONFIG_SCHED_TOPOLOGY) && MACHINE_HAS_TOPOLOGY) { + char link_to[10]; + + sprintf(link_to, "15_1_%d", topology_mnest_limit()); + debugfs_create_symlink("topology", stsi_root, link_to); + } + return 0; +} +device_initcall(stsi_init_debugfs); + +#endif /* CONFIG_DEBUG_FS */ diff --git a/arch/s390/kernel/text_amode31.S b/arch/s390/kernel/text_amode31.S new file mode 100644 index 0000000000..14c6d25c03 --- /dev/null +++ b/arch/s390/kernel/text_amode31.S @@ -0,0 +1,159 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Code that needs to run below 2 GB. + * + * Copyright IBM Corp. 2019 + */ + +#include +#include +#include +#include + + .section .amode31.text,"ax" +/* + * Simplified version of expoline thunk. The normal thunks can not be used here, + * because they might be more than 2 GB away, and not reachable by the relative + * branch. No comdat, exrl, etc. optimizations used here, because it only + * affects a few functions that are not performance-relevant. + */ + .macro BR_EX_AMODE31_r14 + larl %r1,0f + ex 0,0(%r1) + j . +0: br %r14 + .endm + +/* + * int _diag14_amode31(unsigned long rx, unsigned long ry1, unsigned long subcode) + */ +SYM_FUNC_START(_diag14_amode31) + lgr %r1,%r2 + lgr %r2,%r3 + lgr %r3,%r4 + lhi %r5,-EIO + sam31 + diag %r1,%r2,0x14 +.Ldiag14_ex: + ipm %r5 + srl %r5,28 +.Ldiag14_fault: + sam64 + lgfr %r2,%r5 + BR_EX_AMODE31_r14 + EX_TABLE_AMODE31(.Ldiag14_ex, .Ldiag14_fault) +SYM_FUNC_END(_diag14_amode31) + +/* + * int _diag210_amode31(struct diag210 *addr) + */ +SYM_FUNC_START(_diag210_amode31) + lgr %r1,%r2 + lhi %r2,-1 + sam31 + diag %r1,%r0,0x210 +.Ldiag210_ex: + ipm %r2 + srl %r2,28 +.Ldiag210_fault: + sam64 + lgfr %r2,%r2 + BR_EX_AMODE31_r14 + EX_TABLE_AMODE31(.Ldiag210_ex, .Ldiag210_fault) +SYM_FUNC_END(_diag210_amode31) + +/* + * int diag8c(struct diag8c *addr, struct ccw_dev_id *devno, size_t len) +*/ +SYM_FUNC_START(_diag8c_amode31) + llgf %r3,0(%r3) + sam31 + diag %r2,%r4,0x8c +.Ldiag8c_ex: + sam64 + lgfr %r2,%r3 + BR_EX_AMODE31_r14 + EX_TABLE_AMODE31(.Ldiag8c_ex, .Ldiag8c_ex) +SYM_FUNC_END(_diag8c_amode31) +/* + * int _diag26c_amode31(void *req, void *resp, enum diag26c_sc subcode) + */ +SYM_FUNC_START(_diag26c_amode31) + lghi %r5,-EOPNOTSUPP + sam31 + diag %r2,%r4,0x26c +.Ldiag26c_ex: + sam64 + lgfr %r2,%r5 + BR_EX_AMODE31_r14 + EX_TABLE_AMODE31(.Ldiag26c_ex, .Ldiag26c_ex) +SYM_FUNC_END(_diag26c_amode31) + +/* + * void _diag0c_amode31(struct hypfs_diag0c_entry *entry) + */ +SYM_FUNC_START(_diag0c_amode31) + sam31 + diag %r2,%r2,0x0c + sam64 + BR_EX_AMODE31_r14 +SYM_FUNC_END(_diag0c_amode31) + +/* + * void _diag308_reset_amode31(void) + * + * Calls diag 308 subcode 1 and continues execution + */ +SYM_FUNC_START(_diag308_reset_amode31) + larl %r4,ctlregs # Save control registers + stctg %c0,%c15,0(%r4) + lg %r2,0(%r4) # Disable lowcore protection + nilh %r2,0xefff + larl %r4,ctlreg0 + stg %r2,0(%r4) + lctlg %c0,%c0,0(%r4) + larl %r4,fpctl # Floating point control register + stfpc 0(%r4) + larl %r4,prefix # Save prefix register + stpx 0(%r4) + larl %r4,prefix_zero # Set prefix register to 0 + spx 0(%r4) + larl %r4,continue_psw # Save PSW flags + epsw %r2,%r3 + stm %r2,%r3,0(%r4) + larl %r4,.Lrestart_part2 # Setup restart PSW at absolute 0 + larl %r3,restart_diag308_psw + og %r4,0(%r3) # Save PSW + lghi %r3,0 + sturg %r4,%r3 # Use sturg, because of large pages + lghi %r1,1 + lghi %r0,0 + diag %r0,%r1,0x308 +.Lrestart_part2: + lhi %r0,0 # Load r0 with zero + lhi %r1,2 # Use mode 2 = ESAME (dump) + sigp %r1,%r0,SIGP_SET_ARCHITECTURE # Switch to ESAME mode + sam64 # Switch to 64 bit addressing mode + larl %r4,ctlregs # Restore control registers + lctlg %c0,%c15,0(%r4) + larl %r4,fpctl # Restore floating point ctl register + lfpc 0(%r4) + larl %r4,prefix # Restore prefix register + spx 0(%r4) + larl %r4,continue_psw # Restore PSW flags + larl %r2,.Lcontinue + stg %r2,8(%r4) + lpswe 0(%r4) +.Lcontinue: + BR_EX_AMODE31_r14 +SYM_FUNC_END(_diag308_reset_amode31) + + .section .amode31.data,"aw",@progbits + .balign 8 +SYM_DATA_LOCAL(restart_diag308_psw, .long 0x00080000,0x80000000) +SYM_DATA_LOCAL(continue_psw, .quad 0,0) +SYM_DATA_LOCAL(ctlreg0, .quad 0) +SYM_DATA_LOCAL(ctlregs, .fill 16,8,0) +SYM_DATA_LOCAL(fpctl, .long 0) +SYM_DATA_LOCAL(prefix, .long 0) +SYM_DATA_LOCAL(prefix_zero, .long 0) diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c new file mode 100644 index 0000000000..d34d3548c0 --- /dev/null +++ b/arch/s390/kernel/time.c @@ -0,0 +1,944 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Time of day based timer functions. + * + * S390 version + * Copyright IBM Corp. 1999, 2008 + * Author(s): Hartmut Penner (hp@de.ibm.com), + * Martin Schwidefsky (schwidefsky@de.ibm.com), + * Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com) + * + * Derived from "arch/i386/kernel/time.c" + * Copyright (C) 1991, 1992, 1995 Linus Torvalds + */ + +#define KMSG_COMPONENT "time" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "entry.h" + +union tod_clock tod_clock_base __section(".data"); +EXPORT_SYMBOL_GPL(tod_clock_base); + +u64 clock_comparator_max = -1ULL; +EXPORT_SYMBOL_GPL(clock_comparator_max); + +static DEFINE_PER_CPU(struct clock_event_device, comparators); + +ATOMIC_NOTIFIER_HEAD(s390_epoch_delta_notifier); +EXPORT_SYMBOL(s390_epoch_delta_notifier); + +unsigned char ptff_function_mask[16]; + +static unsigned long lpar_offset; +static unsigned long initial_leap_seconds; +static unsigned long tod_steering_end; +static long tod_steering_delta; + +/* + * Get time offsets with PTFF + */ +void __init time_early_init(void) +{ + struct ptff_qto qto; + struct ptff_qui qui; + int cs; + + /* Initialize TOD steering parameters */ + tod_steering_end = tod_clock_base.tod; + for (cs = 0; cs < CS_BASES; cs++) + vdso_data[cs].arch_data.tod_steering_end = tod_steering_end; + + if (!test_facility(28)) + return; + + ptff(&ptff_function_mask, sizeof(ptff_function_mask), PTFF_QAF); + + /* get LPAR offset */ + if (ptff_query(PTFF_QTO) && ptff(&qto, sizeof(qto), PTFF_QTO) == 0) + lpar_offset = qto.tod_epoch_difference; + + /* get initial leap seconds */ + if (ptff_query(PTFF_QUI) && ptff(&qui, sizeof(qui), PTFF_QUI) == 0) + initial_leap_seconds = (unsigned long) + ((long) qui.old_leap * 4096000000L); +} + +unsigned long long noinstr sched_clock_noinstr(void) +{ + return tod_to_ns(__get_tod_clock_monotonic()); +} + +/* + * Scheduler clock - returns current time in nanosec units. + */ +unsigned long long notrace sched_clock(void) +{ + return tod_to_ns(get_tod_clock_monotonic()); +} +NOKPROBE_SYMBOL(sched_clock); + +static void ext_to_timespec64(union tod_clock *clk, struct timespec64 *xt) +{ + unsigned long rem, sec, nsec; + + sec = clk->us; + rem = do_div(sec, 1000000); + nsec = ((clk->sus + (rem << 12)) * 125) >> 9; + xt->tv_sec = sec; + xt->tv_nsec = nsec; +} + +void clock_comparator_work(void) +{ + struct clock_event_device *cd; + + S390_lowcore.clock_comparator = clock_comparator_max; + cd = this_cpu_ptr(&comparators); + cd->event_handler(cd); +} + +static int s390_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + S390_lowcore.clock_comparator = get_tod_clock() + delta; + set_clock_comparator(S390_lowcore.clock_comparator); + return 0; +} + +/* + * Set up lowcore and control register of the current cpu to + * enable TOD clock and clock comparator interrupts. + */ +void init_cpu_timer(void) +{ + struct clock_event_device *cd; + int cpu; + + S390_lowcore.clock_comparator = clock_comparator_max; + set_clock_comparator(S390_lowcore.clock_comparator); + + cpu = smp_processor_id(); + cd = &per_cpu(comparators, cpu); + cd->name = "comparator"; + cd->features = CLOCK_EVT_FEAT_ONESHOT; + cd->mult = 16777; + cd->shift = 12; + cd->min_delta_ns = 1; + cd->min_delta_ticks = 1; + cd->max_delta_ns = LONG_MAX; + cd->max_delta_ticks = ULONG_MAX; + cd->rating = 400; + cd->cpumask = cpumask_of(cpu); + cd->set_next_event = s390_next_event; + + clockevents_register_device(cd); + + /* Enable clock comparator timer interrupt. */ + __ctl_set_bit(0,11); + + /* Always allow the timing alert external interrupt. */ + __ctl_set_bit(0, 4); +} + +static void clock_comparator_interrupt(struct ext_code ext_code, + unsigned int param32, + unsigned long param64) +{ + inc_irq_stat(IRQEXT_CLK); + if (S390_lowcore.clock_comparator == clock_comparator_max) + set_clock_comparator(S390_lowcore.clock_comparator); +} + +static void stp_timing_alert(struct stp_irq_parm *); + +static void timing_alert_interrupt(struct ext_code ext_code, + unsigned int param32, unsigned long param64) +{ + inc_irq_stat(IRQEXT_TLA); + if (param32 & 0x00038000) + stp_timing_alert((struct stp_irq_parm *) ¶m32); +} + +static void stp_reset(void); + +void read_persistent_clock64(struct timespec64 *ts) +{ + union tod_clock clk; + u64 delta; + + delta = initial_leap_seconds + TOD_UNIX_EPOCH; + store_tod_clock_ext(&clk); + clk.eitod -= delta; + ext_to_timespec64(&clk, ts); +} + +void __init read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, + struct timespec64 *boot_offset) +{ + struct timespec64 boot_time; + union tod_clock clk; + u64 delta; + + delta = initial_leap_seconds + TOD_UNIX_EPOCH; + clk = tod_clock_base; + clk.eitod -= delta; + ext_to_timespec64(&clk, &boot_time); + + read_persistent_clock64(wall_time); + *boot_offset = timespec64_sub(*wall_time, boot_time); +} + +static u64 read_tod_clock(struct clocksource *cs) +{ + unsigned long now, adj; + + preempt_disable(); /* protect from changes to steering parameters */ + now = get_tod_clock(); + adj = tod_steering_end - now; + if (unlikely((s64) adj > 0)) + /* + * manually steer by 1 cycle every 2^16 cycles. This + * corresponds to shifting the tod delta by 15. 1s is + * therefore steered in ~9h. The adjust will decrease + * over time, until it finally reaches 0. + */ + now += (tod_steering_delta < 0) ? (adj >> 15) : -(adj >> 15); + preempt_enable(); + return now; +} + +static struct clocksource clocksource_tod = { + .name = "tod", + .rating = 400, + .read = read_tod_clock, + .mask = CLOCKSOURCE_MASK(64), + .mult = 1000, + .shift = 12, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .vdso_clock_mode = VDSO_CLOCKMODE_TOD, +}; + +struct clocksource * __init clocksource_default_clock(void) +{ + return &clocksource_tod; +} + +/* + * Initialize the TOD clock and the CPU timer of + * the boot cpu. + */ +void __init time_init(void) +{ + /* Reset time synchronization interfaces. */ + stp_reset(); + + /* request the clock comparator external interrupt */ + if (register_external_irq(EXT_IRQ_CLK_COMP, clock_comparator_interrupt)) + panic("Couldn't request external interrupt 0x1004"); + + /* request the timing alert external interrupt */ + if (register_external_irq(EXT_IRQ_TIMING_ALERT, timing_alert_interrupt)) + panic("Couldn't request external interrupt 0x1406"); + + if (__clocksource_register(&clocksource_tod) != 0) + panic("Could not register TOD clock source"); + + /* Enable TOD clock interrupts on the boot cpu. */ + init_cpu_timer(); + + /* Enable cpu timer interrupts on the boot cpu. */ + vtime_init(); +} + +static DEFINE_PER_CPU(atomic_t, clock_sync_word); +static DEFINE_MUTEX(stp_mutex); +static unsigned long clock_sync_flags; + +#define CLOCK_SYNC_HAS_STP 0 +#define CLOCK_SYNC_STP 1 +#define CLOCK_SYNC_STPINFO_VALID 2 + +/* + * The get_clock function for the physical clock. It will get the current + * TOD clock, subtract the LPAR offset and write the result to *clock. + * The function returns 0 if the clock is in sync with the external time + * source. If the clock mode is local it will return -EOPNOTSUPP and + * -EAGAIN if the clock is not in sync with the external reference. + */ +int get_phys_clock(unsigned long *clock) +{ + atomic_t *sw_ptr; + unsigned int sw0, sw1; + + sw_ptr = &get_cpu_var(clock_sync_word); + sw0 = atomic_read(sw_ptr); + *clock = get_tod_clock() - lpar_offset; + sw1 = atomic_read(sw_ptr); + put_cpu_var(clock_sync_word); + if (sw0 == sw1 && (sw0 & 0x80000000U)) + /* Success: time is in sync. */ + return 0; + if (!test_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags)) + return -EOPNOTSUPP; + if (!test_bit(CLOCK_SYNC_STP, &clock_sync_flags)) + return -EACCES; + return -EAGAIN; +} +EXPORT_SYMBOL(get_phys_clock); + +/* + * Make get_phys_clock() return -EAGAIN. + */ +static void disable_sync_clock(void *dummy) +{ + atomic_t *sw_ptr = this_cpu_ptr(&clock_sync_word); + /* + * Clear the in-sync bit 2^31. All get_phys_clock calls will + * fail until the sync bit is turned back on. In addition + * increase the "sequence" counter to avoid the race of an + * stp event and the complete recovery against get_phys_clock. + */ + atomic_andnot(0x80000000, sw_ptr); + atomic_inc(sw_ptr); +} + +/* + * Make get_phys_clock() return 0 again. + * Needs to be called from a context disabled for preemption. + */ +static void enable_sync_clock(void) +{ + atomic_t *sw_ptr = this_cpu_ptr(&clock_sync_word); + atomic_or(0x80000000, sw_ptr); +} + +/* + * Function to check if the clock is in sync. + */ +static inline int check_sync_clock(void) +{ + atomic_t *sw_ptr; + int rc; + + sw_ptr = &get_cpu_var(clock_sync_word); + rc = (atomic_read(sw_ptr) & 0x80000000U) != 0; + put_cpu_var(clock_sync_word); + return rc; +} + +/* + * Apply clock delta to the global data structures. + * This is called once on the CPU that performed the clock sync. + */ +static void clock_sync_global(long delta) +{ + unsigned long now, adj; + struct ptff_qto qto; + int cs; + + /* Fixup the monotonic sched clock. */ + tod_clock_base.eitod += delta; + /* Adjust TOD steering parameters. */ + now = get_tod_clock(); + adj = tod_steering_end - now; + if (unlikely((s64) adj >= 0)) + /* Calculate how much of the old adjustment is left. */ + tod_steering_delta = (tod_steering_delta < 0) ? + -(adj >> 15) : (adj >> 15); + tod_steering_delta += delta; + if ((abs(tod_steering_delta) >> 48) != 0) + panic("TOD clock sync offset %li is too large to drift\n", + tod_steering_delta); + tod_steering_end = now + (abs(tod_steering_delta) << 15); + for (cs = 0; cs < CS_BASES; cs++) { + vdso_data[cs].arch_data.tod_steering_end = tod_steering_end; + vdso_data[cs].arch_data.tod_steering_delta = tod_steering_delta; + } + + /* Update LPAR offset. */ + if (ptff_query(PTFF_QTO) && ptff(&qto, sizeof(qto), PTFF_QTO) == 0) + lpar_offset = qto.tod_epoch_difference; + /* Call the TOD clock change notifier. */ + atomic_notifier_call_chain(&s390_epoch_delta_notifier, 0, &delta); +} + +/* + * Apply clock delta to the per-CPU data structures of this CPU. + * This is called for each online CPU after the call to clock_sync_global. + */ +static void clock_sync_local(long delta) +{ + /* Add the delta to the clock comparator. */ + if (S390_lowcore.clock_comparator != clock_comparator_max) { + S390_lowcore.clock_comparator += delta; + set_clock_comparator(S390_lowcore.clock_comparator); + } + /* Adjust the last_update_clock time-stamp. */ + S390_lowcore.last_update_clock += delta; +} + +/* Single threaded workqueue used for stp sync events */ +static struct workqueue_struct *time_sync_wq; + +static void __init time_init_wq(void) +{ + if (time_sync_wq) + return; + time_sync_wq = create_singlethread_workqueue("timesync"); +} + +struct clock_sync_data { + atomic_t cpus; + int in_sync; + long clock_delta; +}; + +/* + * Server Time Protocol (STP) code. + */ +static bool stp_online; +static struct stp_sstpi stp_info; +static void *stp_page; + +static void stp_work_fn(struct work_struct *work); +static DECLARE_WORK(stp_work, stp_work_fn); +static struct timer_list stp_timer; + +static int __init early_parse_stp(char *p) +{ + return kstrtobool(p, &stp_online); +} +early_param("stp", early_parse_stp); + +/* + * Reset STP attachment. + */ +static void __init stp_reset(void) +{ + int rc; + + stp_page = (void *) get_zeroed_page(GFP_ATOMIC); + rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0x0000, NULL); + if (rc == 0) + set_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags); + else if (stp_online) { + pr_warn("The real or virtual hardware system does not provide an STP interface\n"); + free_page((unsigned long) stp_page); + stp_page = NULL; + stp_online = false; + } +} + +static void stp_timeout(struct timer_list *unused) +{ + queue_work(time_sync_wq, &stp_work); +} + +static int __init stp_init(void) +{ + if (!test_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags)) + return 0; + timer_setup(&stp_timer, stp_timeout, 0); + time_init_wq(); + if (!stp_online) + return 0; + queue_work(time_sync_wq, &stp_work); + return 0; +} + +arch_initcall(stp_init); + +/* + * STP timing alert. There are three causes: + * 1) timing status change + * 2) link availability change + * 3) time control parameter change + * In all three cases we are only interested in the clock source state. + * If a STP clock source is now available use it. + */ +static void stp_timing_alert(struct stp_irq_parm *intparm) +{ + if (intparm->tsc || intparm->lac || intparm->tcpc) + queue_work(time_sync_wq, &stp_work); +} + +/* + * STP sync check machine check. This is called when the timing state + * changes from the synchronized state to the unsynchronized state. + * After a STP sync check the clock is not in sync. The machine check + * is broadcasted to all cpus at the same time. + */ +int stp_sync_check(void) +{ + disable_sync_clock(NULL); + return 1; +} + +/* + * STP island condition machine check. This is called when an attached + * server attempts to communicate over an STP link and the servers + * have matching CTN ids and have a valid stratum-1 configuration + * but the configurations do not match. + */ +int stp_island_check(void) +{ + disable_sync_clock(NULL); + return 1; +} + +void stp_queue_work(void) +{ + queue_work(time_sync_wq, &stp_work); +} + +static int __store_stpinfo(void) +{ + int rc = chsc_sstpi(stp_page, &stp_info, sizeof(struct stp_sstpi)); + + if (rc) + clear_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags); + else + set_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags); + return rc; +} + +static int stpinfo_valid(void) +{ + return stp_online && test_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags); +} + +static int stp_sync_clock(void *data) +{ + struct clock_sync_data *sync = data; + long clock_delta, flags; + static int first; + int rc; + + enable_sync_clock(); + if (xchg(&first, 1) == 0) { + /* Wait until all other cpus entered the sync function. */ + while (atomic_read(&sync->cpus) != 0) + cpu_relax(); + rc = 0; + if (stp_info.todoff || stp_info.tmd != 2) { + flags = vdso_update_begin(); + rc = chsc_sstpc(stp_page, STP_OP_SYNC, 0, + &clock_delta); + if (rc == 0) { + sync->clock_delta = clock_delta; + clock_sync_global(clock_delta); + rc = __store_stpinfo(); + if (rc == 0 && stp_info.tmd != 2) + rc = -EAGAIN; + } + vdso_update_end(flags); + } + sync->in_sync = rc ? -EAGAIN : 1; + xchg(&first, 0); + } else { + /* Slave */ + atomic_dec(&sync->cpus); + /* Wait for in_sync to be set. */ + while (READ_ONCE(sync->in_sync) == 0) + __udelay(1); + } + if (sync->in_sync != 1) + /* Didn't work. Clear per-cpu in sync bit again. */ + disable_sync_clock(NULL); + /* Apply clock delta to per-CPU fields of this CPU. */ + clock_sync_local(sync->clock_delta); + + return 0; +} + +static int stp_clear_leap(void) +{ + struct __kernel_timex txc; + int ret; + + memset(&txc, 0, sizeof(txc)); + + ret = do_adjtimex(&txc); + if (ret < 0) + return ret; + + txc.modes = ADJ_STATUS; + txc.status &= ~(STA_INS|STA_DEL); + return do_adjtimex(&txc); +} + +static void stp_check_leap(void) +{ + struct stp_stzi stzi; + struct stp_lsoib *lsoib = &stzi.lsoib; + struct __kernel_timex txc; + int64_t timediff; + int leapdiff, ret; + + if (!stp_info.lu || !check_sync_clock()) { + /* + * Either a scheduled leap second was removed by the operator, + * or STP is out of sync. In both cases, clear the leap second + * kernel flags. + */ + if (stp_clear_leap() < 0) + pr_err("failed to clear leap second flags\n"); + return; + } + + if (chsc_stzi(stp_page, &stzi, sizeof(stzi))) { + pr_err("stzi failed\n"); + return; + } + + timediff = tod_to_ns(lsoib->nlsout - get_tod_clock()) / NSEC_PER_SEC; + leapdiff = lsoib->nlso - lsoib->also; + + if (leapdiff != 1 && leapdiff != -1) { + pr_err("Cannot schedule %d leap seconds\n", leapdiff); + return; + } + + if (timediff < 0) { + if (stp_clear_leap() < 0) + pr_err("failed to clear leap second flags\n"); + } else if (timediff < 7200) { + memset(&txc, 0, sizeof(txc)); + ret = do_adjtimex(&txc); + if (ret < 0) + return; + + txc.modes = ADJ_STATUS; + if (leapdiff > 0) + txc.status |= STA_INS; + else + txc.status |= STA_DEL; + ret = do_adjtimex(&txc); + if (ret < 0) + pr_err("failed to set leap second flags\n"); + /* arm Timer to clear leap second flags */ + mod_timer(&stp_timer, jiffies + msecs_to_jiffies(14400 * MSEC_PER_SEC)); + } else { + /* The day the leap second is scheduled for hasn't been reached. Retry + * in one hour. + */ + mod_timer(&stp_timer, jiffies + msecs_to_jiffies(3600 * MSEC_PER_SEC)); + } +} + +/* + * STP work. Check for the STP state and take over the clock + * synchronization if the STP clock source is usable. + */ +static void stp_work_fn(struct work_struct *work) +{ + struct clock_sync_data stp_sync; + int rc; + + /* prevent multiple execution. */ + mutex_lock(&stp_mutex); + + if (!stp_online) { + chsc_sstpc(stp_page, STP_OP_CTRL, 0x0000, NULL); + del_timer_sync(&stp_timer); + goto out_unlock; + } + + rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0xf0e0, NULL); + if (rc) + goto out_unlock; + + rc = __store_stpinfo(); + if (rc || stp_info.c == 0) + goto out_unlock; + + /* Skip synchronization if the clock is already in sync. */ + if (!check_sync_clock()) { + memset(&stp_sync, 0, sizeof(stp_sync)); + cpus_read_lock(); + atomic_set(&stp_sync.cpus, num_online_cpus() - 1); + stop_machine_cpuslocked(stp_sync_clock, &stp_sync, cpu_online_mask); + cpus_read_unlock(); + } + + if (!check_sync_clock()) + /* + * There is a usable clock but the synchronization failed. + * Retry after a second. + */ + mod_timer(&stp_timer, jiffies + msecs_to_jiffies(MSEC_PER_SEC)); + else if (stp_info.lu) + stp_check_leap(); + +out_unlock: + mutex_unlock(&stp_mutex); +} + +/* + * STP subsys sysfs interface functions + */ +static struct bus_type stp_subsys = { + .name = "stp", + .dev_name = "stp", +}; + +static ssize_t ctn_id_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + ssize_t ret = -ENODATA; + + mutex_lock(&stp_mutex); + if (stpinfo_valid()) + ret = sprintf(buf, "%016lx\n", + *(unsigned long *) stp_info.ctnid); + mutex_unlock(&stp_mutex); + return ret; +} + +static DEVICE_ATTR_RO(ctn_id); + +static ssize_t ctn_type_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + ssize_t ret = -ENODATA; + + mutex_lock(&stp_mutex); + if (stpinfo_valid()) + ret = sprintf(buf, "%i\n", stp_info.ctn); + mutex_unlock(&stp_mutex); + return ret; +} + +static DEVICE_ATTR_RO(ctn_type); + +static ssize_t dst_offset_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + ssize_t ret = -ENODATA; + + mutex_lock(&stp_mutex); + if (stpinfo_valid() && (stp_info.vbits & 0x2000)) + ret = sprintf(buf, "%i\n", (int)(s16) stp_info.dsto); + mutex_unlock(&stp_mutex); + return ret; +} + +static DEVICE_ATTR_RO(dst_offset); + +static ssize_t leap_seconds_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + ssize_t ret = -ENODATA; + + mutex_lock(&stp_mutex); + if (stpinfo_valid() && (stp_info.vbits & 0x8000)) + ret = sprintf(buf, "%i\n", (int)(s16) stp_info.leaps); + mutex_unlock(&stp_mutex); + return ret; +} + +static DEVICE_ATTR_RO(leap_seconds); + +static ssize_t leap_seconds_scheduled_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct stp_stzi stzi; + ssize_t ret; + + mutex_lock(&stp_mutex); + if (!stpinfo_valid() || !(stp_info.vbits & 0x8000) || !stp_info.lu) { + mutex_unlock(&stp_mutex); + return -ENODATA; + } + + ret = chsc_stzi(stp_page, &stzi, sizeof(stzi)); + mutex_unlock(&stp_mutex); + if (ret < 0) + return ret; + + if (!stzi.lsoib.p) + return sprintf(buf, "0,0\n"); + + return sprintf(buf, "%lu,%d\n", + tod_to_ns(stzi.lsoib.nlsout - TOD_UNIX_EPOCH) / NSEC_PER_SEC, + stzi.lsoib.nlso - stzi.lsoib.also); +} + +static DEVICE_ATTR_RO(leap_seconds_scheduled); + +static ssize_t stratum_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + ssize_t ret = -ENODATA; + + mutex_lock(&stp_mutex); + if (stpinfo_valid()) + ret = sprintf(buf, "%i\n", (int)(s16) stp_info.stratum); + mutex_unlock(&stp_mutex); + return ret; +} + +static DEVICE_ATTR_RO(stratum); + +static ssize_t time_offset_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + ssize_t ret = -ENODATA; + + mutex_lock(&stp_mutex); + if (stpinfo_valid() && (stp_info.vbits & 0x0800)) + ret = sprintf(buf, "%i\n", (int) stp_info.tto); + mutex_unlock(&stp_mutex); + return ret; +} + +static DEVICE_ATTR_RO(time_offset); + +static ssize_t time_zone_offset_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + ssize_t ret = -ENODATA; + + mutex_lock(&stp_mutex); + if (stpinfo_valid() && (stp_info.vbits & 0x4000)) + ret = sprintf(buf, "%i\n", (int)(s16) stp_info.tzo); + mutex_unlock(&stp_mutex); + return ret; +} + +static DEVICE_ATTR_RO(time_zone_offset); + +static ssize_t timing_mode_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + ssize_t ret = -ENODATA; + + mutex_lock(&stp_mutex); + if (stpinfo_valid()) + ret = sprintf(buf, "%i\n", stp_info.tmd); + mutex_unlock(&stp_mutex); + return ret; +} + +static DEVICE_ATTR_RO(timing_mode); + +static ssize_t timing_state_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + ssize_t ret = -ENODATA; + + mutex_lock(&stp_mutex); + if (stpinfo_valid()) + ret = sprintf(buf, "%i\n", stp_info.tst); + mutex_unlock(&stp_mutex); + return ret; +} + +static DEVICE_ATTR_RO(timing_state); + +static ssize_t online_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "%i\n", stp_online); +} + +static ssize_t online_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned int value; + + value = simple_strtoul(buf, NULL, 0); + if (value != 0 && value != 1) + return -EINVAL; + if (!test_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags)) + return -EOPNOTSUPP; + mutex_lock(&stp_mutex); + stp_online = value; + if (stp_online) + set_bit(CLOCK_SYNC_STP, &clock_sync_flags); + else + clear_bit(CLOCK_SYNC_STP, &clock_sync_flags); + queue_work(time_sync_wq, &stp_work); + mutex_unlock(&stp_mutex); + return count; +} + +/* + * Can't use DEVICE_ATTR because the attribute should be named + * stp/online but dev_attr_online already exists in this file .. + */ +static DEVICE_ATTR_RW(online); + +static struct attribute *stp_dev_attrs[] = { + &dev_attr_ctn_id.attr, + &dev_attr_ctn_type.attr, + &dev_attr_dst_offset.attr, + &dev_attr_leap_seconds.attr, + &dev_attr_online.attr, + &dev_attr_leap_seconds_scheduled.attr, + &dev_attr_stratum.attr, + &dev_attr_time_offset.attr, + &dev_attr_time_zone_offset.attr, + &dev_attr_timing_mode.attr, + &dev_attr_timing_state.attr, + NULL +}; +ATTRIBUTE_GROUPS(stp_dev); + +static int __init stp_init_sysfs(void) +{ + return subsys_system_register(&stp_subsys, stp_dev_groups); +} + +device_initcall(stp_init_sysfs); diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c new file mode 100644 index 0000000000..68adf1de88 --- /dev/null +++ b/arch/s390/kernel/topology.c @@ -0,0 +1,661 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2007, 2011 + */ + +#define KMSG_COMPONENT "cpu" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PTF_HORIZONTAL (0UL) +#define PTF_VERTICAL (1UL) +#define PTF_CHECK (2UL) + +enum { + TOPOLOGY_MODE_HW, + TOPOLOGY_MODE_SINGLE, + TOPOLOGY_MODE_PACKAGE, + TOPOLOGY_MODE_UNINITIALIZED +}; + +struct mask_info { + struct mask_info *next; + unsigned char id; + cpumask_t mask; +}; + +static int topology_mode = TOPOLOGY_MODE_UNINITIALIZED; +static void set_topology_timer(void); +static void topology_work_fn(struct work_struct *work); +static struct sysinfo_15_1_x *tl_info; + +static DECLARE_WORK(topology_work, topology_work_fn); + +/* + * Socket/Book linked lists and cpu_topology updates are + * protected by "sched_domains_mutex". + */ +static struct mask_info socket_info; +static struct mask_info book_info; +static struct mask_info drawer_info; + +struct cpu_topology_s390 cpu_topology[NR_CPUS]; +EXPORT_SYMBOL_GPL(cpu_topology); + +static void cpu_group_map(cpumask_t *dst, struct mask_info *info, unsigned int cpu) +{ + static cpumask_t mask; + + cpumask_clear(&mask); + if (!cpumask_test_cpu(cpu, &cpu_setup_mask)) + goto out; + cpumask_set_cpu(cpu, &mask); + switch (topology_mode) { + case TOPOLOGY_MODE_HW: + while (info) { + if (cpumask_test_cpu(cpu, &info->mask)) { + cpumask_copy(&mask, &info->mask); + break; + } + info = info->next; + } + break; + case TOPOLOGY_MODE_PACKAGE: + cpumask_copy(&mask, cpu_present_mask); + break; + default: + fallthrough; + case TOPOLOGY_MODE_SINGLE: + break; + } + cpumask_and(&mask, &mask, &cpu_setup_mask); +out: + cpumask_copy(dst, &mask); +} + +static void cpu_thread_map(cpumask_t *dst, unsigned int cpu) +{ + static cpumask_t mask; + unsigned int max_cpu; + + cpumask_clear(&mask); + if (!cpumask_test_cpu(cpu, &cpu_setup_mask)) + goto out; + cpumask_set_cpu(cpu, &mask); + if (topology_mode != TOPOLOGY_MODE_HW) + goto out; + cpu -= cpu % (smp_cpu_mtid + 1); + max_cpu = min(cpu + smp_cpu_mtid, nr_cpu_ids - 1); + for (; cpu <= max_cpu; cpu++) { + if (cpumask_test_cpu(cpu, &cpu_setup_mask)) + cpumask_set_cpu(cpu, &mask); + } +out: + cpumask_copy(dst, &mask); +} + +#define TOPOLOGY_CORE_BITS 64 + +static void add_cpus_to_mask(struct topology_core *tl_core, + struct mask_info *drawer, + struct mask_info *book, + struct mask_info *socket) +{ + struct cpu_topology_s390 *topo; + unsigned int core; + + for_each_set_bit(core, &tl_core->mask, TOPOLOGY_CORE_BITS) { + unsigned int max_cpu, rcore; + int cpu; + + rcore = TOPOLOGY_CORE_BITS - 1 - core + tl_core->origin; + cpu = smp_find_processor_id(rcore << smp_cpu_mt_shift); + if (cpu < 0) + continue; + max_cpu = min(cpu + smp_cpu_mtid, nr_cpu_ids - 1); + for (; cpu <= max_cpu; cpu++) { + topo = &cpu_topology[cpu]; + topo->drawer_id = drawer->id; + topo->book_id = book->id; + topo->socket_id = socket->id; + topo->core_id = rcore; + topo->thread_id = cpu; + topo->dedicated = tl_core->d; + cpumask_set_cpu(cpu, &drawer->mask); + cpumask_set_cpu(cpu, &book->mask); + cpumask_set_cpu(cpu, &socket->mask); + smp_cpu_set_polarization(cpu, tl_core->pp); + } + } +} + +static void clear_masks(void) +{ + struct mask_info *info; + + info = &socket_info; + while (info) { + cpumask_clear(&info->mask); + info = info->next; + } + info = &book_info; + while (info) { + cpumask_clear(&info->mask); + info = info->next; + } + info = &drawer_info; + while (info) { + cpumask_clear(&info->mask); + info = info->next; + } +} + +static union topology_entry *next_tle(union topology_entry *tle) +{ + if (!tle->nl) + return (union topology_entry *)((struct topology_core *)tle + 1); + return (union topology_entry *)((struct topology_container *)tle + 1); +} + +static void tl_to_masks(struct sysinfo_15_1_x *info) +{ + struct mask_info *socket = &socket_info; + struct mask_info *book = &book_info; + struct mask_info *drawer = &drawer_info; + union topology_entry *tle, *end; + + clear_masks(); + tle = info->tle; + end = (union topology_entry *)((unsigned long)info + info->length); + while (tle < end) { + switch (tle->nl) { + case 3: + drawer = drawer->next; + drawer->id = tle->container.id; + break; + case 2: + book = book->next; + book->id = tle->container.id; + break; + case 1: + socket = socket->next; + socket->id = tle->container.id; + break; + case 0: + add_cpus_to_mask(&tle->cpu, drawer, book, socket); + break; + default: + clear_masks(); + return; + } + tle = next_tle(tle); + } +} + +static void topology_update_polarization_simple(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + smp_cpu_set_polarization(cpu, POLARIZATION_HRZ); +} + +static int ptf(unsigned long fc) +{ + int rc; + + asm volatile( + " .insn rre,0xb9a20000,%1,%1\n" + " ipm %0\n" + " srl %0,28\n" + : "=d" (rc) + : "d" (fc) : "cc"); + return rc; +} + +int topology_set_cpu_management(int fc) +{ + int cpu, rc; + + if (!MACHINE_HAS_TOPOLOGY) + return -EOPNOTSUPP; + if (fc) + rc = ptf(PTF_VERTICAL); + else + rc = ptf(PTF_HORIZONTAL); + if (rc) + return -EBUSY; + for_each_possible_cpu(cpu) + smp_cpu_set_polarization(cpu, POLARIZATION_UNKNOWN); + return rc; +} + +void update_cpu_masks(void) +{ + struct cpu_topology_s390 *topo, *topo_package, *topo_sibling; + int cpu, sibling, pkg_first, smt_first, id; + + for_each_possible_cpu(cpu) { + topo = &cpu_topology[cpu]; + cpu_thread_map(&topo->thread_mask, cpu); + cpu_group_map(&topo->core_mask, &socket_info, cpu); + cpu_group_map(&topo->book_mask, &book_info, cpu); + cpu_group_map(&topo->drawer_mask, &drawer_info, cpu); + topo->booted_cores = 0; + if (topology_mode != TOPOLOGY_MODE_HW) { + id = topology_mode == TOPOLOGY_MODE_PACKAGE ? 0 : cpu; + topo->thread_id = cpu; + topo->core_id = cpu; + topo->socket_id = id; + topo->book_id = id; + topo->drawer_id = id; + } + } + for_each_online_cpu(cpu) { + topo = &cpu_topology[cpu]; + pkg_first = cpumask_first(&topo->core_mask); + topo_package = &cpu_topology[pkg_first]; + if (cpu == pkg_first) { + for_each_cpu(sibling, &topo->core_mask) { + topo_sibling = &cpu_topology[sibling]; + smt_first = cpumask_first(&topo_sibling->thread_mask); + if (sibling == smt_first) + topo_package->booted_cores++; + } + } else { + topo->booted_cores = topo_package->booted_cores; + } + } +} + +void store_topology(struct sysinfo_15_1_x *info) +{ + stsi(info, 15, 1, topology_mnest_limit()); +} + +static void __arch_update_dedicated_flag(void *arg) +{ + if (topology_cpu_dedicated(smp_processor_id())) + set_cpu_flag(CIF_DEDICATED_CPU); + else + clear_cpu_flag(CIF_DEDICATED_CPU); +} + +static int __arch_update_cpu_topology(void) +{ + struct sysinfo_15_1_x *info = tl_info; + int rc = 0; + + mutex_lock(&smp_cpu_state_mutex); + if (MACHINE_HAS_TOPOLOGY) { + rc = 1; + store_topology(info); + tl_to_masks(info); + } + update_cpu_masks(); + if (!MACHINE_HAS_TOPOLOGY) + topology_update_polarization_simple(); + mutex_unlock(&smp_cpu_state_mutex); + return rc; +} + +int arch_update_cpu_topology(void) +{ + struct device *dev; + int cpu, rc; + + rc = __arch_update_cpu_topology(); + on_each_cpu(__arch_update_dedicated_flag, NULL, 0); + for_each_online_cpu(cpu) { + dev = get_cpu_device(cpu); + if (dev) + kobject_uevent(&dev->kobj, KOBJ_CHANGE); + } + return rc; +} + +static void topology_work_fn(struct work_struct *work) +{ + rebuild_sched_domains(); +} + +void topology_schedule_update(void) +{ + schedule_work(&topology_work); +} + +static void topology_flush_work(void) +{ + flush_work(&topology_work); +} + +static void topology_timer_fn(struct timer_list *unused) +{ + if (ptf(PTF_CHECK)) + topology_schedule_update(); + set_topology_timer(); +} + +static struct timer_list topology_timer; + +static atomic_t topology_poll = ATOMIC_INIT(0); + +static void set_topology_timer(void) +{ + if (atomic_add_unless(&topology_poll, -1, 0)) + mod_timer(&topology_timer, jiffies + msecs_to_jiffies(100)); + else + mod_timer(&topology_timer, jiffies + msecs_to_jiffies(60 * MSEC_PER_SEC)); +} + +void topology_expect_change(void) +{ + if (!MACHINE_HAS_TOPOLOGY) + return; + /* This is racy, but it doesn't matter since it is just a heuristic. + * Worst case is that we poll in a higher frequency for a bit longer. + */ + if (atomic_read(&topology_poll) > 60) + return; + atomic_add(60, &topology_poll); + set_topology_timer(); +} + +static int cpu_management; + +static ssize_t dispatching_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + ssize_t count; + + mutex_lock(&smp_cpu_state_mutex); + count = sprintf(buf, "%d\n", cpu_management); + mutex_unlock(&smp_cpu_state_mutex); + return count; +} + +static ssize_t dispatching_store(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + int val, rc; + char delim; + + if (sscanf(buf, "%d %c", &val, &delim) != 1) + return -EINVAL; + if (val != 0 && val != 1) + return -EINVAL; + rc = 0; + cpus_read_lock(); + mutex_lock(&smp_cpu_state_mutex); + if (cpu_management == val) + goto out; + rc = topology_set_cpu_management(val); + if (rc) + goto out; + cpu_management = val; + topology_expect_change(); +out: + mutex_unlock(&smp_cpu_state_mutex); + cpus_read_unlock(); + return rc ? rc : count; +} +static DEVICE_ATTR_RW(dispatching); + +static ssize_t cpu_polarization_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int cpu = dev->id; + ssize_t count; + + mutex_lock(&smp_cpu_state_mutex); + switch (smp_cpu_get_polarization(cpu)) { + case POLARIZATION_HRZ: + count = sprintf(buf, "horizontal\n"); + break; + case POLARIZATION_VL: + count = sprintf(buf, "vertical:low\n"); + break; + case POLARIZATION_VM: + count = sprintf(buf, "vertical:medium\n"); + break; + case POLARIZATION_VH: + count = sprintf(buf, "vertical:high\n"); + break; + default: + count = sprintf(buf, "unknown\n"); + break; + } + mutex_unlock(&smp_cpu_state_mutex); + return count; +} +static DEVICE_ATTR(polarization, 0444, cpu_polarization_show, NULL); + +static struct attribute *topology_cpu_attrs[] = { + &dev_attr_polarization.attr, + NULL, +}; + +static struct attribute_group topology_cpu_attr_group = { + .attrs = topology_cpu_attrs, +}; + +static ssize_t cpu_dedicated_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int cpu = dev->id; + ssize_t count; + + mutex_lock(&smp_cpu_state_mutex); + count = sprintf(buf, "%d\n", topology_cpu_dedicated(cpu)); + mutex_unlock(&smp_cpu_state_mutex); + return count; +} +static DEVICE_ATTR(dedicated, 0444, cpu_dedicated_show, NULL); + +static struct attribute *topology_extra_cpu_attrs[] = { + &dev_attr_dedicated.attr, + NULL, +}; + +static struct attribute_group topology_extra_cpu_attr_group = { + .attrs = topology_extra_cpu_attrs, +}; + +int topology_cpu_init(struct cpu *cpu) +{ + int rc; + + rc = sysfs_create_group(&cpu->dev.kobj, &topology_cpu_attr_group); + if (rc || !MACHINE_HAS_TOPOLOGY) + return rc; + rc = sysfs_create_group(&cpu->dev.kobj, &topology_extra_cpu_attr_group); + if (rc) + sysfs_remove_group(&cpu->dev.kobj, &topology_cpu_attr_group); + return rc; +} + +static const struct cpumask *cpu_thread_mask(int cpu) +{ + return &cpu_topology[cpu].thread_mask; +} + + +const struct cpumask *cpu_coregroup_mask(int cpu) +{ + return &cpu_topology[cpu].core_mask; +} + +static const struct cpumask *cpu_book_mask(int cpu) +{ + return &cpu_topology[cpu].book_mask; +} + +static const struct cpumask *cpu_drawer_mask(int cpu) +{ + return &cpu_topology[cpu].drawer_mask; +} + +static struct sched_domain_topology_level s390_topology[] = { + { cpu_thread_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, + { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, + { cpu_book_mask, SD_INIT_NAME(BOOK) }, + { cpu_drawer_mask, SD_INIT_NAME(DRAWER) }, + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + +static void __init alloc_masks(struct sysinfo_15_1_x *info, + struct mask_info *mask, int offset) +{ + int i, nr_masks; + + nr_masks = info->mag[TOPOLOGY_NR_MAG - offset]; + for (i = 0; i < info->mnest - offset; i++) + nr_masks *= info->mag[TOPOLOGY_NR_MAG - offset - 1 - i]; + nr_masks = max(nr_masks, 1); + for (i = 0; i < nr_masks; i++) { + mask->next = memblock_alloc(sizeof(*mask->next), 8); + if (!mask->next) + panic("%s: Failed to allocate %zu bytes align=0x%x\n", + __func__, sizeof(*mask->next), 8); + mask = mask->next; + } +} + +void __init topology_init_early(void) +{ + struct sysinfo_15_1_x *info; + + set_sched_topology(s390_topology); + if (topology_mode == TOPOLOGY_MODE_UNINITIALIZED) { + if (MACHINE_HAS_TOPOLOGY) + topology_mode = TOPOLOGY_MODE_HW; + else + topology_mode = TOPOLOGY_MODE_SINGLE; + } + if (!MACHINE_HAS_TOPOLOGY) + goto out; + tl_info = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + if (!tl_info) + panic("%s: Failed to allocate %lu bytes align=0x%lx\n", + __func__, PAGE_SIZE, PAGE_SIZE); + info = tl_info; + store_topology(info); + pr_info("The CPU configuration topology of the machine is: %d %d %d %d %d %d / %d\n", + info->mag[0], info->mag[1], info->mag[2], info->mag[3], + info->mag[4], info->mag[5], info->mnest); + alloc_masks(info, &socket_info, 1); + alloc_masks(info, &book_info, 2); + alloc_masks(info, &drawer_info, 3); +out: + cpumask_set_cpu(0, &cpu_setup_mask); + __arch_update_cpu_topology(); + __arch_update_dedicated_flag(NULL); +} + +static inline int topology_get_mode(int enabled) +{ + if (!enabled) + return TOPOLOGY_MODE_SINGLE; + return MACHINE_HAS_TOPOLOGY ? TOPOLOGY_MODE_HW : TOPOLOGY_MODE_PACKAGE; +} + +static inline int topology_is_enabled(void) +{ + return topology_mode != TOPOLOGY_MODE_SINGLE; +} + +static int __init topology_setup(char *str) +{ + bool enabled; + int rc; + + rc = kstrtobool(str, &enabled); + if (rc) + return rc; + topology_mode = topology_get_mode(enabled); + return 0; +} +early_param("topology", topology_setup); + +static int topology_ctl_handler(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int enabled = topology_is_enabled(); + int new_mode; + int rc; + struct ctl_table ctl_entry = { + .procname = ctl->procname, + .data = &enabled, + .maxlen = sizeof(int), + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + + rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos); + if (rc < 0 || !write) + return rc; + + mutex_lock(&smp_cpu_state_mutex); + new_mode = topology_get_mode(enabled); + if (topology_mode != new_mode) { + topology_mode = new_mode; + topology_schedule_update(); + } + mutex_unlock(&smp_cpu_state_mutex); + topology_flush_work(); + + return rc; +} + +static struct ctl_table topology_ctl_table[] = { + { + .procname = "topology", + .mode = 0644, + .proc_handler = topology_ctl_handler, + }, + { }, +}; + +static int __init topology_init(void) +{ + struct device *dev_root; + int rc = 0; + + timer_setup(&topology_timer, topology_timer_fn, TIMER_DEFERRABLE); + if (MACHINE_HAS_TOPOLOGY) + set_topology_timer(); + else + topology_update_polarization_simple(); + register_sysctl("s390", topology_ctl_table); + + dev_root = bus_get_dev_root(&cpu_subsys); + if (dev_root) { + rc = device_create_file(dev_root, &dev_attr_dispatching); + put_device(dev_root); + } + return rc; +} +device_initcall(topology_init); diff --git a/arch/s390/kernel/trace.c b/arch/s390/kernel/trace.c new file mode 100644 index 0000000000..11a669f3cc --- /dev/null +++ b/arch/s390/kernel/trace.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Tracepoint definitions for s390 + * + * Copyright IBM Corp. 2015 + * Author(s): Martin Schwidefsky + */ + +#include +#define CREATE_TRACE_POINTS +#include + +EXPORT_TRACEPOINT_SYMBOL(s390_diagnose); + +static DEFINE_PER_CPU(unsigned int, diagnose_trace_depth); + +void notrace trace_s390_diagnose_norecursion(int diag_nr) +{ + unsigned long flags; + unsigned int *depth; + + /* Avoid lockdep recursion. */ + if (IS_ENABLED(CONFIG_LOCKDEP)) + return; + local_irq_save(flags); + depth = this_cpu_ptr(&diagnose_trace_depth); + if (*depth == 0) { + (*depth)++; + trace_s390_diagnose(diag_nr); + (*depth)--; + } + local_irq_restore(flags); +} diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c new file mode 100644 index 0000000000..1d2aa448d1 --- /dev/null +++ b/arch/s390/kernel/traps.c @@ -0,0 +1,403 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * S390 version + * Copyright IBM Corp. 1999, 2000 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), + * Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com), + * + * Derived from "arch/i386/kernel/traps.c" + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * 'Traps.c' handles hardware traps and faults after we have saved some + * state in 'asm.s'. + */ +#include "asm/irqflags.h" +#include "asm/ptrace.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "entry.h" + +static inline void __user *get_trap_ip(struct pt_regs *regs) +{ + unsigned long address; + + if (regs->int_code & 0x200) + address = current->thread.trap_tdb.data[3]; + else + address = regs->psw.addr; + return (void __user *) (address - (regs->int_code >> 16)); +} + +int is_valid_bugaddr(unsigned long addr) +{ + return 1; +} + +void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str) +{ + if (user_mode(regs)) { + force_sig_fault(si_signo, si_code, get_trap_ip(regs)); + report_user_fault(regs, si_signo, 0); + } else { + if (!fixup_exception(regs)) + die(regs, str); + } +} + +static void do_trap(struct pt_regs *regs, int si_signo, int si_code, char *str) +{ + if (notify_die(DIE_TRAP, str, regs, 0, + regs->int_code, si_signo) == NOTIFY_STOP) + return; + do_report_trap(regs, si_signo, si_code, str); +} +NOKPROBE_SYMBOL(do_trap); + +void do_per_trap(struct pt_regs *regs) +{ + if (notify_die(DIE_SSTEP, "sstep", regs, 0, 0, SIGTRAP) == NOTIFY_STOP) + return; + if (!current->ptrace) + return; + force_sig_fault(SIGTRAP, TRAP_HWBKPT, + (void __force __user *) current->thread.per_event.address); +} +NOKPROBE_SYMBOL(do_per_trap); + +static void default_trap_handler(struct pt_regs *regs) +{ + if (user_mode(regs)) { + report_user_fault(regs, SIGSEGV, 0); + force_exit_sig(SIGSEGV); + } else + die(regs, "Unknown program exception"); +} + +#define DO_ERROR_INFO(name, signr, sicode, str) \ +static void name(struct pt_regs *regs) \ +{ \ + do_trap(regs, signr, sicode, str); \ +} + +DO_ERROR_INFO(addressing_exception, SIGILL, ILL_ILLADR, + "addressing exception") +DO_ERROR_INFO(execute_exception, SIGILL, ILL_ILLOPN, + "execute exception") +DO_ERROR_INFO(divide_exception, SIGFPE, FPE_INTDIV, + "fixpoint divide exception") +DO_ERROR_INFO(overflow_exception, SIGFPE, FPE_INTOVF, + "fixpoint overflow exception") +DO_ERROR_INFO(hfp_overflow_exception, SIGFPE, FPE_FLTOVF, + "HFP overflow exception") +DO_ERROR_INFO(hfp_underflow_exception, SIGFPE, FPE_FLTUND, + "HFP underflow exception") +DO_ERROR_INFO(hfp_significance_exception, SIGFPE, FPE_FLTRES, + "HFP significance exception") +DO_ERROR_INFO(hfp_divide_exception, SIGFPE, FPE_FLTDIV, + "HFP divide exception") +DO_ERROR_INFO(hfp_sqrt_exception, SIGFPE, FPE_FLTINV, + "HFP square root exception") +DO_ERROR_INFO(operand_exception, SIGILL, ILL_ILLOPN, + "operand exception") +DO_ERROR_INFO(privileged_op, SIGILL, ILL_PRVOPC, + "privileged operation") +DO_ERROR_INFO(special_op_exception, SIGILL, ILL_ILLOPN, + "special operation exception") +DO_ERROR_INFO(transaction_exception, SIGILL, ILL_ILLOPN, + "transaction constraint exception") + +static inline void do_fp_trap(struct pt_regs *regs, __u32 fpc) +{ + int si_code = 0; + /* FPC[2] is Data Exception Code */ + if ((fpc & 0x00000300) == 0) { + /* bits 6 and 7 of DXC are 0 iff IEEE exception */ + if (fpc & 0x8000) /* invalid fp operation */ + si_code = FPE_FLTINV; + else if (fpc & 0x4000) /* div by 0 */ + si_code = FPE_FLTDIV; + else if (fpc & 0x2000) /* overflow */ + si_code = FPE_FLTOVF; + else if (fpc & 0x1000) /* underflow */ + si_code = FPE_FLTUND; + else if (fpc & 0x0800) /* inexact */ + si_code = FPE_FLTRES; + } + do_trap(regs, SIGFPE, si_code, "floating point exception"); +} + +static void translation_specification_exception(struct pt_regs *regs) +{ + /* May never happen. */ + panic("Translation-Specification Exception"); +} + +static void illegal_op(struct pt_regs *regs) +{ + __u8 opcode[6]; + __u16 __user *location; + int is_uprobe_insn = 0; + int signal = 0; + + location = get_trap_ip(regs); + + if (user_mode(regs)) { + if (get_user(*((__u16 *) opcode), (__u16 __user *) location)) + return; + if (*((__u16 *) opcode) == S390_BREAKPOINT_U16) { + if (current->ptrace) + force_sig_fault(SIGTRAP, TRAP_BRKPT, location); + else + signal = SIGILL; +#ifdef CONFIG_UPROBES + } else if (*((__u16 *) opcode) == UPROBE_SWBP_INSN) { + is_uprobe_insn = 1; +#endif + } else + signal = SIGILL; + } + /* + * We got either an illegal op in kernel mode, or user space trapped + * on a uprobes illegal instruction. See if kprobes or uprobes picks + * it up. If not, SIGILL. + */ + if (is_uprobe_insn || !user_mode(regs)) { + if (notify_die(DIE_BPT, "bpt", regs, 0, + 3, SIGTRAP) != NOTIFY_STOP) + signal = SIGILL; + } + if (signal) + do_trap(regs, signal, ILL_ILLOPC, "illegal operation"); +} +NOKPROBE_SYMBOL(illegal_op); + +DO_ERROR_INFO(specification_exception, SIGILL, ILL_ILLOPN, + "specification exception"); + +static void vector_exception(struct pt_regs *regs) +{ + int si_code, vic; + + if (!MACHINE_HAS_VX) { + do_trap(regs, SIGILL, ILL_ILLOPN, "illegal operation"); + return; + } + + /* get vector interrupt code from fpc */ + save_fpu_regs(); + vic = (current->thread.fpu.fpc & 0xf00) >> 8; + switch (vic) { + case 1: /* invalid vector operation */ + si_code = FPE_FLTINV; + break; + case 2: /* division by zero */ + si_code = FPE_FLTDIV; + break; + case 3: /* overflow */ + si_code = FPE_FLTOVF; + break; + case 4: /* underflow */ + si_code = FPE_FLTUND; + break; + case 5: /* inexact */ + si_code = FPE_FLTRES; + break; + default: /* unknown cause */ + si_code = 0; + } + do_trap(regs, SIGFPE, si_code, "vector exception"); +} + +static void data_exception(struct pt_regs *regs) +{ + save_fpu_regs(); + if (current->thread.fpu.fpc & FPC_DXC_MASK) + do_fp_trap(regs, current->thread.fpu.fpc); + else + do_trap(regs, SIGILL, ILL_ILLOPN, "data exception"); +} + +static void space_switch_exception(struct pt_regs *regs) +{ + /* Set user psw back to home space mode. */ + if (user_mode(regs)) + regs->psw.mask |= PSW_ASC_HOME; + /* Send SIGILL. */ + do_trap(regs, SIGILL, ILL_PRVOPC, "space switch event"); +} + +static void monitor_event_exception(struct pt_regs *regs) +{ + if (user_mode(regs)) + return; + + switch (report_bug(regs->psw.addr - (regs->int_code >> 16), regs)) { + case BUG_TRAP_TYPE_NONE: + fixup_exception(regs); + break; + case BUG_TRAP_TYPE_WARN: + break; + case BUG_TRAP_TYPE_BUG: + die(regs, "monitor event"); + break; + } +} + +void kernel_stack_overflow(struct pt_regs *regs) +{ + bust_spinlocks(1); + printk("Kernel stack overflow.\n"); + show_regs(regs); + bust_spinlocks(0); + panic("Corrupt kernel stack, can't continue."); +} +NOKPROBE_SYMBOL(kernel_stack_overflow); + +static void __init test_monitor_call(void) +{ + int val = 1; + + if (!IS_ENABLED(CONFIG_BUG)) + return; + asm volatile( + " mc 0,0\n" + "0: xgr %0,%0\n" + "1:\n" + EX_TABLE(0b,1b) + : "+d" (val)); + if (!val) + panic("Monitor call doesn't work!\n"); +} + +void __init trap_init(void) +{ + local_mcck_enable(); + test_monitor_call(); +} + +static void (*pgm_check_table[128])(struct pt_regs *regs); + +void noinstr __do_pgm_check(struct pt_regs *regs) +{ + unsigned int trapnr; + irqentry_state_t state; + + regs->int_code = S390_lowcore.pgm_int_code; + regs->int_parm_long = S390_lowcore.trans_exc_code; + + state = irqentry_enter(regs); + + if (user_mode(regs)) { + update_timer_sys(); + if (!static_branch_likely(&cpu_has_bear)) { + if (regs->last_break < 4096) + regs->last_break = 1; + } + current->thread.last_break = regs->last_break; + } + + if (S390_lowcore.pgm_code & 0x0200) { + /* transaction abort */ + current->thread.trap_tdb = S390_lowcore.pgm_tdb; + } + + if (S390_lowcore.pgm_code & PGM_INT_CODE_PER) { + if (user_mode(regs)) { + struct per_event *ev = ¤t->thread.per_event; + + set_thread_flag(TIF_PER_TRAP); + ev->address = S390_lowcore.per_address; + ev->cause = S390_lowcore.per_code_combined; + ev->paid = S390_lowcore.per_access_id; + } else { + /* PER event in kernel is kprobes */ + __arch_local_irq_ssm(regs->psw.mask & ~PSW_MASK_PER); + do_per_trap(regs); + goto out; + } + } + + if (!irqs_disabled_flags(regs->psw.mask)) + trace_hardirqs_on(); + __arch_local_irq_ssm(regs->psw.mask & ~PSW_MASK_PER); + + trapnr = regs->int_code & PGM_INT_CODE_MASK; + if (trapnr) + pgm_check_table[trapnr](regs); +out: + local_irq_disable(); + irqentry_exit(regs, state); +} + +/* + * The program check table contains exactly 128 (0x00-0x7f) entries. Each + * line defines the function to be called corresponding to the program check + * interruption code. + */ +static void (*pgm_check_table[128])(struct pt_regs *regs) = { + [0x00] = default_trap_handler, + [0x01] = illegal_op, + [0x02] = privileged_op, + [0x03] = execute_exception, + [0x04] = do_protection_exception, + [0x05] = addressing_exception, + [0x06] = specification_exception, + [0x07] = data_exception, + [0x08] = overflow_exception, + [0x09] = divide_exception, + [0x0a] = overflow_exception, + [0x0b] = divide_exception, + [0x0c] = hfp_overflow_exception, + [0x0d] = hfp_underflow_exception, + [0x0e] = hfp_significance_exception, + [0x0f] = hfp_divide_exception, + [0x10] = do_dat_exception, + [0x11] = do_dat_exception, + [0x12] = translation_specification_exception, + [0x13] = special_op_exception, + [0x14] = default_trap_handler, + [0x15] = operand_exception, + [0x16] = default_trap_handler, + [0x17] = default_trap_handler, + [0x18] = transaction_exception, + [0x19] = default_trap_handler, + [0x1a] = default_trap_handler, + [0x1b] = vector_exception, + [0x1c] = space_switch_exception, + [0x1d] = hfp_sqrt_exception, + [0x1e ... 0x37] = default_trap_handler, + [0x38] = do_dat_exception, + [0x39] = do_dat_exception, + [0x3a] = do_dat_exception, + [0x3b] = do_dat_exception, + [0x3c] = default_trap_handler, + [0x3d] = do_secure_storage_access, + [0x3e] = do_non_secure_storage_access, + [0x3f] = do_secure_storage_violation, + [0x40] = monitor_event_exception, + [0x41 ... 0x7f] = default_trap_handler, +}; + +#define COND_TRAP(x) asm( \ + ".weak " __stringify(x) "\n\t" \ + ".set " __stringify(x) "," \ + __stringify(default_trap_handler)) + +COND_TRAP(do_secure_storage_access); +COND_TRAP(do_non_secure_storage_access); +COND_TRAP(do_secure_storage_violation); diff --git a/arch/s390/kernel/unwind_bc.c b/arch/s390/kernel/unwind_bc.c new file mode 100644 index 0000000000..0ece156fdd --- /dev/null +++ b/arch/s390/kernel/unwind_bc.c @@ -0,0 +1,178 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned long unwind_get_return_address(struct unwind_state *state) +{ + if (unwind_done(state)) + return 0; + return __kernel_text_address(state->ip) ? state->ip : 0; +} +EXPORT_SYMBOL_GPL(unwind_get_return_address); + +static bool outside_of_stack(struct unwind_state *state, unsigned long sp) +{ + return (sp <= state->sp) || + (sp > state->stack_info.end - sizeof(struct stack_frame)); +} + +static bool update_stack_info(struct unwind_state *state, unsigned long sp) +{ + struct stack_info *info = &state->stack_info; + unsigned long *mask = &state->stack_mask; + + /* New stack pointer leaves the current stack */ + if (get_stack_info(sp, state->task, info, mask) != 0 || + !on_stack(info, sp, sizeof(struct stack_frame))) + /* 'sp' does not point to a valid stack */ + return false; + return true; +} + +static inline bool is_final_pt_regs(struct unwind_state *state, + struct pt_regs *regs) +{ + /* user mode or kernel thread pt_regs at the bottom of task stack */ + if (task_pt_regs(state->task) == regs) + return true; + + /* user mode pt_regs at the bottom of irq stack */ + return state->stack_info.type == STACK_TYPE_IRQ && + state->stack_info.end - sizeof(struct pt_regs) == (unsigned long)regs && + READ_ONCE_NOCHECK(regs->psw.mask) & PSW_MASK_PSTATE; +} + +bool unwind_next_frame(struct unwind_state *state) +{ + struct stack_info *info = &state->stack_info; + struct stack_frame *sf; + struct pt_regs *regs; + unsigned long sp, ip; + bool reliable; + + regs = state->regs; + if (unlikely(regs)) { + sp = state->sp; + sf = (struct stack_frame *) sp; + ip = READ_ONCE_NOCHECK(sf->gprs[8]); + reliable = false; + regs = NULL; + /* skip bogus %r14 or if is the same as regs->psw.addr */ + if (!__kernel_text_address(ip) || state->ip == unwind_recover_ret_addr(state, ip)) { + state->regs = NULL; + return unwind_next_frame(state); + } + } else { + sf = (struct stack_frame *) state->sp; + sp = READ_ONCE_NOCHECK(sf->back_chain); + if (likely(sp)) { + /* Non-zero back-chain points to the previous frame */ + if (unlikely(outside_of_stack(state, sp))) { + if (!update_stack_info(state, sp)) + goto out_err; + } + sf = (struct stack_frame *) sp; + ip = READ_ONCE_NOCHECK(sf->gprs[8]); + reliable = true; + } else { + /* No back-chain, look for a pt_regs structure */ + sp = state->sp + STACK_FRAME_OVERHEAD; + if (!on_stack(info, sp, sizeof(struct pt_regs))) + goto out_err; + regs = (struct pt_regs *) sp; + if (is_final_pt_regs(state, regs)) + goto out_stop; + ip = READ_ONCE_NOCHECK(regs->psw.addr); + sp = READ_ONCE_NOCHECK(regs->gprs[15]); + if (unlikely(outside_of_stack(state, sp))) { + if (!update_stack_info(state, sp)) + goto out_err; + } + reliable = true; + } + } + + /* Sanity check: ABI requires SP to be aligned 8 bytes. */ + if (sp & 0x7) + goto out_err; + + /* Update unwind state */ + state->sp = sp; + state->regs = regs; + state->reliable = reliable; + state->ip = unwind_recover_ret_addr(state, ip); + return true; + +out_err: + state->error = true; +out_stop: + state->stack_info.type = STACK_TYPE_UNKNOWN; + return false; +} +EXPORT_SYMBOL_GPL(unwind_next_frame); + +void __unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long first_frame) +{ + struct stack_info *info = &state->stack_info; + struct stack_frame *sf; + unsigned long ip, sp; + + memset(state, 0, sizeof(*state)); + state->task = task; + state->regs = regs; + + /* Don't even attempt to start from user mode regs: */ + if (regs && user_mode(regs)) { + info->type = STACK_TYPE_UNKNOWN; + return; + } + + /* Get the instruction pointer from pt_regs or the stack frame */ + if (regs) { + ip = regs->psw.addr; + sp = regs->gprs[15]; + } else if (task == current) { + sp = current_frame_address(); + } else { + sp = task->thread.ksp; + } + + /* Get current stack pointer and initialize stack info */ + if (!update_stack_info(state, sp)) { + /* Something is wrong with the stack pointer */ + info->type = STACK_TYPE_UNKNOWN; + state->error = true; + return; + } + + if (!regs) { + /* Stack frame is within valid stack */ + sf = (struct stack_frame *)sp; + ip = READ_ONCE_NOCHECK(sf->gprs[8]); + } + + /* Update unwind state */ + state->sp = sp; + state->reliable = true; + state->ip = unwind_recover_ret_addr(state, ip); + + if (!first_frame) + return; + /* Skip through the call chain to the specified starting frame */ + while (!unwind_done(state)) { + if (on_stack(&state->stack_info, first_frame, sizeof(struct stack_frame))) { + if (state->sp >= first_frame) + break; + } + unwind_next_frame(state); + } +} +EXPORT_SYMBOL_GPL(__unwind_start); diff --git a/arch/s390/kernel/uprobes.c b/arch/s390/kernel/uprobes.c new file mode 100644 index 0000000000..b88345ef8b --- /dev/null +++ b/arch/s390/kernel/uprobes.c @@ -0,0 +1,388 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * User-space Probes (UProbes) for s390 + * + * Copyright IBM Corp. 2014 + * Author(s): Jan Willeke, + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include "entry.h" + +#define UPROBE_TRAP_NR UINT_MAX + +int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, + unsigned long addr) +{ + return probe_is_prohibited_opcode(auprobe->insn); +} + +int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + if (psw_bits(regs->psw).eaba == PSW_BITS_AMODE_24BIT) + return -EINVAL; + if (!is_compat_task() && psw_bits(regs->psw).eaba == PSW_BITS_AMODE_31BIT) + return -EINVAL; + clear_thread_flag(TIF_PER_TRAP); + auprobe->saved_per = psw_bits(regs->psw).per; + auprobe->saved_int_code = regs->int_code; + regs->int_code = UPROBE_TRAP_NR; + regs->psw.addr = current->utask->xol_vaddr; + set_tsk_thread_flag(current, TIF_UPROBE_SINGLESTEP); + update_cr_regs(current); + return 0; +} + +bool arch_uprobe_xol_was_trapped(struct task_struct *tsk) +{ + struct pt_regs *regs = task_pt_regs(tsk); + + if (regs->int_code != UPROBE_TRAP_NR) + return true; + return false; +} + +static int check_per_event(unsigned short cause, unsigned long control, + struct pt_regs *regs) +{ + if (!(regs->psw.mask & PSW_MASK_PER)) + return 0; + /* user space single step */ + if (control == 0) + return 1; + /* over indication for storage alteration */ + if ((control & 0x20200000) && (cause & 0x2000)) + return 1; + if (cause & 0x8000) { + /* all branches */ + if ((control & 0x80800000) == 0x80000000) + return 1; + /* branch into selected range */ + if (((control & 0x80800000) == 0x80800000) && + regs->psw.addr >= current->thread.per_user.start && + regs->psw.addr <= current->thread.per_user.end) + return 1; + } + return 0; +} + +int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + int fixup = probe_get_fixup_type(auprobe->insn); + struct uprobe_task *utask = current->utask; + + clear_tsk_thread_flag(current, TIF_UPROBE_SINGLESTEP); + update_cr_regs(current); + psw_bits(regs->psw).per = auprobe->saved_per; + regs->int_code = auprobe->saved_int_code; + + if (fixup & FIXUP_PSW_NORMAL) + regs->psw.addr += utask->vaddr - utask->xol_vaddr; + if (fixup & FIXUP_RETURN_REGISTER) { + int reg = (auprobe->insn[0] & 0xf0) >> 4; + + regs->gprs[reg] += utask->vaddr - utask->xol_vaddr; + } + if (fixup & FIXUP_BRANCH_NOT_TAKEN) { + int ilen = insn_length(auprobe->insn[0] >> 8); + + if (regs->psw.addr - utask->xol_vaddr == ilen) + regs->psw.addr = utask->vaddr + ilen; + } + if (check_per_event(current->thread.per_event.cause, + current->thread.per_user.control, regs)) { + /* fix per address */ + current->thread.per_event.address = utask->vaddr; + /* trigger per event */ + set_thread_flag(TIF_PER_TRAP); + } + return 0; +} + +int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, + void *data) +{ + struct die_args *args = data; + struct pt_regs *regs = args->regs; + + if (!user_mode(regs)) + return NOTIFY_DONE; + if (regs->int_code & 0x200) /* Trap during transaction */ + return NOTIFY_DONE; + switch (val) { + case DIE_BPT: + if (uprobe_pre_sstep_notifier(regs)) + return NOTIFY_STOP; + break; + case DIE_SSTEP: + if (uprobe_post_sstep_notifier(regs)) + return NOTIFY_STOP; + break; + default: + break; + } + return NOTIFY_DONE; +} + +void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + clear_thread_flag(TIF_UPROBE_SINGLESTEP); + regs->int_code = auprobe->saved_int_code; + regs->psw.addr = current->utask->vaddr; + current->thread.per_event.address = current->utask->vaddr; +} + +unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline, + struct pt_regs *regs) +{ + unsigned long orig; + + orig = regs->gprs[14]; + regs->gprs[14] = trampoline; + return orig; +} + +bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, + struct pt_regs *regs) +{ + if (ctx == RP_CHECK_CHAIN_CALL) + return user_stack_pointer(regs) <= ret->stack; + else + return user_stack_pointer(regs) < ret->stack; +} + +/* Instruction Emulation */ + +static void adjust_psw_addr(psw_t *psw, unsigned long len) +{ + psw->addr = __rewind_psw(*psw, -len); +} + +#define EMU_ILLEGAL_OP 1 +#define EMU_SPECIFICATION 2 +#define EMU_ADDRESSING 3 + +#define emu_load_ril(ptr, output) \ +({ \ + unsigned int mask = sizeof(*(ptr)) - 1; \ + __typeof__(*(ptr)) input; \ + int __rc = 0; \ + \ + if ((u64 __force)ptr & mask) \ + __rc = EMU_SPECIFICATION; \ + else if (get_user(input, ptr)) \ + __rc = EMU_ADDRESSING; \ + else \ + *(output) = input; \ + __rc; \ +}) + +#define emu_store_ril(regs, ptr, input) \ +({ \ + unsigned int mask = sizeof(*(ptr)) - 1; \ + __typeof__(ptr) __ptr = (ptr); \ + int __rc = 0; \ + \ + if ((u64 __force)__ptr & mask) \ + __rc = EMU_SPECIFICATION; \ + else if (put_user(*(input), __ptr)) \ + __rc = EMU_ADDRESSING; \ + if (__rc == 0) \ + sim_stor_event(regs, \ + (void __force *)__ptr, \ + mask + 1); \ + __rc; \ +}) + +#define emu_cmp_ril(regs, ptr, cmp) \ +({ \ + unsigned int mask = sizeof(*(ptr)) - 1; \ + __typeof__(*(ptr)) input; \ + int __rc = 0; \ + \ + if ((u64 __force)ptr & mask) \ + __rc = EMU_SPECIFICATION; \ + else if (get_user(input, ptr)) \ + __rc = EMU_ADDRESSING; \ + else if (input > *(cmp)) \ + psw_bits((regs)->psw).cc = 1; \ + else if (input < *(cmp)) \ + psw_bits((regs)->psw).cc = 2; \ + else \ + psw_bits((regs)->psw).cc = 0; \ + __rc; \ +}) + +struct insn_ril { + u8 opc0; + u8 reg : 4; + u8 opc1 : 4; + s32 disp; +} __packed; + +union split_register { + u64 u64; + u32 u32[2]; + u16 u16[4]; + s64 s64; + s32 s32[2]; + s16 s16[4]; +}; + +/* + * If user per registers are setup to trace storage alterations and an + * emulated store took place on a fitting address a user trap is generated. + */ +static void sim_stor_event(struct pt_regs *regs, void *addr, int len) +{ + if (!(regs->psw.mask & PSW_MASK_PER)) + return; + if (!(current->thread.per_user.control & PER_EVENT_STORE)) + return; + if ((void *)current->thread.per_user.start > (addr + len)) + return; + if ((void *)current->thread.per_user.end < addr) + return; + current->thread.per_event.address = regs->psw.addr; + current->thread.per_event.cause = PER_EVENT_STORE >> 16; + set_thread_flag(TIF_PER_TRAP); +} + +/* + * pc relative instructions are emulated, since parameters may not be + * accessible from the xol area due to range limitations. + */ +static void handle_insn_ril(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + union split_register *rx; + struct insn_ril *insn; + unsigned int ilen; + void *uptr; + int rc = 0; + + insn = (struct insn_ril *) &auprobe->insn; + rx = (union split_register *) ®s->gprs[insn->reg]; + uptr = (void *)(regs->psw.addr + (insn->disp * 2)); + ilen = insn_length(insn->opc0); + + switch (insn->opc0) { + case 0xc0: + switch (insn->opc1) { + case 0x00: /* larl */ + rx->u64 = (unsigned long)uptr; + break; + } + break; + case 0xc4: + switch (insn->opc1) { + case 0x02: /* llhrl */ + rc = emu_load_ril((u16 __user *)uptr, &rx->u32[1]); + break; + case 0x04: /* lghrl */ + rc = emu_load_ril((s16 __user *)uptr, &rx->u64); + break; + case 0x05: /* lhrl */ + rc = emu_load_ril((s16 __user *)uptr, &rx->u32[1]); + break; + case 0x06: /* llghrl */ + rc = emu_load_ril((u16 __user *)uptr, &rx->u64); + break; + case 0x08: /* lgrl */ + rc = emu_load_ril((u64 __user *)uptr, &rx->u64); + break; + case 0x0c: /* lgfrl */ + rc = emu_load_ril((s32 __user *)uptr, &rx->u64); + break; + case 0x0d: /* lrl */ + rc = emu_load_ril((u32 __user *)uptr, &rx->u32[1]); + break; + case 0x0e: /* llgfrl */ + rc = emu_load_ril((u32 __user *)uptr, &rx->u64); + break; + case 0x07: /* sthrl */ + rc = emu_store_ril(regs, (u16 __user *)uptr, &rx->u16[3]); + break; + case 0x0b: /* stgrl */ + rc = emu_store_ril(regs, (u64 __user *)uptr, &rx->u64); + break; + case 0x0f: /* strl */ + rc = emu_store_ril(regs, (u32 __user *)uptr, &rx->u32[1]); + break; + } + break; + case 0xc6: + switch (insn->opc1) { + case 0x04: /* cghrl */ + rc = emu_cmp_ril(regs, (s16 __user *)uptr, &rx->s64); + break; + case 0x05: /* chrl */ + rc = emu_cmp_ril(regs, (s16 __user *)uptr, &rx->s32[1]); + break; + case 0x06: /* clghrl */ + rc = emu_cmp_ril(regs, (u16 __user *)uptr, &rx->u64); + break; + case 0x07: /* clhrl */ + rc = emu_cmp_ril(regs, (u16 __user *)uptr, &rx->u32[1]); + break; + case 0x08: /* cgrl */ + rc = emu_cmp_ril(regs, (s64 __user *)uptr, &rx->s64); + break; + case 0x0a: /* clgrl */ + rc = emu_cmp_ril(regs, (u64 __user *)uptr, &rx->u64); + break; + case 0x0c: /* cgfrl */ + rc = emu_cmp_ril(regs, (s32 __user *)uptr, &rx->s64); + break; + case 0x0d: /* crl */ + rc = emu_cmp_ril(regs, (s32 __user *)uptr, &rx->s32[1]); + break; + case 0x0e: /* clgfrl */ + rc = emu_cmp_ril(regs, (u32 __user *)uptr, &rx->u64); + break; + case 0x0f: /* clrl */ + rc = emu_cmp_ril(regs, (u32 __user *)uptr, &rx->u32[1]); + break; + } + break; + } + adjust_psw_addr(®s->psw, ilen); + switch (rc) { + case EMU_ILLEGAL_OP: + regs->int_code = ilen << 16 | 0x0001; + do_report_trap(regs, SIGILL, ILL_ILLOPC, NULL); + break; + case EMU_SPECIFICATION: + regs->int_code = ilen << 16 | 0x0006; + do_report_trap(regs, SIGILL, ILL_ILLOPC , NULL); + break; + case EMU_ADDRESSING: + regs->int_code = ilen << 16 | 0x0005; + do_report_trap(regs, SIGSEGV, SEGV_MAPERR, NULL); + break; + } +} + +bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + if ((psw_bits(regs->psw).eaba == PSW_BITS_AMODE_24BIT) || + ((psw_bits(regs->psw).eaba == PSW_BITS_AMODE_31BIT) && + !is_compat_task())) { + regs->psw.addr = __rewind_psw(regs->psw, UPROBE_SWBP_INSN_SIZE); + do_report_trap(regs, SIGILL, ILL_ILLADR, NULL); + return true; + } + if (probe_is_insn_relative_long(auprobe->insn)) { + handle_insn_ril(auprobe, regs); + return true; + } + return false; +} diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c new file mode 100644 index 0000000000..fc07bc39e6 --- /dev/null +++ b/arch/s390/kernel/uv.c @@ -0,0 +1,717 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Common Ultravisor functions and initialization + * + * Copyright IBM Corp. 2019, 2020 + */ +#define KMSG_COMPONENT "prot_virt" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* the bootdata_preserved fields come from ones in arch/s390/boot/uv.c */ +#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST +int __bootdata_preserved(prot_virt_guest); +#endif + +/* + * uv_info contains both host and guest information but it's currently only + * expected to be used within modules if it's the KVM module or for + * any PV guest module. + * + * The kernel itself will write these values once in uv_query_info() + * and then make some of them readable via a sysfs interface. + */ +struct uv_info __bootdata_preserved(uv_info); +EXPORT_SYMBOL(uv_info); + +#if IS_ENABLED(CONFIG_KVM) +int __bootdata_preserved(prot_virt_host); +EXPORT_SYMBOL(prot_virt_host); + +static int __init uv_init(phys_addr_t stor_base, unsigned long stor_len) +{ + struct uv_cb_init uvcb = { + .header.cmd = UVC_CMD_INIT_UV, + .header.len = sizeof(uvcb), + .stor_origin = stor_base, + .stor_len = stor_len, + }; + + if (uv_call(0, (uint64_t)&uvcb)) { + pr_err("Ultravisor init failed with rc: 0x%x rrc: 0%x\n", + uvcb.header.rc, uvcb.header.rrc); + return -1; + } + return 0; +} + +void __init setup_uv(void) +{ + void *uv_stor_base; + + if (!is_prot_virt_host()) + return; + + uv_stor_base = memblock_alloc_try_nid( + uv_info.uv_base_stor_len, SZ_1M, SZ_2G, + MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE); + if (!uv_stor_base) { + pr_warn("Failed to reserve %lu bytes for ultravisor base storage\n", + uv_info.uv_base_stor_len); + goto fail; + } + + if (uv_init(__pa(uv_stor_base), uv_info.uv_base_stor_len)) { + memblock_free(uv_stor_base, uv_info.uv_base_stor_len); + goto fail; + } + + pr_info("Reserving %luMB as ultravisor base storage\n", + uv_info.uv_base_stor_len >> 20); + return; +fail: + pr_info("Disabling support for protected virtualization"); + prot_virt_host = 0; +} + +/* + * Requests the Ultravisor to pin the page in the shared state. This will + * cause an intercept when the guest attempts to unshare the pinned page. + */ +int uv_pin_shared(unsigned long paddr) +{ + struct uv_cb_cfs uvcb = { + .header.cmd = UVC_CMD_PIN_PAGE_SHARED, + .header.len = sizeof(uvcb), + .paddr = paddr, + }; + + if (uv_call(0, (u64)&uvcb)) + return -EINVAL; + return 0; +} +EXPORT_SYMBOL_GPL(uv_pin_shared); + +/* + * Requests the Ultravisor to destroy a guest page and make it + * accessible to the host. The destroy clears the page instead of + * exporting. + * + * @paddr: Absolute host address of page to be destroyed + */ +static int uv_destroy_page(unsigned long paddr) +{ + struct uv_cb_cfs uvcb = { + .header.cmd = UVC_CMD_DESTR_SEC_STOR, + .header.len = sizeof(uvcb), + .paddr = paddr + }; + + if (uv_call(0, (u64)&uvcb)) { + /* + * Older firmware uses 107/d as an indication of a non secure + * page. Let us emulate the newer variant (no-op). + */ + if (uvcb.header.rc == 0x107 && uvcb.header.rrc == 0xd) + return 0; + return -EINVAL; + } + return 0; +} + +/* + * The caller must already hold a reference to the page + */ +int uv_destroy_owned_page(unsigned long paddr) +{ + struct page *page = phys_to_page(paddr); + int rc; + + get_page(page); + rc = uv_destroy_page(paddr); + if (!rc) + clear_bit(PG_arch_1, &page->flags); + put_page(page); + return rc; +} + +/* + * Requests the Ultravisor to encrypt a guest page and make it + * accessible to the host for paging (export). + * + * @paddr: Absolute host address of page to be exported + */ +int uv_convert_from_secure(unsigned long paddr) +{ + struct uv_cb_cfs uvcb = { + .header.cmd = UVC_CMD_CONV_FROM_SEC_STOR, + .header.len = sizeof(uvcb), + .paddr = paddr + }; + + if (uv_call(0, (u64)&uvcb)) + return -EINVAL; + return 0; +} + +/* + * The caller must already hold a reference to the page + */ +int uv_convert_owned_from_secure(unsigned long paddr) +{ + struct page *page = phys_to_page(paddr); + int rc; + + get_page(page); + rc = uv_convert_from_secure(paddr); + if (!rc) + clear_bit(PG_arch_1, &page->flags); + put_page(page); + return rc; +} + +/* + * Calculate the expected ref_count for a page that would otherwise have no + * further pins. This was cribbed from similar functions in other places in + * the kernel, but with some slight modifications. We know that a secure + * page can not be a huge page for example. + */ +static int expected_page_refs(struct page *page) +{ + int res; + + res = page_mapcount(page); + if (PageSwapCache(page)) { + res++; + } else if (page_mapping(page)) { + res++; + if (page_has_private(page)) + res++; + } + return res; +} + +static int make_page_secure(struct page *page, struct uv_cb_header *uvcb) +{ + int expected, cc = 0; + + if (PageWriteback(page)) + return -EAGAIN; + expected = expected_page_refs(page); + if (!page_ref_freeze(page, expected)) + return -EBUSY; + set_bit(PG_arch_1, &page->flags); + /* + * If the UVC does not succeed or fail immediately, we don't want to + * loop for long, or we might get stall notifications. + * On the other hand, this is a complex scenario and we are holding a lot of + * locks, so we can't easily sleep and reschedule. We try only once, + * and if the UVC returned busy or partial completion, we return + * -EAGAIN and we let the callers deal with it. + */ + cc = __uv_call(0, (u64)uvcb); + page_ref_unfreeze(page, expected); + /* + * Return -ENXIO if the page was not mapped, -EINVAL for other errors. + * If busy or partially completed, return -EAGAIN. + */ + if (cc == UVC_CC_OK) + return 0; + else if (cc == UVC_CC_BUSY || cc == UVC_CC_PARTIAL) + return -EAGAIN; + return uvcb->rc == 0x10a ? -ENXIO : -EINVAL; +} + +/** + * should_export_before_import - Determine whether an export is needed + * before an import-like operation + * @uvcb: the Ultravisor control block of the UVC to be performed + * @mm: the mm of the process + * + * Returns whether an export is needed before every import-like operation. + * This is needed for shared pages, which don't trigger a secure storage + * exception when accessed from a different guest. + * + * Although considered as one, the Unpin Page UVC is not an actual import, + * so it is not affected. + * + * No export is needed also when there is only one protected VM, because the + * page cannot belong to the wrong VM in that case (there is no "other VM" + * it can belong to). + * + * Return: true if an export is needed before every import, otherwise false. + */ +static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm) +{ + /* + * The misc feature indicates, among other things, that importing a + * shared page from a different protected VM will automatically also + * transfer its ownership. + */ + if (uv_has_feature(BIT_UV_FEAT_MISC)) + return false; + if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED) + return false; + return atomic_read(&mm->context.protected_count) > 1; +} + +/* + * Requests the Ultravisor to make a page accessible to a guest. + * If it's brought in the first time, it will be cleared. If + * it has been exported before, it will be decrypted and integrity + * checked. + */ +int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb) +{ + struct vm_area_struct *vma; + bool local_drain = false; + spinlock_t *ptelock; + unsigned long uaddr; + struct page *page; + pte_t *ptep; + int rc; + +again: + rc = -EFAULT; + mmap_read_lock(gmap->mm); + + uaddr = __gmap_translate(gmap, gaddr); + if (IS_ERR_VALUE(uaddr)) + goto out; + vma = vma_lookup(gmap->mm, uaddr); + if (!vma) + goto out; + /* + * Secure pages cannot be huge and userspace should not combine both. + * In case userspace does it anyway this will result in an -EFAULT for + * the unpack. The guest is thus never reaching secure mode. If + * userspace is playing dirty tricky with mapping huge pages later + * on this will result in a segmentation fault. + */ + if (is_vm_hugetlb_page(vma)) + goto out; + + rc = -ENXIO; + ptep = get_locked_pte(gmap->mm, uaddr, &ptelock); + if (!ptep) + goto out; + if (pte_present(*ptep) && !(pte_val(*ptep) & _PAGE_INVALID) && pte_write(*ptep)) { + page = pte_page(*ptep); + rc = -EAGAIN; + if (trylock_page(page)) { + if (should_export_before_import(uvcb, gmap->mm)) + uv_convert_from_secure(page_to_phys(page)); + rc = make_page_secure(page, uvcb); + unlock_page(page); + } + } + pte_unmap_unlock(ptep, ptelock); +out: + mmap_read_unlock(gmap->mm); + + if (rc == -EAGAIN) { + /* + * If we are here because the UVC returned busy or partial + * completion, this is just a useless check, but it is safe. + */ + wait_on_page_writeback(page); + } else if (rc == -EBUSY) { + /* + * If we have tried a local drain and the page refcount + * still does not match our expected safe value, try with a + * system wide drain. This is needed if the pagevecs holding + * the page are on a different CPU. + */ + if (local_drain) { + lru_add_drain_all(); + /* We give up here, and let the caller try again */ + return -EAGAIN; + } + /* + * We are here if the page refcount does not match the + * expected safe value. The main culprits are usually + * pagevecs. With lru_add_drain() we drain the pagevecs + * on the local CPU so that hopefully the refcount will + * reach the expected safe value. + */ + lru_add_drain(); + local_drain = true; + /* And now we try again immediately after draining */ + goto again; + } else if (rc == -ENXIO) { + if (gmap_fault(gmap, gaddr, FAULT_FLAG_WRITE)) + return -EFAULT; + return -EAGAIN; + } + return rc; +} +EXPORT_SYMBOL_GPL(gmap_make_secure); + +int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr) +{ + struct uv_cb_cts uvcb = { + .header.cmd = UVC_CMD_CONV_TO_SEC_STOR, + .header.len = sizeof(uvcb), + .guest_handle = gmap->guest_handle, + .gaddr = gaddr, + }; + + return gmap_make_secure(gmap, gaddr, &uvcb); +} +EXPORT_SYMBOL_GPL(gmap_convert_to_secure); + +/** + * gmap_destroy_page - Destroy a guest page. + * @gmap: the gmap of the guest + * @gaddr: the guest address to destroy + * + * An attempt will be made to destroy the given guest page. If the attempt + * fails, an attempt is made to export the page. If both attempts fail, an + * appropriate error is returned. + */ +int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr) +{ + struct vm_area_struct *vma; + unsigned long uaddr; + struct page *page; + int rc; + + rc = -EFAULT; + mmap_read_lock(gmap->mm); + + uaddr = __gmap_translate(gmap, gaddr); + if (IS_ERR_VALUE(uaddr)) + goto out; + vma = vma_lookup(gmap->mm, uaddr); + if (!vma) + goto out; + /* + * Huge pages should not be able to become secure + */ + if (is_vm_hugetlb_page(vma)) + goto out; + + rc = 0; + /* we take an extra reference here */ + page = follow_page(vma, uaddr, FOLL_WRITE | FOLL_GET); + if (IS_ERR_OR_NULL(page)) + goto out; + rc = uv_destroy_owned_page(page_to_phys(page)); + /* + * Fault handlers can race; it is possible that two CPUs will fault + * on the same secure page. One CPU can destroy the page, reboot, + * re-enter secure mode and import it, while the second CPU was + * stuck at the beginning of the handler. At some point the second + * CPU will be able to progress, and it will not be able to destroy + * the page. In that case we do not want to terminate the process, + * we instead try to export the page. + */ + if (rc) + rc = uv_convert_owned_from_secure(page_to_phys(page)); + put_page(page); +out: + mmap_read_unlock(gmap->mm); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_destroy_page); + +/* + * To be called with the page locked or with an extra reference! This will + * prevent gmap_make_secure from touching the page concurrently. Having 2 + * parallel make_page_accessible is fine, as the UV calls will become a + * no-op if the page is already exported. + */ +int arch_make_page_accessible(struct page *page) +{ + int rc = 0; + + /* Hugepage cannot be protected, so nothing to do */ + if (PageHuge(page)) + return 0; + + /* + * PG_arch_1 is used in 3 places: + * 1. for kernel page tables during early boot + * 2. for storage keys of huge pages and KVM + * 3. As an indication that this page might be secure. This can + * overindicate, e.g. we set the bit before calling + * convert_to_secure. + * As secure pages are never huge, all 3 variants can co-exists. + */ + if (!test_bit(PG_arch_1, &page->flags)) + return 0; + + rc = uv_pin_shared(page_to_phys(page)); + if (!rc) { + clear_bit(PG_arch_1, &page->flags); + return 0; + } + + rc = uv_convert_from_secure(page_to_phys(page)); + if (!rc) { + clear_bit(PG_arch_1, &page->flags); + return 0; + } + + return rc; +} +EXPORT_SYMBOL_GPL(arch_make_page_accessible); + +#endif + +#if defined(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) || IS_ENABLED(CONFIG_KVM) +static ssize_t uv_query_facilities(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n%lx\n%lx\n%lx\n", + uv_info.inst_calls_list[0], + uv_info.inst_calls_list[1], + uv_info.inst_calls_list[2], + uv_info.inst_calls_list[3]); +} + +static struct kobj_attribute uv_query_facilities_attr = + __ATTR(facilities, 0444, uv_query_facilities, NULL); + +static ssize_t uv_query_supp_se_hdr_ver(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_se_hdr_ver); +} + +static struct kobj_attribute uv_query_supp_se_hdr_ver_attr = + __ATTR(supp_se_hdr_ver, 0444, uv_query_supp_se_hdr_ver, NULL); + +static ssize_t uv_query_supp_se_hdr_pcf(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_se_hdr_pcf); +} + +static struct kobj_attribute uv_query_supp_se_hdr_pcf_attr = + __ATTR(supp_se_hdr_pcf, 0444, uv_query_supp_se_hdr_pcf, NULL); + +static ssize_t uv_query_dump_cpu_len(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.guest_cpu_stor_len); +} + +static struct kobj_attribute uv_query_dump_cpu_len_attr = + __ATTR(uv_query_dump_cpu_len, 0444, uv_query_dump_cpu_len, NULL); + +static ssize_t uv_query_dump_storage_state_len(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.conf_dump_storage_state_len); +} + +static struct kobj_attribute uv_query_dump_storage_state_len_attr = + __ATTR(dump_storage_state_len, 0444, uv_query_dump_storage_state_len, NULL); + +static ssize_t uv_query_dump_finalize_len(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.conf_dump_finalize_len); +} + +static struct kobj_attribute uv_query_dump_finalize_len_attr = + __ATTR(dump_finalize_len, 0444, uv_query_dump_finalize_len, NULL); + +static ssize_t uv_query_feature_indications(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.uv_feature_indications); +} + +static struct kobj_attribute uv_query_feature_indications_attr = + __ATTR(feature_indications, 0444, uv_query_feature_indications, NULL); + +static ssize_t uv_query_max_guest_cpus(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", uv_info.max_guest_cpu_id + 1); +} + +static struct kobj_attribute uv_query_max_guest_cpus_attr = + __ATTR(max_cpus, 0444, uv_query_max_guest_cpus, NULL); + +static ssize_t uv_query_max_guest_vms(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", uv_info.max_num_sec_conf); +} + +static struct kobj_attribute uv_query_max_guest_vms_attr = + __ATTR(max_guests, 0444, uv_query_max_guest_vms, NULL); + +static ssize_t uv_query_max_guest_addr(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.max_sec_stor_addr); +} + +static struct kobj_attribute uv_query_max_guest_addr_attr = + __ATTR(max_address, 0444, uv_query_max_guest_addr, NULL); + +static ssize_t uv_query_supp_att_req_hdr_ver(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_att_req_hdr_ver); +} + +static struct kobj_attribute uv_query_supp_att_req_hdr_ver_attr = + __ATTR(supp_att_req_hdr_ver, 0444, uv_query_supp_att_req_hdr_ver, NULL); + +static ssize_t uv_query_supp_att_pflags(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_att_pflags); +} + +static struct kobj_attribute uv_query_supp_att_pflags_attr = + __ATTR(supp_att_pflags, 0444, uv_query_supp_att_pflags, NULL); + +static ssize_t uv_query_supp_add_secret_req_ver(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_add_secret_req_ver); +} + +static struct kobj_attribute uv_query_supp_add_secret_req_ver_attr = + __ATTR(supp_add_secret_req_ver, 0444, uv_query_supp_add_secret_req_ver, NULL); + +static ssize_t uv_query_supp_add_secret_pcf(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_add_secret_pcf); +} + +static struct kobj_attribute uv_query_supp_add_secret_pcf_attr = + __ATTR(supp_add_secret_pcf, 0444, uv_query_supp_add_secret_pcf, NULL); + +static ssize_t uv_query_supp_secret_types(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_secret_types); +} + +static struct kobj_attribute uv_query_supp_secret_types_attr = + __ATTR(supp_secret_types, 0444, uv_query_supp_secret_types, NULL); + +static ssize_t uv_query_max_secrets(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", uv_info.max_secrets); +} + +static struct kobj_attribute uv_query_max_secrets_attr = + __ATTR(max_secrets, 0444, uv_query_max_secrets, NULL); + +static struct attribute *uv_query_attrs[] = { + &uv_query_facilities_attr.attr, + &uv_query_feature_indications_attr.attr, + &uv_query_max_guest_cpus_attr.attr, + &uv_query_max_guest_vms_attr.attr, + &uv_query_max_guest_addr_attr.attr, + &uv_query_supp_se_hdr_ver_attr.attr, + &uv_query_supp_se_hdr_pcf_attr.attr, + &uv_query_dump_storage_state_len_attr.attr, + &uv_query_dump_finalize_len_attr.attr, + &uv_query_dump_cpu_len_attr.attr, + &uv_query_supp_att_req_hdr_ver_attr.attr, + &uv_query_supp_att_pflags_attr.attr, + &uv_query_supp_add_secret_req_ver_attr.attr, + &uv_query_supp_add_secret_pcf_attr.attr, + &uv_query_supp_secret_types_attr.attr, + &uv_query_max_secrets_attr.attr, + NULL, +}; + +static struct attribute_group uv_query_attr_group = { + .attrs = uv_query_attrs, +}; + +static ssize_t uv_is_prot_virt_guest(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int val = 0; + +#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST + val = prot_virt_guest; +#endif + return sysfs_emit(buf, "%d\n", val); +} + +static ssize_t uv_is_prot_virt_host(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int val = 0; + +#if IS_ENABLED(CONFIG_KVM) + val = prot_virt_host; +#endif + + return sysfs_emit(buf, "%d\n", val); +} + +static struct kobj_attribute uv_prot_virt_guest = + __ATTR(prot_virt_guest, 0444, uv_is_prot_virt_guest, NULL); + +static struct kobj_attribute uv_prot_virt_host = + __ATTR(prot_virt_host, 0444, uv_is_prot_virt_host, NULL); + +static const struct attribute *uv_prot_virt_attrs[] = { + &uv_prot_virt_guest.attr, + &uv_prot_virt_host.attr, + NULL, +}; + +static struct kset *uv_query_kset; +static struct kobject *uv_kobj; + +static int __init uv_info_init(void) +{ + int rc = -ENOMEM; + + if (!test_facility(158)) + return 0; + + uv_kobj = kobject_create_and_add("uv", firmware_kobj); + if (!uv_kobj) + return -ENOMEM; + + rc = sysfs_create_files(uv_kobj, uv_prot_virt_attrs); + if (rc) + goto out_kobj; + + uv_query_kset = kset_create_and_add("query", NULL, uv_kobj); + if (!uv_query_kset) { + rc = -ENOMEM; + goto out_ind_files; + } + + rc = sysfs_create_group(&uv_query_kset->kobj, &uv_query_attr_group); + if (!rc) + return 0; + + kset_unregister(uv_query_kset); +out_ind_files: + sysfs_remove_files(uv_kobj, uv_prot_virt_attrs); +out_kobj: + kobject_del(uv_kobj); + kobject_put(uv_kobj); + return rc; +} +device_initcall(uv_info_init); +#endif diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c new file mode 100644 index 0000000000..bbaefd84f1 --- /dev/null +++ b/arch/s390/kernel/vdso.c @@ -0,0 +1,258 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * vdso setup for s390 + * + * Copyright IBM Corp. 2008 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern char vdso64_start[], vdso64_end[]; +extern char vdso32_start[], vdso32_end[]; + +static struct vm_special_mapping vvar_mapping; + +static union { + struct vdso_data data[CS_BASES]; + u8 page[PAGE_SIZE]; +} vdso_data_store __page_aligned_data; + +struct vdso_data *vdso_data = vdso_data_store.data; + +enum vvar_pages { + VVAR_DATA_PAGE_OFFSET, + VVAR_TIMENS_PAGE_OFFSET, + VVAR_NR_PAGES, +}; + +#ifdef CONFIG_TIME_NS +struct vdso_data *arch_get_vdso_data(void *vvar_page) +{ + return (struct vdso_data *)(vvar_page); +} + +/* + * The VVAR page layout depends on whether a task belongs to the root or + * non-root time namespace. Whenever a task changes its namespace, the VVAR + * page tables are cleared and then they will be re-faulted with a + * corresponding layout. + * See also the comment near timens_setup_vdso_data() for details. + */ +int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) +{ + struct mm_struct *mm = task->mm; + VMA_ITERATOR(vmi, mm, 0); + struct vm_area_struct *vma; + + mmap_read_lock(mm); + for_each_vma(vmi, vma) { + if (!vma_is_special_mapping(vma, &vvar_mapping)) + continue; + zap_vma_pages(vma); + break; + } + mmap_read_unlock(mm); + return 0; +} +#endif + +static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *timens_page = find_timens_vvar_page(vma); + unsigned long addr, pfn; + vm_fault_t err; + + switch (vmf->pgoff) { + case VVAR_DATA_PAGE_OFFSET: + pfn = virt_to_pfn(vdso_data); + if (timens_page) { + /* + * Fault in VVAR page too, since it will be accessed + * to get clock data anyway. + */ + addr = vmf->address + VVAR_TIMENS_PAGE_OFFSET * PAGE_SIZE; + err = vmf_insert_pfn(vma, addr, pfn); + if (unlikely(err & VM_FAULT_ERROR)) + return err; + pfn = page_to_pfn(timens_page); + } + break; +#ifdef CONFIG_TIME_NS + case VVAR_TIMENS_PAGE_OFFSET: + /* + * If a task belongs to a time namespace then a namespace + * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and + * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET + * offset. + * See also the comment near timens_setup_vdso_data(). + */ + if (!timens_page) + return VM_FAULT_SIGBUS; + pfn = virt_to_pfn(vdso_data); + break; +#endif /* CONFIG_TIME_NS */ + default: + return VM_FAULT_SIGBUS; + } + return vmf_insert_pfn(vma, vmf->address, pfn); +} + +static int vdso_mremap(const struct vm_special_mapping *sm, + struct vm_area_struct *vma) +{ + current->mm->context.vdso_base = vma->vm_start; + return 0; +} + +static struct vm_special_mapping vvar_mapping = { + .name = "[vvar]", + .fault = vvar_fault, +}; + +static struct vm_special_mapping vdso64_mapping = { + .name = "[vdso]", + .mremap = vdso_mremap, +}; + +static struct vm_special_mapping vdso32_mapping = { + .name = "[vdso]", + .mremap = vdso_mremap, +}; + +int vdso_getcpu_init(void) +{ + set_tod_programmable_field(smp_processor_id()); + return 0; +} +early_initcall(vdso_getcpu_init); /* Must be called before SMP init */ + +static int map_vdso(unsigned long addr, unsigned long vdso_mapping_len) +{ + unsigned long vvar_start, vdso_text_start, vdso_text_len; + struct vm_special_mapping *vdso_mapping; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + int rc; + + BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES); + if (mmap_write_lock_killable(mm)) + return -EINTR; + + if (is_compat_task()) { + vdso_text_len = vdso32_end - vdso32_start; + vdso_mapping = &vdso32_mapping; + } else { + vdso_text_len = vdso64_end - vdso64_start; + vdso_mapping = &vdso64_mapping; + } + vvar_start = get_unmapped_area(NULL, addr, vdso_mapping_len, 0, 0); + rc = vvar_start; + if (IS_ERR_VALUE(vvar_start)) + goto out; + vma = _install_special_mapping(mm, vvar_start, VVAR_NR_PAGES*PAGE_SIZE, + VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP| + VM_PFNMAP, + &vvar_mapping); + rc = PTR_ERR(vma); + if (IS_ERR(vma)) + goto out; + vdso_text_start = vvar_start + VVAR_NR_PAGES * PAGE_SIZE; + /* VM_MAYWRITE for COW so gdb can set breakpoints */ + vma = _install_special_mapping(mm, vdso_text_start, vdso_text_len, + VM_READ|VM_EXEC| + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, + vdso_mapping); + if (IS_ERR(vma)) { + do_munmap(mm, vvar_start, PAGE_SIZE, NULL); + rc = PTR_ERR(vma); + } else { + current->mm->context.vdso_base = vdso_text_start; + rc = 0; + } +out: + mmap_write_unlock(mm); + return rc; +} + +static unsigned long vdso_addr(unsigned long start, unsigned long len) +{ + unsigned long addr, end, offset; + + /* + * Round up the start address. It can start out unaligned as a result + * of stack start randomization. + */ + start = PAGE_ALIGN(start); + + /* Round the lowest possible end address up to a PMD boundary. */ + end = (start + len + PMD_SIZE - 1) & PMD_MASK; + if (end >= VDSO_BASE) + end = VDSO_BASE; + end -= len; + + if (end > start) { + offset = get_random_u32_below(((end - start) >> PAGE_SHIFT) + 1); + addr = start + (offset << PAGE_SHIFT); + } else { + addr = start; + } + return addr; +} + +unsigned long vdso_size(void) +{ + unsigned long size = VVAR_NR_PAGES * PAGE_SIZE; + + if (is_compat_task()) + size += vdso32_end - vdso32_start; + else + size += vdso64_end - vdso64_start; + return PAGE_ALIGN(size); +} + +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +{ + unsigned long addr = VDSO_BASE; + unsigned long size = vdso_size(); + + if (current->flags & PF_RANDOMIZE) + addr = vdso_addr(current->mm->start_stack + PAGE_SIZE, size); + return map_vdso(addr, size); +} + +static struct page ** __init vdso_setup_pages(void *start, void *end) +{ + int pages = (end - start) >> PAGE_SHIFT; + struct page **pagelist; + int i; + + pagelist = kcalloc(pages + 1, sizeof(struct page *), GFP_KERNEL); + if (!pagelist) + panic("%s: Cannot allocate page list for VDSO", __func__); + for (i = 0; i < pages; i++) + pagelist[i] = virt_to_page(start + i * PAGE_SIZE); + return pagelist; +} + +static int __init vdso_init(void) +{ + vdso64_mapping.pages = vdso_setup_pages(vdso64_start, vdso64_end); + if (IS_ENABLED(CONFIG_COMPAT)) + vdso32_mapping.pages = vdso_setup_pages(vdso32_start, vdso32_end); + return 0; +} +arch_initcall(vdso_init); diff --git a/arch/s390/kernel/vdso32/.gitignore b/arch/s390/kernel/vdso32/.gitignore new file mode 100644 index 0000000000..5167384843 --- /dev/null +++ b/arch/s390/kernel/vdso32/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +vdso32.lds diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile new file mode 100644 index 0000000000..23e868b79a --- /dev/null +++ b/arch/s390/kernel/vdso32/Makefile @@ -0,0 +1,80 @@ +# SPDX-License-Identifier: GPL-2.0 +# List of files in the vdso + +KCOV_INSTRUMENT := n + +# Include the generic Makefile to check the built vdso. +include $(srctree)/lib/vdso/Makefile +obj-vdso32 = vdso_user_wrapper-32.o note-32.o + +# Build rules + +targets := $(obj-vdso32) vdso32.so vdso32.so.dbg +obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32)) + +KBUILD_AFLAGS += -DBUILD_VDSO +KBUILD_CFLAGS += -DBUILD_VDSO -DDISABLE_BRANCH_PROFILING + +KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) +KBUILD_AFLAGS_32 += -m31 -s + +KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS)) +KBUILD_CFLAGS_32 := $(filter-out -mno-pic-data-is-text-relative,$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 += -m31 -fPIC -shared -fno-common -fno-builtin + +LDFLAGS_vdso32.so.dbg += -fPIC -shared -soname=linux-vdso32.so.1 \ + --hash-style=both --build-id=sha1 -melf_s390 -T + +$(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) +$(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) + +obj-y += vdso32_wrapper.o +targets += vdso32.lds +CPPFLAGS_vdso32.lds += -P -C -U$(ARCH) + +# Disable gcov profiling, ubsan and kasan for VDSO code +GCOV_PROFILE := n +UBSAN_SANITIZE := n +KASAN_SANITIZE := n +KCSAN_SANITIZE := n + +# Force dependency (incbin is bad) +$(obj)/vdso32_wrapper.o : $(obj)/vdso32.so + +quiet_cmd_vdso_and_check = VDSO $@ + cmd_vdso_and_check = $(cmd_ld); $(cmd_vdso_check) + +$(obj)/vdso32.so.dbg: $(src)/vdso32.lds $(obj-vdso32) FORCE + $(call if_changed,vdso_and_check) + +# strip rule for the .so file +$(obj)/%.so: OBJCOPYFLAGS := -S +$(obj)/%.so: $(obj)/%.so.dbg FORCE + $(call if_changed,objcopy) + +$(obj-vdso32): %-32.o: %.S FORCE + $(call if_changed_dep,vdso32as) + +# actual build commands +quiet_cmd_vdso32as = VDSO32A $@ + cmd_vdso32as = $(CC) $(a_flags) -c -o $@ $< +quiet_cmd_vdso32cc = VDSO32C $@ + cmd_vdso32cc = $(CC) $(c_flags) -c -o $@ $< + +# install commands for the unstripped file +quiet_cmd_vdso_install = INSTALL $@ + cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@ + +vdso32.so: $(obj)/vdso32.so.dbg + @mkdir -p $(MODLIB)/vdso + $(call cmd,vdso_install) + +vdso_install: vdso32.so + +# Generate VDSO offsets using helper script +gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh +quiet_cmd_vdsosym = VDSOSYM $@ + cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ + +include/generated/vdso32-offsets.h: $(obj)/vdso32.so.dbg FORCE + $(call if_changed,vdsosym) diff --git a/arch/s390/kernel/vdso32/gen_vdso_offsets.sh b/arch/s390/kernel/vdso32/gen_vdso_offsets.sh new file mode 100755 index 0000000000..9c4f951e22 --- /dev/null +++ b/arch/s390/kernel/vdso32/gen_vdso_offsets.sh @@ -0,0 +1,15 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +# +# Match symbols in the DSO that look like VDSO_*; produce a header file +# of constant offsets into the shared object. +# +# Doing this inside the Makefile will break the $(filter-out) function, +# causing Kbuild to rebuild the vdso-offsets header file every time. +# +# Inspired by arm64 version. +# + +LC_ALL=C +sed -n 's/\([0-9a-f]*\) . __kernel_compat_\(.*\)/\#define vdso32_offset_\2\t0x\1/p' diff --git a/arch/s390/kernel/vdso32/note.S b/arch/s390/kernel/vdso32/note.S new file mode 100644 index 0000000000..db19d0680a --- /dev/null +++ b/arch/s390/kernel/vdso32/note.S @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. + * Here we can supply some information useful to userland. + */ + +#include +#include +#include + +ELFNOTE_START(Linux, 0, "a") + .long LINUX_VERSION_CODE +ELFNOTE_END diff --git a/arch/s390/kernel/vdso32/vdso32.lds.S b/arch/s390/kernel/vdso32/vdso32.lds.S new file mode 100644 index 0000000000..edf5ff1deb --- /dev/null +++ b/arch/s390/kernel/vdso32/vdso32.lds.S @@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This is the infamous ld script for the 64 bits vdso + * library + */ + +#include +#include + +OUTPUT_FORMAT("elf32-s390", "elf32-s390", "elf32-s390") +OUTPUT_ARCH(s390:31-bit) +ENTRY(_start) + +SECTIONS +{ + PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); +#ifdef CONFIG_TIME_NS + PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); +#endif + . = VDSO_LBASE + SIZEOF_HEADERS; + + .hash : { *(.hash) } :text + .gnu.hash : { *(.gnu.hash) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + + .note : { *(.note.*) } :text :note + + . = ALIGN(16); + .text : { + *(.text .stub .text.* .gnu.linkonce.t.*) + } :text + PROVIDE(__etext = .); + PROVIDE(_etext = .); + PROVIDE(etext = .); + + /* + * Other stuff is appended to the text segment: + */ + .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } + .rodata1 : { *(.rodata1) } + + .dynamic : { *(.dynamic) } :text :dynamic + + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr + .eh_frame : { KEEP (*(.eh_frame)) } :text + .gcc_except_table : { *(.gcc_except_table .gcc_except_table.*) } + + .rela.dyn ALIGN(8) : { *(.rela.dyn) } + .got ALIGN(8) : { *(.got .toc) } + .got.plt ALIGN(8) : { *(.got.plt) } + + _end = .; + PROVIDE(end = .); + + /* + * Stabs debugging sections are here too. + */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } + + /* + * DWARF debug sections. + * Symbols in the DWARF debugging sections are relative to the + * beginning of the section so we begin them at 0. + */ + /* DWARF 1 */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2 */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + /* DWARF 3 */ + .debug_pubtypes 0 : { *(.debug_pubtypes) } + .debug_ranges 0 : { *(.debug_ranges) } + .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) } + + /DISCARD/ : { + *(.note.GNU-stack) + *(.branch_lt) + *(.data .data.* .gnu.linkonce.d.* .sdata*) + *(.bss .sbss .dynbss .dynsbss) + } +} + +/* + * Very old versions of ld do not recognize this name token; use the constant. + */ +#define PT_GNU_EH_FRAME 0x6474e550 + +/* + * We must supply the ELF program headers explicitly to get just one + * PT_LOAD segment, and set the flags explicitly to make segments read-only. + */ +PHDRS +{ + text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + note PT_NOTE FLAGS(4); /* PF_R */ + eh_frame_hdr PT_GNU_EH_FRAME; +} + +/* + * This controls what symbols we export from the DSO. + */ +VERSION +{ + VDSO_VERSION_STRING { + global: + /* + * Has to be there for the kernel to find + */ + __kernel_compat_restart_syscall; + __kernel_compat_rt_sigreturn; + __kernel_compat_sigreturn; + local: *; + }; +} diff --git a/arch/s390/kernel/vdso32/vdso32_wrapper.S b/arch/s390/kernel/vdso32/vdso32_wrapper.S new file mode 100644 index 0000000000..de2fb93047 --- /dev/null +++ b/arch/s390/kernel/vdso32/vdso32_wrapper.S @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include + + __PAGE_ALIGNED_DATA + + .globl vdso32_start, vdso32_end + .balign PAGE_SIZE +vdso32_start: + .incbin "arch/s390/kernel/vdso32/vdso32.so" + .balign PAGE_SIZE +vdso32_end: + + .previous diff --git a/arch/s390/kernel/vdso32/vdso_user_wrapper.S b/arch/s390/kernel/vdso32/vdso_user_wrapper.S new file mode 100644 index 0000000000..2e645003fd --- /dev/null +++ b/arch/s390/kernel/vdso32/vdso_user_wrapper.S @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include +#include +#include + +.macro vdso_syscall func,syscall + .globl __kernel_compat_\func + .type __kernel_compat_\func,@function + __ALIGN +__kernel_compat_\func: + CFI_STARTPROC + svc \syscall + /* Make sure we notice when a syscall returns, which shouldn't happen */ + .word 0 + CFI_ENDPROC + .size __kernel_compat_\func,.-__kernel_compat_\func +.endm + +vdso_syscall restart_syscall,__NR_restart_syscall +vdso_syscall sigreturn,__NR_sigreturn +vdso_syscall rt_sigreturn,__NR_rt_sigreturn diff --git a/arch/s390/kernel/vdso64/.gitignore b/arch/s390/kernel/vdso64/.gitignore new file mode 100644 index 0000000000..4ec80685fe --- /dev/null +++ b/arch/s390/kernel/vdso64/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +vdso64.lds diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile new file mode 100644 index 0000000000..fc1c6ff817 --- /dev/null +++ b/arch/s390/kernel/vdso64/Makefile @@ -0,0 +1,89 @@ +# SPDX-License-Identifier: GPL-2.0 +# List of files in the vdso + +KCOV_INSTRUMENT := n + +# Include the generic Makefile to check the built vdso. +include $(srctree)/lib/vdso/Makefile +obj-vdso64 = vdso_user_wrapper.o note.o +obj-cvdso64 = vdso64_generic.o getcpu.o +VDSO_CFLAGS_REMOVE := -pg $(CC_FLAGS_FTRACE) $(CC_FLAGS_EXPOLINE) $(CC_FLAGS_CHECK_STACK) +CFLAGS_REMOVE_getcpu.o = $(VDSO_CFLAGS_REMOVE) +CFLAGS_REMOVE_vdso64_generic.o = $(VDSO_CFLAGS_REMOVE) + +# Build rules + +targets := $(obj-vdso64) $(obj-cvdso64) vdso64.so vdso64.so.dbg +obj-vdso64 := $(addprefix $(obj)/, $(obj-vdso64)) +obj-cvdso64 := $(addprefix $(obj)/, $(obj-cvdso64)) + +KBUILD_AFLAGS += -DBUILD_VDSO +KBUILD_CFLAGS += -DBUILD_VDSO -DDISABLE_BRANCH_PROFILING + +KBUILD_AFLAGS_64 := $(filter-out -m64,$(KBUILD_AFLAGS)) +KBUILD_AFLAGS_64 += -m64 + +KBUILD_CFLAGS_64 := $(filter-out -m64,$(KBUILD_CFLAGS)) +KBUILD_CFLAGS_64 := $(filter-out -mno-pic-data-is-text-relative,$(KBUILD_CFLAGS_64)) +KBUILD_CFLAGS_64 += -m64 -fPIC -fno-common -fno-builtin +ldflags-y := -fPIC -shared -soname=linux-vdso64.so.1 \ + --hash-style=both --build-id=sha1 -T + +$(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_64) +$(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_64) + +obj-y += vdso64_wrapper.o +targets += vdso64.lds +CPPFLAGS_vdso64.lds += -P -C -U$(ARCH) + +# Disable gcov profiling, ubsan and kasan for VDSO code +GCOV_PROFILE := n +UBSAN_SANITIZE := n +KASAN_SANITIZE := n +KCSAN_SANITIZE := n + +# Force dependency (incbin is bad) +$(obj)/vdso64_wrapper.o : $(obj)/vdso64.so + +quiet_cmd_vdso_and_check = VDSO $@ + cmd_vdso_and_check = $(cmd_ld); $(cmd_vdso_check) + +# link rule for the .so file, .lds has to be first +$(obj)/vdso64.so.dbg: $(src)/vdso64.lds $(obj-vdso64) $(obj-cvdso64) FORCE + $(call if_changed,vdso_and_check) + +# strip rule for the .so file +$(obj)/%.so: OBJCOPYFLAGS := -S +$(obj)/%.so: $(obj)/%.so.dbg FORCE + $(call if_changed,objcopy) + +# assembly rules for the .S files +$(obj-vdso64): %.o: %.S FORCE + $(call if_changed_dep,vdso64as) + +$(obj-cvdso64): %.o: %.c FORCE + $(call if_changed_dep,vdso64cc) + +# actual build commands +quiet_cmd_vdso64as = VDSO64A $@ + cmd_vdso64as = $(CC) $(a_flags) -c -o $@ $< +quiet_cmd_vdso64cc = VDSO64C $@ + cmd_vdso64cc = $(CC) $(c_flags) -c -o $@ $< + +# install commands for the unstripped file +quiet_cmd_vdso_install = INSTALL $@ + cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@ + +vdso64.so: $(obj)/vdso64.so.dbg + @mkdir -p $(MODLIB)/vdso + $(call cmd,vdso_install) + +vdso_install: vdso64.so + +# Generate VDSO offsets using helper script +gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh +quiet_cmd_vdsosym = VDSOSYM $@ + cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ + +include/generated/vdso64-offsets.h: $(obj)/vdso64.so.dbg FORCE + $(call if_changed,vdsosym) diff --git a/arch/s390/kernel/vdso64/gen_vdso_offsets.sh b/arch/s390/kernel/vdso64/gen_vdso_offsets.sh new file mode 100755 index 0000000000..37f05cb38d --- /dev/null +++ b/arch/s390/kernel/vdso64/gen_vdso_offsets.sh @@ -0,0 +1,15 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +# +# Match symbols in the DSO that look like VDSO_*; produce a header file +# of constant offsets into the shared object. +# +# Doing this inside the Makefile will break the $(filter-out) function, +# causing Kbuild to rebuild the vdso-offsets header file every time. +# +# Inspired by arm64 version. +# + +LC_ALL=C +sed -n 's/\([0-9a-f]*\) . __kernel_\(.*\)/\#define vdso64_offset_\2\t0x\1/p' diff --git a/arch/s390/kernel/vdso64/getcpu.c b/arch/s390/kernel/vdso64/getcpu.c new file mode 100644 index 0000000000..5c5d4a848b --- /dev/null +++ b/arch/s390/kernel/vdso64/getcpu.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright IBM Corp. 2020 */ + +#include +#include +#include +#include "vdso.h" + +int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) +{ + union tod_clock clk; + + /* CPU number is stored in the programmable field of the TOD clock */ + store_tod_clock_ext(&clk); + if (cpu) + *cpu = clk.pf; + /* NUMA node is always zero */ + if (node) + *node = 0; + return 0; +} diff --git a/arch/s390/kernel/vdso64/note.S b/arch/s390/kernel/vdso64/note.S new file mode 100644 index 0000000000..db19d0680a --- /dev/null +++ b/arch/s390/kernel/vdso64/note.S @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. + * Here we can supply some information useful to userland. + */ + +#include +#include +#include + +ELFNOTE_START(Linux, 0, "a") + .long LINUX_VERSION_CODE +ELFNOTE_END diff --git a/arch/s390/kernel/vdso64/vdso.h b/arch/s390/kernel/vdso64/vdso.h new file mode 100644 index 0000000000..34c7a2312f --- /dev/null +++ b/arch/s390/kernel/vdso64/vdso.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ARCH_S390_KERNEL_VDSO64_VDSO_H +#define __ARCH_S390_KERNEL_VDSO64_VDSO_H + +#include + +struct getcpu_cache; + +int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused); +int __s390_vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz); +int __s390_vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts); +int __s390_vdso_clock_getres(clockid_t clock, struct __kernel_timespec *ts); + +#endif /* __ARCH_S390_KERNEL_VDSO64_VDSO_H */ diff --git a/arch/s390/kernel/vdso64/vdso64.lds.S b/arch/s390/kernel/vdso64/vdso64.lds.S new file mode 100644 index 0000000000..4461ea151e --- /dev/null +++ b/arch/s390/kernel/vdso64/vdso64.lds.S @@ -0,0 +1,146 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This is the infamous ld script for the 64 bits vdso + * library + */ + +#include +#include + +OUTPUT_FORMAT("elf64-s390", "elf64-s390", "elf64-s390") +OUTPUT_ARCH(s390:64-bit) +ENTRY(_start) + +SECTIONS +{ + PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); +#ifdef CONFIG_TIME_NS + PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); +#endif + . = VDSO_LBASE + SIZEOF_HEADERS; + + .hash : { *(.hash) } :text + .gnu.hash : { *(.gnu.hash) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + + .note : { *(.note.*) } :text :note + + . = ALIGN(16); + .text : { + *(.text .stub .text.* .gnu.linkonce.t.*) + } :text + PROVIDE(__etext = .); + PROVIDE(_etext = .); + PROVIDE(etext = .); + + /* + * Other stuff is appended to the text segment: + */ + .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } + .rodata1 : { *(.rodata1) } + + .dynamic : { *(.dynamic) } :text :dynamic + + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr + .eh_frame : { KEEP (*(.eh_frame)) } :text + .gcc_except_table : { *(.gcc_except_table .gcc_except_table.*) } + + .rela.dyn ALIGN(8) : { *(.rela.dyn) } + .got ALIGN(8) : { *(.got .toc) } + .got.plt ALIGN(8) : { *(.got.plt) } + + _end = .; + PROVIDE(end = .); + + /* + * Stabs debugging sections are here too. + */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } + + /* + * DWARF debug sections. + * Symbols in the DWARF debugging sections are relative to the + * beginning of the section so we begin them at 0. + */ + /* DWARF 1 */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2 */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + /* DWARF 3 */ + .debug_pubtypes 0 : { *(.debug_pubtypes) } + .debug_ranges 0 : { *(.debug_ranges) } + .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) } + + /DISCARD/ : { + *(.note.GNU-stack) + *(.branch_lt) + *(.data .data.* .gnu.linkonce.d.* .sdata*) + *(.bss .sbss .dynbss .dynsbss) + } +} + +/* + * Very old versions of ld do not recognize this name token; use the constant. + */ +#define PT_GNU_EH_FRAME 0x6474e550 + +/* + * We must supply the ELF program headers explicitly to get just one + * PT_LOAD segment, and set the flags explicitly to make segments read-only. + */ +PHDRS +{ + text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + note PT_NOTE FLAGS(4); /* PF_R */ + eh_frame_hdr PT_GNU_EH_FRAME; +} + +/* + * This controls what symbols we export from the DSO. + */ +VERSION +{ + VDSO_VERSION_STRING { + global: + /* + * Has to be there for the kernel to find + */ + __kernel_gettimeofday; + __kernel_clock_gettime; + __kernel_clock_getres; + __kernel_getcpu; + __kernel_restart_syscall; + __kernel_rt_sigreturn; + __kernel_sigreturn; + local: *; + }; +} diff --git a/arch/s390/kernel/vdso64/vdso64_generic.c b/arch/s390/kernel/vdso64/vdso64_generic.c new file mode 100644 index 0000000000..a9aa75643c --- /dev/null +++ b/arch/s390/kernel/vdso64/vdso64_generic.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "../../../../lib/vdso/gettimeofday.c" +#include "vdso.h" + +int __s390_vdso_gettimeofday(struct __kernel_old_timeval *tv, + struct timezone *tz) +{ + return __cvdso_gettimeofday(tv, tz); +} + +int __s390_vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts) +{ + return __cvdso_clock_gettime(clock, ts); +} + +int __s390_vdso_clock_getres(clockid_t clock, struct __kernel_timespec *ts) +{ + return __cvdso_clock_getres(clock, ts); +} diff --git a/arch/s390/kernel/vdso64/vdso64_wrapper.S b/arch/s390/kernel/vdso64/vdso64_wrapper.S new file mode 100644 index 0000000000..6721849986 --- /dev/null +++ b/arch/s390/kernel/vdso64/vdso64_wrapper.S @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include + + __PAGE_ALIGNED_DATA + + .globl vdso64_start, vdso64_end + .balign PAGE_SIZE +vdso64_start: + .incbin "arch/s390/kernel/vdso64/vdso64.so" + .balign PAGE_SIZE +vdso64_end: + + .previous diff --git a/arch/s390/kernel/vdso64/vdso_user_wrapper.S b/arch/s390/kernel/vdso64/vdso_user_wrapper.S new file mode 100644 index 0000000000..57f62596e5 --- /dev/null +++ b/arch/s390/kernel/vdso64/vdso_user_wrapper.S @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include +#include +#include +#include + +#define WRAPPER_FRAME_SIZE (STACK_FRAME_OVERHEAD+8) + +/* + * Older glibc version called vdso without allocating a stackframe. This wrapper + * is just used to allocate a stackframe. See + * https://sourceware.org/git/?p=glibc.git;a=commit;h=478593e6374f3818da39332260dc453cb19cfa1e + * for details. + */ +.macro vdso_func func + .globl __kernel_\func + .type __kernel_\func,@function + __ALIGN +__kernel_\func: + CFI_STARTPROC + aghi %r15,-WRAPPER_FRAME_SIZE + CFI_DEF_CFA_OFFSET (STACK_FRAME_OVERHEAD + WRAPPER_FRAME_SIZE) + CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD + stg %r14,STACK_FRAME_OVERHEAD(%r15) + brasl %r14,__s390_vdso_\func + lg %r14,STACK_FRAME_OVERHEAD(%r15) + aghi %r15,WRAPPER_FRAME_SIZE + CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD + CFI_RESTORE 15 + br %r14 + CFI_ENDPROC + .size __kernel_\func,.-__kernel_\func +.endm + +vdso_func gettimeofday +vdso_func clock_getres +vdso_func clock_gettime +vdso_func getcpu + +.macro vdso_syscall func,syscall + .globl __kernel_\func + .type __kernel_\func,@function + __ALIGN +__kernel_\func: + CFI_STARTPROC + svc \syscall + /* Make sure we notice when a syscall returns, which shouldn't happen */ + .word 0 + CFI_ENDPROC + .size __kernel_\func,.-__kernel_\func +.endm + +vdso_syscall restart_syscall,__NR_restart_syscall +vdso_syscall sigreturn,__NR_sigreturn +vdso_syscall rt_sigreturn,__NR_rt_sigreturn diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S new file mode 100644 index 0000000000..2ae201ebf9 --- /dev/null +++ b/arch/s390/kernel/vmlinux.lds.S @@ -0,0 +1,243 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* ld script to make s390 Linux kernel + * Written by Martin Schwidefsky (schwidefsky@de.ibm.com) + */ + +#include +#include +#include + +/* + * Put .bss..swapper_pg_dir as the first thing in .bss. This will + * make sure it has 16k alignment. + */ +#define BSS_FIRST_SECTIONS *(.bss..swapper_pg_dir) \ + *(.bss..invalid_pg_dir) + +#define RO_EXCEPTION_TABLE_ALIGN 16 + +/* Handle ro_after_init data on our own. */ +#define RO_AFTER_INIT_DATA + +#define RUNTIME_DISCARD_EXIT + +#define EMITS_PT_NOTE + +#include +#include + +OUTPUT_FORMAT("elf64-s390", "elf64-s390", "elf64-s390") +OUTPUT_ARCH(s390:64-bit) +ENTRY(startup_continue) +jiffies = jiffies_64; + +PHDRS { + text PT_LOAD FLAGS(5); /* R_E */ + data PT_LOAD FLAGS(7); /* RWE */ + note PT_NOTE FLAGS(0); /* ___ */ +} + +SECTIONS +{ + . = 0x100000; + .text : { + _stext = .; /* Start of text section */ + _text = .; /* Text and read-only data */ + HEAD_TEXT + TEXT_TEXT + SCHED_TEXT + LOCK_TEXT + KPROBES_TEXT + IRQENTRY_TEXT + SOFTIRQENTRY_TEXT + FTRACE_HOTPATCH_TRAMPOLINES_TEXT + *(.text.*_indirect_*) + *(.gnu.warning) + . = ALIGN(PAGE_SIZE); + _etext = .; /* End of text section */ + } :text = 0x0700 + + RO_DATA(PAGE_SIZE) + + . = ALIGN(PAGE_SIZE); + _sdata = .; /* Start of data section */ + + . = ALIGN(PAGE_SIZE); + __start_ro_after_init = .; + .data..ro_after_init : { + *(.data..ro_after_init) + JUMP_TABLE_DATA + } :data + . = ALIGN(PAGE_SIZE); + __end_ro_after_init = .; + + RW_DATA(0x100, PAGE_SIZE, THREAD_SIZE) + BOOT_DATA_PRESERVED + + . = ALIGN(8); + .amode31.refs : { + _start_amode31_refs = .; + *(.amode31.refs) + _end_amode31_refs = .; + } + + . = ALIGN(PAGE_SIZE); + _edata = .; /* End of data section */ + + /* will be freed after init */ + . = ALIGN(PAGE_SIZE); /* Init code and data */ + __init_begin = .; + + . = ALIGN(PAGE_SIZE); + .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { + _sinittext = .; + INIT_TEXT + . = ALIGN(PAGE_SIZE); + _einittext = .; + } + + /* + * .exit.text is discarded at runtime, not link time, + * to deal with references from __bug_table + */ + .exit.text : { + EXIT_TEXT + } + + .exit.data : { + EXIT_DATA + } + + /* + * struct alt_inst entries. From the header (alternative.h): + * "Alternative instructions for different CPU types or capabilities" + * Think locking instructions on spinlocks. + * Note, that it is a part of __init region. + */ + . = ALIGN(8); + .altinstructions : { + __alt_instructions = .; + *(.altinstructions) + __alt_instructions_end = .; + } + + /* + * And here are the replacement instructions. The linker sticks + * them as binary blobs. The .altinstructions has enough data to + * get the address and the length of them to patch the kernel safely. + * Note, that it is a part of __init region. + */ + .altinstr_replacement : { + *(.altinstr_replacement) + } + + /* + * Table with the patch locations to undo expolines + */ + . = ALIGN(4); + .nospec_call_table : { + __nospec_call_start = . ; + *(.s390_indirect*) + __nospec_call_end = . ; + } + .nospec_return_table : { + __nospec_return_start = . ; + *(.s390_return*) + __nospec_return_end = . ; + } + + BOOT_DATA + + /* + * .amode31 section for code, data, ex_table that need to stay + * below 2 GB, even when the kernel is relocated above 2 GB. + */ + . = ALIGN(PAGE_SIZE); + _samode31 = .; + .amode31.text : { + _stext_amode31 = .; + *(.amode31.text) + *(.amode31.text.*_indirect_*) + . = ALIGN(PAGE_SIZE); + _etext_amode31 = .; + } + . = ALIGN(16); + .amode31.ex_table : { + _start_amode31_ex_table = .; + KEEP(*(.amode31.ex_table)) + _stop_amode31_ex_table = .; + } + . = ALIGN(PAGE_SIZE); + .amode31.data : { + *(.amode31.data) + } + . = ALIGN(PAGE_SIZE); + _eamode31 = .; + + /* early.c uses stsi, which requires page aligned data. */ + . = ALIGN(PAGE_SIZE); + INIT_DATA_SECTION(0x100) + + PERCPU_SECTION(0x100) + + .dynsym ALIGN(8) : { + __dynsym_start = .; + *(.dynsym) + __dynsym_end = .; + } + .rela.dyn ALIGN(8) : { + __rela_dyn_start = .; + *(.rela*) + __rela_dyn_end = .; + } + + . = ALIGN(PAGE_SIZE); + __init_end = .; /* freed after init ends here */ + + BSS_SECTION(PAGE_SIZE, 4 * PAGE_SIZE, PAGE_SIZE) + + . = ALIGN(PAGE_SIZE); + _end = . ; + + /* + * uncompressed image info used by the decompressor + * it should match struct vmlinux_info + */ + .vmlinux.info 0 (INFO) : { + QUAD(_stext) /* default_lma */ + QUAD(startup_continue) /* entry */ + QUAD(__bss_start - _stext) /* image_size */ + QUAD(__bss_stop - __bss_start) /* bss_size */ + QUAD(__boot_data_start) /* bootdata_off */ + QUAD(__boot_data_end - __boot_data_start) /* bootdata_size */ + QUAD(__boot_data_preserved_start) /* bootdata_preserved_off */ + QUAD(__boot_data_preserved_end - + __boot_data_preserved_start) /* bootdata_preserved_size */ + QUAD(__dynsym_start) /* dynsym_start */ + QUAD(__rela_dyn_start) /* rela_dyn_start */ + QUAD(__rela_dyn_end) /* rela_dyn_end */ + QUAD(_eamode31 - _samode31) /* amode31_size */ + QUAD(init_mm) + QUAD(swapper_pg_dir) + QUAD(invalid_pg_dir) +#ifdef CONFIG_KASAN + QUAD(kasan_early_shadow_page) + QUAD(kasan_early_shadow_pte) + QUAD(kasan_early_shadow_pmd) + QUAD(kasan_early_shadow_pud) + QUAD(kasan_early_shadow_p4d) +#endif + } :NONE + + /* Debugging sections. */ + STABS_DEBUG + DWARF_DEBUG + ELF_DETAILS + + /* Sections to be discarded */ + DISCARDS + /DISCARD/ : { + *(.eh_frame) + *(.interp) + } +} diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c new file mode 100644 index 0000000000..e0a88dcaf5 --- /dev/null +++ b/arch/s390/kernel/vtime.c @@ -0,0 +1,454 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Virtual cpu timer based timer functions. + * + * Copyright IBM Corp. 2004, 2012 + * Author(s): Jan Glauber + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "entry.h" + +static void virt_timer_expire(void); + +static LIST_HEAD(virt_timer_list); +static DEFINE_SPINLOCK(virt_timer_lock); +static atomic64_t virt_timer_current; +static atomic64_t virt_timer_elapsed; + +DEFINE_PER_CPU(u64, mt_cycles[8]); +static DEFINE_PER_CPU(u64, mt_scaling_mult) = { 1 }; +static DEFINE_PER_CPU(u64, mt_scaling_div) = { 1 }; +static DEFINE_PER_CPU(u64, mt_scaling_jiffies); + +static inline u64 get_vtimer(void) +{ + u64 timer; + + asm volatile("stpt %0" : "=Q" (timer)); + return timer; +} + +static inline void set_vtimer(u64 expires) +{ + u64 timer; + + asm volatile( + " stpt %0\n" /* Store current cpu timer value */ + " spt %1" /* Set new value imm. afterwards */ + : "=Q" (timer) : "Q" (expires)); + S390_lowcore.system_timer += S390_lowcore.last_update_timer - timer; + S390_lowcore.last_update_timer = expires; +} + +static inline int virt_timer_forward(u64 elapsed) +{ + BUG_ON(!irqs_disabled()); + + if (list_empty(&virt_timer_list)) + return 0; + elapsed = atomic64_add_return(elapsed, &virt_timer_elapsed); + return elapsed >= atomic64_read(&virt_timer_current); +} + +static void update_mt_scaling(void) +{ + u64 cycles_new[8], *cycles_old; + u64 delta, fac, mult, div; + int i; + + stcctm(MT_DIAG, smp_cpu_mtid + 1, cycles_new); + cycles_old = this_cpu_ptr(mt_cycles); + fac = 1; + mult = div = 0; + for (i = 0; i <= smp_cpu_mtid; i++) { + delta = cycles_new[i] - cycles_old[i]; + div += delta; + mult *= i + 1; + mult += delta * fac; + fac *= i + 1; + } + div *= fac; + if (div > 0) { + /* Update scaling factor */ + __this_cpu_write(mt_scaling_mult, mult); + __this_cpu_write(mt_scaling_div, div); + memcpy(cycles_old, cycles_new, + sizeof(u64) * (smp_cpu_mtid + 1)); + } + __this_cpu_write(mt_scaling_jiffies, jiffies_64); +} + +static inline u64 update_tsk_timer(unsigned long *tsk_vtime, u64 new) +{ + u64 delta; + + delta = new - *tsk_vtime; + *tsk_vtime = new; + return delta; +} + + +static inline u64 scale_vtime(u64 vtime) +{ + u64 mult = __this_cpu_read(mt_scaling_mult); + u64 div = __this_cpu_read(mt_scaling_div); + + if (smp_cpu_mtid) + return vtime * mult / div; + return vtime; +} + +static void account_system_index_scaled(struct task_struct *p, u64 cputime, + enum cpu_usage_stat index) +{ + p->stimescaled += cputime_to_nsecs(scale_vtime(cputime)); + account_system_index_time(p, cputime_to_nsecs(cputime), index); +} + +/* + * Update process times based on virtual cpu times stored by entry.S + * to the lowcore fields user_timer, system_timer & steal_clock. + */ +static int do_account_vtime(struct task_struct *tsk) +{ + u64 timer, clock, user, guest, system, hardirq, softirq; + + timer = S390_lowcore.last_update_timer; + clock = S390_lowcore.last_update_clock; + asm volatile( + " stpt %0\n" /* Store current cpu timer value */ + " stckf %1" /* Store current tod clock value */ + : "=Q" (S390_lowcore.last_update_timer), + "=Q" (S390_lowcore.last_update_clock) + : : "cc"); + clock = S390_lowcore.last_update_clock - clock; + timer -= S390_lowcore.last_update_timer; + + if (hardirq_count()) + S390_lowcore.hardirq_timer += timer; + else + S390_lowcore.system_timer += timer; + + /* Update MT utilization calculation */ + if (smp_cpu_mtid && + time_after64(jiffies_64, this_cpu_read(mt_scaling_jiffies))) + update_mt_scaling(); + + /* Calculate cputime delta */ + user = update_tsk_timer(&tsk->thread.user_timer, + READ_ONCE(S390_lowcore.user_timer)); + guest = update_tsk_timer(&tsk->thread.guest_timer, + READ_ONCE(S390_lowcore.guest_timer)); + system = update_tsk_timer(&tsk->thread.system_timer, + READ_ONCE(S390_lowcore.system_timer)); + hardirq = update_tsk_timer(&tsk->thread.hardirq_timer, + READ_ONCE(S390_lowcore.hardirq_timer)); + softirq = update_tsk_timer(&tsk->thread.softirq_timer, + READ_ONCE(S390_lowcore.softirq_timer)); + S390_lowcore.steal_timer += + clock - user - guest - system - hardirq - softirq; + + /* Push account value */ + if (user) { + account_user_time(tsk, cputime_to_nsecs(user)); + tsk->utimescaled += cputime_to_nsecs(scale_vtime(user)); + } + + if (guest) { + account_guest_time(tsk, cputime_to_nsecs(guest)); + tsk->utimescaled += cputime_to_nsecs(scale_vtime(guest)); + } + + if (system) + account_system_index_scaled(tsk, system, CPUTIME_SYSTEM); + if (hardirq) + account_system_index_scaled(tsk, hardirq, CPUTIME_IRQ); + if (softirq) + account_system_index_scaled(tsk, softirq, CPUTIME_SOFTIRQ); + + return virt_timer_forward(user + guest + system + hardirq + softirq); +} + +void vtime_task_switch(struct task_struct *prev) +{ + do_account_vtime(prev); + prev->thread.user_timer = S390_lowcore.user_timer; + prev->thread.guest_timer = S390_lowcore.guest_timer; + prev->thread.system_timer = S390_lowcore.system_timer; + prev->thread.hardirq_timer = S390_lowcore.hardirq_timer; + prev->thread.softirq_timer = S390_lowcore.softirq_timer; + S390_lowcore.user_timer = current->thread.user_timer; + S390_lowcore.guest_timer = current->thread.guest_timer; + S390_lowcore.system_timer = current->thread.system_timer; + S390_lowcore.hardirq_timer = current->thread.hardirq_timer; + S390_lowcore.softirq_timer = current->thread.softirq_timer; +} + +/* + * In s390, accounting pending user time also implies + * accounting system time in order to correctly compute + * the stolen time accounting. + */ +void vtime_flush(struct task_struct *tsk) +{ + u64 steal, avg_steal; + + if (do_account_vtime(tsk)) + virt_timer_expire(); + + steal = S390_lowcore.steal_timer; + avg_steal = S390_lowcore.avg_steal_timer / 2; + if ((s64) steal > 0) { + S390_lowcore.steal_timer = 0; + account_steal_time(cputime_to_nsecs(steal)); + avg_steal += steal; + } + S390_lowcore.avg_steal_timer = avg_steal; +} + +static u64 vtime_delta(void) +{ + u64 timer = S390_lowcore.last_update_timer; + + S390_lowcore.last_update_timer = get_vtimer(); + + return timer - S390_lowcore.last_update_timer; +} + +/* + * Update process times based on virtual cpu times stored by entry.S + * to the lowcore fields user_timer, system_timer & steal_clock. + */ +void vtime_account_kernel(struct task_struct *tsk) +{ + u64 delta = vtime_delta(); + + if (tsk->flags & PF_VCPU) + S390_lowcore.guest_timer += delta; + else + S390_lowcore.system_timer += delta; + + virt_timer_forward(delta); +} +EXPORT_SYMBOL_GPL(vtime_account_kernel); + +void vtime_account_softirq(struct task_struct *tsk) +{ + u64 delta = vtime_delta(); + + S390_lowcore.softirq_timer += delta; + + virt_timer_forward(delta); +} + +void vtime_account_hardirq(struct task_struct *tsk) +{ + u64 delta = vtime_delta(); + + S390_lowcore.hardirq_timer += delta; + + virt_timer_forward(delta); +} + +/* + * Sorted add to a list. List is linear searched until first bigger + * element is found. + */ +static void list_add_sorted(struct vtimer_list *timer, struct list_head *head) +{ + struct vtimer_list *tmp; + + list_for_each_entry(tmp, head, entry) { + if (tmp->expires > timer->expires) { + list_add_tail(&timer->entry, &tmp->entry); + return; + } + } + list_add_tail(&timer->entry, head); +} + +/* + * Handler for expired virtual CPU timer. + */ +static void virt_timer_expire(void) +{ + struct vtimer_list *timer, *tmp; + unsigned long elapsed; + LIST_HEAD(cb_list); + + /* walk timer list, fire all expired timers */ + spin_lock(&virt_timer_lock); + elapsed = atomic64_read(&virt_timer_elapsed); + list_for_each_entry_safe(timer, tmp, &virt_timer_list, entry) { + if (timer->expires < elapsed) + /* move expired timer to the callback queue */ + list_move_tail(&timer->entry, &cb_list); + else + timer->expires -= elapsed; + } + if (!list_empty(&virt_timer_list)) { + timer = list_first_entry(&virt_timer_list, + struct vtimer_list, entry); + atomic64_set(&virt_timer_current, timer->expires); + } + atomic64_sub(elapsed, &virt_timer_elapsed); + spin_unlock(&virt_timer_lock); + + /* Do callbacks and recharge periodic timers */ + list_for_each_entry_safe(timer, tmp, &cb_list, entry) { + list_del_init(&timer->entry); + timer->function(timer->data); + if (timer->interval) { + /* Recharge interval timer */ + timer->expires = timer->interval + + atomic64_read(&virt_timer_elapsed); + spin_lock(&virt_timer_lock); + list_add_sorted(timer, &virt_timer_list); + spin_unlock(&virt_timer_lock); + } + } +} + +void init_virt_timer(struct vtimer_list *timer) +{ + timer->function = NULL; + INIT_LIST_HEAD(&timer->entry); +} +EXPORT_SYMBOL(init_virt_timer); + +static inline int vtimer_pending(struct vtimer_list *timer) +{ + return !list_empty(&timer->entry); +} + +static void internal_add_vtimer(struct vtimer_list *timer) +{ + if (list_empty(&virt_timer_list)) { + /* First timer, just program it. */ + atomic64_set(&virt_timer_current, timer->expires); + atomic64_set(&virt_timer_elapsed, 0); + list_add(&timer->entry, &virt_timer_list); + } else { + /* Update timer against current base. */ + timer->expires += atomic64_read(&virt_timer_elapsed); + if (likely((s64) timer->expires < + (s64) atomic64_read(&virt_timer_current))) + /* The new timer expires before the current timer. */ + atomic64_set(&virt_timer_current, timer->expires); + /* Insert new timer into the list. */ + list_add_sorted(timer, &virt_timer_list); + } +} + +static void __add_vtimer(struct vtimer_list *timer, int periodic) +{ + unsigned long flags; + + timer->interval = periodic ? timer->expires : 0; + spin_lock_irqsave(&virt_timer_lock, flags); + internal_add_vtimer(timer); + spin_unlock_irqrestore(&virt_timer_lock, flags); +} + +/* + * add_virt_timer - add a oneshot virtual CPU timer + */ +void add_virt_timer(struct vtimer_list *timer) +{ + __add_vtimer(timer, 0); +} +EXPORT_SYMBOL(add_virt_timer); + +/* + * add_virt_timer_int - add an interval virtual CPU timer + */ +void add_virt_timer_periodic(struct vtimer_list *timer) +{ + __add_vtimer(timer, 1); +} +EXPORT_SYMBOL(add_virt_timer_periodic); + +static int __mod_vtimer(struct vtimer_list *timer, u64 expires, int periodic) +{ + unsigned long flags; + int rc; + + BUG_ON(!timer->function); + + if (timer->expires == expires && vtimer_pending(timer)) + return 1; + spin_lock_irqsave(&virt_timer_lock, flags); + rc = vtimer_pending(timer); + if (rc) + list_del_init(&timer->entry); + timer->interval = periodic ? expires : 0; + timer->expires = expires; + internal_add_vtimer(timer); + spin_unlock_irqrestore(&virt_timer_lock, flags); + return rc; +} + +/* + * returns whether it has modified a pending timer (1) or not (0) + */ +int mod_virt_timer(struct vtimer_list *timer, u64 expires) +{ + return __mod_vtimer(timer, expires, 0); +} +EXPORT_SYMBOL(mod_virt_timer); + +/* + * returns whether it has modified a pending timer (1) or not (0) + */ +int mod_virt_timer_periodic(struct vtimer_list *timer, u64 expires) +{ + return __mod_vtimer(timer, expires, 1); +} +EXPORT_SYMBOL(mod_virt_timer_periodic); + +/* + * Delete a virtual timer. + * + * returns whether the deleted timer was pending (1) or not (0) + */ +int del_virt_timer(struct vtimer_list *timer) +{ + unsigned long flags; + + if (!vtimer_pending(timer)) + return 0; + spin_lock_irqsave(&virt_timer_lock, flags); + list_del_init(&timer->entry); + spin_unlock_irqrestore(&virt_timer_lock, flags); + return 1; +} +EXPORT_SYMBOL(del_virt_timer); + +/* + * Start the virtual CPU timer on the current CPU. + */ +void vtime_init(void) +{ + /* set initial cpu timer */ + set_vtimer(VTIMER_MAX_SLICE); + /* Setup initial MT scaling values */ + if (smp_cpu_mtid) { + __this_cpu_write(mt_scaling_jiffies, jiffies); + __this_cpu_write(mt_scaling_mult, 1); + __this_cpu_write(mt_scaling_div, 1); + stcctm(MT_DIAG, smp_cpu_mtid + 1, this_cpu_ptr(mt_cycles)); + } +} diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig new file mode 100644 index 0000000000..45fdf2a9b2 --- /dev/null +++ b/arch/s390/kvm/Kconfig @@ -0,0 +1,59 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# KVM configuration +# +source "virt/kvm/Kconfig" + +menuconfig VIRTUALIZATION + def_bool y + prompt "KVM" + help + Say Y here to get to see options for using your Linux host to run other + operating systems inside virtual machines (guests). + This option alone does not add any kernel code. + + If you say N, all options in this submenu will be skipped and disabled. + +if VIRTUALIZATION + +config KVM + def_tristate y + prompt "Kernel-based Virtual Machine (KVM) support" + depends on HAVE_KVM + select PREEMPT_NOTIFIERS + select HAVE_KVM_CPU_RELAX_INTERCEPT + select HAVE_KVM_VCPU_ASYNC_IOCTL + select HAVE_KVM_EVENTFD + select KVM_ASYNC_PF + select KVM_ASYNC_PF_SYNC + select HAVE_KVM_IRQCHIP + select HAVE_KVM_IRQFD + select HAVE_KVM_IRQ_ROUTING + select HAVE_KVM_INVALID_WAKEUPS + select HAVE_KVM_NO_POLL + select KVM_VFIO + select INTERVAL_TREE + select MMU_NOTIFIER + help + Support hosting paravirtualized guest machines using the SIE + virtualization capability on the mainframe. This should work + on any 64bit machine. + + This module provides access to the hardware capabilities through + a character device node named /dev/kvm. + + To compile this as a module, choose M here: the module + will be called kvm. + + If unsure, say N. + +config KVM_S390_UCONTROL + bool "Userspace controlled virtual machines" + depends on KVM + help + Allow CAP_SYS_ADMIN users to create KVM virtual machines that are + controlled by userspace. + + If unsure, say N. + +endif # VIRTUALIZATION diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile new file mode 100644 index 0000000000..02217fb4ae --- /dev/null +++ b/arch/s390/kvm/Makefile @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0 +# Makefile for kernel virtual machines on s390 +# +# Copyright IBM Corp. 2008 + +include $(srctree)/virt/kvm/Makefile.kvm + +ccflags-y := -Ivirt/kvm -Iarch/s390/kvm + +kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o +kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o + +kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o +obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c new file mode 100644 index 0000000000..3c65b8258a --- /dev/null +++ b/arch/s390/kvm/diag.c @@ -0,0 +1,307 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * handling diagnose instructions + * + * Copyright IBM Corp. 2008, 2020 + * + * Author(s): Carsten Otte + * Christian Borntraeger + */ + +#include +#include +#include +#include +#include "kvm-s390.h" +#include "trace.h" +#include "trace-s390.h" +#include "gaccess.h" + +static int diag_release_pages(struct kvm_vcpu *vcpu) +{ + unsigned long start, end; + unsigned long prefix = kvm_s390_get_prefix(vcpu); + + start = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4]; + end = vcpu->run->s.regs.gprs[vcpu->arch.sie_block->ipa & 0xf] + PAGE_SIZE; + vcpu->stat.instruction_diagnose_10++; + + if (start & ~PAGE_MASK || end & ~PAGE_MASK || start >= end + || start < 2 * PAGE_SIZE) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + VCPU_EVENT(vcpu, 5, "diag release pages %lX %lX", start, end); + + /* + * We checked for start >= end above, so lets check for the + * fast path (no prefix swap page involved) + */ + if (end <= prefix || start >= prefix + 2 * PAGE_SIZE) { + gmap_discard(vcpu->arch.gmap, start, end); + } else { + /* + * This is slow path. gmap_discard will check for start + * so lets split this into before prefix, prefix, after + * prefix and let gmap_discard make some of these calls + * NOPs. + */ + gmap_discard(vcpu->arch.gmap, start, prefix); + if (start <= prefix) + gmap_discard(vcpu->arch.gmap, 0, PAGE_SIZE); + if (end > prefix + PAGE_SIZE) + gmap_discard(vcpu->arch.gmap, PAGE_SIZE, 2 * PAGE_SIZE); + gmap_discard(vcpu->arch.gmap, prefix + 2 * PAGE_SIZE, end); + } + return 0; +} + +static int __diag_page_ref_service(struct kvm_vcpu *vcpu) +{ + struct prs_parm { + u16 code; + u16 subcode; + u16 parm_len; + u16 parm_version; + u64 token_addr; + u64 select_mask; + u64 compare_mask; + u64 zarch; + }; + struct prs_parm parm; + int rc; + u16 rx = (vcpu->arch.sie_block->ipa & 0xf0) >> 4; + u16 ry = (vcpu->arch.sie_block->ipa & 0x0f); + + VCPU_EVENT(vcpu, 3, "diag page reference parameter block at 0x%llx", + vcpu->run->s.regs.gprs[rx]); + vcpu->stat.instruction_diagnose_258++; + if (vcpu->run->s.regs.gprs[rx] & 7) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + rc = read_guest(vcpu, vcpu->run->s.regs.gprs[rx], rx, &parm, sizeof(parm)); + if (rc) + return kvm_s390_inject_prog_cond(vcpu, rc); + if (parm.parm_version != 2 || parm.parm_len < 5 || parm.code != 0x258) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + switch (parm.subcode) { + case 0: /* TOKEN */ + VCPU_EVENT(vcpu, 3, "pageref token addr 0x%llx " + "select mask 0x%llx compare mask 0x%llx", + parm.token_addr, parm.select_mask, parm.compare_mask); + if (vcpu->arch.pfault_token != KVM_S390_PFAULT_TOKEN_INVALID) { + /* + * If the pagefault handshake is already activated, + * the token must not be changed. We have to return + * decimal 8 instead, as mandated in SC24-6084. + */ + vcpu->run->s.regs.gprs[ry] = 8; + return 0; + } + + if ((parm.compare_mask & parm.select_mask) != parm.compare_mask || + parm.token_addr & 7 || parm.zarch != 0x8000000000000000ULL) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + if (kvm_is_error_gpa(vcpu->kvm, parm.token_addr)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + + vcpu->arch.pfault_token = parm.token_addr; + vcpu->arch.pfault_select = parm.select_mask; + vcpu->arch.pfault_compare = parm.compare_mask; + vcpu->run->s.regs.gprs[ry] = 0; + rc = 0; + break; + case 1: /* + * CANCEL + * Specification allows to let already pending tokens survive + * the cancel, therefore to reduce code complexity, we assume + * all outstanding tokens are already pending. + */ + VCPU_EVENT(vcpu, 3, "pageref cancel addr 0x%llx", parm.token_addr); + if (parm.token_addr || parm.select_mask || + parm.compare_mask || parm.zarch) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + vcpu->run->s.regs.gprs[ry] = 0; + /* + * If the pfault handling was not established or is already + * canceled SC24-6084 requests to return decimal 4. + */ + if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID) + vcpu->run->s.regs.gprs[ry] = 4; + else + vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; + + rc = 0; + break; + default: + rc = -EOPNOTSUPP; + break; + } + + return rc; +} + +static int __diag_time_slice_end(struct kvm_vcpu *vcpu) +{ + VCPU_EVENT(vcpu, 5, "%s", "diag time slice end"); + vcpu->stat.instruction_diagnose_44++; + kvm_vcpu_on_spin(vcpu, true); + return 0; +} + +static int forward_cnt; +static unsigned long cur_slice; + +static int diag9c_forwarding_overrun(void) +{ + /* Reset the count on a new slice */ + if (time_after(jiffies, cur_slice)) { + cur_slice = jiffies; + forward_cnt = diag9c_forwarding_hz / HZ; + } + return forward_cnt-- <= 0 ? 1 : 0; +} + +static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu *tcpu; + int tcpu_cpu; + int tid; + + tid = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4]; + vcpu->stat.instruction_diagnose_9c++; + + /* yield to self */ + if (tid == vcpu->vcpu_id) + goto no_yield; + + /* yield to invalid */ + tcpu = kvm_get_vcpu_by_id(vcpu->kvm, tid); + if (!tcpu) + goto no_yield; + + /* target guest VCPU already running */ + tcpu_cpu = READ_ONCE(tcpu->cpu); + if (tcpu_cpu >= 0) { + if (!diag9c_forwarding_hz || diag9c_forwarding_overrun()) + goto no_yield; + + /* target host CPU already running */ + if (!vcpu_is_preempted(tcpu_cpu)) + goto no_yield; + smp_yield_cpu(tcpu_cpu); + VCPU_EVENT(vcpu, 5, + "diag time slice end directed to %d: yield forwarded", + tid); + vcpu->stat.diag_9c_forward++; + return 0; + } + + if (kvm_vcpu_yield_to(tcpu) <= 0) + goto no_yield; + + VCPU_EVENT(vcpu, 5, "diag time slice end directed to %d: done", tid); + return 0; +no_yield: + VCPU_EVENT(vcpu, 5, "diag time slice end directed to %d: ignored", tid); + vcpu->stat.diag_9c_ignored++; + return 0; +} + +static int __diag_ipl_functions(struct kvm_vcpu *vcpu) +{ + unsigned int reg = vcpu->arch.sie_block->ipa & 0xf; + unsigned long subcode = vcpu->run->s.regs.gprs[reg] & 0xffff; + + VCPU_EVENT(vcpu, 3, "diag ipl functions, subcode %lx", subcode); + vcpu->stat.instruction_diagnose_308++; + switch (subcode) { + case 3: + vcpu->run->s390_reset_flags = KVM_S390_RESET_CLEAR; + break; + case 4: + vcpu->run->s390_reset_flags = 0; + break; + default: + return -EOPNOTSUPP; + } + + /* + * no need to check the return value of vcpu_stop as it can only have + * an error for protvirt, but protvirt means user cpu state + */ + if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) + kvm_s390_vcpu_stop(vcpu); + vcpu->run->s390_reset_flags |= KVM_S390_RESET_SUBSYSTEM; + vcpu->run->s390_reset_flags |= KVM_S390_RESET_IPL; + vcpu->run->s390_reset_flags |= KVM_S390_RESET_CPU_INIT; + vcpu->run->exit_reason = KVM_EXIT_S390_RESET; + VCPU_EVENT(vcpu, 3, "requesting userspace resets %llx", + vcpu->run->s390_reset_flags); + trace_kvm_s390_request_resets(vcpu->run->s390_reset_flags); + return -EREMOTE; +} + +static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu) +{ + int ret; + + vcpu->stat.instruction_diagnose_500++; + /* No virtio-ccw notification? Get out quickly. */ + if (!vcpu->kvm->arch.css_support || + (vcpu->run->s.regs.gprs[1] != KVM_S390_VIRTIO_CCW_NOTIFY)) + return -EOPNOTSUPP; + + VCPU_EVENT(vcpu, 4, "diag 0x500 schid 0x%8.8x queue 0x%x cookie 0x%llx", + (u32) vcpu->run->s.regs.gprs[2], + (u32) vcpu->run->s.regs.gprs[3], + vcpu->run->s.regs.gprs[4]); + + /* + * The layout is as follows: + * - gpr 2 contains the subchannel id (passed as addr) + * - gpr 3 contains the virtqueue index (passed as datamatch) + * - gpr 4 contains the index on the bus (optionally) + */ + ret = kvm_io_bus_write_cookie(vcpu, KVM_VIRTIO_CCW_NOTIFY_BUS, + vcpu->run->s.regs.gprs[2] & 0xffffffff, + 8, &vcpu->run->s.regs.gprs[3], + vcpu->run->s.regs.gprs[4]); + + /* + * Return cookie in gpr 2, but don't overwrite the register if the + * diagnose will be handled by userspace. + */ + if (ret != -EOPNOTSUPP) + vcpu->run->s.regs.gprs[2] = ret; + /* kvm_io_bus_write_cookie returns -EOPNOTSUPP if it found no match. */ + return ret < 0 ? ret : 0; +} + +int kvm_s390_handle_diag(struct kvm_vcpu *vcpu) +{ + int code = kvm_s390_get_base_disp_rs(vcpu, NULL) & 0xffff; + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + trace_kvm_s390_handle_diag(vcpu, code); + switch (code) { + case 0x10: + return diag_release_pages(vcpu); + case 0x44: + return __diag_time_slice_end(vcpu); + case 0x9c: + return __diag_time_slice_end_directed(vcpu); + case 0x258: + return __diag_page_ref_service(vcpu); + case 0x308: + return __diag_ipl_functions(vcpu); + case 0x500: + return __diag_virtio_hypercall(vcpu); + default: + vcpu->stat.instruction_diagnose_other++; + return -EOPNOTSUPP; + } +} diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c new file mode 100644 index 0000000000..6d6bc19b37 --- /dev/null +++ b/arch/s390/kvm/gaccess.c @@ -0,0 +1,1624 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * guest access functions + * + * Copyright IBM Corp. 2014 + * + */ + +#include +#include +#include +#include +#include + +#include +#include "kvm-s390.h" +#include "gaccess.h" +#include + +union asce { + unsigned long val; + struct { + unsigned long origin : 52; /* Region- or Segment-Table Origin */ + unsigned long : 2; + unsigned long g : 1; /* Subspace Group Control */ + unsigned long p : 1; /* Private Space Control */ + unsigned long s : 1; /* Storage-Alteration-Event Control */ + unsigned long x : 1; /* Space-Switch-Event Control */ + unsigned long r : 1; /* Real-Space Control */ + unsigned long : 1; + unsigned long dt : 2; /* Designation-Type Control */ + unsigned long tl : 2; /* Region- or Segment-Table Length */ + }; +}; + +enum { + ASCE_TYPE_SEGMENT = 0, + ASCE_TYPE_REGION3 = 1, + ASCE_TYPE_REGION2 = 2, + ASCE_TYPE_REGION1 = 3 +}; + +union region1_table_entry { + unsigned long val; + struct { + unsigned long rto: 52;/* Region-Table Origin */ + unsigned long : 2; + unsigned long p : 1; /* DAT-Protection Bit */ + unsigned long : 1; + unsigned long tf : 2; /* Region-Second-Table Offset */ + unsigned long i : 1; /* Region-Invalid Bit */ + unsigned long : 1; + unsigned long tt : 2; /* Table-Type Bits */ + unsigned long tl : 2; /* Region-Second-Table Length */ + }; +}; + +union region2_table_entry { + unsigned long val; + struct { + unsigned long rto: 52;/* Region-Table Origin */ + unsigned long : 2; + unsigned long p : 1; /* DAT-Protection Bit */ + unsigned long : 1; + unsigned long tf : 2; /* Region-Third-Table Offset */ + unsigned long i : 1; /* Region-Invalid Bit */ + unsigned long : 1; + unsigned long tt : 2; /* Table-Type Bits */ + unsigned long tl : 2; /* Region-Third-Table Length */ + }; +}; + +struct region3_table_entry_fc0 { + unsigned long sto: 52;/* Segment-Table Origin */ + unsigned long : 1; + unsigned long fc : 1; /* Format-Control */ + unsigned long p : 1; /* DAT-Protection Bit */ + unsigned long : 1; + unsigned long tf : 2; /* Segment-Table Offset */ + unsigned long i : 1; /* Region-Invalid Bit */ + unsigned long cr : 1; /* Common-Region Bit */ + unsigned long tt : 2; /* Table-Type Bits */ + unsigned long tl : 2; /* Segment-Table Length */ +}; + +struct region3_table_entry_fc1 { + unsigned long rfaa : 33; /* Region-Frame Absolute Address */ + unsigned long : 14; + unsigned long av : 1; /* ACCF-Validity Control */ + unsigned long acc: 4; /* Access-Control Bits */ + unsigned long f : 1; /* Fetch-Protection Bit */ + unsigned long fc : 1; /* Format-Control */ + unsigned long p : 1; /* DAT-Protection Bit */ + unsigned long iep: 1; /* Instruction-Execution-Protection */ + unsigned long : 2; + unsigned long i : 1; /* Region-Invalid Bit */ + unsigned long cr : 1; /* Common-Region Bit */ + unsigned long tt : 2; /* Table-Type Bits */ + unsigned long : 2; +}; + +union region3_table_entry { + unsigned long val; + struct region3_table_entry_fc0 fc0; + struct region3_table_entry_fc1 fc1; + struct { + unsigned long : 53; + unsigned long fc : 1; /* Format-Control */ + unsigned long : 4; + unsigned long i : 1; /* Region-Invalid Bit */ + unsigned long cr : 1; /* Common-Region Bit */ + unsigned long tt : 2; /* Table-Type Bits */ + unsigned long : 2; + }; +}; + +struct segment_entry_fc0 { + unsigned long pto: 53;/* Page-Table Origin */ + unsigned long fc : 1; /* Format-Control */ + unsigned long p : 1; /* DAT-Protection Bit */ + unsigned long : 3; + unsigned long i : 1; /* Segment-Invalid Bit */ + unsigned long cs : 1; /* Common-Segment Bit */ + unsigned long tt : 2; /* Table-Type Bits */ + unsigned long : 2; +}; + +struct segment_entry_fc1 { + unsigned long sfaa : 44; /* Segment-Frame Absolute Address */ + unsigned long : 3; + unsigned long av : 1; /* ACCF-Validity Control */ + unsigned long acc: 4; /* Access-Control Bits */ + unsigned long f : 1; /* Fetch-Protection Bit */ + unsigned long fc : 1; /* Format-Control */ + unsigned long p : 1; /* DAT-Protection Bit */ + unsigned long iep: 1; /* Instruction-Execution-Protection */ + unsigned long : 2; + unsigned long i : 1; /* Segment-Invalid Bit */ + unsigned long cs : 1; /* Common-Segment Bit */ + unsigned long tt : 2; /* Table-Type Bits */ + unsigned long : 2; +}; + +union segment_table_entry { + unsigned long val; + struct segment_entry_fc0 fc0; + struct segment_entry_fc1 fc1; + struct { + unsigned long : 53; + unsigned long fc : 1; /* Format-Control */ + unsigned long : 4; + unsigned long i : 1; /* Segment-Invalid Bit */ + unsigned long cs : 1; /* Common-Segment Bit */ + unsigned long tt : 2; /* Table-Type Bits */ + unsigned long : 2; + }; +}; + +enum { + TABLE_TYPE_SEGMENT = 0, + TABLE_TYPE_REGION3 = 1, + TABLE_TYPE_REGION2 = 2, + TABLE_TYPE_REGION1 = 3 +}; + +union page_table_entry { + unsigned long val; + struct { + unsigned long pfra : 52; /* Page-Frame Real Address */ + unsigned long z : 1; /* Zero Bit */ + unsigned long i : 1; /* Page-Invalid Bit */ + unsigned long p : 1; /* DAT-Protection Bit */ + unsigned long iep: 1; /* Instruction-Execution-Protection */ + unsigned long : 8; + }; +}; + +/* + * vaddress union in order to easily decode a virtual address into its + * region first index, region second index etc. parts. + */ +union vaddress { + unsigned long addr; + struct { + unsigned long rfx : 11; + unsigned long rsx : 11; + unsigned long rtx : 11; + unsigned long sx : 11; + unsigned long px : 8; + unsigned long bx : 12; + }; + struct { + unsigned long rfx01 : 2; + unsigned long : 9; + unsigned long rsx01 : 2; + unsigned long : 9; + unsigned long rtx01 : 2; + unsigned long : 9; + unsigned long sx01 : 2; + unsigned long : 29; + }; +}; + +/* + * raddress union which will contain the result (real or absolute address) + * after a page table walk. The rfaa, sfaa and pfra members are used to + * simply assign them the value of a region, segment or page table entry. + */ +union raddress { + unsigned long addr; + unsigned long rfaa : 33; /* Region-Frame Absolute Address */ + unsigned long sfaa : 44; /* Segment-Frame Absolute Address */ + unsigned long pfra : 52; /* Page-Frame Real Address */ +}; + +union alet { + u32 val; + struct { + u32 reserved : 7; + u32 p : 1; + u32 alesn : 8; + u32 alen : 16; + }; +}; + +union ald { + u32 val; + struct { + u32 : 1; + u32 alo : 24; + u32 all : 7; + }; +}; + +struct ale { + unsigned long i : 1; /* ALEN-Invalid Bit */ + unsigned long : 5; + unsigned long fo : 1; /* Fetch-Only Bit */ + unsigned long p : 1; /* Private Bit */ + unsigned long alesn : 8; /* Access-List-Entry Sequence Number */ + unsigned long aleax : 16; /* Access-List-Entry Authorization Index */ + unsigned long : 32; + unsigned long : 1; + unsigned long asteo : 25; /* ASN-Second-Table-Entry Origin */ + unsigned long : 6; + unsigned long astesn : 32; /* ASTE Sequence Number */ +}; + +struct aste { + unsigned long i : 1; /* ASX-Invalid Bit */ + unsigned long ato : 29; /* Authority-Table Origin */ + unsigned long : 1; + unsigned long b : 1; /* Base-Space Bit */ + unsigned long ax : 16; /* Authorization Index */ + unsigned long atl : 12; /* Authority-Table Length */ + unsigned long : 2; + unsigned long ca : 1; /* Controlled-ASN Bit */ + unsigned long ra : 1; /* Reusable-ASN Bit */ + unsigned long asce : 64; /* Address-Space-Control Element */ + unsigned long ald : 32; + unsigned long astesn : 32; + /* .. more fields there */ +}; + +int ipte_lock_held(struct kvm *kvm) +{ + if (sclp.has_siif) { + int rc; + + read_lock(&kvm->arch.sca_lock); + rc = kvm_s390_get_ipte_control(kvm)->kh != 0; + read_unlock(&kvm->arch.sca_lock); + return rc; + } + return kvm->arch.ipte_lock_count != 0; +} + +static void ipte_lock_simple(struct kvm *kvm) +{ + union ipte_control old, new, *ic; + + mutex_lock(&kvm->arch.ipte_mutex); + kvm->arch.ipte_lock_count++; + if (kvm->arch.ipte_lock_count > 1) + goto out; +retry: + read_lock(&kvm->arch.sca_lock); + ic = kvm_s390_get_ipte_control(kvm); + do { + old = READ_ONCE(*ic); + if (old.k) { + read_unlock(&kvm->arch.sca_lock); + cond_resched(); + goto retry; + } + new = old; + new.k = 1; + } while (cmpxchg(&ic->val, old.val, new.val) != old.val); + read_unlock(&kvm->arch.sca_lock); +out: + mutex_unlock(&kvm->arch.ipte_mutex); +} + +static void ipte_unlock_simple(struct kvm *kvm) +{ + union ipte_control old, new, *ic; + + mutex_lock(&kvm->arch.ipte_mutex); + kvm->arch.ipte_lock_count--; + if (kvm->arch.ipte_lock_count) + goto out; + read_lock(&kvm->arch.sca_lock); + ic = kvm_s390_get_ipte_control(kvm); + do { + old = READ_ONCE(*ic); + new = old; + new.k = 0; + } while (cmpxchg(&ic->val, old.val, new.val) != old.val); + read_unlock(&kvm->arch.sca_lock); + wake_up(&kvm->arch.ipte_wq); +out: + mutex_unlock(&kvm->arch.ipte_mutex); +} + +static void ipte_lock_siif(struct kvm *kvm) +{ + union ipte_control old, new, *ic; + +retry: + read_lock(&kvm->arch.sca_lock); + ic = kvm_s390_get_ipte_control(kvm); + do { + old = READ_ONCE(*ic); + if (old.kg) { + read_unlock(&kvm->arch.sca_lock); + cond_resched(); + goto retry; + } + new = old; + new.k = 1; + new.kh++; + } while (cmpxchg(&ic->val, old.val, new.val) != old.val); + read_unlock(&kvm->arch.sca_lock); +} + +static void ipte_unlock_siif(struct kvm *kvm) +{ + union ipte_control old, new, *ic; + + read_lock(&kvm->arch.sca_lock); + ic = kvm_s390_get_ipte_control(kvm); + do { + old = READ_ONCE(*ic); + new = old; + new.kh--; + if (!new.kh) + new.k = 0; + } while (cmpxchg(&ic->val, old.val, new.val) != old.val); + read_unlock(&kvm->arch.sca_lock); + if (!new.kh) + wake_up(&kvm->arch.ipte_wq); +} + +void ipte_lock(struct kvm *kvm) +{ + if (sclp.has_siif) + ipte_lock_siif(kvm); + else + ipte_lock_simple(kvm); +} + +void ipte_unlock(struct kvm *kvm) +{ + if (sclp.has_siif) + ipte_unlock_siif(kvm); + else + ipte_unlock_simple(kvm); +} + +static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, u8 ar, + enum gacc_mode mode) +{ + union alet alet; + struct ale ale; + struct aste aste; + unsigned long ald_addr, authority_table_addr; + union ald ald; + int eax, rc; + u8 authority_table; + + if (ar >= NUM_ACRS) + return -EINVAL; + + save_access_regs(vcpu->run->s.regs.acrs); + alet.val = vcpu->run->s.regs.acrs[ar]; + + if (ar == 0 || alet.val == 0) { + asce->val = vcpu->arch.sie_block->gcr[1]; + return 0; + } else if (alet.val == 1) { + asce->val = vcpu->arch.sie_block->gcr[7]; + return 0; + } + + if (alet.reserved) + return PGM_ALET_SPECIFICATION; + + if (alet.p) + ald_addr = vcpu->arch.sie_block->gcr[5]; + else + ald_addr = vcpu->arch.sie_block->gcr[2]; + ald_addr &= 0x7fffffc0; + + rc = read_guest_real(vcpu, ald_addr + 16, &ald.val, sizeof(union ald)); + if (rc) + return rc; + + if (alet.alen / 8 > ald.all) + return PGM_ALEN_TRANSLATION; + + if (0x7fffffff - ald.alo * 128 < alet.alen * 16) + return PGM_ADDRESSING; + + rc = read_guest_real(vcpu, ald.alo * 128 + alet.alen * 16, &ale, + sizeof(struct ale)); + if (rc) + return rc; + + if (ale.i == 1) + return PGM_ALEN_TRANSLATION; + if (ale.alesn != alet.alesn) + return PGM_ALE_SEQUENCE; + + rc = read_guest_real(vcpu, ale.asteo * 64, &aste, sizeof(struct aste)); + if (rc) + return rc; + + if (aste.i) + return PGM_ASTE_VALIDITY; + if (aste.astesn != ale.astesn) + return PGM_ASTE_SEQUENCE; + + if (ale.p == 1) { + eax = (vcpu->arch.sie_block->gcr[8] >> 16) & 0xffff; + if (ale.aleax != eax) { + if (eax / 16 > aste.atl) + return PGM_EXTENDED_AUTHORITY; + + authority_table_addr = aste.ato * 4 + eax / 4; + + rc = read_guest_real(vcpu, authority_table_addr, + &authority_table, + sizeof(u8)); + if (rc) + return rc; + + if ((authority_table & (0x40 >> ((eax & 3) * 2))) == 0) + return PGM_EXTENDED_AUTHORITY; + } + } + + if (ale.fo == 1 && mode == GACC_STORE) + return PGM_PROTECTION; + + asce->val = aste.asce; + return 0; +} + +struct trans_exc_code_bits { + unsigned long addr : 52; /* Translation-exception Address */ + unsigned long fsi : 2; /* Access Exception Fetch/Store Indication */ + unsigned long : 2; + unsigned long b56 : 1; + unsigned long : 3; + unsigned long b60 : 1; + unsigned long b61 : 1; + unsigned long as : 2; /* ASCE Identifier */ +}; + +enum { + FSI_UNKNOWN = 0, /* Unknown whether fetch or store */ + FSI_STORE = 1, /* Exception was due to store operation */ + FSI_FETCH = 2 /* Exception was due to fetch operation */ +}; + +enum prot_type { + PROT_TYPE_LA = 0, + PROT_TYPE_KEYC = 1, + PROT_TYPE_ALC = 2, + PROT_TYPE_DAT = 3, + PROT_TYPE_IEP = 4, + /* Dummy value for passing an initialized value when code != PGM_PROTECTION */ + PROT_NONE, +}; + +static int trans_exc_ending(struct kvm_vcpu *vcpu, int code, unsigned long gva, u8 ar, + enum gacc_mode mode, enum prot_type prot, bool terminate) +{ + struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm; + struct trans_exc_code_bits *tec; + + memset(pgm, 0, sizeof(*pgm)); + pgm->code = code; + tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code; + + switch (code) { + case PGM_PROTECTION: + switch (prot) { + case PROT_NONE: + /* We should never get here, acts like termination */ + WARN_ON_ONCE(1); + break; + case PROT_TYPE_IEP: + tec->b61 = 1; + fallthrough; + case PROT_TYPE_LA: + tec->b56 = 1; + break; + case PROT_TYPE_KEYC: + tec->b60 = 1; + break; + case PROT_TYPE_ALC: + tec->b60 = 1; + fallthrough; + case PROT_TYPE_DAT: + tec->b61 = 1; + break; + } + if (terminate) { + tec->b56 = 0; + tec->b60 = 0; + tec->b61 = 0; + } + fallthrough; + case PGM_ASCE_TYPE: + case PGM_PAGE_TRANSLATION: + case PGM_REGION_FIRST_TRANS: + case PGM_REGION_SECOND_TRANS: + case PGM_REGION_THIRD_TRANS: + case PGM_SEGMENT_TRANSLATION: + /* + * op_access_id only applies to MOVE_PAGE -> set bit 61 + * exc_access_id has to be set to 0 for some instructions. Both + * cases have to be handled by the caller. + */ + tec->addr = gva >> PAGE_SHIFT; + tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH; + tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as; + fallthrough; + case PGM_ALEN_TRANSLATION: + case PGM_ALE_SEQUENCE: + case PGM_ASTE_VALIDITY: + case PGM_ASTE_SEQUENCE: + case PGM_EXTENDED_AUTHORITY: + /* + * We can always store exc_access_id, as it is + * undefined for non-ar cases. It is undefined for + * most DAT protection exceptions. + */ + pgm->exc_access_id = ar; + break; + } + return code; +} + +static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, u8 ar, + enum gacc_mode mode, enum prot_type prot) +{ + return trans_exc_ending(vcpu, code, gva, ar, mode, prot, false); +} + +static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce, + unsigned long ga, u8 ar, enum gacc_mode mode) +{ + int rc; + struct psw_bits psw = psw_bits(vcpu->arch.sie_block->gpsw); + + if (!psw.dat) { + asce->val = 0; + asce->r = 1; + return 0; + } + + if ((mode == GACC_IFETCH) && (psw.as != PSW_BITS_AS_HOME)) + psw.as = PSW_BITS_AS_PRIMARY; + + switch (psw.as) { + case PSW_BITS_AS_PRIMARY: + asce->val = vcpu->arch.sie_block->gcr[1]; + return 0; + case PSW_BITS_AS_SECONDARY: + asce->val = vcpu->arch.sie_block->gcr[7]; + return 0; + case PSW_BITS_AS_HOME: + asce->val = vcpu->arch.sie_block->gcr[13]; + return 0; + case PSW_BITS_AS_ACCREG: + rc = ar_translation(vcpu, asce, ar, mode); + if (rc > 0) + return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_ALC); + return rc; + } + return 0; +} + +static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val) +{ + return kvm_read_guest(kvm, gpa, val, sizeof(*val)); +} + +/** + * guest_translate - translate a guest virtual into a guest absolute address + * @vcpu: virtual cpu + * @gva: guest virtual address + * @gpa: points to where guest physical (absolute) address should be stored + * @asce: effective asce + * @mode: indicates the access mode to be used + * @prot: returns the type for protection exceptions + * + * Translate a guest virtual address into a guest absolute address by means + * of dynamic address translation as specified by the architecture. + * If the resulting absolute address is not available in the configuration + * an addressing exception is indicated and @gpa will not be changed. + * + * Returns: - zero on success; @gpa contains the resulting absolute address + * - a negative value if guest access failed due to e.g. broken + * guest mapping + * - a positive value if an access exception happened. In this case + * the returned value is the program interruption code as defined + * by the architecture + */ +static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, + unsigned long *gpa, const union asce asce, + enum gacc_mode mode, enum prot_type *prot) +{ + union vaddress vaddr = {.addr = gva}; + union raddress raddr = {.addr = gva}; + union page_table_entry pte; + int dat_protection = 0; + int iep_protection = 0; + union ctlreg0 ctlreg0; + unsigned long ptr; + int edat1, edat2, iep; + + ctlreg0.val = vcpu->arch.sie_block->gcr[0]; + edat1 = ctlreg0.edat && test_kvm_facility(vcpu->kvm, 8); + edat2 = edat1 && test_kvm_facility(vcpu->kvm, 78); + iep = ctlreg0.iep && test_kvm_facility(vcpu->kvm, 130); + if (asce.r) + goto real_address; + ptr = asce.origin * PAGE_SIZE; + switch (asce.dt) { + case ASCE_TYPE_REGION1: + if (vaddr.rfx01 > asce.tl) + return PGM_REGION_FIRST_TRANS; + ptr += vaddr.rfx * 8; + break; + case ASCE_TYPE_REGION2: + if (vaddr.rfx) + return PGM_ASCE_TYPE; + if (vaddr.rsx01 > asce.tl) + return PGM_REGION_SECOND_TRANS; + ptr += vaddr.rsx * 8; + break; + case ASCE_TYPE_REGION3: + if (vaddr.rfx || vaddr.rsx) + return PGM_ASCE_TYPE; + if (vaddr.rtx01 > asce.tl) + return PGM_REGION_THIRD_TRANS; + ptr += vaddr.rtx * 8; + break; + case ASCE_TYPE_SEGMENT: + if (vaddr.rfx || vaddr.rsx || vaddr.rtx) + return PGM_ASCE_TYPE; + if (vaddr.sx01 > asce.tl) + return PGM_SEGMENT_TRANSLATION; + ptr += vaddr.sx * 8; + break; + } + switch (asce.dt) { + case ASCE_TYPE_REGION1: { + union region1_table_entry rfte; + + if (kvm_is_error_gpa(vcpu->kvm, ptr)) + return PGM_ADDRESSING; + if (deref_table(vcpu->kvm, ptr, &rfte.val)) + return -EFAULT; + if (rfte.i) + return PGM_REGION_FIRST_TRANS; + if (rfte.tt != TABLE_TYPE_REGION1) + return PGM_TRANSLATION_SPEC; + if (vaddr.rsx01 < rfte.tf || vaddr.rsx01 > rfte.tl) + return PGM_REGION_SECOND_TRANS; + if (edat1) + dat_protection |= rfte.p; + ptr = rfte.rto * PAGE_SIZE + vaddr.rsx * 8; + } + fallthrough; + case ASCE_TYPE_REGION2: { + union region2_table_entry rste; + + if (kvm_is_error_gpa(vcpu->kvm, ptr)) + return PGM_ADDRESSING; + if (deref_table(vcpu->kvm, ptr, &rste.val)) + return -EFAULT; + if (rste.i) + return PGM_REGION_SECOND_TRANS; + if (rste.tt != TABLE_TYPE_REGION2) + return PGM_TRANSLATION_SPEC; + if (vaddr.rtx01 < rste.tf || vaddr.rtx01 > rste.tl) + return PGM_REGION_THIRD_TRANS; + if (edat1) + dat_protection |= rste.p; + ptr = rste.rto * PAGE_SIZE + vaddr.rtx * 8; + } + fallthrough; + case ASCE_TYPE_REGION3: { + union region3_table_entry rtte; + + if (kvm_is_error_gpa(vcpu->kvm, ptr)) + return PGM_ADDRESSING; + if (deref_table(vcpu->kvm, ptr, &rtte.val)) + return -EFAULT; + if (rtte.i) + return PGM_REGION_THIRD_TRANS; + if (rtte.tt != TABLE_TYPE_REGION3) + return PGM_TRANSLATION_SPEC; + if (rtte.cr && asce.p && edat2) + return PGM_TRANSLATION_SPEC; + if (rtte.fc && edat2) { + dat_protection |= rtte.fc1.p; + iep_protection = rtte.fc1.iep; + raddr.rfaa = rtte.fc1.rfaa; + goto absolute_address; + } + if (vaddr.sx01 < rtte.fc0.tf) + return PGM_SEGMENT_TRANSLATION; + if (vaddr.sx01 > rtte.fc0.tl) + return PGM_SEGMENT_TRANSLATION; + if (edat1) + dat_protection |= rtte.fc0.p; + ptr = rtte.fc0.sto * PAGE_SIZE + vaddr.sx * 8; + } + fallthrough; + case ASCE_TYPE_SEGMENT: { + union segment_table_entry ste; + + if (kvm_is_error_gpa(vcpu->kvm, ptr)) + return PGM_ADDRESSING; + if (deref_table(vcpu->kvm, ptr, &ste.val)) + return -EFAULT; + if (ste.i) + return PGM_SEGMENT_TRANSLATION; + if (ste.tt != TABLE_TYPE_SEGMENT) + return PGM_TRANSLATION_SPEC; + if (ste.cs && asce.p) + return PGM_TRANSLATION_SPEC; + if (ste.fc && edat1) { + dat_protection |= ste.fc1.p; + iep_protection = ste.fc1.iep; + raddr.sfaa = ste.fc1.sfaa; + goto absolute_address; + } + dat_protection |= ste.fc0.p; + ptr = ste.fc0.pto * (PAGE_SIZE / 2) + vaddr.px * 8; + } + } + if (kvm_is_error_gpa(vcpu->kvm, ptr)) + return PGM_ADDRESSING; + if (deref_table(vcpu->kvm, ptr, &pte.val)) + return -EFAULT; + if (pte.i) + return PGM_PAGE_TRANSLATION; + if (pte.z) + return PGM_TRANSLATION_SPEC; + dat_protection |= pte.p; + iep_protection = pte.iep; + raddr.pfra = pte.pfra; +real_address: + raddr.addr = kvm_s390_real_to_abs(vcpu, raddr.addr); +absolute_address: + if (mode == GACC_STORE && dat_protection) { + *prot = PROT_TYPE_DAT; + return PGM_PROTECTION; + } + if (mode == GACC_IFETCH && iep_protection && iep) { + *prot = PROT_TYPE_IEP; + return PGM_PROTECTION; + } + if (kvm_is_error_gpa(vcpu->kvm, raddr.addr)) + return PGM_ADDRESSING; + *gpa = raddr.addr; + return 0; +} + +static inline int is_low_address(unsigned long ga) +{ + /* Check for address ranges 0..511 and 4096..4607 */ + return (ga & ~0x11fful) == 0; +} + +static int low_address_protection_enabled(struct kvm_vcpu *vcpu, + const union asce asce) +{ + union ctlreg0 ctlreg0 = {.val = vcpu->arch.sie_block->gcr[0]}; + psw_t *psw = &vcpu->arch.sie_block->gpsw; + + if (!ctlreg0.lap) + return 0; + if (psw_bits(*psw).dat && asce.p) + return 0; + return 1; +} + +static int vm_check_access_key(struct kvm *kvm, u8 access_key, + enum gacc_mode mode, gpa_t gpa) +{ + u8 storage_key, access_control; + bool fetch_protected; + unsigned long hva; + int r; + + if (access_key == 0) + return 0; + + hva = gfn_to_hva(kvm, gpa_to_gfn(gpa)); + if (kvm_is_error_hva(hva)) + return PGM_ADDRESSING; + + mmap_read_lock(current->mm); + r = get_guest_storage_key(current->mm, hva, &storage_key); + mmap_read_unlock(current->mm); + if (r) + return r; + access_control = FIELD_GET(_PAGE_ACC_BITS, storage_key); + if (access_control == access_key) + return 0; + fetch_protected = storage_key & _PAGE_FP_BIT; + if ((mode == GACC_FETCH || mode == GACC_IFETCH) && !fetch_protected) + return 0; + return PGM_PROTECTION; +} + +static bool fetch_prot_override_applicable(struct kvm_vcpu *vcpu, enum gacc_mode mode, + union asce asce) +{ + psw_t *psw = &vcpu->arch.sie_block->gpsw; + unsigned long override; + + if (mode == GACC_FETCH || mode == GACC_IFETCH) { + /* check if fetch protection override enabled */ + override = vcpu->arch.sie_block->gcr[0]; + override &= CR0_FETCH_PROTECTION_OVERRIDE; + /* not applicable if subject to DAT && private space */ + override = override && !(psw_bits(*psw).dat && asce.p); + return override; + } + return false; +} + +static bool fetch_prot_override_applies(unsigned long ga, unsigned int len) +{ + return ga < 2048 && ga + len <= 2048; +} + +static bool storage_prot_override_applicable(struct kvm_vcpu *vcpu) +{ + /* check if storage protection override enabled */ + return vcpu->arch.sie_block->gcr[0] & CR0_STORAGE_PROTECTION_OVERRIDE; +} + +static bool storage_prot_override_applies(u8 access_control) +{ + /* matches special storage protection override key (9) -> allow */ + return access_control == PAGE_SPO_ACC; +} + +static int vcpu_check_access_key(struct kvm_vcpu *vcpu, u8 access_key, + enum gacc_mode mode, union asce asce, gpa_t gpa, + unsigned long ga, unsigned int len) +{ + u8 storage_key, access_control; + unsigned long hva; + int r; + + /* access key 0 matches any storage key -> allow */ + if (access_key == 0) + return 0; + /* + * caller needs to ensure that gfn is accessible, so we can + * assume that this cannot fail + */ + hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gpa)); + mmap_read_lock(current->mm); + r = get_guest_storage_key(current->mm, hva, &storage_key); + mmap_read_unlock(current->mm); + if (r) + return r; + access_control = FIELD_GET(_PAGE_ACC_BITS, storage_key); + /* access key matches storage key -> allow */ + if (access_control == access_key) + return 0; + if (mode == GACC_FETCH || mode == GACC_IFETCH) { + /* it is a fetch and fetch protection is off -> allow */ + if (!(storage_key & _PAGE_FP_BIT)) + return 0; + if (fetch_prot_override_applicable(vcpu, mode, asce) && + fetch_prot_override_applies(ga, len)) + return 0; + } + if (storage_prot_override_applicable(vcpu) && + storage_prot_override_applies(access_control)) + return 0; + return PGM_PROTECTION; +} + +/** + * guest_range_to_gpas() - Calculate guest physical addresses of page fragments + * covering a logical range + * @vcpu: virtual cpu + * @ga: guest address, start of range + * @ar: access register + * @gpas: output argument, may be NULL + * @len: length of range in bytes + * @asce: address-space-control element to use for translation + * @mode: access mode + * @access_key: access key to mach the range's storage keys against + * + * Translate a logical range to a series of guest absolute addresses, + * such that the concatenation of page fragments starting at each gpa make up + * the whole range. + * The translation is performed as if done by the cpu for the given @asce, @ar, + * @mode and state of the @vcpu. + * If the translation causes an exception, its program interruption code is + * returned and the &struct kvm_s390_pgm_info pgm member of @vcpu is modified + * such that a subsequent call to kvm_s390_inject_prog_vcpu() will inject + * a correct exception into the guest. + * The resulting gpas are stored into @gpas, unless it is NULL. + * + * Note: All fragments except the first one start at the beginning of a page. + * When deriving the boundaries of a fragment from a gpa, all but the last + * fragment end at the end of the page. + * + * Return: + * * 0 - success + * * <0 - translation could not be performed, for example if guest + * memory could not be accessed + * * >0 - an access exception occurred. In this case the returned value + * is the program interruption code and the contents of pgm may + * be used to inject an exception into the guest. + */ +static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, + unsigned long *gpas, unsigned long len, + const union asce asce, enum gacc_mode mode, + u8 access_key) +{ + psw_t *psw = &vcpu->arch.sie_block->gpsw; + unsigned int offset = offset_in_page(ga); + unsigned int fragment_len; + int lap_enabled, rc = 0; + enum prot_type prot; + unsigned long gpa; + + lap_enabled = low_address_protection_enabled(vcpu, asce); + while (min(PAGE_SIZE - offset, len) > 0) { + fragment_len = min(PAGE_SIZE - offset, len); + ga = kvm_s390_logical_to_effective(vcpu, ga); + if (mode == GACC_STORE && lap_enabled && is_low_address(ga)) + return trans_exc(vcpu, PGM_PROTECTION, ga, ar, mode, + PROT_TYPE_LA); + if (psw_bits(*psw).dat) { + rc = guest_translate(vcpu, ga, &gpa, asce, mode, &prot); + if (rc < 0) + return rc; + } else { + gpa = kvm_s390_real_to_abs(vcpu, ga); + if (kvm_is_error_gpa(vcpu->kvm, gpa)) { + rc = PGM_ADDRESSING; + prot = PROT_NONE; + } + } + if (rc) + return trans_exc(vcpu, rc, ga, ar, mode, prot); + rc = vcpu_check_access_key(vcpu, access_key, mode, asce, gpa, ga, + fragment_len); + if (rc) + return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_KEYC); + if (gpas) + *gpas++ = gpa; + offset = 0; + ga += fragment_len; + len -= fragment_len; + } + return 0; +} + +static int access_guest_page(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, + void *data, unsigned int len) +{ + const unsigned int offset = offset_in_page(gpa); + const gfn_t gfn = gpa_to_gfn(gpa); + int rc; + + if (mode == GACC_STORE) + rc = kvm_write_guest_page(kvm, gfn, data, offset, len); + else + rc = kvm_read_guest_page(kvm, gfn, data, offset, len); + return rc; +} + +static int +access_guest_page_with_key(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, + void *data, unsigned int len, u8 access_key) +{ + struct kvm_memory_slot *slot; + bool writable; + gfn_t gfn; + hva_t hva; + int rc; + + gfn = gpa >> PAGE_SHIFT; + slot = gfn_to_memslot(kvm, gfn); + hva = gfn_to_hva_memslot_prot(slot, gfn, &writable); + + if (kvm_is_error_hva(hva)) + return PGM_ADDRESSING; + /* + * Check if it's a ro memslot, even tho that can't occur (they're unsupported). + * Don't try to actually handle that case. + */ + if (!writable && mode == GACC_STORE) + return -EOPNOTSUPP; + hva += offset_in_page(gpa); + if (mode == GACC_STORE) + rc = copy_to_user_key((void __user *)hva, data, len, access_key); + else + rc = copy_from_user_key(data, (void __user *)hva, len, access_key); + if (rc) + return PGM_PROTECTION; + if (mode == GACC_STORE) + mark_page_dirty_in_slot(kvm, slot, gfn); + return 0; +} + +int access_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, void *data, + unsigned long len, enum gacc_mode mode, u8 access_key) +{ + int offset = offset_in_page(gpa); + int fragment_len; + int rc; + + while (min(PAGE_SIZE - offset, len) > 0) { + fragment_len = min(PAGE_SIZE - offset, len); + rc = access_guest_page_with_key(kvm, mode, gpa, data, fragment_len, access_key); + if (rc) + return rc; + offset = 0; + len -= fragment_len; + data += fragment_len; + gpa += fragment_len; + } + return 0; +} + +int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, + void *data, unsigned long len, enum gacc_mode mode, + u8 access_key) +{ + psw_t *psw = &vcpu->arch.sie_block->gpsw; + unsigned long nr_pages, idx; + unsigned long gpa_array[2]; + unsigned int fragment_len; + unsigned long *gpas; + enum prot_type prot; + int need_ipte_lock; + union asce asce; + bool try_storage_prot_override; + bool try_fetch_prot_override; + int rc; + + if (!len) + return 0; + ga = kvm_s390_logical_to_effective(vcpu, ga); + rc = get_vcpu_asce(vcpu, &asce, ga, ar, mode); + if (rc) + return rc; + nr_pages = (((ga & ~PAGE_MASK) + len - 1) >> PAGE_SHIFT) + 1; + gpas = gpa_array; + if (nr_pages > ARRAY_SIZE(gpa_array)) + gpas = vmalloc(array_size(nr_pages, sizeof(unsigned long))); + if (!gpas) + return -ENOMEM; + try_fetch_prot_override = fetch_prot_override_applicable(vcpu, mode, asce); + try_storage_prot_override = storage_prot_override_applicable(vcpu); + need_ipte_lock = psw_bits(*psw).dat && !asce.r; + if (need_ipte_lock) + ipte_lock(vcpu->kvm); + /* + * Since we do the access further down ultimately via a move instruction + * that does key checking and returns an error in case of a protection + * violation, we don't need to do the check during address translation. + * Skip it by passing access key 0, which matches any storage key, + * obviating the need for any further checks. As a result the check is + * handled entirely in hardware on access, we only need to take care to + * forego key protection checking if fetch protection override applies or + * retry with the special key 9 in case of storage protection override. + */ + rc = guest_range_to_gpas(vcpu, ga, ar, gpas, len, asce, mode, 0); + if (rc) + goto out_unlock; + for (idx = 0; idx < nr_pages; idx++) { + fragment_len = min(PAGE_SIZE - offset_in_page(gpas[idx]), len); + if (try_fetch_prot_override && fetch_prot_override_applies(ga, fragment_len)) { + rc = access_guest_page(vcpu->kvm, mode, gpas[idx], + data, fragment_len); + } else { + rc = access_guest_page_with_key(vcpu->kvm, mode, gpas[idx], + data, fragment_len, access_key); + } + if (rc == PGM_PROTECTION && try_storage_prot_override) + rc = access_guest_page_with_key(vcpu->kvm, mode, gpas[idx], + data, fragment_len, PAGE_SPO_ACC); + if (rc) + break; + len -= fragment_len; + data += fragment_len; + ga = kvm_s390_logical_to_effective(vcpu, ga + fragment_len); + } + if (rc > 0) { + bool terminate = (mode == GACC_STORE) && (idx > 0); + + if (rc == PGM_PROTECTION) + prot = PROT_TYPE_KEYC; + else + prot = PROT_NONE; + rc = trans_exc_ending(vcpu, rc, ga, ar, mode, prot, terminate); + } +out_unlock: + if (need_ipte_lock) + ipte_unlock(vcpu->kvm); + if (nr_pages > ARRAY_SIZE(gpa_array)) + vfree(gpas); + return rc; +} + +int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, + void *data, unsigned long len, enum gacc_mode mode) +{ + unsigned int fragment_len; + unsigned long gpa; + int rc = 0; + + while (len && !rc) { + gpa = kvm_s390_real_to_abs(vcpu, gra); + fragment_len = min(PAGE_SIZE - offset_in_page(gpa), len); + rc = access_guest_page(vcpu->kvm, mode, gpa, data, fragment_len); + len -= fragment_len; + gra += fragment_len; + data += fragment_len; + } + return rc; +} + +/** + * cmpxchg_guest_abs_with_key() - Perform cmpxchg on guest absolute address. + * @kvm: Virtual machine instance. + * @gpa: Absolute guest address of the location to be changed. + * @len: Operand length of the cmpxchg, required: 1 <= len <= 16. Providing a + * non power of two will result in failure. + * @old_addr: Pointer to old value. If the location at @gpa contains this value, + * the exchange will succeed. After calling cmpxchg_guest_abs_with_key() + * *@old_addr contains the value at @gpa before the attempt to + * exchange the value. + * @new: The value to place at @gpa. + * @access_key: The access key to use for the guest access. + * @success: output value indicating if an exchange occurred. + * + * Atomically exchange the value at @gpa by @new, if it contains *@old. + * Honors storage keys. + * + * Return: * 0: successful exchange + * * >0: a program interruption code indicating the reason cmpxchg could + * not be attempted + * * -EINVAL: address misaligned or len not power of two + * * -EAGAIN: transient failure (len 1 or 2) + * * -EOPNOTSUPP: read-only memslot (should never occur) + */ +int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, + __uint128_t *old_addr, __uint128_t new, + u8 access_key, bool *success) +{ + gfn_t gfn = gpa_to_gfn(gpa); + struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); + bool writable; + hva_t hva; + int ret; + + if (!IS_ALIGNED(gpa, len)) + return -EINVAL; + + hva = gfn_to_hva_memslot_prot(slot, gfn, &writable); + if (kvm_is_error_hva(hva)) + return PGM_ADDRESSING; + /* + * Check if it's a read-only memslot, even though that cannot occur + * since those are unsupported. + * Don't try to actually handle that case. + */ + if (!writable) + return -EOPNOTSUPP; + + hva += offset_in_page(gpa); + /* + * The cmpxchg_user_key macro depends on the type of "old", so we need + * a case for each valid length and get some code duplication as long + * as we don't introduce a new macro. + */ + switch (len) { + case 1: { + u8 old; + + ret = cmpxchg_user_key((u8 __user *)hva, &old, *old_addr, new, access_key); + *success = !ret && old == *old_addr; + *old_addr = old; + break; + } + case 2: { + u16 old; + + ret = cmpxchg_user_key((u16 __user *)hva, &old, *old_addr, new, access_key); + *success = !ret && old == *old_addr; + *old_addr = old; + break; + } + case 4: { + u32 old; + + ret = cmpxchg_user_key((u32 __user *)hva, &old, *old_addr, new, access_key); + *success = !ret && old == *old_addr; + *old_addr = old; + break; + } + case 8: { + u64 old; + + ret = cmpxchg_user_key((u64 __user *)hva, &old, *old_addr, new, access_key); + *success = !ret && old == *old_addr; + *old_addr = old; + break; + } + case 16: { + __uint128_t old; + + ret = cmpxchg_user_key((__uint128_t __user *)hva, &old, *old_addr, new, access_key); + *success = !ret && old == *old_addr; + *old_addr = old; + break; + } + default: + return -EINVAL; + } + if (*success) + mark_page_dirty_in_slot(kvm, slot, gfn); + /* + * Assume that the fault is caused by protection, either key protection + * or user page write protection. + */ + if (ret == -EFAULT) + ret = PGM_PROTECTION; + return ret; +} + +/** + * guest_translate_address_with_key - translate guest logical into guest absolute address + * @vcpu: virtual cpu + * @gva: Guest virtual address + * @ar: Access register + * @gpa: Guest physical address + * @mode: Translation access mode + * @access_key: access key to mach the storage key with + * + * Parameter semantics are the same as the ones from guest_translate. + * The memory contents at the guest address are not changed. + * + * Note: The IPTE lock is not taken during this function, so the caller + * has to take care of this. + */ +int guest_translate_address_with_key(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, + unsigned long *gpa, enum gacc_mode mode, + u8 access_key) +{ + union asce asce; + int rc; + + gva = kvm_s390_logical_to_effective(vcpu, gva); + rc = get_vcpu_asce(vcpu, &asce, gva, ar, mode); + if (rc) + return rc; + return guest_range_to_gpas(vcpu, gva, ar, gpa, 1, asce, mode, + access_key); +} + +/** + * check_gva_range - test a range of guest virtual addresses for accessibility + * @vcpu: virtual cpu + * @gva: Guest virtual address + * @ar: Access register + * @length: Length of test range + * @mode: Translation access mode + * @access_key: access key to mach the storage keys with + */ +int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, + unsigned long length, enum gacc_mode mode, u8 access_key) +{ + union asce asce; + int rc = 0; + + rc = get_vcpu_asce(vcpu, &asce, gva, ar, mode); + if (rc) + return rc; + ipte_lock(vcpu->kvm); + rc = guest_range_to_gpas(vcpu, gva, ar, NULL, length, asce, mode, + access_key); + ipte_unlock(vcpu->kvm); + + return rc; +} + +/** + * check_gpa_range - test a range of guest physical addresses for accessibility + * @kvm: virtual machine instance + * @gpa: guest physical address + * @length: length of test range + * @mode: access mode to test, relevant for storage keys + * @access_key: access key to mach the storage keys with + */ +int check_gpa_range(struct kvm *kvm, unsigned long gpa, unsigned long length, + enum gacc_mode mode, u8 access_key) +{ + unsigned int fragment_len; + int rc = 0; + + while (length && !rc) { + fragment_len = min(PAGE_SIZE - offset_in_page(gpa), length); + rc = vm_check_access_key(kvm, access_key, mode, gpa); + length -= fragment_len; + gpa += fragment_len; + } + return rc; +} + +/** + * kvm_s390_check_low_addr_prot_real - check for low-address protection + * @vcpu: virtual cpu + * @gra: Guest real address + * + * Checks whether an address is subject to low-address protection and set + * up vcpu->arch.pgm accordingly if necessary. + * + * Return: 0 if no protection exception, or PGM_PROTECTION if protected. + */ +int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra) +{ + union ctlreg0 ctlreg0 = {.val = vcpu->arch.sie_block->gcr[0]}; + + if (!ctlreg0.lap || !is_low_address(gra)) + return 0; + return trans_exc(vcpu, PGM_PROTECTION, gra, 0, GACC_STORE, PROT_TYPE_LA); +} + +/** + * kvm_s390_shadow_tables - walk the guest page table and create shadow tables + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @pgt: pointer to the beginning of the page table for the given address if + * successful (return value 0), or to the first invalid DAT entry in + * case of exceptions (return value > 0) + * @dat_protection: referenced memory is write protected + * @fake: pgt references contiguous guest memory block, not a pgtable + */ +static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, + unsigned long *pgt, int *dat_protection, + int *fake) +{ + struct gmap *parent; + union asce asce; + union vaddress vaddr; + unsigned long ptr; + int rc; + + *fake = 0; + *dat_protection = 0; + parent = sg->parent; + vaddr.addr = saddr; + asce.val = sg->orig_asce; + ptr = asce.origin * PAGE_SIZE; + if (asce.r) { + *fake = 1; + ptr = 0; + asce.dt = ASCE_TYPE_REGION1; + } + switch (asce.dt) { + case ASCE_TYPE_REGION1: + if (vaddr.rfx01 > asce.tl && !*fake) + return PGM_REGION_FIRST_TRANS; + break; + case ASCE_TYPE_REGION2: + if (vaddr.rfx) + return PGM_ASCE_TYPE; + if (vaddr.rsx01 > asce.tl) + return PGM_REGION_SECOND_TRANS; + break; + case ASCE_TYPE_REGION3: + if (vaddr.rfx || vaddr.rsx) + return PGM_ASCE_TYPE; + if (vaddr.rtx01 > asce.tl) + return PGM_REGION_THIRD_TRANS; + break; + case ASCE_TYPE_SEGMENT: + if (vaddr.rfx || vaddr.rsx || vaddr.rtx) + return PGM_ASCE_TYPE; + if (vaddr.sx01 > asce.tl) + return PGM_SEGMENT_TRANSLATION; + break; + } + + switch (asce.dt) { + case ASCE_TYPE_REGION1: { + union region1_table_entry rfte; + + if (*fake) { + ptr += vaddr.rfx * _REGION1_SIZE; + rfte.val = ptr; + goto shadow_r2t; + } + *pgt = ptr + vaddr.rfx * 8; + rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val); + if (rc) + return rc; + if (rfte.i) + return PGM_REGION_FIRST_TRANS; + if (rfte.tt != TABLE_TYPE_REGION1) + return PGM_TRANSLATION_SPEC; + if (vaddr.rsx01 < rfte.tf || vaddr.rsx01 > rfte.tl) + return PGM_REGION_SECOND_TRANS; + if (sg->edat_level >= 1) + *dat_protection |= rfte.p; + ptr = rfte.rto * PAGE_SIZE; +shadow_r2t: + rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake); + if (rc) + return rc; + } + fallthrough; + case ASCE_TYPE_REGION2: { + union region2_table_entry rste; + + if (*fake) { + ptr += vaddr.rsx * _REGION2_SIZE; + rste.val = ptr; + goto shadow_r3t; + } + *pgt = ptr + vaddr.rsx * 8; + rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val); + if (rc) + return rc; + if (rste.i) + return PGM_REGION_SECOND_TRANS; + if (rste.tt != TABLE_TYPE_REGION2) + return PGM_TRANSLATION_SPEC; + if (vaddr.rtx01 < rste.tf || vaddr.rtx01 > rste.tl) + return PGM_REGION_THIRD_TRANS; + if (sg->edat_level >= 1) + *dat_protection |= rste.p; + ptr = rste.rto * PAGE_SIZE; +shadow_r3t: + rste.p |= *dat_protection; + rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake); + if (rc) + return rc; + } + fallthrough; + case ASCE_TYPE_REGION3: { + union region3_table_entry rtte; + + if (*fake) { + ptr += vaddr.rtx * _REGION3_SIZE; + rtte.val = ptr; + goto shadow_sgt; + } + *pgt = ptr + vaddr.rtx * 8; + rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val); + if (rc) + return rc; + if (rtte.i) + return PGM_REGION_THIRD_TRANS; + if (rtte.tt != TABLE_TYPE_REGION3) + return PGM_TRANSLATION_SPEC; + if (rtte.cr && asce.p && sg->edat_level >= 2) + return PGM_TRANSLATION_SPEC; + if (rtte.fc && sg->edat_level >= 2) { + *dat_protection |= rtte.fc0.p; + *fake = 1; + ptr = rtte.fc1.rfaa * _REGION3_SIZE; + rtte.val = ptr; + goto shadow_sgt; + } + if (vaddr.sx01 < rtte.fc0.tf || vaddr.sx01 > rtte.fc0.tl) + return PGM_SEGMENT_TRANSLATION; + if (sg->edat_level >= 1) + *dat_protection |= rtte.fc0.p; + ptr = rtte.fc0.sto * PAGE_SIZE; +shadow_sgt: + rtte.fc0.p |= *dat_protection; + rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake); + if (rc) + return rc; + } + fallthrough; + case ASCE_TYPE_SEGMENT: { + union segment_table_entry ste; + + if (*fake) { + ptr += vaddr.sx * _SEGMENT_SIZE; + ste.val = ptr; + goto shadow_pgt; + } + *pgt = ptr + vaddr.sx * 8; + rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val); + if (rc) + return rc; + if (ste.i) + return PGM_SEGMENT_TRANSLATION; + if (ste.tt != TABLE_TYPE_SEGMENT) + return PGM_TRANSLATION_SPEC; + if (ste.cs && asce.p) + return PGM_TRANSLATION_SPEC; + *dat_protection |= ste.fc0.p; + if (ste.fc && sg->edat_level >= 1) { + *fake = 1; + ptr = ste.fc1.sfaa * _SEGMENT_SIZE; + ste.val = ptr; + goto shadow_pgt; + } + ptr = ste.fc0.pto * (PAGE_SIZE / 2); +shadow_pgt: + ste.fc0.p |= *dat_protection; + rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake); + if (rc) + return rc; + } + } + /* Return the parent address of the page table */ + *pgt = ptr; + return 0; +} + +/** + * kvm_s390_shadow_fault - handle fault on a shadow page table + * @vcpu: virtual cpu + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @datptr: will contain the address of the faulting DAT table entry, or of + * the valid leaf, plus some flags + * + * Returns: - 0 if the shadow fault was successfully resolved + * - > 0 (pgm exception code) on exceptions while faulting + * - -EAGAIN if the caller can retry immediately + * - -EFAULT when accessing invalid guest addresses + * - -ENOMEM if out of memory + */ +int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, + unsigned long saddr, unsigned long *datptr) +{ + union vaddress vaddr; + union page_table_entry pte; + unsigned long pgt = 0; + int dat_protection, fake; + int rc; + + mmap_read_lock(sg->mm); + /* + * We don't want any guest-2 tables to change - so the parent + * tables/pointers we read stay valid - unshadowing is however + * always possible - only guest_table_lock protects us. + */ + ipte_lock(vcpu->kvm); + + rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake); + if (rc) + rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection, + &fake); + + vaddr.addr = saddr; + if (fake) { + pte.val = pgt + vaddr.px * PAGE_SIZE; + goto shadow_page; + } + + switch (rc) { + case PGM_SEGMENT_TRANSLATION: + case PGM_REGION_THIRD_TRANS: + case PGM_REGION_SECOND_TRANS: + case PGM_REGION_FIRST_TRANS: + pgt |= PEI_NOT_PTE; + break; + case 0: + pgt += vaddr.px * 8; + rc = gmap_read_table(sg->parent, pgt, &pte.val); + } + if (datptr) + *datptr = pgt | dat_protection * PEI_DAT_PROT; + if (!rc && pte.i) + rc = PGM_PAGE_TRANSLATION; + if (!rc && pte.z) + rc = PGM_TRANSLATION_SPEC; +shadow_page: + pte.p |= dat_protection; + if (!rc) + rc = gmap_shadow_page(sg, saddr, __pte(pte.val)); + ipte_unlock(vcpu->kvm); + mmap_read_unlock(sg->mm); + return rc; +} diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h new file mode 100644 index 0000000000..b320d12aa0 --- /dev/null +++ b/arch/s390/kvm/gaccess.h @@ -0,0 +1,458 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * access guest memory + * + * Copyright IBM Corp. 2008, 2014 + * + * Author(s): Carsten Otte + */ + +#ifndef __KVM_S390_GACCESS_H +#define __KVM_S390_GACCESS_H + +#include +#include +#include +#include +#include "kvm-s390.h" + +/** + * kvm_s390_real_to_abs - convert guest real address to guest absolute address + * @prefix - guest prefix + * @gra - guest real address + * + * Returns the guest absolute address that corresponds to the passed guest real + * address @gra of by applying the given prefix. + */ +static inline unsigned long _kvm_s390_real_to_abs(u32 prefix, unsigned long gra) +{ + if (gra < 2 * PAGE_SIZE) + gra += prefix; + else if (gra >= prefix && gra < prefix + 2 * PAGE_SIZE) + gra -= prefix; + return gra; +} + +/** + * kvm_s390_real_to_abs - convert guest real address to guest absolute address + * @vcpu - guest virtual cpu + * @gra - guest real address + * + * Returns the guest absolute address that corresponds to the passed guest real + * address @gra of a virtual guest cpu by applying its prefix. + */ +static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu, + unsigned long gra) +{ + return _kvm_s390_real_to_abs(kvm_s390_get_prefix(vcpu), gra); +} + +/** + * _kvm_s390_logical_to_effective - convert guest logical to effective address + * @psw: psw of the guest + * @ga: guest logical address + * + * Convert a guest logical address to an effective address by applying the + * rules of the addressing mode defined by bits 31 and 32 of the given PSW + * (extendended/basic addressing mode). + * + * Depending on the addressing mode, the upper 40 bits (24 bit addressing + * mode), 33 bits (31 bit addressing mode) or no bits (64 bit addressing + * mode) of @ga will be zeroed and the remaining bits will be returned. + */ +static inline unsigned long _kvm_s390_logical_to_effective(psw_t *psw, + unsigned long ga) +{ + if (psw_bits(*psw).eaba == PSW_BITS_AMODE_64BIT) + return ga; + if (psw_bits(*psw).eaba == PSW_BITS_AMODE_31BIT) + return ga & ((1UL << 31) - 1); + return ga & ((1UL << 24) - 1); +} + +/** + * kvm_s390_logical_to_effective - convert guest logical to effective address + * @vcpu: guest virtual cpu + * @ga: guest logical address + * + * Convert a guest vcpu logical address to a guest vcpu effective address by + * applying the rules of the vcpu's addressing mode defined by PSW bits 31 + * and 32 (extendended/basic addressing mode). + * + * Depending on the vcpu's addressing mode the upper 40 bits (24 bit addressing + * mode), 33 bits (31 bit addressing mode) or no bits (64 bit addressing mode) + * of @ga will be zeroed and the remaining bits will be returned. + */ +static inline unsigned long kvm_s390_logical_to_effective(struct kvm_vcpu *vcpu, + unsigned long ga) +{ + return _kvm_s390_logical_to_effective(&vcpu->arch.sie_block->gpsw, ga); +} + +/* + * put_guest_lc, read_guest_lc and write_guest_lc are guest access functions + * which shall only be used to access the lowcore of a vcpu. + * These functions should be used for e.g. interrupt handlers where no + * guest memory access protection facilities, like key or low address + * protection, are applicable. + * At a later point guest vcpu lowcore access should happen via pinned + * prefix pages, so that these pages can be accessed directly via the + * kernel mapping. All of these *_lc functions can be removed then. + */ + +/** + * put_guest_lc - write a simple variable to a guest vcpu's lowcore + * @vcpu: virtual cpu + * @x: value to copy to guest + * @gra: vcpu's destination guest real address + * + * Copies a simple value from kernel space to a guest vcpu's lowcore. + * The size of the variable may be 1, 2, 4 or 8 bytes. The destination + * must be located in the vcpu's lowcore. Otherwise the result is undefined. + * + * Returns zero on success or -EFAULT on error. + * + * Note: an error indicates that either the kernel is out of memory or + * the guest memory mapping is broken. In any case the best solution + * would be to terminate the guest. + * It is wrong to inject a guest exception. + */ +#define put_guest_lc(vcpu, x, gra) \ +({ \ + struct kvm_vcpu *__vcpu = (vcpu); \ + __typeof__(*(gra)) __x = (x); \ + unsigned long __gpa; \ + \ + __gpa = (unsigned long)(gra); \ + __gpa += kvm_s390_get_prefix(__vcpu); \ + kvm_write_guest(__vcpu->kvm, __gpa, &__x, sizeof(__x)); \ +}) + +/** + * write_guest_lc - copy data from kernel space to guest vcpu's lowcore + * @vcpu: virtual cpu + * @gra: vcpu's source guest real address + * @data: source address in kernel space + * @len: number of bytes to copy + * + * Copy data from kernel space to guest vcpu's lowcore. The entire range must + * be located within the vcpu's lowcore, otherwise the result is undefined. + * + * Returns zero on success or -EFAULT on error. + * + * Note: an error indicates that either the kernel is out of memory or + * the guest memory mapping is broken. In any case the best solution + * would be to terminate the guest. + * It is wrong to inject a guest exception. + */ +static inline __must_check +int write_guest_lc(struct kvm_vcpu *vcpu, unsigned long gra, void *data, + unsigned long len) +{ + unsigned long gpa = gra + kvm_s390_get_prefix(vcpu); + + return kvm_write_guest(vcpu->kvm, gpa, data, len); +} + +/** + * read_guest_lc - copy data from guest vcpu's lowcore to kernel space + * @vcpu: virtual cpu + * @gra: vcpu's source guest real address + * @data: destination address in kernel space + * @len: number of bytes to copy + * + * Copy data from guest vcpu's lowcore to kernel space. The entire range must + * be located within the vcpu's lowcore, otherwise the result is undefined. + * + * Returns zero on success or -EFAULT on error. + * + * Note: an error indicates that either the kernel is out of memory or + * the guest memory mapping is broken. In any case the best solution + * would be to terminate the guest. + * It is wrong to inject a guest exception. + */ +static inline __must_check +int read_guest_lc(struct kvm_vcpu *vcpu, unsigned long gra, void *data, + unsigned long len) +{ + unsigned long gpa = gra + kvm_s390_get_prefix(vcpu); + + return kvm_read_guest(vcpu->kvm, gpa, data, len); +} + +enum gacc_mode { + GACC_FETCH, + GACC_STORE, + GACC_IFETCH, +}; + +int guest_translate_address_with_key(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, + unsigned long *gpa, enum gacc_mode mode, + u8 access_key); + +int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, + unsigned long length, enum gacc_mode mode, u8 access_key); + +int check_gpa_range(struct kvm *kvm, unsigned long gpa, unsigned long length, + enum gacc_mode mode, u8 access_key); + +int access_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, void *data, + unsigned long len, enum gacc_mode mode, u8 access_key); + +int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, + void *data, unsigned long len, enum gacc_mode mode, + u8 access_key); + +int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, + void *data, unsigned long len, enum gacc_mode mode); + +int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, __uint128_t *old, + __uint128_t new, u8 access_key, bool *success); + +/** + * write_guest_with_key - copy data from kernel space to guest space + * @vcpu: virtual cpu + * @ga: guest address + * @ar: access register + * @data: source address in kernel space + * @len: number of bytes to copy + * @access_key: access key the storage key needs to match + * + * Copy @len bytes from @data (kernel space) to @ga (guest address). + * In order to copy data to guest space the PSW of the vcpu is inspected: + * If DAT is off data will be copied to guest real or absolute memory. + * If DAT is on data will be copied to the address space as specified by + * the address space bits of the PSW: + * Primary, secondary, home space or access register mode. + * The addressing mode of the PSW is also inspected, so that address wrap + * around is taken into account for 24-, 31- and 64-bit addressing mode, + * if the to be copied data crosses page boundaries in guest address space. + * In addition low address, DAT and key protection checks are performed before + * copying any data. + * + * This function modifies the 'struct kvm_s390_pgm_info pgm' member of @vcpu. + * In case of an access exception (e.g. protection exception) pgm will contain + * all data necessary so that a subsequent call to 'kvm_s390_inject_prog_vcpu()' + * will inject a correct exception into the guest. + * If no access exception happened, the contents of pgm are undefined when + * this function returns. + * + * Returns: - zero on success + * - a negative value if e.g. the guest mapping is broken or in + * case of out-of-memory. In this case the contents of pgm are + * undefined. Also parts of @data may have been copied to guest + * space. + * - a positive value if an access exception happened. In this case + * the returned value is the program interruption code and the + * contents of pgm may be used to inject an exception into the + * guest. No data has been copied to guest space. + * + * Note: in case an access exception is recognized no data has been copied to + * guest space (this is also true, if the to be copied data would cross + * one or more page boundaries in guest space). + * Therefore this function may be used for nullifying and suppressing + * instruction emulation. + * It may also be used for terminating instructions, if it is undefined + * if data has been changed in guest space in case of an exception. + */ +static inline __must_check +int write_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, + void *data, unsigned long len, u8 access_key) +{ + return access_guest_with_key(vcpu, ga, ar, data, len, GACC_STORE, + access_key); +} + +/** + * write_guest - copy data from kernel space to guest space + * @vcpu: virtual cpu + * @ga: guest address + * @ar: access register + * @data: source address in kernel space + * @len: number of bytes to copy + * + * The behaviour of write_guest is identical to write_guest_with_key, except + * that the PSW access key is used instead of an explicit argument. + */ +static inline __must_check +int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data, + unsigned long len) +{ + u8 access_key = psw_bits(vcpu->arch.sie_block->gpsw).key; + + return write_guest_with_key(vcpu, ga, ar, data, len, access_key); +} + +/** + * read_guest_with_key - copy data from guest space to kernel space + * @vcpu: virtual cpu + * @ga: guest address + * @ar: access register + * @data: destination address in kernel space + * @len: number of bytes to copy + * @access_key: access key the storage key needs to match + * + * Copy @len bytes from @ga (guest address) to @data (kernel space). + * + * The behaviour of read_guest_with_key is identical to write_guest_with_key, + * except that data will be copied from guest space to kernel space. + */ +static inline __must_check +int read_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, + void *data, unsigned long len, u8 access_key) +{ + return access_guest_with_key(vcpu, ga, ar, data, len, GACC_FETCH, + access_key); +} + +/** + * read_guest - copy data from guest space to kernel space + * @vcpu: virtual cpu + * @ga: guest address + * @ar: access register + * @data: destination address in kernel space + * @len: number of bytes to copy + * + * Copy @len bytes from @ga (guest address) to @data (kernel space). + * + * The behaviour of read_guest is identical to read_guest_with_key, except + * that the PSW access key is used instead of an explicit argument. + */ +static inline __must_check +int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data, + unsigned long len) +{ + u8 access_key = psw_bits(vcpu->arch.sie_block->gpsw).key; + + return read_guest_with_key(vcpu, ga, ar, data, len, access_key); +} + +/** + * read_guest_instr - copy instruction data from guest space to kernel space + * @vcpu: virtual cpu + * @ga: guest address + * @data: destination address in kernel space + * @len: number of bytes to copy + * + * Copy @len bytes from the given address (guest space) to @data (kernel + * space). + * + * The behaviour of read_guest_instr is identical to read_guest, except that + * instruction data will be read from primary space when in home-space or + * address-space mode. + */ +static inline __must_check +int read_guest_instr(struct kvm_vcpu *vcpu, unsigned long ga, void *data, + unsigned long len) +{ + u8 access_key = psw_bits(vcpu->arch.sie_block->gpsw).key; + + return access_guest_with_key(vcpu, ga, 0, data, len, GACC_IFETCH, + access_key); +} + +/** + * write_guest_abs - copy data from kernel space to guest space absolute + * @vcpu: virtual cpu + * @gpa: guest physical (absolute) address + * @data: source address in kernel space + * @len: number of bytes to copy + * + * Copy @len bytes from @data (kernel space) to @gpa (guest absolute address). + * It is up to the caller to ensure that the entire guest memory range is + * valid memory before calling this function. + * Guest low address and key protection are not checked. + * + * Returns zero on success or -EFAULT on error. + * + * If an error occurs data may have been copied partially to guest memory. + */ +static inline __must_check +int write_guest_abs(struct kvm_vcpu *vcpu, unsigned long gpa, void *data, + unsigned long len) +{ + return kvm_write_guest(vcpu->kvm, gpa, data, len); +} + +/** + * read_guest_abs - copy data from guest space absolute to kernel space + * @vcpu: virtual cpu + * @gpa: guest physical (absolute) address + * @data: destination address in kernel space + * @len: number of bytes to copy + * + * Copy @len bytes from @gpa (guest absolute address) to @data (kernel space). + * It is up to the caller to ensure that the entire guest memory range is + * valid memory before calling this function. + * Guest key protection is not checked. + * + * Returns zero on success or -EFAULT on error. + * + * If an error occurs data may have been copied partially to kernel space. + */ +static inline __must_check +int read_guest_abs(struct kvm_vcpu *vcpu, unsigned long gpa, void *data, + unsigned long len) +{ + return kvm_read_guest(vcpu->kvm, gpa, data, len); +} + +/** + * write_guest_real - copy data from kernel space to guest space real + * @vcpu: virtual cpu + * @gra: guest real address + * @data: source address in kernel space + * @len: number of bytes to copy + * + * Copy @len bytes from @data (kernel space) to @gra (guest real address). + * It is up to the caller to ensure that the entire guest memory range is + * valid memory before calling this function. + * Guest low address and key protection are not checked. + * + * Returns zero on success or -EFAULT on error. + * + * If an error occurs data may have been copied partially to guest memory. + */ +static inline __must_check +int write_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data, + unsigned long len) +{ + return access_guest_real(vcpu, gra, data, len, 1); +} + +/** + * read_guest_real - copy data from guest space real to kernel space + * @vcpu: virtual cpu + * @gra: guest real address + * @data: destination address in kernel space + * @len: number of bytes to copy + * + * Copy @len bytes from @gra (guest real address) to @data (kernel space). + * It is up to the caller to ensure that the entire guest memory range is + * valid memory before calling this function. + * Guest key protection is not checked. + * + * Returns zero on success or -EFAULT on error. + * + * If an error occurs data may have been copied partially to kernel space. + */ +static inline __must_check +int read_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data, + unsigned long len) +{ + return access_guest_real(vcpu, gra, data, len, 0); +} + +void ipte_lock(struct kvm *kvm); +void ipte_unlock(struct kvm *kvm); +int ipte_lock_held(struct kvm *kvm); +int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra); + +/* MVPG PEI indication bits */ +#define PEI_DAT_PROT 2 +#define PEI_NOT_PTE 4 + +int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow, + unsigned long saddr, unsigned long *datptr); + +#endif /* __KVM_S390_GACCESS_H */ diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c new file mode 100644 index 0000000000..3765c4223b --- /dev/null +++ b/arch/s390/kvm/guestdbg.c @@ -0,0 +1,626 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * kvm guest debug support + * + * Copyright IBM Corp. 2014 + * + * Author(s): David Hildenbrand + */ +#include +#include +#include "kvm-s390.h" +#include "gaccess.h" + +/* + * Extends the address range given by *start and *stop to include the address + * range starting with estart and the length len. Takes care of overflowing + * intervals and tries to minimize the overall interval size. + */ +static void extend_address_range(u64 *start, u64 *stop, u64 estart, int len) +{ + u64 estop; + + if (len > 0) + len--; + else + len = 0; + + estop = estart + len; + + /* 0-0 range represents "not set" */ + if ((*start == 0) && (*stop == 0)) { + *start = estart; + *stop = estop; + } else if (*start <= *stop) { + /* increase the existing range */ + if (estart < *start) + *start = estart; + if (estop > *stop) + *stop = estop; + } else { + /* "overflowing" interval, whereby *stop > *start */ + if (estart <= *stop) { + if (estop > *stop) + *stop = estop; + } else if (estop > *start) { + if (estart < *start) + *start = estart; + } + /* minimize the range */ + else if ((estop - *stop) < (*start - estart)) + *stop = estop; + else + *start = estart; + } +} + +#define MAX_INST_SIZE 6 + +static void enable_all_hw_bp(struct kvm_vcpu *vcpu) +{ + unsigned long start, len; + u64 *cr9 = &vcpu->arch.sie_block->gcr[9]; + u64 *cr10 = &vcpu->arch.sie_block->gcr[10]; + u64 *cr11 = &vcpu->arch.sie_block->gcr[11]; + int i; + + if (vcpu->arch.guestdbg.nr_hw_bp <= 0 || + vcpu->arch.guestdbg.hw_bp_info == NULL) + return; + + /* + * If the guest is not interested in branching events, we can safely + * limit them to the PER address range. + */ + if (!(*cr9 & PER_EVENT_BRANCH)) + *cr9 |= PER_CONTROL_BRANCH_ADDRESS; + *cr9 |= PER_EVENT_IFETCH | PER_EVENT_BRANCH; + + for (i = 0; i < vcpu->arch.guestdbg.nr_hw_bp; i++) { + start = vcpu->arch.guestdbg.hw_bp_info[i].addr; + len = vcpu->arch.guestdbg.hw_bp_info[i].len; + + /* + * The instruction in front of the desired bp has to + * report instruction-fetching events + */ + if (start < MAX_INST_SIZE) { + len += start; + start = 0; + } else { + start -= MAX_INST_SIZE; + len += MAX_INST_SIZE; + } + + extend_address_range(cr10, cr11, start, len); + } +} + +static void enable_all_hw_wp(struct kvm_vcpu *vcpu) +{ + unsigned long start, len; + u64 *cr9 = &vcpu->arch.sie_block->gcr[9]; + u64 *cr10 = &vcpu->arch.sie_block->gcr[10]; + u64 *cr11 = &vcpu->arch.sie_block->gcr[11]; + int i; + + if (vcpu->arch.guestdbg.nr_hw_wp <= 0 || + vcpu->arch.guestdbg.hw_wp_info == NULL) + return; + + /* if host uses storage alternation for special address + * spaces, enable all events and give all to the guest */ + if (*cr9 & PER_EVENT_STORE && *cr9 & PER_CONTROL_ALTERATION) { + *cr9 &= ~PER_CONTROL_ALTERATION; + *cr10 = 0; + *cr11 = -1UL; + } else { + *cr9 &= ~PER_CONTROL_ALTERATION; + *cr9 |= PER_EVENT_STORE; + + for (i = 0; i < vcpu->arch.guestdbg.nr_hw_wp; i++) { + start = vcpu->arch.guestdbg.hw_wp_info[i].addr; + len = vcpu->arch.guestdbg.hw_wp_info[i].len; + + extend_address_range(cr10, cr11, start, len); + } + } +} + +void kvm_s390_backup_guest_per_regs(struct kvm_vcpu *vcpu) +{ + vcpu->arch.guestdbg.cr0 = vcpu->arch.sie_block->gcr[0]; + vcpu->arch.guestdbg.cr9 = vcpu->arch.sie_block->gcr[9]; + vcpu->arch.guestdbg.cr10 = vcpu->arch.sie_block->gcr[10]; + vcpu->arch.guestdbg.cr11 = vcpu->arch.sie_block->gcr[11]; +} + +void kvm_s390_restore_guest_per_regs(struct kvm_vcpu *vcpu) +{ + vcpu->arch.sie_block->gcr[0] = vcpu->arch.guestdbg.cr0; + vcpu->arch.sie_block->gcr[9] = vcpu->arch.guestdbg.cr9; + vcpu->arch.sie_block->gcr[10] = vcpu->arch.guestdbg.cr10; + vcpu->arch.sie_block->gcr[11] = vcpu->arch.guestdbg.cr11; +} + +void kvm_s390_patch_guest_per_regs(struct kvm_vcpu *vcpu) +{ + /* + * TODO: if guest psw has per enabled, otherwise 0s! + * This reduces the amount of reported events. + * Need to intercept all psw changes! + */ + + if (guestdbg_sstep_enabled(vcpu)) { + /* disable timer (clock-comparator) interrupts */ + vcpu->arch.sie_block->gcr[0] &= ~CR0_CLOCK_COMPARATOR_SUBMASK; + vcpu->arch.sie_block->gcr[9] |= PER_EVENT_IFETCH; + vcpu->arch.sie_block->gcr[10] = 0; + vcpu->arch.sie_block->gcr[11] = -1UL; + } + + if (guestdbg_hw_bp_enabled(vcpu)) { + enable_all_hw_bp(vcpu); + enable_all_hw_wp(vcpu); + } + + /* TODO: Instruction-fetching-nullification not allowed for now */ + if (vcpu->arch.sie_block->gcr[9] & PER_EVENT_NULLIFICATION) + vcpu->arch.sie_block->gcr[9] &= ~PER_EVENT_NULLIFICATION; +} + +#define MAX_WP_SIZE 100 + +static int __import_wp_info(struct kvm_vcpu *vcpu, + struct kvm_hw_breakpoint *bp_data, + struct kvm_hw_wp_info_arch *wp_info) +{ + int ret = 0; + wp_info->len = bp_data->len; + wp_info->addr = bp_data->addr; + wp_info->phys_addr = bp_data->phys_addr; + wp_info->old_data = NULL; + + if (wp_info->len < 0 || wp_info->len > MAX_WP_SIZE) + return -EINVAL; + + wp_info->old_data = kmalloc(bp_data->len, GFP_KERNEL_ACCOUNT); + if (!wp_info->old_data) + return -ENOMEM; + /* try to backup the original value */ + ret = read_guest_abs(vcpu, wp_info->phys_addr, wp_info->old_data, + wp_info->len); + if (ret) { + kfree(wp_info->old_data); + wp_info->old_data = NULL; + } + + return ret; +} + +#define MAX_BP_COUNT 50 + +int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg) +{ + int ret = 0, nr_wp = 0, nr_bp = 0, i; + struct kvm_hw_breakpoint *bp_data = NULL; + struct kvm_hw_wp_info_arch *wp_info = NULL; + struct kvm_hw_bp_info_arch *bp_info = NULL; + + if (dbg->arch.nr_hw_bp <= 0 || !dbg->arch.hw_bp) + return 0; + else if (dbg->arch.nr_hw_bp > MAX_BP_COUNT) + return -EINVAL; + + bp_data = memdup_user(dbg->arch.hw_bp, + sizeof(*bp_data) * dbg->arch.nr_hw_bp); + if (IS_ERR(bp_data)) + return PTR_ERR(bp_data); + + for (i = 0; i < dbg->arch.nr_hw_bp; i++) { + switch (bp_data[i].type) { + case KVM_HW_WP_WRITE: + nr_wp++; + break; + case KVM_HW_BP: + nr_bp++; + break; + default: + break; + } + } + + if (nr_wp > 0) { + wp_info = kmalloc_array(nr_wp, + sizeof(*wp_info), + GFP_KERNEL_ACCOUNT); + if (!wp_info) { + ret = -ENOMEM; + goto error; + } + } + if (nr_bp > 0) { + bp_info = kmalloc_array(nr_bp, + sizeof(*bp_info), + GFP_KERNEL_ACCOUNT); + if (!bp_info) { + ret = -ENOMEM; + goto error; + } + } + + for (nr_wp = 0, nr_bp = 0, i = 0; i < dbg->arch.nr_hw_bp; i++) { + switch (bp_data[i].type) { + case KVM_HW_WP_WRITE: + ret = __import_wp_info(vcpu, &bp_data[i], + &wp_info[nr_wp]); + if (ret) + goto error; + nr_wp++; + break; + case KVM_HW_BP: + bp_info[nr_bp].len = bp_data[i].len; + bp_info[nr_bp].addr = bp_data[i].addr; + nr_bp++; + break; + } + } + + vcpu->arch.guestdbg.nr_hw_bp = nr_bp; + vcpu->arch.guestdbg.hw_bp_info = bp_info; + vcpu->arch.guestdbg.nr_hw_wp = nr_wp; + vcpu->arch.guestdbg.hw_wp_info = wp_info; + return 0; +error: + kfree(bp_data); + kfree(wp_info); + kfree(bp_info); + return ret; +} + +void kvm_s390_clear_bp_data(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_hw_wp_info_arch *hw_wp_info = NULL; + + for (i = 0; i < vcpu->arch.guestdbg.nr_hw_wp; i++) { + hw_wp_info = &vcpu->arch.guestdbg.hw_wp_info[i]; + kfree(hw_wp_info->old_data); + hw_wp_info->old_data = NULL; + } + kfree(vcpu->arch.guestdbg.hw_wp_info); + vcpu->arch.guestdbg.hw_wp_info = NULL; + + kfree(vcpu->arch.guestdbg.hw_bp_info); + vcpu->arch.guestdbg.hw_bp_info = NULL; + + vcpu->arch.guestdbg.nr_hw_wp = 0; + vcpu->arch.guestdbg.nr_hw_bp = 0; +} + +static inline int in_addr_range(u64 addr, u64 a, u64 b) +{ + if (a <= b) + return (addr >= a) && (addr <= b); + else + /* "overflowing" interval */ + return (addr >= a) || (addr <= b); +} + +#define end_of_range(bp_info) (bp_info->addr + bp_info->len - 1) + +static struct kvm_hw_bp_info_arch *find_hw_bp(struct kvm_vcpu *vcpu, + unsigned long addr) +{ + struct kvm_hw_bp_info_arch *bp_info = vcpu->arch.guestdbg.hw_bp_info; + int i; + + if (vcpu->arch.guestdbg.nr_hw_bp == 0) + return NULL; + + for (i = 0; i < vcpu->arch.guestdbg.nr_hw_bp; i++) { + /* addr is directly the start or in the range of a bp */ + if (addr == bp_info->addr) + goto found; + if (bp_info->len > 0 && + in_addr_range(addr, bp_info->addr, end_of_range(bp_info))) + goto found; + + bp_info++; + } + + return NULL; +found: + return bp_info; +} + +static struct kvm_hw_wp_info_arch *any_wp_changed(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_hw_wp_info_arch *wp_info = NULL; + void *temp = NULL; + + if (vcpu->arch.guestdbg.nr_hw_wp == 0) + return NULL; + + for (i = 0; i < vcpu->arch.guestdbg.nr_hw_wp; i++) { + wp_info = &vcpu->arch.guestdbg.hw_wp_info[i]; + if (!wp_info || !wp_info->old_data || wp_info->len <= 0) + continue; + + temp = kmalloc(wp_info->len, GFP_KERNEL_ACCOUNT); + if (!temp) + continue; + + /* refetch the wp data and compare it to the old value */ + if (!read_guest_abs(vcpu, wp_info->phys_addr, temp, + wp_info->len)) { + if (memcmp(temp, wp_info->old_data, wp_info->len)) { + kfree(temp); + return wp_info; + } + } + kfree(temp); + temp = NULL; + } + + return NULL; +} + +void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu) +{ + vcpu->run->exit_reason = KVM_EXIT_DEBUG; + vcpu->guest_debug &= ~KVM_GUESTDBG_EXIT_PENDING; +} + +#define PER_CODE_MASK (PER_EVENT_MASK >> 24) +#define PER_CODE_BRANCH (PER_EVENT_BRANCH >> 24) +#define PER_CODE_IFETCH (PER_EVENT_IFETCH >> 24) +#define PER_CODE_STORE (PER_EVENT_STORE >> 24) +#define PER_CODE_STORE_REAL (PER_EVENT_STORE_REAL >> 24) + +#define per_bp_event(code) \ + (code & (PER_CODE_IFETCH | PER_CODE_BRANCH)) +#define per_write_wp_event(code) \ + (code & (PER_CODE_STORE | PER_CODE_STORE_REAL)) + +static int debug_exit_required(struct kvm_vcpu *vcpu, u8 perc, + unsigned long peraddr) +{ + struct kvm_debug_exit_arch *debug_exit = &vcpu->run->debug.arch; + struct kvm_hw_wp_info_arch *wp_info = NULL; + struct kvm_hw_bp_info_arch *bp_info = NULL; + unsigned long addr = vcpu->arch.sie_block->gpsw.addr; + + if (guestdbg_hw_bp_enabled(vcpu)) { + if (per_write_wp_event(perc) && + vcpu->arch.guestdbg.nr_hw_wp > 0) { + wp_info = any_wp_changed(vcpu); + if (wp_info) { + debug_exit->addr = wp_info->addr; + debug_exit->type = KVM_HW_WP_WRITE; + goto exit_required; + } + } + if (per_bp_event(perc) && + vcpu->arch.guestdbg.nr_hw_bp > 0) { + bp_info = find_hw_bp(vcpu, addr); + /* remove duplicate events if PC==PER address */ + if (bp_info && (addr != peraddr)) { + debug_exit->addr = addr; + debug_exit->type = KVM_HW_BP; + vcpu->arch.guestdbg.last_bp = addr; + goto exit_required; + } + /* breakpoint missed */ + bp_info = find_hw_bp(vcpu, peraddr); + if (bp_info && vcpu->arch.guestdbg.last_bp != peraddr) { + debug_exit->addr = peraddr; + debug_exit->type = KVM_HW_BP; + goto exit_required; + } + } + } + if (guestdbg_sstep_enabled(vcpu) && per_bp_event(perc)) { + debug_exit->addr = addr; + debug_exit->type = KVM_SINGLESTEP; + goto exit_required; + } + + return 0; +exit_required: + return 1; +} + +static int per_fetched_addr(struct kvm_vcpu *vcpu, unsigned long *addr) +{ + u8 exec_ilen = 0; + u16 opcode[3]; + int rc; + + if (vcpu->arch.sie_block->icptcode == ICPT_PROGI) { + /* PER address references the fetched or the execute instr */ + *addr = vcpu->arch.sie_block->peraddr; + /* + * Manually detect if we have an EXECUTE instruction. As + * instructions are always 2 byte aligned we can read the + * first two bytes unconditionally + */ + rc = read_guest_instr(vcpu, *addr, &opcode, 2); + if (rc) + return rc; + if (opcode[0] >> 8 == 0x44) + exec_ilen = 4; + if ((opcode[0] & 0xff0f) == 0xc600) + exec_ilen = 6; + } else { + /* instr was suppressed, calculate the responsible instr */ + *addr = __rewind_psw(vcpu->arch.sie_block->gpsw, + kvm_s390_get_ilen(vcpu)); + if (vcpu->arch.sie_block->icptstatus & 0x01) { + exec_ilen = (vcpu->arch.sie_block->icptstatus & 0x60) >> 4; + if (!exec_ilen) + exec_ilen = 4; + } + } + + if (exec_ilen) { + /* read the complete EXECUTE instr to detect the fetched addr */ + rc = read_guest_instr(vcpu, *addr, &opcode, exec_ilen); + if (rc) + return rc; + if (exec_ilen == 6) { + /* EXECUTE RELATIVE LONG - RIL-b format */ + s32 rl = *((s32 *) (opcode + 1)); + + /* rl is a _signed_ 32 bit value specifying halfwords */ + *addr += (u64)(s64) rl * 2; + } else { + /* EXECUTE - RX-a format */ + u32 base = (opcode[1] & 0xf000) >> 12; + u32 disp = opcode[1] & 0x0fff; + u32 index = opcode[0] & 0x000f; + + *addr = base ? vcpu->run->s.regs.gprs[base] : 0; + *addr += index ? vcpu->run->s.regs.gprs[index] : 0; + *addr += disp; + } + *addr = kvm_s390_logical_to_effective(vcpu, *addr); + } + return 0; +} + +#define guest_per_enabled(vcpu) \ + (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER) + +int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu) +{ + const u64 cr10 = vcpu->arch.sie_block->gcr[10]; + const u64 cr11 = vcpu->arch.sie_block->gcr[11]; + const u8 ilen = kvm_s390_get_ilen(vcpu); + struct kvm_s390_pgm_info pgm_info = { + .code = PGM_PER, + .per_code = PER_CODE_IFETCH, + .per_address = __rewind_psw(vcpu->arch.sie_block->gpsw, ilen), + }; + unsigned long fetched_addr; + int rc; + + /* + * The PSW points to the next instruction, therefore the intercepted + * instruction generated a PER i-fetch event. PER address therefore + * points at the previous PSW address (could be an EXECUTE function). + */ + if (!guestdbg_enabled(vcpu)) + return kvm_s390_inject_prog_irq(vcpu, &pgm_info); + + if (debug_exit_required(vcpu, pgm_info.per_code, pgm_info.per_address)) + vcpu->guest_debug |= KVM_GUESTDBG_EXIT_PENDING; + + if (!guest_per_enabled(vcpu) || + !(vcpu->arch.sie_block->gcr[9] & PER_EVENT_IFETCH)) + return 0; + + rc = per_fetched_addr(vcpu, &fetched_addr); + if (rc < 0) + return rc; + if (rc) + /* instruction-fetching exceptions */ + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + + if (in_addr_range(fetched_addr, cr10, cr11)) + return kvm_s390_inject_prog_irq(vcpu, &pgm_info); + return 0; +} + +static int filter_guest_per_event(struct kvm_vcpu *vcpu) +{ + const u8 perc = vcpu->arch.sie_block->perc; + u64 addr = vcpu->arch.sie_block->gpsw.addr; + u64 cr9 = vcpu->arch.sie_block->gcr[9]; + u64 cr10 = vcpu->arch.sie_block->gcr[10]; + u64 cr11 = vcpu->arch.sie_block->gcr[11]; + /* filter all events, demanded by the guest */ + u8 guest_perc = perc & (cr9 >> 24) & PER_CODE_MASK; + unsigned long fetched_addr; + int rc; + + if (!guest_per_enabled(vcpu)) + guest_perc = 0; + + /* filter "successful-branching" events */ + if (guest_perc & PER_CODE_BRANCH && + cr9 & PER_CONTROL_BRANCH_ADDRESS && + !in_addr_range(addr, cr10, cr11)) + guest_perc &= ~PER_CODE_BRANCH; + + /* filter "instruction-fetching" events */ + if (guest_perc & PER_CODE_IFETCH) { + rc = per_fetched_addr(vcpu, &fetched_addr); + if (rc < 0) + return rc; + /* + * Don't inject an irq on exceptions. This would make handling + * on icpt code 8 very complex (as PSW was already rewound). + */ + if (rc || !in_addr_range(fetched_addr, cr10, cr11)) + guest_perc &= ~PER_CODE_IFETCH; + } + + /* All other PER events will be given to the guest */ + /* TODO: Check altered address/address space */ + + vcpu->arch.sie_block->perc = guest_perc; + + if (!guest_perc) + vcpu->arch.sie_block->iprcc &= ~PGM_PER; + return 0; +} + +#define pssec(vcpu) (vcpu->arch.sie_block->gcr[1] & _ASCE_SPACE_SWITCH) +#define hssec(vcpu) (vcpu->arch.sie_block->gcr[13] & _ASCE_SPACE_SWITCH) +#define old_ssec(vcpu) ((vcpu->arch.sie_block->tecmc >> 31) & 0x1) +#define old_as_is_home(vcpu) !(vcpu->arch.sie_block->tecmc & 0xffff) + +int kvm_s390_handle_per_event(struct kvm_vcpu *vcpu) +{ + int rc, new_as; + + if (debug_exit_required(vcpu, vcpu->arch.sie_block->perc, + vcpu->arch.sie_block->peraddr)) + vcpu->guest_debug |= KVM_GUESTDBG_EXIT_PENDING; + + rc = filter_guest_per_event(vcpu); + if (rc) + return rc; + + /* + * Only RP, SAC, SACF, PT, PTI, PR, PC instructions can trigger + * a space-switch event. PER events enforce space-switch events + * for these instructions. So if no PER event for the guest is left, + * we might have to filter the space-switch element out, too. + */ + if (vcpu->arch.sie_block->iprcc == PGM_SPACE_SWITCH) { + vcpu->arch.sie_block->iprcc = 0; + new_as = psw_bits(vcpu->arch.sie_block->gpsw).as; + + /* + * If the AS changed from / to home, we had RP, SAC or SACF + * instruction. Check primary and home space-switch-event + * controls. (theoretically home -> home produced no event) + */ + if (((new_as == PSW_BITS_AS_HOME) ^ old_as_is_home(vcpu)) && + (pssec(vcpu) || hssec(vcpu))) + vcpu->arch.sie_block->iprcc = PGM_SPACE_SWITCH; + + /* + * PT, PTI, PR, PC instruction operate on primary AS only. Check + * if the primary-space-switch-event control was or got set. + */ + if (new_as == PSW_BITS_AS_PRIMARY && !old_as_is_home(vcpu) && + (pssec(vcpu) || old_ssec(vcpu))) + vcpu->arch.sie_block->iprcc = PGM_SPACE_SWITCH; + } + return 0; +} diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c new file mode 100644 index 0000000000..b16352083f --- /dev/null +++ b/arch/s390/kvm/intercept.c @@ -0,0 +1,667 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * in-kernel handling for sie intercepts + * + * Copyright IBM Corp. 2008, 2020 + * + * Author(s): Carsten Otte + * Christian Borntraeger + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include "kvm-s390.h" +#include "gaccess.h" +#include "trace.h" +#include "trace-s390.h" + +u8 kvm_s390_get_ilen(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_sie_block *sie_block = vcpu->arch.sie_block; + u8 ilen = 0; + + switch (vcpu->arch.sie_block->icptcode) { + case ICPT_INST: + case ICPT_INSTPROGI: + case ICPT_OPEREXC: + case ICPT_PARTEXEC: + case ICPT_IOINST: + /* instruction only stored for these icptcodes */ + ilen = insn_length(vcpu->arch.sie_block->ipa >> 8); + /* Use the length of the EXECUTE instruction if necessary */ + if (sie_block->icptstatus & 1) { + ilen = (sie_block->icptstatus >> 4) & 0x6; + if (!ilen) + ilen = 4; + } + break; + case ICPT_PROGI: + /* bit 1+2 of pgmilc are the ilc, so we directly get ilen */ + ilen = vcpu->arch.sie_block->pgmilc & 0x6; + break; + } + return ilen; +} + +static int handle_stop(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + int rc = 0; + uint8_t flags, stop_pending; + + vcpu->stat.exit_stop_request++; + + /* delay the stop if any non-stop irq is pending */ + if (kvm_s390_vcpu_has_irq(vcpu, 1)) + return 0; + + /* avoid races with the injection/SIGP STOP code */ + spin_lock(&li->lock); + flags = li->irq.stop.flags; + stop_pending = kvm_s390_is_stop_irq_pending(vcpu); + spin_unlock(&li->lock); + + trace_kvm_s390_stop_request(stop_pending, flags); + if (!stop_pending) + return 0; + + if (flags & KVM_S390_STOP_FLAG_STORE_STATUS) { + rc = kvm_s390_vcpu_store_status(vcpu, + KVM_S390_STORE_STATUS_NOADDR); + if (rc) + return rc; + } + + /* + * no need to check the return value of vcpu_stop as it can only have + * an error for protvirt, but protvirt means user cpu state + */ + if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) + kvm_s390_vcpu_stop(vcpu); + return -EOPNOTSUPP; +} + +static int handle_validity(struct kvm_vcpu *vcpu) +{ + int viwhy = vcpu->arch.sie_block->ipb >> 16; + + vcpu->stat.exit_validity++; + trace_kvm_s390_intercept_validity(vcpu, viwhy); + KVM_EVENT(3, "validity intercept 0x%x for pid %u (kvm 0x%pK)", viwhy, + current->pid, vcpu->kvm); + + /* do not warn on invalid runtime instrumentation mode */ + WARN_ONCE(viwhy != 0x44, "kvm: unhandled validity intercept 0x%x\n", + viwhy); + return -EINVAL; +} + +static int handle_instruction(struct kvm_vcpu *vcpu) +{ + vcpu->stat.exit_instruction++; + trace_kvm_s390_intercept_instruction(vcpu, + vcpu->arch.sie_block->ipa, + vcpu->arch.sie_block->ipb); + + switch (vcpu->arch.sie_block->ipa >> 8) { + case 0x01: + return kvm_s390_handle_01(vcpu); + case 0x82: + return kvm_s390_handle_lpsw(vcpu); + case 0x83: + return kvm_s390_handle_diag(vcpu); + case 0xaa: + return kvm_s390_handle_aa(vcpu); + case 0xae: + return kvm_s390_handle_sigp(vcpu); + case 0xb2: + return kvm_s390_handle_b2(vcpu); + case 0xb6: + return kvm_s390_handle_stctl(vcpu); + case 0xb7: + return kvm_s390_handle_lctl(vcpu); + case 0xb9: + return kvm_s390_handle_b9(vcpu); + case 0xe3: + return kvm_s390_handle_e3(vcpu); + case 0xe5: + return kvm_s390_handle_e5(vcpu); + case 0xeb: + return kvm_s390_handle_eb(vcpu); + default: + return -EOPNOTSUPP; + } +} + +static int inject_prog_on_prog_intercept(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_pgm_info pgm_info = { + .code = vcpu->arch.sie_block->iprcc, + /* the PSW has already been rewound */ + .flags = KVM_S390_PGM_FLAGS_NO_REWIND, + }; + + switch (vcpu->arch.sie_block->iprcc & ~PGM_PER) { + case PGM_AFX_TRANSLATION: + case PGM_ASX_TRANSLATION: + case PGM_EX_TRANSLATION: + case PGM_LFX_TRANSLATION: + case PGM_LSTE_SEQUENCE: + case PGM_LSX_TRANSLATION: + case PGM_LX_TRANSLATION: + case PGM_PRIMARY_AUTHORITY: + case PGM_SECONDARY_AUTHORITY: + case PGM_SPACE_SWITCH: + pgm_info.trans_exc_code = vcpu->arch.sie_block->tecmc; + break; + case PGM_ALEN_TRANSLATION: + case PGM_ALE_SEQUENCE: + case PGM_ASTE_INSTANCE: + case PGM_ASTE_SEQUENCE: + case PGM_ASTE_VALIDITY: + case PGM_EXTENDED_AUTHORITY: + pgm_info.exc_access_id = vcpu->arch.sie_block->eai; + break; + case PGM_ASCE_TYPE: + case PGM_PAGE_TRANSLATION: + case PGM_REGION_FIRST_TRANS: + case PGM_REGION_SECOND_TRANS: + case PGM_REGION_THIRD_TRANS: + case PGM_SEGMENT_TRANSLATION: + pgm_info.trans_exc_code = vcpu->arch.sie_block->tecmc; + pgm_info.exc_access_id = vcpu->arch.sie_block->eai; + pgm_info.op_access_id = vcpu->arch.sie_block->oai; + break; + case PGM_MONITOR: + pgm_info.mon_class_nr = vcpu->arch.sie_block->mcn; + pgm_info.mon_code = vcpu->arch.sie_block->tecmc; + break; + case PGM_VECTOR_PROCESSING: + case PGM_DATA: + pgm_info.data_exc_code = vcpu->arch.sie_block->dxc; + break; + case PGM_PROTECTION: + pgm_info.trans_exc_code = vcpu->arch.sie_block->tecmc; + pgm_info.exc_access_id = vcpu->arch.sie_block->eai; + break; + default: + break; + } + + if (vcpu->arch.sie_block->iprcc & PGM_PER) { + pgm_info.per_code = vcpu->arch.sie_block->perc; + pgm_info.per_atmid = vcpu->arch.sie_block->peratmid; + pgm_info.per_address = vcpu->arch.sie_block->peraddr; + pgm_info.per_access_id = vcpu->arch.sie_block->peraid; + } + return kvm_s390_inject_prog_irq(vcpu, &pgm_info); +} + +/* + * restore ITDB to program-interruption TDB in guest lowcore + * and set TX abort indication if required +*/ +static int handle_itdb(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_itdb *itdb; + int rc; + + if (!IS_TE_ENABLED(vcpu) || !IS_ITDB_VALID(vcpu)) + return 0; + if (current->thread.per_flags & PER_FLAG_NO_TE) + return 0; + itdb = phys_to_virt(vcpu->arch.sie_block->itdba); + rc = write_guest_lc(vcpu, __LC_PGM_TDB, itdb, sizeof(*itdb)); + if (rc) + return rc; + memset(itdb, 0, sizeof(*itdb)); + + return 0; +} + +#define per_event(vcpu) (vcpu->arch.sie_block->iprcc & PGM_PER) + +static bool should_handle_per_event(const struct kvm_vcpu *vcpu) +{ + if (!guestdbg_enabled(vcpu) || !per_event(vcpu)) + return false; + if (guestdbg_sstep_enabled(vcpu) && + vcpu->arch.sie_block->iprcc != PGM_PER) { + /* + * __vcpu_run() will exit after delivering the concurrently + * indicated condition. + */ + return false; + } + return true; +} + +static int handle_prog(struct kvm_vcpu *vcpu) +{ + psw_t psw; + int rc; + + vcpu->stat.exit_program_interruption++; + + /* + * Intercept 8 indicates a loop of specification exceptions + * for protected guests. + */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) + return -EOPNOTSUPP; + + if (should_handle_per_event(vcpu)) { + rc = kvm_s390_handle_per_event(vcpu); + if (rc) + return rc; + /* the interrupt might have been filtered out completely */ + if (vcpu->arch.sie_block->iprcc == 0) + return 0; + } + + trace_kvm_s390_intercept_prog(vcpu, vcpu->arch.sie_block->iprcc); + if (vcpu->arch.sie_block->iprcc == PGM_SPECIFICATION) { + rc = read_guest_lc(vcpu, __LC_PGM_NEW_PSW, &psw, sizeof(psw_t)); + if (rc) + return rc; + /* Avoid endless loops of specification exceptions */ + if (!is_valid_psw(&psw)) + return -EOPNOTSUPP; + } + rc = handle_itdb(vcpu); + if (rc) + return rc; + + return inject_prog_on_prog_intercept(vcpu); +} + +/** + * handle_external_interrupt - used for external interruption interceptions + * @vcpu: virtual cpu + * + * This interception occurs if: + * - the CPUSTAT_EXT_INT bit was already set when the external interrupt + * occurred. In this case, the interrupt needs to be injected manually to + * preserve interrupt priority. + * - the external new PSW has external interrupts enabled, which will cause an + * interruption loop. We drop to userspace in this case. + * + * The latter case can be detected by inspecting the external mask bit in the + * external new psw. + * + * Under PV, only the latter case can occur, since interrupt priorities are + * handled in the ultravisor. + */ +static int handle_external_interrupt(struct kvm_vcpu *vcpu) +{ + u16 eic = vcpu->arch.sie_block->eic; + struct kvm_s390_irq irq; + psw_t newpsw; + int rc; + + vcpu->stat.exit_external_interrupt++; + + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + newpsw = vcpu->arch.sie_block->gpsw; + } else { + rc = read_guest_lc(vcpu, __LC_EXT_NEW_PSW, &newpsw, sizeof(psw_t)); + if (rc) + return rc; + } + + /* + * Clock comparator or timer interrupt with external interrupt enabled + * will cause interrupt loop. Drop to userspace. + */ + if ((eic == EXT_IRQ_CLK_COMP || eic == EXT_IRQ_CPU_TIMER) && + (newpsw.mask & PSW_MASK_EXT)) + return -EOPNOTSUPP; + + switch (eic) { + case EXT_IRQ_CLK_COMP: + irq.type = KVM_S390_INT_CLOCK_COMP; + break; + case EXT_IRQ_CPU_TIMER: + irq.type = KVM_S390_INT_CPU_TIMER; + break; + case EXT_IRQ_EXTERNAL_CALL: + irq.type = KVM_S390_INT_EXTERNAL_CALL; + irq.u.extcall.code = vcpu->arch.sie_block->extcpuaddr; + rc = kvm_s390_inject_vcpu(vcpu, &irq); + /* ignore if another external call is already pending */ + if (rc == -EBUSY) + return 0; + return rc; + default: + return -EOPNOTSUPP; + } + + return kvm_s390_inject_vcpu(vcpu, &irq); +} + +/** + * handle_mvpg_pei - Handle MOVE PAGE partial execution interception. + * @vcpu: virtual cpu + * + * This interception can only happen for guests with DAT disabled and + * addresses that are currently not mapped in the host. Thus we try to + * set up the mappings for the corresponding user pages here (or throw + * addressing exceptions in case of illegal guest addresses). + */ +static int handle_mvpg_pei(struct kvm_vcpu *vcpu) +{ + unsigned long srcaddr, dstaddr; + int reg1, reg2, rc; + + kvm_s390_get_regs_rre(vcpu, ®1, ®2); + + /* Ensure that the source is paged-in, no actual access -> no key checking */ + rc = guest_translate_address_with_key(vcpu, vcpu->run->s.regs.gprs[reg2], + reg2, &srcaddr, GACC_FETCH, 0); + if (rc) + return kvm_s390_inject_prog_cond(vcpu, rc); + rc = kvm_arch_fault_in_page(vcpu, srcaddr, 0); + if (rc != 0) + return rc; + + /* Ensure that the source is paged-in, no actual access -> no key checking */ + rc = guest_translate_address_with_key(vcpu, vcpu->run->s.regs.gprs[reg1], + reg1, &dstaddr, GACC_STORE, 0); + if (rc) + return kvm_s390_inject_prog_cond(vcpu, rc); + rc = kvm_arch_fault_in_page(vcpu, dstaddr, 1); + if (rc != 0) + return rc; + + kvm_s390_retry_instr(vcpu); + + return 0; +} + +static int handle_partial_execution(struct kvm_vcpu *vcpu) +{ + vcpu->stat.exit_pei++; + + if (vcpu->arch.sie_block->ipa == 0xb254) /* MVPG */ + return handle_mvpg_pei(vcpu); + if (vcpu->arch.sie_block->ipa >> 8 == 0xae) /* SIGP */ + return kvm_s390_handle_sigp_pei(vcpu); + + return -EOPNOTSUPP; +} + +/* + * Handle the sthyi instruction that provides the guest with system + * information, like current CPU resources available at each level of + * the machine. + */ +int handle_sthyi(struct kvm_vcpu *vcpu) +{ + int reg1, reg2, cc = 0, r = 0; + u64 code, addr, rc = 0; + struct sthyi_sctns *sctns = NULL; + + if (!test_kvm_facility(vcpu->kvm, 74)) + return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); + + kvm_s390_get_regs_rre(vcpu, ®1, ®2); + code = vcpu->run->s.regs.gprs[reg1]; + addr = vcpu->run->s.regs.gprs[reg2]; + + vcpu->stat.instruction_sthyi++; + VCPU_EVENT(vcpu, 3, "STHYI: fc: %llu addr: 0x%016llx", code, addr); + trace_kvm_s390_handle_sthyi(vcpu, code, addr); + + if (reg1 == reg2 || reg1 & 1 || reg2 & 1) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + if (code & 0xffff) { + cc = 3; + rc = 4; + goto out; + } + + if (!kvm_s390_pv_cpu_is_protected(vcpu) && (addr & ~PAGE_MASK)) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + sctns = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!sctns) + return -ENOMEM; + + cc = sthyi_fill(sctns, &rc); + if (cc < 0) { + free_page((unsigned long)sctns); + return cc; + } +out: + if (!cc) { + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + memcpy(sida_addr(vcpu->arch.sie_block), sctns, PAGE_SIZE); + } else { + r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE); + if (r) { + free_page((unsigned long)sctns); + return kvm_s390_inject_prog_cond(vcpu, r); + } + } + } + + free_page((unsigned long)sctns); + vcpu->run->s.regs.gprs[reg2 + 1] = rc; + kvm_s390_set_psw_cc(vcpu, cc); + return r; +} + +static int handle_operexc(struct kvm_vcpu *vcpu) +{ + psw_t oldpsw, newpsw; + int rc; + + vcpu->stat.exit_operation_exception++; + trace_kvm_s390_handle_operexc(vcpu, vcpu->arch.sie_block->ipa, + vcpu->arch.sie_block->ipb); + + if (vcpu->arch.sie_block->ipa == 0xb256) + return handle_sthyi(vcpu); + + if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0) + return -EOPNOTSUPP; + rc = read_guest_lc(vcpu, __LC_PGM_NEW_PSW, &newpsw, sizeof(psw_t)); + if (rc) + return rc; + /* + * Avoid endless loops of operation exceptions, if the pgm new + * PSW will cause a new operation exception. + * The heuristic checks if the pgm new psw is within 6 bytes before + * the faulting psw address (with same DAT, AS settings) and the + * new psw is not a wait psw and the fault was not triggered by + * problem state. + */ + oldpsw = vcpu->arch.sie_block->gpsw; + if (oldpsw.addr - newpsw.addr <= 6 && + !(newpsw.mask & PSW_MASK_WAIT) && + !(oldpsw.mask & PSW_MASK_PSTATE) && + (newpsw.mask & PSW_MASK_ASC) == (oldpsw.mask & PSW_MASK_ASC) && + (newpsw.mask & PSW_MASK_DAT) == (oldpsw.mask & PSW_MASK_DAT)) + return -EOPNOTSUPP; + + return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); +} + +static int handle_pv_spx(struct kvm_vcpu *vcpu) +{ + u32 pref = *(u32 *)sida_addr(vcpu->arch.sie_block); + + kvm_s390_set_prefix(vcpu, pref); + trace_kvm_s390_handle_prefix(vcpu, 1, pref); + return 0; +} + +static int handle_pv_sclp(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; + + spin_lock(&fi->lock); + /* + * 2 cases: + * a: an sccb answering interrupt was already pending or in flight. + * As the sccb value is not known we can simply set some value to + * trigger delivery of a saved SCCB. UV will then use its saved + * copy of the SCCB value. + * b: an error SCCB interrupt needs to be injected so we also inject + * a fake SCCB address. Firmware will use the proper one. + * This makes sure, that both errors and real sccb returns will only + * be delivered after a notification intercept (instruction has + * finished) but not after others. + */ + fi->srv_signal.ext_params |= 0x43000; + set_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs); + clear_bit(IRQ_PEND_EXT_SERVICE, &fi->masked_irqs); + spin_unlock(&fi->lock); + return 0; +} + +static int handle_pv_uvc(struct kvm_vcpu *vcpu) +{ + struct uv_cb_share *guest_uvcb = sida_addr(vcpu->arch.sie_block); + struct uv_cb_cts uvcb = { + .header.cmd = UVC_CMD_UNPIN_PAGE_SHARED, + .header.len = sizeof(uvcb), + .guest_handle = kvm_s390_pv_get_handle(vcpu->kvm), + .gaddr = guest_uvcb->paddr, + }; + int rc; + + if (guest_uvcb->header.cmd != UVC_CMD_REMOVE_SHARED_ACCESS) { + WARN_ONCE(1, "Unexpected notification intercept for UVC 0x%x\n", + guest_uvcb->header.cmd); + return 0; + } + rc = gmap_make_secure(vcpu->arch.gmap, uvcb.gaddr, &uvcb); + /* + * If the unpin did not succeed, the guest will exit again for the UVC + * and we will retry the unpin. + */ + if (rc == -EINVAL) + return 0; + /* + * If we got -EAGAIN here, we simply return it. It will eventually + * get propagated all the way to userspace, which should then try + * again. + */ + return rc; +} + +static int handle_pv_notification(struct kvm_vcpu *vcpu) +{ + int ret; + + if (vcpu->arch.sie_block->ipa == 0xb210) + return handle_pv_spx(vcpu); + if (vcpu->arch.sie_block->ipa == 0xb220) + return handle_pv_sclp(vcpu); + if (vcpu->arch.sie_block->ipa == 0xb9a4) + return handle_pv_uvc(vcpu); + if (vcpu->arch.sie_block->ipa >> 8 == 0xae) { + /* + * Besides external call, other SIGP orders also cause a + * 108 (pv notify) intercept. In contrast to external call, + * these orders need to be emulated and hence the appropriate + * place to handle them is in handle_instruction(). + * So first try kvm_s390_handle_sigp_pei() and if that isn't + * successful, go on with handle_instruction(). + */ + ret = kvm_s390_handle_sigp_pei(vcpu); + if (!ret) + return ret; + } + + return handle_instruction(vcpu); +} + +static bool should_handle_per_ifetch(const struct kvm_vcpu *vcpu, int rc) +{ + /* Process PER, also if the instruction is processed in user space. */ + if (!(vcpu->arch.sie_block->icptstatus & 0x02)) + return false; + if (rc != 0 && rc != -EOPNOTSUPP) + return false; + if (guestdbg_sstep_enabled(vcpu) && vcpu->arch.local_int.pending_irqs) + /* __vcpu_run() will exit after delivering the interrupt. */ + return false; + return true; +} + +int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) +{ + int rc, per_rc = 0; + + if (kvm_is_ucontrol(vcpu->kvm)) + return -EOPNOTSUPP; + + switch (vcpu->arch.sie_block->icptcode) { + case ICPT_EXTREQ: + vcpu->stat.exit_external_request++; + return 0; + case ICPT_IOREQ: + vcpu->stat.exit_io_request++; + return 0; + case ICPT_INST: + rc = handle_instruction(vcpu); + break; + case ICPT_PROGI: + return handle_prog(vcpu); + case ICPT_EXTINT: + return handle_external_interrupt(vcpu); + case ICPT_WAIT: + return kvm_s390_handle_wait(vcpu); + case ICPT_VALIDITY: + return handle_validity(vcpu); + case ICPT_STOP: + return handle_stop(vcpu); + case ICPT_OPEREXC: + rc = handle_operexc(vcpu); + break; + case ICPT_PARTEXEC: + rc = handle_partial_execution(vcpu); + break; + case ICPT_KSS: + /* Instruction will be redriven, skip the PER check. */ + return kvm_s390_skey_check_enable(vcpu); + case ICPT_MCHKREQ: + case ICPT_INT_ENABLE: + /* + * PSW bit 13 or a CR (0, 6, 14) changed and we might + * now be able to deliver interrupts. The pre-run code + * will take care of this. + */ + rc = 0; + break; + case ICPT_PV_INSTR: + rc = handle_instruction(vcpu); + break; + case ICPT_PV_NOTIFY: + rc = handle_pv_notification(vcpu); + break; + case ICPT_PV_PREF: + rc = 0; + gmap_convert_to_secure(vcpu->arch.gmap, + kvm_s390_get_prefix(vcpu)); + gmap_convert_to_secure(vcpu->arch.gmap, + kvm_s390_get_prefix(vcpu) + PAGE_SIZE); + break; + default: + return -EOPNOTSUPP; + } + + if (should_handle_per_ifetch(vcpu, rc)) + per_rc = kvm_s390_handle_per_ifetch_icpt(vcpu); + return per_rc ? per_rc : rc; +} diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c new file mode 100644 index 0000000000..efaebba5ee --- /dev/null +++ b/arch/s390/kvm/interrupt.c @@ -0,0 +1,3481 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * handling kvm guest interrupts + * + * Copyright IBM Corp. 2008, 2020 + * + * Author(s): Carsten Otte + */ + +#define KMSG_COMPONENT "kvm-s390" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "kvm-s390.h" +#include "gaccess.h" +#include "trace-s390.h" +#include "pci.h" + +#define PFAULT_INIT 0x0600 +#define PFAULT_DONE 0x0680 +#define VIRTIO_PARAM 0x0d00 + +static struct kvm_s390_gib *gib; + +/* handle external calls via sigp interpretation facility */ +static int sca_ext_call_pending(struct kvm_vcpu *vcpu, int *src_id) +{ + int c, scn; + + if (!kvm_s390_test_cpuflags(vcpu, CPUSTAT_ECALL_PEND)) + return 0; + + BUG_ON(!kvm_s390_use_sca_entries()); + read_lock(&vcpu->kvm->arch.sca_lock); + if (vcpu->kvm->arch.use_esca) { + struct esca_block *sca = vcpu->kvm->arch.sca; + union esca_sigp_ctrl sigp_ctrl = + sca->cpu[vcpu->vcpu_id].sigp_ctrl; + + c = sigp_ctrl.c; + scn = sigp_ctrl.scn; + } else { + struct bsca_block *sca = vcpu->kvm->arch.sca; + union bsca_sigp_ctrl sigp_ctrl = + sca->cpu[vcpu->vcpu_id].sigp_ctrl; + + c = sigp_ctrl.c; + scn = sigp_ctrl.scn; + } + read_unlock(&vcpu->kvm->arch.sca_lock); + + if (src_id) + *src_id = scn; + + return c; +} + +static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id) +{ + int expect, rc; + + BUG_ON(!kvm_s390_use_sca_entries()); + read_lock(&vcpu->kvm->arch.sca_lock); + if (vcpu->kvm->arch.use_esca) { + struct esca_block *sca = vcpu->kvm->arch.sca; + union esca_sigp_ctrl *sigp_ctrl = + &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); + union esca_sigp_ctrl new_val = {0}, old_val; + + old_val = READ_ONCE(*sigp_ctrl); + new_val.scn = src_id; + new_val.c = 1; + old_val.c = 0; + + expect = old_val.value; + rc = cmpxchg(&sigp_ctrl->value, old_val.value, new_val.value); + } else { + struct bsca_block *sca = vcpu->kvm->arch.sca; + union bsca_sigp_ctrl *sigp_ctrl = + &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); + union bsca_sigp_ctrl new_val = {0}, old_val; + + old_val = READ_ONCE(*sigp_ctrl); + new_val.scn = src_id; + new_val.c = 1; + old_val.c = 0; + + expect = old_val.value; + rc = cmpxchg(&sigp_ctrl->value, old_val.value, new_val.value); + } + read_unlock(&vcpu->kvm->arch.sca_lock); + + if (rc != expect) { + /* another external call is pending */ + return -EBUSY; + } + kvm_s390_set_cpuflags(vcpu, CPUSTAT_ECALL_PEND); + return 0; +} + +static void sca_clear_ext_call(struct kvm_vcpu *vcpu) +{ + int rc, expect; + + if (!kvm_s390_use_sca_entries()) + return; + kvm_s390_clear_cpuflags(vcpu, CPUSTAT_ECALL_PEND); + read_lock(&vcpu->kvm->arch.sca_lock); + if (vcpu->kvm->arch.use_esca) { + struct esca_block *sca = vcpu->kvm->arch.sca; + union esca_sigp_ctrl *sigp_ctrl = + &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); + union esca_sigp_ctrl old; + + old = READ_ONCE(*sigp_ctrl); + expect = old.value; + rc = cmpxchg(&sigp_ctrl->value, old.value, 0); + } else { + struct bsca_block *sca = vcpu->kvm->arch.sca; + union bsca_sigp_ctrl *sigp_ctrl = + &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); + union bsca_sigp_ctrl old; + + old = READ_ONCE(*sigp_ctrl); + expect = old.value; + rc = cmpxchg(&sigp_ctrl->value, old.value, 0); + } + read_unlock(&vcpu->kvm->arch.sca_lock); + WARN_ON(rc != expect); /* cannot clear? */ +} + +int psw_extint_disabled(struct kvm_vcpu *vcpu) +{ + return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_EXT); +} + +static int psw_ioint_disabled(struct kvm_vcpu *vcpu) +{ + return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_IO); +} + +static int psw_mchk_disabled(struct kvm_vcpu *vcpu) +{ + return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_MCHECK); +} + +static int psw_interrupts_disabled(struct kvm_vcpu *vcpu) +{ + return psw_extint_disabled(vcpu) && + psw_ioint_disabled(vcpu) && + psw_mchk_disabled(vcpu); +} + +static int ckc_interrupts_enabled(struct kvm_vcpu *vcpu) +{ + if (psw_extint_disabled(vcpu) || + !(vcpu->arch.sie_block->gcr[0] & CR0_CLOCK_COMPARATOR_SUBMASK)) + return 0; + if (guestdbg_enabled(vcpu) && guestdbg_sstep_enabled(vcpu)) + /* No timer interrupts when single stepping */ + return 0; + return 1; +} + +static int ckc_irq_pending(struct kvm_vcpu *vcpu) +{ + const u64 now = kvm_s390_get_tod_clock_fast(vcpu->kvm); + const u64 ckc = vcpu->arch.sie_block->ckc; + + if (vcpu->arch.sie_block->gcr[0] & CR0_CLOCK_COMPARATOR_SIGN) { + if ((s64)ckc >= (s64)now) + return 0; + } else if (ckc >= now) { + return 0; + } + return ckc_interrupts_enabled(vcpu); +} + +static int cpu_timer_interrupts_enabled(struct kvm_vcpu *vcpu) +{ + return !psw_extint_disabled(vcpu) && + (vcpu->arch.sie_block->gcr[0] & CR0_CPU_TIMER_SUBMASK); +} + +static int cpu_timer_irq_pending(struct kvm_vcpu *vcpu) +{ + if (!cpu_timer_interrupts_enabled(vcpu)) + return 0; + return kvm_s390_get_cpu_timer(vcpu) >> 63; +} + +static uint64_t isc_to_isc_bits(int isc) +{ + return (0x80 >> isc) << 24; +} + +static inline u32 isc_to_int_word(u8 isc) +{ + return ((u32)isc << 27) | 0x80000000; +} + +static inline u8 int_word_to_isc(u32 int_word) +{ + return (int_word & 0x38000000) >> 27; +} + +/* + * To use atomic bitmap functions, we have to provide a bitmap address + * that is u64 aligned. However, the ipm might be u32 aligned. + * Therefore, we logically start the bitmap at the very beginning of the + * struct and fixup the bit number. + */ +#define IPM_BIT_OFFSET (offsetof(struct kvm_s390_gisa, ipm) * BITS_PER_BYTE) + +/** + * gisa_set_iam - change the GISA interruption alert mask + * + * @gisa: gisa to operate on + * @iam: new IAM value to use + * + * Change the IAM atomically with the next alert address and the IPM + * of the GISA if the GISA is not part of the GIB alert list. All three + * fields are located in the first long word of the GISA. + * + * Returns: 0 on success + * -EBUSY in case the gisa is part of the alert list + */ +static inline int gisa_set_iam(struct kvm_s390_gisa *gisa, u8 iam) +{ + u64 word, _word; + + do { + word = READ_ONCE(gisa->u64.word[0]); + if ((u64)gisa != word >> 32) + return -EBUSY; + _word = (word & ~0xffUL) | iam; + } while (cmpxchg(&gisa->u64.word[0], word, _word) != word); + + return 0; +} + +/** + * gisa_clear_ipm - clear the GISA interruption pending mask + * + * @gisa: gisa to operate on + * + * Clear the IPM atomically with the next alert address and the IAM + * of the GISA unconditionally. All three fields are located in the + * first long word of the GISA. + */ +static inline void gisa_clear_ipm(struct kvm_s390_gisa *gisa) +{ + u64 word, _word; + + do { + word = READ_ONCE(gisa->u64.word[0]); + _word = word & ~(0xffUL << 24); + } while (cmpxchg(&gisa->u64.word[0], word, _word) != word); +} + +/** + * gisa_get_ipm_or_restore_iam - return IPM or restore GISA IAM + * + * @gi: gisa interrupt struct to work on + * + * Atomically restores the interruption alert mask if none of the + * relevant ISCs are pending and return the IPM. + * + * Returns: the relevant pending ISCs + */ +static inline u8 gisa_get_ipm_or_restore_iam(struct kvm_s390_gisa_interrupt *gi) +{ + u8 pending_mask, alert_mask; + u64 word, _word; + + do { + word = READ_ONCE(gi->origin->u64.word[0]); + alert_mask = READ_ONCE(gi->alert.mask); + pending_mask = (u8)(word >> 24) & alert_mask; + if (pending_mask) + return pending_mask; + _word = (word & ~0xffUL) | alert_mask; + } while (cmpxchg(&gi->origin->u64.word[0], word, _word) != word); + + return 0; +} + +static inline void gisa_set_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) +{ + set_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); +} + +static inline u8 gisa_get_ipm(struct kvm_s390_gisa *gisa) +{ + return READ_ONCE(gisa->ipm); +} + +static inline int gisa_tac_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) +{ + return test_and_clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); +} + +static inline unsigned long pending_irqs_no_gisa(struct kvm_vcpu *vcpu) +{ + unsigned long pending = vcpu->kvm->arch.float_int.pending_irqs | + vcpu->arch.local_int.pending_irqs; + + pending &= ~vcpu->kvm->arch.float_int.masked_irqs; + return pending; +} + +static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_gisa_interrupt *gi = &vcpu->kvm->arch.gisa_int; + unsigned long pending_mask; + + pending_mask = pending_irqs_no_gisa(vcpu); + if (gi->origin) + pending_mask |= gisa_get_ipm(gi->origin) << IRQ_PEND_IO_ISC_7; + return pending_mask; +} + +static inline int isc_to_irq_type(unsigned long isc) +{ + return IRQ_PEND_IO_ISC_0 - isc; +} + +static inline int irq_type_to_isc(unsigned long irq_type) +{ + return IRQ_PEND_IO_ISC_0 - irq_type; +} + +static unsigned long disable_iscs(struct kvm_vcpu *vcpu, + unsigned long active_mask) +{ + int i; + + for (i = 0; i <= MAX_ISC; i++) + if (!(vcpu->arch.sie_block->gcr[6] & isc_to_isc_bits(i))) + active_mask &= ~(1UL << (isc_to_irq_type(i))); + + return active_mask; +} + +static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu) +{ + unsigned long active_mask; + + active_mask = pending_irqs(vcpu); + if (!active_mask) + return 0; + + if (psw_extint_disabled(vcpu)) + active_mask &= ~IRQ_PEND_EXT_MASK; + if (psw_ioint_disabled(vcpu)) + active_mask &= ~IRQ_PEND_IO_MASK; + else + active_mask = disable_iscs(vcpu, active_mask); + if (!(vcpu->arch.sie_block->gcr[0] & CR0_EXTERNAL_CALL_SUBMASK)) + __clear_bit(IRQ_PEND_EXT_EXTERNAL, &active_mask); + if (!(vcpu->arch.sie_block->gcr[0] & CR0_EMERGENCY_SIGNAL_SUBMASK)) + __clear_bit(IRQ_PEND_EXT_EMERGENCY, &active_mask); + if (!(vcpu->arch.sie_block->gcr[0] & CR0_CLOCK_COMPARATOR_SUBMASK)) + __clear_bit(IRQ_PEND_EXT_CLOCK_COMP, &active_mask); + if (!(vcpu->arch.sie_block->gcr[0] & CR0_CPU_TIMER_SUBMASK)) + __clear_bit(IRQ_PEND_EXT_CPU_TIMER, &active_mask); + if (!(vcpu->arch.sie_block->gcr[0] & CR0_SERVICE_SIGNAL_SUBMASK)) { + __clear_bit(IRQ_PEND_EXT_SERVICE, &active_mask); + __clear_bit(IRQ_PEND_EXT_SERVICE_EV, &active_mask); + } + if (psw_mchk_disabled(vcpu)) + active_mask &= ~IRQ_PEND_MCHK_MASK; + /* PV guest cpus can have a single interruption injected at a time. */ + if (kvm_s390_pv_cpu_get_handle(vcpu) && + vcpu->arch.sie_block->iictl != IICTL_CODE_NONE) + active_mask &= ~(IRQ_PEND_EXT_II_MASK | + IRQ_PEND_IO_MASK | + IRQ_PEND_MCHK_MASK); + /* + * Check both floating and local interrupt's cr14 because + * bit IRQ_PEND_MCHK_REP could be set in both cases. + */ + if (!(vcpu->arch.sie_block->gcr[14] & + (vcpu->kvm->arch.float_int.mchk.cr14 | + vcpu->arch.local_int.irq.mchk.cr14))) + __clear_bit(IRQ_PEND_MCHK_REP, &active_mask); + + /* + * STOP irqs will never be actively delivered. They are triggered via + * intercept requests and cleared when the stop intercept is performed. + */ + __clear_bit(IRQ_PEND_SIGP_STOP, &active_mask); + + return active_mask; +} + +static void __set_cpu_idle(struct kvm_vcpu *vcpu) +{ + kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT); + set_bit(vcpu->vcpu_idx, vcpu->kvm->arch.idle_mask); +} + +static void __unset_cpu_idle(struct kvm_vcpu *vcpu) +{ + kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT); + clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.idle_mask); +} + +static void __reset_intercept_indicators(struct kvm_vcpu *vcpu) +{ + kvm_s390_clear_cpuflags(vcpu, CPUSTAT_IO_INT | CPUSTAT_EXT_INT | + CPUSTAT_STOP_INT); + vcpu->arch.sie_block->lctl = 0x0000; + vcpu->arch.sie_block->ictl &= ~(ICTL_LPSW | ICTL_STCTL | ICTL_PINT); + + if (guestdbg_enabled(vcpu)) { + vcpu->arch.sie_block->lctl |= (LCTL_CR0 | LCTL_CR9 | + LCTL_CR10 | LCTL_CR11); + vcpu->arch.sie_block->ictl |= (ICTL_STCTL | ICTL_PINT); + } +} + +static void set_intercept_indicators_io(struct kvm_vcpu *vcpu) +{ + if (!(pending_irqs_no_gisa(vcpu) & IRQ_PEND_IO_MASK)) + return; + if (psw_ioint_disabled(vcpu)) + kvm_s390_set_cpuflags(vcpu, CPUSTAT_IO_INT); + else + vcpu->arch.sie_block->lctl |= LCTL_CR6; +} + +static void set_intercept_indicators_ext(struct kvm_vcpu *vcpu) +{ + if (!(pending_irqs_no_gisa(vcpu) & IRQ_PEND_EXT_MASK)) + return; + if (psw_extint_disabled(vcpu)) + kvm_s390_set_cpuflags(vcpu, CPUSTAT_EXT_INT); + else + vcpu->arch.sie_block->lctl |= LCTL_CR0; +} + +static void set_intercept_indicators_mchk(struct kvm_vcpu *vcpu) +{ + if (!(pending_irqs_no_gisa(vcpu) & IRQ_PEND_MCHK_MASK)) + return; + if (psw_mchk_disabled(vcpu)) + vcpu->arch.sie_block->ictl |= ICTL_LPSW; + else + vcpu->arch.sie_block->lctl |= LCTL_CR14; +} + +static void set_intercept_indicators_stop(struct kvm_vcpu *vcpu) +{ + if (kvm_s390_is_stop_irq_pending(vcpu)) + kvm_s390_set_cpuflags(vcpu, CPUSTAT_STOP_INT); +} + +/* Set interception request for non-deliverable interrupts */ +static void set_intercept_indicators(struct kvm_vcpu *vcpu) +{ + set_intercept_indicators_io(vcpu); + set_intercept_indicators_ext(vcpu); + set_intercept_indicators_mchk(vcpu); + set_intercept_indicators_stop(vcpu); +} + +static int __must_check __deliver_cpu_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + int rc = 0; + + vcpu->stat.deliver_cputm++; + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_CPU_TIMER, + 0, 0); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_EXT; + vcpu->arch.sie_block->eic = EXT_IRQ_CPU_TIMER; + } else { + rc = put_guest_lc(vcpu, EXT_IRQ_CPU_TIMER, + (u16 *)__LC_EXT_INT_CODE); + rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR); + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + } + clear_bit(IRQ_PEND_EXT_CPU_TIMER, &li->pending_irqs); + return rc ? -EFAULT : 0; +} + +static int __must_check __deliver_ckc(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + int rc = 0; + + vcpu->stat.deliver_ckc++; + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_CLOCK_COMP, + 0, 0); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_EXT; + vcpu->arch.sie_block->eic = EXT_IRQ_CLK_COMP; + } else { + rc = put_guest_lc(vcpu, EXT_IRQ_CLK_COMP, + (u16 __user *)__LC_EXT_INT_CODE); + rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR); + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + } + clear_bit(IRQ_PEND_EXT_CLOCK_COMP, &li->pending_irqs); + return rc ? -EFAULT : 0; +} + +static int __must_check __deliver_pfault_init(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + struct kvm_s390_ext_info ext; + int rc; + + spin_lock(&li->lock); + ext = li->irq.ext; + clear_bit(IRQ_PEND_PFAULT_INIT, &li->pending_irqs); + li->irq.ext.ext_params2 = 0; + spin_unlock(&li->lock); + + VCPU_EVENT(vcpu, 4, "deliver: pfault init token 0x%llx", + ext.ext_params2); + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, + KVM_S390_INT_PFAULT_INIT, + 0, ext.ext_params2); + + rc = put_guest_lc(vcpu, EXT_IRQ_CP_SERVICE, (u16 *) __LC_EXT_INT_CODE); + rc |= put_guest_lc(vcpu, PFAULT_INIT, (u16 *) __LC_EXT_CPU_ADDR); + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= put_guest_lc(vcpu, ext.ext_params2, (u64 *) __LC_EXT_PARAMS2); + return rc ? -EFAULT : 0; +} + +static int __write_machine_check(struct kvm_vcpu *vcpu, + struct kvm_s390_mchk_info *mchk) +{ + unsigned long ext_sa_addr; + unsigned long lc; + freg_t fprs[NUM_FPRS]; + union mci mci; + int rc; + + /* + * All other possible payload for a machine check (e.g. the register + * contents in the save area) will be handled by the ultravisor, as + * the hypervisor does not not have the needed information for + * protected guests. + */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_MCHK; + vcpu->arch.sie_block->mcic = mchk->mcic; + vcpu->arch.sie_block->faddr = mchk->failing_storage_address; + vcpu->arch.sie_block->edc = mchk->ext_damage_code; + return 0; + } + + mci.val = mchk->mcic; + /* take care of lazy register loading */ + save_fpu_regs(); + save_access_regs(vcpu->run->s.regs.acrs); + if (MACHINE_HAS_GS && vcpu->arch.gs_enabled) + save_gs_cb(current->thread.gs_cb); + + /* Extended save area */ + rc = read_guest_lc(vcpu, __LC_MCESAD, &ext_sa_addr, + sizeof(unsigned long)); + /* Only bits 0 through 63-LC are used for address formation */ + lc = ext_sa_addr & MCESA_LC_MASK; + if (test_kvm_facility(vcpu->kvm, 133)) { + switch (lc) { + case 0: + case 10: + ext_sa_addr &= ~0x3ffUL; + break; + case 11: + ext_sa_addr &= ~0x7ffUL; + break; + case 12: + ext_sa_addr &= ~0xfffUL; + break; + default: + ext_sa_addr = 0; + break; + } + } else { + ext_sa_addr &= ~0x3ffUL; + } + + if (!rc && mci.vr && ext_sa_addr && test_kvm_facility(vcpu->kvm, 129)) { + if (write_guest_abs(vcpu, ext_sa_addr, vcpu->run->s.regs.vrs, + 512)) + mci.vr = 0; + } else { + mci.vr = 0; + } + if (!rc && mci.gs && ext_sa_addr && test_kvm_facility(vcpu->kvm, 133) + && (lc == 11 || lc == 12)) { + if (write_guest_abs(vcpu, ext_sa_addr + 1024, + &vcpu->run->s.regs.gscb, 32)) + mci.gs = 0; + } else { + mci.gs = 0; + } + + /* General interruption information */ + rc |= put_guest_lc(vcpu, 1, (u8 __user *) __LC_AR_MODE_ID); + rc |= write_guest_lc(vcpu, __LC_MCK_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= read_guest_lc(vcpu, __LC_MCK_NEW_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= put_guest_lc(vcpu, mci.val, (u64 __user *) __LC_MCCK_CODE); + + /* Register-save areas */ + if (MACHINE_HAS_VX) { + convert_vx_to_fp(fprs, (__vector128 *) vcpu->run->s.regs.vrs); + rc |= write_guest_lc(vcpu, __LC_FPREGS_SAVE_AREA, fprs, 128); + } else { + rc |= write_guest_lc(vcpu, __LC_FPREGS_SAVE_AREA, + vcpu->run->s.regs.fprs, 128); + } + rc |= write_guest_lc(vcpu, __LC_GPREGS_SAVE_AREA, + vcpu->run->s.regs.gprs, 128); + rc |= put_guest_lc(vcpu, current->thread.fpu.fpc, + (u32 __user *) __LC_FP_CREG_SAVE_AREA); + rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->todpr, + (u32 __user *) __LC_TOD_PROGREG_SAVE_AREA); + rc |= put_guest_lc(vcpu, kvm_s390_get_cpu_timer(vcpu), + (u64 __user *) __LC_CPU_TIMER_SAVE_AREA); + rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->ckc >> 8, + (u64 __user *) __LC_CLOCK_COMP_SAVE_AREA); + rc |= write_guest_lc(vcpu, __LC_AREGS_SAVE_AREA, + &vcpu->run->s.regs.acrs, 64); + rc |= write_guest_lc(vcpu, __LC_CREGS_SAVE_AREA, + &vcpu->arch.sie_block->gcr, 128); + + /* Extended interruption information */ + rc |= put_guest_lc(vcpu, mchk->ext_damage_code, + (u32 __user *) __LC_EXT_DAMAGE_CODE); + rc |= put_guest_lc(vcpu, mchk->failing_storage_address, + (u64 __user *) __LC_MCCK_FAIL_STOR_ADDR); + rc |= write_guest_lc(vcpu, __LC_PSW_SAVE_AREA, &mchk->fixed_logout, + sizeof(mchk->fixed_logout)); + return rc ? -EFAULT : 0; +} + +static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + struct kvm_s390_mchk_info mchk = {}; + int deliver = 0; + int rc = 0; + + spin_lock(&fi->lock); + spin_lock(&li->lock); + if (test_bit(IRQ_PEND_MCHK_EX, &li->pending_irqs) || + test_bit(IRQ_PEND_MCHK_REP, &li->pending_irqs)) { + /* + * If there was an exigent machine check pending, then any + * repressible machine checks that might have been pending + * are indicated along with it, so always clear bits for + * repressible and exigent interrupts + */ + mchk = li->irq.mchk; + clear_bit(IRQ_PEND_MCHK_EX, &li->pending_irqs); + clear_bit(IRQ_PEND_MCHK_REP, &li->pending_irqs); + memset(&li->irq.mchk, 0, sizeof(mchk)); + deliver = 1; + } + /* + * We indicate floating repressible conditions along with + * other pending conditions. Channel Report Pending and Channel + * Subsystem damage are the only two and are indicated by + * bits in mcic and masked in cr14. + */ + if (test_and_clear_bit(IRQ_PEND_MCHK_REP, &fi->pending_irqs)) { + mchk.mcic |= fi->mchk.mcic; + mchk.cr14 |= fi->mchk.cr14; + memset(&fi->mchk, 0, sizeof(mchk)); + deliver = 1; + } + spin_unlock(&li->lock); + spin_unlock(&fi->lock); + + if (deliver) { + VCPU_EVENT(vcpu, 3, "deliver: machine check mcic 0x%llx", + mchk.mcic); + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, + KVM_S390_MCHK, + mchk.cr14, mchk.mcic); + vcpu->stat.deliver_machine_check++; + rc = __write_machine_check(vcpu, &mchk); + } + return rc; +} + +static int __must_check __deliver_restart(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + int rc = 0; + + VCPU_EVENT(vcpu, 3, "%s", "deliver: cpu restart"); + vcpu->stat.deliver_restart_signal++; + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_RESTART, 0, 0); + + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_RESTART; + } else { + rc = write_guest_lc(vcpu, + offsetof(struct lowcore, restart_old_psw), + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= read_guest_lc(vcpu, offsetof(struct lowcore, restart_psw), + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + } + clear_bit(IRQ_PEND_RESTART, &li->pending_irqs); + return rc ? -EFAULT : 0; +} + +static int __must_check __deliver_set_prefix(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + struct kvm_s390_prefix_info prefix; + + spin_lock(&li->lock); + prefix = li->irq.prefix; + li->irq.prefix.address = 0; + clear_bit(IRQ_PEND_SET_PREFIX, &li->pending_irqs); + spin_unlock(&li->lock); + + vcpu->stat.deliver_prefix_signal++; + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, + KVM_S390_SIGP_SET_PREFIX, + prefix.address, 0); + + kvm_s390_set_prefix(vcpu, prefix.address); + return 0; +} + +static int __must_check __deliver_emergency_signal(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + int rc; + int cpu_addr; + + spin_lock(&li->lock); + cpu_addr = find_first_bit(li->sigp_emerg_pending, KVM_MAX_VCPUS); + clear_bit(cpu_addr, li->sigp_emerg_pending); + if (bitmap_empty(li->sigp_emerg_pending, KVM_MAX_VCPUS)) + clear_bit(IRQ_PEND_EXT_EMERGENCY, &li->pending_irqs); + spin_unlock(&li->lock); + + VCPU_EVENT(vcpu, 4, "%s", "deliver: sigp emerg"); + vcpu->stat.deliver_emergency_signal++; + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_EMERGENCY, + cpu_addr, 0); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_EXT; + vcpu->arch.sie_block->eic = EXT_IRQ_EMERGENCY_SIG; + vcpu->arch.sie_block->extcpuaddr = cpu_addr; + return 0; + } + + rc = put_guest_lc(vcpu, EXT_IRQ_EMERGENCY_SIG, + (u16 *)__LC_EXT_INT_CODE); + rc |= put_guest_lc(vcpu, cpu_addr, (u16 *)__LC_EXT_CPU_ADDR); + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + return rc ? -EFAULT : 0; +} + +static int __must_check __deliver_external_call(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + struct kvm_s390_extcall_info extcall; + int rc; + + spin_lock(&li->lock); + extcall = li->irq.extcall; + li->irq.extcall.code = 0; + clear_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs); + spin_unlock(&li->lock); + + VCPU_EVENT(vcpu, 4, "%s", "deliver: sigp ext call"); + vcpu->stat.deliver_external_call++; + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, + KVM_S390_INT_EXTERNAL_CALL, + extcall.code, 0); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_EXT; + vcpu->arch.sie_block->eic = EXT_IRQ_EXTERNAL_CALL; + vcpu->arch.sie_block->extcpuaddr = extcall.code; + return 0; + } + + rc = put_guest_lc(vcpu, EXT_IRQ_EXTERNAL_CALL, + (u16 *)__LC_EXT_INT_CODE); + rc |= put_guest_lc(vcpu, extcall.code, (u16 *)__LC_EXT_CPU_ADDR); + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, &vcpu->arch.sie_block->gpsw, + sizeof(psw_t)); + return rc ? -EFAULT : 0; +} + +static int __deliver_prog_pv(struct kvm_vcpu *vcpu, u16 code) +{ + switch (code) { + case PGM_SPECIFICATION: + vcpu->arch.sie_block->iictl = IICTL_CODE_SPECIFICATION; + break; + case PGM_OPERAND: + vcpu->arch.sie_block->iictl = IICTL_CODE_OPERAND; + break; + default: + return -EINVAL; + } + return 0; +} + +static int __must_check __deliver_prog(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + struct kvm_s390_pgm_info pgm_info; + int rc = 0, nullifying = false; + u16 ilen; + + spin_lock(&li->lock); + pgm_info = li->irq.pgm; + clear_bit(IRQ_PEND_PROG, &li->pending_irqs); + memset(&li->irq.pgm, 0, sizeof(pgm_info)); + spin_unlock(&li->lock); + + ilen = pgm_info.flags & KVM_S390_PGM_FLAGS_ILC_MASK; + VCPU_EVENT(vcpu, 3, "deliver: program irq code 0x%x, ilen:%d", + pgm_info.code, ilen); + vcpu->stat.deliver_program++; + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_PROGRAM_INT, + pgm_info.code, 0); + + /* PER is handled by the ultravisor */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) + return __deliver_prog_pv(vcpu, pgm_info.code & ~PGM_PER); + + switch (pgm_info.code & ~PGM_PER) { + case PGM_AFX_TRANSLATION: + case PGM_ASX_TRANSLATION: + case PGM_EX_TRANSLATION: + case PGM_LFX_TRANSLATION: + case PGM_LSTE_SEQUENCE: + case PGM_LSX_TRANSLATION: + case PGM_LX_TRANSLATION: + case PGM_PRIMARY_AUTHORITY: + case PGM_SECONDARY_AUTHORITY: + nullifying = true; + fallthrough; + case PGM_SPACE_SWITCH: + rc = put_guest_lc(vcpu, pgm_info.trans_exc_code, + (u64 *)__LC_TRANS_EXC_CODE); + break; + case PGM_ALEN_TRANSLATION: + case PGM_ALE_SEQUENCE: + case PGM_ASTE_INSTANCE: + case PGM_ASTE_SEQUENCE: + case PGM_ASTE_VALIDITY: + case PGM_EXTENDED_AUTHORITY: + rc = put_guest_lc(vcpu, pgm_info.exc_access_id, + (u8 *)__LC_EXC_ACCESS_ID); + nullifying = true; + break; + case PGM_ASCE_TYPE: + case PGM_PAGE_TRANSLATION: + case PGM_REGION_FIRST_TRANS: + case PGM_REGION_SECOND_TRANS: + case PGM_REGION_THIRD_TRANS: + case PGM_SEGMENT_TRANSLATION: + rc = put_guest_lc(vcpu, pgm_info.trans_exc_code, + (u64 *)__LC_TRANS_EXC_CODE); + rc |= put_guest_lc(vcpu, pgm_info.exc_access_id, + (u8 *)__LC_EXC_ACCESS_ID); + rc |= put_guest_lc(vcpu, pgm_info.op_access_id, + (u8 *)__LC_OP_ACCESS_ID); + nullifying = true; + break; + case PGM_MONITOR: + rc = put_guest_lc(vcpu, pgm_info.mon_class_nr, + (u16 *)__LC_MON_CLASS_NR); + rc |= put_guest_lc(vcpu, pgm_info.mon_code, + (u64 *)__LC_MON_CODE); + break; + case PGM_VECTOR_PROCESSING: + case PGM_DATA: + rc = put_guest_lc(vcpu, pgm_info.data_exc_code, + (u32 *)__LC_DATA_EXC_CODE); + break; + case PGM_PROTECTION: + rc = put_guest_lc(vcpu, pgm_info.trans_exc_code, + (u64 *)__LC_TRANS_EXC_CODE); + rc |= put_guest_lc(vcpu, pgm_info.exc_access_id, + (u8 *)__LC_EXC_ACCESS_ID); + break; + case PGM_STACK_FULL: + case PGM_STACK_EMPTY: + case PGM_STACK_SPECIFICATION: + case PGM_STACK_TYPE: + case PGM_STACK_OPERATION: + case PGM_TRACE_TABEL: + case PGM_CRYPTO_OPERATION: + nullifying = true; + break; + } + + if (pgm_info.code & PGM_PER) { + rc |= put_guest_lc(vcpu, pgm_info.per_code, + (u8 *) __LC_PER_CODE); + rc |= put_guest_lc(vcpu, pgm_info.per_atmid, + (u8 *)__LC_PER_ATMID); + rc |= put_guest_lc(vcpu, pgm_info.per_address, + (u64 *) __LC_PER_ADDRESS); + rc |= put_guest_lc(vcpu, pgm_info.per_access_id, + (u8 *) __LC_PER_ACCESS_ID); + } + + if (nullifying && !(pgm_info.flags & KVM_S390_PGM_FLAGS_NO_REWIND)) + kvm_s390_rewind_psw(vcpu, ilen); + + /* bit 1+2 of the target are the ilc, so we can directly use ilen */ + rc |= put_guest_lc(vcpu, ilen, (u16 *) __LC_PGM_ILC); + rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->gbea, + (u64 *) __LC_PGM_LAST_BREAK); + rc |= put_guest_lc(vcpu, pgm_info.code, + (u16 *)__LC_PGM_INT_CODE); + rc |= write_guest_lc(vcpu, __LC_PGM_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= read_guest_lc(vcpu, __LC_PGM_NEW_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + return rc ? -EFAULT : 0; +} + +#define SCCB_MASK 0xFFFFFFF8 +#define SCCB_EVENT_PENDING 0x3 + +static int write_sclp(struct kvm_vcpu *vcpu, u32 parm) +{ + int rc; + + if (kvm_s390_pv_cpu_get_handle(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_EXT; + vcpu->arch.sie_block->eic = EXT_IRQ_SERVICE_SIG; + vcpu->arch.sie_block->eiparams = parm; + return 0; + } + + rc = put_guest_lc(vcpu, EXT_IRQ_SERVICE_SIG, (u16 *)__LC_EXT_INT_CODE); + rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR); + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= put_guest_lc(vcpu, parm, + (u32 *)__LC_EXT_PARAMS); + + return rc ? -EFAULT : 0; +} + +static int __must_check __deliver_service(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; + struct kvm_s390_ext_info ext; + + spin_lock(&fi->lock); + if (test_bit(IRQ_PEND_EXT_SERVICE, &fi->masked_irqs) || + !(test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs))) { + spin_unlock(&fi->lock); + return 0; + } + ext = fi->srv_signal; + memset(&fi->srv_signal, 0, sizeof(ext)); + clear_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs); + clear_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs); + if (kvm_s390_pv_cpu_is_protected(vcpu)) + set_bit(IRQ_PEND_EXT_SERVICE, &fi->masked_irqs); + spin_unlock(&fi->lock); + + VCPU_EVENT(vcpu, 4, "deliver: sclp parameter 0x%x", + ext.ext_params); + vcpu->stat.deliver_service_signal++; + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_SERVICE, + ext.ext_params, 0); + + return write_sclp(vcpu, ext.ext_params); +} + +static int __must_check __deliver_service_ev(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; + struct kvm_s390_ext_info ext; + + spin_lock(&fi->lock); + if (!(test_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs))) { + spin_unlock(&fi->lock); + return 0; + } + ext = fi->srv_signal; + /* only clear the event bit */ + fi->srv_signal.ext_params &= ~SCCB_EVENT_PENDING; + clear_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs); + spin_unlock(&fi->lock); + + VCPU_EVENT(vcpu, 4, "%s", "deliver: sclp parameter event"); + vcpu->stat.deliver_service_signal++; + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_SERVICE, + ext.ext_params, 0); + + return write_sclp(vcpu, SCCB_EVENT_PENDING); +} + +static int __must_check __deliver_pfault_done(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; + struct kvm_s390_interrupt_info *inti; + int rc = 0; + + spin_lock(&fi->lock); + inti = list_first_entry_or_null(&fi->lists[FIRQ_LIST_PFAULT], + struct kvm_s390_interrupt_info, + list); + if (inti) { + list_del(&inti->list); + fi->counters[FIRQ_CNTR_PFAULT] -= 1; + } + if (list_empty(&fi->lists[FIRQ_LIST_PFAULT])) + clear_bit(IRQ_PEND_PFAULT_DONE, &fi->pending_irqs); + spin_unlock(&fi->lock); + + if (inti) { + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, + KVM_S390_INT_PFAULT_DONE, 0, + inti->ext.ext_params2); + VCPU_EVENT(vcpu, 4, "deliver: pfault done token 0x%llx", + inti->ext.ext_params2); + + rc = put_guest_lc(vcpu, EXT_IRQ_CP_SERVICE, + (u16 *)__LC_EXT_INT_CODE); + rc |= put_guest_lc(vcpu, PFAULT_DONE, + (u16 *)__LC_EXT_CPU_ADDR); + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, + &vcpu->arch.sie_block->gpsw, + sizeof(psw_t)); + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, + &vcpu->arch.sie_block->gpsw, + sizeof(psw_t)); + rc |= put_guest_lc(vcpu, inti->ext.ext_params2, + (u64 *)__LC_EXT_PARAMS2); + kfree(inti); + } + return rc ? -EFAULT : 0; +} + +static int __must_check __deliver_virtio(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; + struct kvm_s390_interrupt_info *inti; + int rc = 0; + + spin_lock(&fi->lock); + inti = list_first_entry_or_null(&fi->lists[FIRQ_LIST_VIRTIO], + struct kvm_s390_interrupt_info, + list); + if (inti) { + VCPU_EVENT(vcpu, 4, + "deliver: virtio parm: 0x%x,parm64: 0x%llx", + inti->ext.ext_params, inti->ext.ext_params2); + vcpu->stat.deliver_virtio++; + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, + inti->type, + inti->ext.ext_params, + inti->ext.ext_params2); + list_del(&inti->list); + fi->counters[FIRQ_CNTR_VIRTIO] -= 1; + } + if (list_empty(&fi->lists[FIRQ_LIST_VIRTIO])) + clear_bit(IRQ_PEND_VIRTIO, &fi->pending_irqs); + spin_unlock(&fi->lock); + + if (inti) { + rc = put_guest_lc(vcpu, EXT_IRQ_CP_SERVICE, + (u16 *)__LC_EXT_INT_CODE); + rc |= put_guest_lc(vcpu, VIRTIO_PARAM, + (u16 *)__LC_EXT_CPU_ADDR); + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, + &vcpu->arch.sie_block->gpsw, + sizeof(psw_t)); + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, + &vcpu->arch.sie_block->gpsw, + sizeof(psw_t)); + rc |= put_guest_lc(vcpu, inti->ext.ext_params, + (u32 *)__LC_EXT_PARAMS); + rc |= put_guest_lc(vcpu, inti->ext.ext_params2, + (u64 *)__LC_EXT_PARAMS2); + kfree(inti); + } + return rc ? -EFAULT : 0; +} + +static int __do_deliver_io(struct kvm_vcpu *vcpu, struct kvm_s390_io_info *io) +{ + int rc; + + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_IO; + vcpu->arch.sie_block->subchannel_id = io->subchannel_id; + vcpu->arch.sie_block->subchannel_nr = io->subchannel_nr; + vcpu->arch.sie_block->io_int_parm = io->io_int_parm; + vcpu->arch.sie_block->io_int_word = io->io_int_word; + return 0; + } + + rc = put_guest_lc(vcpu, io->subchannel_id, (u16 *)__LC_SUBCHANNEL_ID); + rc |= put_guest_lc(vcpu, io->subchannel_nr, (u16 *)__LC_SUBCHANNEL_NR); + rc |= put_guest_lc(vcpu, io->io_int_parm, (u32 *)__LC_IO_INT_PARM); + rc |= put_guest_lc(vcpu, io->io_int_word, (u32 *)__LC_IO_INT_WORD); + rc |= write_guest_lc(vcpu, __LC_IO_OLD_PSW, + &vcpu->arch.sie_block->gpsw, + sizeof(psw_t)); + rc |= read_guest_lc(vcpu, __LC_IO_NEW_PSW, + &vcpu->arch.sie_block->gpsw, + sizeof(psw_t)); + return rc ? -EFAULT : 0; +} + +static int __must_check __deliver_io(struct kvm_vcpu *vcpu, + unsigned long irq_type) +{ + struct list_head *isc_list; + struct kvm_s390_float_interrupt *fi; + struct kvm_s390_gisa_interrupt *gi = &vcpu->kvm->arch.gisa_int; + struct kvm_s390_interrupt_info *inti = NULL; + struct kvm_s390_io_info io; + u32 isc; + int rc = 0; + + fi = &vcpu->kvm->arch.float_int; + + spin_lock(&fi->lock); + isc = irq_type_to_isc(irq_type); + isc_list = &fi->lists[isc]; + inti = list_first_entry_or_null(isc_list, + struct kvm_s390_interrupt_info, + list); + if (inti) { + if (inti->type & KVM_S390_INT_IO_AI_MASK) + VCPU_EVENT(vcpu, 4, "%s", "deliver: I/O (AI)"); + else + VCPU_EVENT(vcpu, 4, "deliver: I/O %x ss %x schid %04x", + inti->io.subchannel_id >> 8, + inti->io.subchannel_id >> 1 & 0x3, + inti->io.subchannel_nr); + + vcpu->stat.deliver_io++; + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, + inti->type, + ((__u32)inti->io.subchannel_id << 16) | + inti->io.subchannel_nr, + ((__u64)inti->io.io_int_parm << 32) | + inti->io.io_int_word); + list_del(&inti->list); + fi->counters[FIRQ_CNTR_IO] -= 1; + } + if (list_empty(isc_list)) + clear_bit(irq_type, &fi->pending_irqs); + spin_unlock(&fi->lock); + + if (inti) { + rc = __do_deliver_io(vcpu, &(inti->io)); + kfree(inti); + goto out; + } + + if (gi->origin && gisa_tac_ipm_gisc(gi->origin, isc)) { + /* + * in case an adapter interrupt was not delivered + * in SIE context KVM will handle the delivery + */ + VCPU_EVENT(vcpu, 4, "%s isc %u", "deliver: I/O (AI/gisa)", isc); + memset(&io, 0, sizeof(io)); + io.io_int_word = isc_to_int_word(isc); + vcpu->stat.deliver_io++; + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, + KVM_S390_INT_IO(1, 0, 0, 0), + ((__u32)io.subchannel_id << 16) | + io.subchannel_nr, + ((__u64)io.io_int_parm << 32) | + io.io_int_word); + rc = __do_deliver_io(vcpu, &io); + } +out: + return rc; +} + +/* Check whether an external call is pending (deliverable or not) */ +int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + + if (!sclp.has_sigpif) + return test_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs); + + return sca_ext_call_pending(vcpu, NULL); +} + +int kvm_s390_vcpu_has_irq(struct kvm_vcpu *vcpu, int exclude_stop) +{ + if (deliverable_irqs(vcpu)) + return 1; + + if (kvm_cpu_has_pending_timer(vcpu)) + return 1; + + /* external call pending and deliverable */ + if (kvm_s390_ext_call_pending(vcpu) && + !psw_extint_disabled(vcpu) && + (vcpu->arch.sie_block->gcr[0] & CR0_EXTERNAL_CALL_SUBMASK)) + return 1; + + if (!exclude_stop && kvm_s390_is_stop_irq_pending(vcpu)) + return 1; + return 0; +} + +int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) +{ + return ckc_irq_pending(vcpu) || cpu_timer_irq_pending(vcpu); +} + +static u64 __calculate_sltime(struct kvm_vcpu *vcpu) +{ + const u64 now = kvm_s390_get_tod_clock_fast(vcpu->kvm); + const u64 ckc = vcpu->arch.sie_block->ckc; + u64 cputm, sltime = 0; + + if (ckc_interrupts_enabled(vcpu)) { + if (vcpu->arch.sie_block->gcr[0] & CR0_CLOCK_COMPARATOR_SIGN) { + if ((s64)now < (s64)ckc) + sltime = tod_to_ns((s64)ckc - (s64)now); + } else if (now < ckc) { + sltime = tod_to_ns(ckc - now); + } + /* already expired */ + if (!sltime) + return 0; + if (cpu_timer_interrupts_enabled(vcpu)) { + cputm = kvm_s390_get_cpu_timer(vcpu); + /* already expired? */ + if (cputm >> 63) + return 0; + return min_t(u64, sltime, tod_to_ns(cputm)); + } + } else if (cpu_timer_interrupts_enabled(vcpu)) { + sltime = kvm_s390_get_cpu_timer(vcpu); + /* already expired? */ + if (sltime >> 63) + return 0; + } + return sltime; +} + +int kvm_s390_handle_wait(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_gisa_interrupt *gi = &vcpu->kvm->arch.gisa_int; + u64 sltime; + + vcpu->stat.exit_wait_state++; + + /* fast path */ + if (kvm_arch_vcpu_runnable(vcpu)) + return 0; + + if (psw_interrupts_disabled(vcpu)) { + VCPU_EVENT(vcpu, 3, "%s", "disabled wait"); + return -EOPNOTSUPP; /* disabled wait */ + } + + if (gi->origin && + (gisa_get_ipm_or_restore_iam(gi) & + vcpu->arch.sie_block->gcr[6] >> 24)) + return 0; + + if (!ckc_interrupts_enabled(vcpu) && + !cpu_timer_interrupts_enabled(vcpu)) { + VCPU_EVENT(vcpu, 3, "%s", "enabled wait w/o timer"); + __set_cpu_idle(vcpu); + goto no_timer; + } + + sltime = __calculate_sltime(vcpu); + if (!sltime) + return 0; + + __set_cpu_idle(vcpu); + hrtimer_start(&vcpu->arch.ckc_timer, sltime, HRTIMER_MODE_REL); + VCPU_EVENT(vcpu, 4, "enabled wait: %llu ns", sltime); +no_timer: + kvm_vcpu_srcu_read_unlock(vcpu); + kvm_vcpu_halt(vcpu); + vcpu->valid_wakeup = false; + __unset_cpu_idle(vcpu); + kvm_vcpu_srcu_read_lock(vcpu); + + hrtimer_cancel(&vcpu->arch.ckc_timer); + return 0; +} + +void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu) +{ + vcpu->valid_wakeup = true; + kvm_vcpu_wake_up(vcpu); + + /* + * The VCPU might not be sleeping but rather executing VSIE. Let's + * kick it, so it leaves the SIE to process the request. + */ + kvm_s390_vsie_kick(vcpu); +} + +enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer) +{ + struct kvm_vcpu *vcpu; + u64 sltime; + + vcpu = container_of(timer, struct kvm_vcpu, arch.ckc_timer); + sltime = __calculate_sltime(vcpu); + + /* + * If the monotonic clock runs faster than the tod clock we might be + * woken up too early and have to go back to sleep to avoid deadlocks. + */ + if (sltime && hrtimer_forward_now(timer, ns_to_ktime(sltime))) + return HRTIMER_RESTART; + kvm_s390_vcpu_wakeup(vcpu); + return HRTIMER_NORESTART; +} + +void kvm_s390_clear_local_irqs(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + + spin_lock(&li->lock); + li->pending_irqs = 0; + bitmap_zero(li->sigp_emerg_pending, KVM_MAX_VCPUS); + memset(&li->irq, 0, sizeof(li->irq)); + spin_unlock(&li->lock); + + sca_clear_ext_call(vcpu); +} + +int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + int rc = 0; + bool delivered = false; + unsigned long irq_type; + unsigned long irqs; + + __reset_intercept_indicators(vcpu); + + /* pending ckc conditions might have been invalidated */ + clear_bit(IRQ_PEND_EXT_CLOCK_COMP, &li->pending_irqs); + if (ckc_irq_pending(vcpu)) + set_bit(IRQ_PEND_EXT_CLOCK_COMP, &li->pending_irqs); + + /* pending cpu timer conditions might have been invalidated */ + clear_bit(IRQ_PEND_EXT_CPU_TIMER, &li->pending_irqs); + if (cpu_timer_irq_pending(vcpu)) + set_bit(IRQ_PEND_EXT_CPU_TIMER, &li->pending_irqs); + + while ((irqs = deliverable_irqs(vcpu)) && !rc) { + /* bits are in the reverse order of interrupt priority */ + irq_type = find_last_bit(&irqs, IRQ_PEND_COUNT); + switch (irq_type) { + case IRQ_PEND_IO_ISC_0: + case IRQ_PEND_IO_ISC_1: + case IRQ_PEND_IO_ISC_2: + case IRQ_PEND_IO_ISC_3: + case IRQ_PEND_IO_ISC_4: + case IRQ_PEND_IO_ISC_5: + case IRQ_PEND_IO_ISC_6: + case IRQ_PEND_IO_ISC_7: + rc = __deliver_io(vcpu, irq_type); + break; + case IRQ_PEND_MCHK_EX: + case IRQ_PEND_MCHK_REP: + rc = __deliver_machine_check(vcpu); + break; + case IRQ_PEND_PROG: + rc = __deliver_prog(vcpu); + break; + case IRQ_PEND_EXT_EMERGENCY: + rc = __deliver_emergency_signal(vcpu); + break; + case IRQ_PEND_EXT_EXTERNAL: + rc = __deliver_external_call(vcpu); + break; + case IRQ_PEND_EXT_CLOCK_COMP: + rc = __deliver_ckc(vcpu); + break; + case IRQ_PEND_EXT_CPU_TIMER: + rc = __deliver_cpu_timer(vcpu); + break; + case IRQ_PEND_RESTART: + rc = __deliver_restart(vcpu); + break; + case IRQ_PEND_SET_PREFIX: + rc = __deliver_set_prefix(vcpu); + break; + case IRQ_PEND_PFAULT_INIT: + rc = __deliver_pfault_init(vcpu); + break; + case IRQ_PEND_EXT_SERVICE: + rc = __deliver_service(vcpu); + break; + case IRQ_PEND_EXT_SERVICE_EV: + rc = __deliver_service_ev(vcpu); + break; + case IRQ_PEND_PFAULT_DONE: + rc = __deliver_pfault_done(vcpu); + break; + case IRQ_PEND_VIRTIO: + rc = __deliver_virtio(vcpu); + break; + default: + WARN_ONCE(1, "Unknown pending irq type %ld", irq_type); + clear_bit(irq_type, &li->pending_irqs); + } + delivered |= !rc; + } + + /* + * We delivered at least one interrupt and modified the PC. Force a + * singlestep event now. + */ + if (delivered && guestdbg_sstep_enabled(vcpu)) { + struct kvm_debug_exit_arch *debug_exit = &vcpu->run->debug.arch; + + debug_exit->addr = vcpu->arch.sie_block->gpsw.addr; + debug_exit->type = KVM_SINGLESTEP; + vcpu->guest_debug |= KVM_GUESTDBG_EXIT_PENDING; + } + + set_intercept_indicators(vcpu); + + return rc; +} + +static int __inject_prog(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + + vcpu->stat.inject_program++; + VCPU_EVENT(vcpu, 3, "inject: program irq code 0x%x", irq->u.pgm.code); + trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_PROGRAM_INT, + irq->u.pgm.code, 0); + + if (!(irq->u.pgm.flags & KVM_S390_PGM_FLAGS_ILC_VALID)) { + /* auto detection if no valid ILC was given */ + irq->u.pgm.flags &= ~KVM_S390_PGM_FLAGS_ILC_MASK; + irq->u.pgm.flags |= kvm_s390_get_ilen(vcpu); + irq->u.pgm.flags |= KVM_S390_PGM_FLAGS_ILC_VALID; + } + + if (irq->u.pgm.code == PGM_PER) { + li->irq.pgm.code |= PGM_PER; + li->irq.pgm.flags = irq->u.pgm.flags; + /* only modify PER related information */ + li->irq.pgm.per_address = irq->u.pgm.per_address; + li->irq.pgm.per_code = irq->u.pgm.per_code; + li->irq.pgm.per_atmid = irq->u.pgm.per_atmid; + li->irq.pgm.per_access_id = irq->u.pgm.per_access_id; + } else if (!(irq->u.pgm.code & PGM_PER)) { + li->irq.pgm.code = (li->irq.pgm.code & PGM_PER) | + irq->u.pgm.code; + li->irq.pgm.flags = irq->u.pgm.flags; + /* only modify non-PER information */ + li->irq.pgm.trans_exc_code = irq->u.pgm.trans_exc_code; + li->irq.pgm.mon_code = irq->u.pgm.mon_code; + li->irq.pgm.data_exc_code = irq->u.pgm.data_exc_code; + li->irq.pgm.mon_class_nr = irq->u.pgm.mon_class_nr; + li->irq.pgm.exc_access_id = irq->u.pgm.exc_access_id; + li->irq.pgm.op_access_id = irq->u.pgm.op_access_id; + } else { + li->irq.pgm = irq->u.pgm; + } + set_bit(IRQ_PEND_PROG, &li->pending_irqs); + return 0; +} + +static int __inject_pfault_init(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + + vcpu->stat.inject_pfault_init++; + VCPU_EVENT(vcpu, 4, "inject: pfault init parameter block at 0x%llx", + irq->u.ext.ext_params2); + trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_PFAULT_INIT, + irq->u.ext.ext_params, + irq->u.ext.ext_params2); + + li->irq.ext = irq->u.ext; + set_bit(IRQ_PEND_PFAULT_INIT, &li->pending_irqs); + kvm_s390_set_cpuflags(vcpu, CPUSTAT_EXT_INT); + return 0; +} + +static int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + struct kvm_s390_extcall_info *extcall = &li->irq.extcall; + uint16_t src_id = irq->u.extcall.code; + + vcpu->stat.inject_external_call++; + VCPU_EVENT(vcpu, 4, "inject: external call source-cpu:%u", + src_id); + trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_EXTERNAL_CALL, + src_id, 0); + + /* sending vcpu invalid */ + if (kvm_get_vcpu_by_id(vcpu->kvm, src_id) == NULL) + return -EINVAL; + + if (sclp.has_sigpif && !kvm_s390_pv_cpu_get_handle(vcpu)) + return sca_inject_ext_call(vcpu, src_id); + + if (test_and_set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs)) + return -EBUSY; + *extcall = irq->u.extcall; + kvm_s390_set_cpuflags(vcpu, CPUSTAT_EXT_INT); + return 0; +} + +static int __inject_set_prefix(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + struct kvm_s390_prefix_info *prefix = &li->irq.prefix; + + vcpu->stat.inject_set_prefix++; + VCPU_EVENT(vcpu, 3, "inject: set prefix to %x", + irq->u.prefix.address); + trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_SIGP_SET_PREFIX, + irq->u.prefix.address, 0); + + if (!is_vcpu_stopped(vcpu)) + return -EBUSY; + + *prefix = irq->u.prefix; + set_bit(IRQ_PEND_SET_PREFIX, &li->pending_irqs); + return 0; +} + +#define KVM_S390_STOP_SUPP_FLAGS (KVM_S390_STOP_FLAG_STORE_STATUS) +static int __inject_sigp_stop(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + struct kvm_s390_stop_info *stop = &li->irq.stop; + int rc = 0; + + vcpu->stat.inject_stop_signal++; + trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_SIGP_STOP, 0, 0); + + if (irq->u.stop.flags & ~KVM_S390_STOP_SUPP_FLAGS) + return -EINVAL; + + if (is_vcpu_stopped(vcpu)) { + if (irq->u.stop.flags & KVM_S390_STOP_FLAG_STORE_STATUS) + rc = kvm_s390_store_status_unloaded(vcpu, + KVM_S390_STORE_STATUS_NOADDR); + return rc; + } + + if (test_and_set_bit(IRQ_PEND_SIGP_STOP, &li->pending_irqs)) + return -EBUSY; + stop->flags = irq->u.stop.flags; + kvm_s390_set_cpuflags(vcpu, CPUSTAT_STOP_INT); + return 0; +} + +static int __inject_sigp_restart(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + + vcpu->stat.inject_restart++; + VCPU_EVENT(vcpu, 3, "%s", "inject: restart int"); + trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_RESTART, 0, 0); + + set_bit(IRQ_PEND_RESTART, &li->pending_irqs); + return 0; +} + +static int __inject_sigp_emergency(struct kvm_vcpu *vcpu, + struct kvm_s390_irq *irq) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + + vcpu->stat.inject_emergency_signal++; + VCPU_EVENT(vcpu, 4, "inject: emergency from cpu %u", + irq->u.emerg.code); + trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_EMERGENCY, + irq->u.emerg.code, 0); + + /* sending vcpu invalid */ + if (kvm_get_vcpu_by_id(vcpu->kvm, irq->u.emerg.code) == NULL) + return -EINVAL; + + set_bit(irq->u.emerg.code, li->sigp_emerg_pending); + set_bit(IRQ_PEND_EXT_EMERGENCY, &li->pending_irqs); + kvm_s390_set_cpuflags(vcpu, CPUSTAT_EXT_INT); + return 0; +} + +static int __inject_mchk(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + struct kvm_s390_mchk_info *mchk = &li->irq.mchk; + + vcpu->stat.inject_mchk++; + VCPU_EVENT(vcpu, 3, "inject: machine check mcic 0x%llx", + irq->u.mchk.mcic); + trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_MCHK, 0, + irq->u.mchk.mcic); + + /* + * Because repressible machine checks can be indicated along with + * exigent machine checks (PoP, Chapter 11, Interruption action) + * we need to combine cr14, mcic and external damage code. + * Failing storage address and the logout area should not be or'ed + * together, we just indicate the last occurrence of the corresponding + * machine check + */ + mchk->cr14 |= irq->u.mchk.cr14; + mchk->mcic |= irq->u.mchk.mcic; + mchk->ext_damage_code |= irq->u.mchk.ext_damage_code; + mchk->failing_storage_address = irq->u.mchk.failing_storage_address; + memcpy(&mchk->fixed_logout, &irq->u.mchk.fixed_logout, + sizeof(mchk->fixed_logout)); + if (mchk->mcic & MCHK_EX_MASK) + set_bit(IRQ_PEND_MCHK_EX, &li->pending_irqs); + else if (mchk->mcic & MCHK_REP_MASK) + set_bit(IRQ_PEND_MCHK_REP, &li->pending_irqs); + return 0; +} + +static int __inject_ckc(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + + vcpu->stat.inject_ckc++; + VCPU_EVENT(vcpu, 3, "%s", "inject: clock comparator external"); + trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_CLOCK_COMP, + 0, 0); + + set_bit(IRQ_PEND_EXT_CLOCK_COMP, &li->pending_irqs); + kvm_s390_set_cpuflags(vcpu, CPUSTAT_EXT_INT); + return 0; +} + +static int __inject_cpu_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + + vcpu->stat.inject_cputm++; + VCPU_EVENT(vcpu, 3, "%s", "inject: cpu timer external"); + trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_CPU_TIMER, + 0, 0); + + set_bit(IRQ_PEND_EXT_CPU_TIMER, &li->pending_irqs); + kvm_s390_set_cpuflags(vcpu, CPUSTAT_EXT_INT); + return 0; +} + +static struct kvm_s390_interrupt_info *get_io_int(struct kvm *kvm, + int isc, u32 schid) +{ + struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int; + struct list_head *isc_list = &fi->lists[FIRQ_LIST_IO_ISC_0 + isc]; + struct kvm_s390_interrupt_info *iter; + u16 id = (schid & 0xffff0000U) >> 16; + u16 nr = schid & 0x0000ffffU; + + spin_lock(&fi->lock); + list_for_each_entry(iter, isc_list, list) { + if (schid && (id != iter->io.subchannel_id || + nr != iter->io.subchannel_nr)) + continue; + /* found an appropriate entry */ + list_del_init(&iter->list); + fi->counters[FIRQ_CNTR_IO] -= 1; + if (list_empty(isc_list)) + clear_bit(isc_to_irq_type(isc), &fi->pending_irqs); + spin_unlock(&fi->lock); + return iter; + } + spin_unlock(&fi->lock); + return NULL; +} + +static struct kvm_s390_interrupt_info *get_top_io_int(struct kvm *kvm, + u64 isc_mask, u32 schid) +{ + struct kvm_s390_interrupt_info *inti = NULL; + int isc; + + for (isc = 0; isc <= MAX_ISC && !inti; isc++) { + if (isc_mask & isc_to_isc_bits(isc)) + inti = get_io_int(kvm, isc, schid); + } + return inti; +} + +static int get_top_gisa_isc(struct kvm *kvm, u64 isc_mask, u32 schid) +{ + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + unsigned long active_mask; + int isc; + + if (schid) + goto out; + if (!gi->origin) + goto out; + + active_mask = (isc_mask & gisa_get_ipm(gi->origin) << 24) << 32; + while (active_mask) { + isc = __fls(active_mask) ^ (BITS_PER_LONG - 1); + if (gisa_tac_ipm_gisc(gi->origin, isc)) + return isc; + clear_bit_inv(isc, &active_mask); + } +out: + return -EINVAL; +} + +/* + * Dequeue and return an I/O interrupt matching any of the interruption + * subclasses as designated by the isc mask in cr6 and the schid (if != 0). + * Take into account the interrupts pending in the interrupt list and in GISA. + * + * Note that for a guest that does not enable I/O interrupts + * but relies on TPI, a flood of classic interrupts may starve + * out adapter interrupts on the same isc. Linux does not do + * that, and it is possible to work around the issue by configuring + * different iscs for classic and adapter interrupts in the guest, + * but we may want to revisit this in the future. + */ +struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm, + u64 isc_mask, u32 schid) +{ + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + struct kvm_s390_interrupt_info *inti, *tmp_inti; + int isc; + + inti = get_top_io_int(kvm, isc_mask, schid); + + isc = get_top_gisa_isc(kvm, isc_mask, schid); + if (isc < 0) + /* no AI in GISA */ + goto out; + + if (!inti) + /* AI in GISA but no classical IO int */ + goto gisa_out; + + /* both types of interrupts present */ + if (int_word_to_isc(inti->io.io_int_word) <= isc) { + /* classical IO int with higher priority */ + gisa_set_ipm_gisc(gi->origin, isc); + goto out; + } +gisa_out: + tmp_inti = kzalloc(sizeof(*inti), GFP_KERNEL_ACCOUNT); + if (tmp_inti) { + tmp_inti->type = KVM_S390_INT_IO(1, 0, 0, 0); + tmp_inti->io.io_int_word = isc_to_int_word(isc); + if (inti) + kvm_s390_reinject_io_int(kvm, inti); + inti = tmp_inti; + } else + gisa_set_ipm_gisc(gi->origin, isc); +out: + return inti; +} + +static int __inject_service(struct kvm *kvm, + struct kvm_s390_interrupt_info *inti) +{ + struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int; + + kvm->stat.inject_service_signal++; + spin_lock(&fi->lock); + fi->srv_signal.ext_params |= inti->ext.ext_params & SCCB_EVENT_PENDING; + + /* We always allow events, track them separately from the sccb ints */ + if (fi->srv_signal.ext_params & SCCB_EVENT_PENDING) + set_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs); + + /* + * Early versions of the QEMU s390 bios will inject several + * service interrupts after another without handling a + * condition code indicating busy. + * We will silently ignore those superfluous sccb values. + * A future version of QEMU will take care of serialization + * of servc requests + */ + if (fi->srv_signal.ext_params & SCCB_MASK) + goto out; + fi->srv_signal.ext_params |= inti->ext.ext_params & SCCB_MASK; + set_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs); +out: + spin_unlock(&fi->lock); + kfree(inti); + return 0; +} + +static int __inject_virtio(struct kvm *kvm, + struct kvm_s390_interrupt_info *inti) +{ + struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int; + + kvm->stat.inject_virtio++; + spin_lock(&fi->lock); + if (fi->counters[FIRQ_CNTR_VIRTIO] >= KVM_S390_MAX_VIRTIO_IRQS) { + spin_unlock(&fi->lock); + return -EBUSY; + } + fi->counters[FIRQ_CNTR_VIRTIO] += 1; + list_add_tail(&inti->list, &fi->lists[FIRQ_LIST_VIRTIO]); + set_bit(IRQ_PEND_VIRTIO, &fi->pending_irqs); + spin_unlock(&fi->lock); + return 0; +} + +static int __inject_pfault_done(struct kvm *kvm, + struct kvm_s390_interrupt_info *inti) +{ + struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int; + + kvm->stat.inject_pfault_done++; + spin_lock(&fi->lock); + if (fi->counters[FIRQ_CNTR_PFAULT] >= + (ASYNC_PF_PER_VCPU * KVM_MAX_VCPUS)) { + spin_unlock(&fi->lock); + return -EBUSY; + } + fi->counters[FIRQ_CNTR_PFAULT] += 1; + list_add_tail(&inti->list, &fi->lists[FIRQ_LIST_PFAULT]); + set_bit(IRQ_PEND_PFAULT_DONE, &fi->pending_irqs); + spin_unlock(&fi->lock); + return 0; +} + +#define CR_PENDING_SUBCLASS 28 +static int __inject_float_mchk(struct kvm *kvm, + struct kvm_s390_interrupt_info *inti) +{ + struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int; + + kvm->stat.inject_float_mchk++; + spin_lock(&fi->lock); + fi->mchk.cr14 |= inti->mchk.cr14 & (1UL << CR_PENDING_SUBCLASS); + fi->mchk.mcic |= inti->mchk.mcic; + set_bit(IRQ_PEND_MCHK_REP, &fi->pending_irqs); + spin_unlock(&fi->lock); + kfree(inti); + return 0; +} + +static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) +{ + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + struct kvm_s390_float_interrupt *fi; + struct list_head *list; + int isc; + + kvm->stat.inject_io++; + isc = int_word_to_isc(inti->io.io_int_word); + + /* + * We do not use the lock checking variant as this is just a + * performance optimization and we do not hold the lock here. + * This is ok as the code will pick interrupts from both "lists" + * for delivery. + */ + if (gi->origin && inti->type & KVM_S390_INT_IO_AI_MASK) { + VM_EVENT(kvm, 4, "%s isc %1u", "inject: I/O (AI/gisa)", isc); + gisa_set_ipm_gisc(gi->origin, isc); + kfree(inti); + return 0; + } + + fi = &kvm->arch.float_int; + spin_lock(&fi->lock); + if (fi->counters[FIRQ_CNTR_IO] >= KVM_S390_MAX_FLOAT_IRQS) { + spin_unlock(&fi->lock); + return -EBUSY; + } + fi->counters[FIRQ_CNTR_IO] += 1; + + if (inti->type & KVM_S390_INT_IO_AI_MASK) + VM_EVENT(kvm, 4, "%s", "inject: I/O (AI)"); + else + VM_EVENT(kvm, 4, "inject: I/O %x ss %x schid %04x", + inti->io.subchannel_id >> 8, + inti->io.subchannel_id >> 1 & 0x3, + inti->io.subchannel_nr); + list = &fi->lists[FIRQ_LIST_IO_ISC_0 + isc]; + list_add_tail(&inti->list, list); + set_bit(isc_to_irq_type(isc), &fi->pending_irqs); + spin_unlock(&fi->lock); + return 0; +} + +/* + * Find a destination VCPU for a floating irq and kick it. + */ +static void __floating_irq_kick(struct kvm *kvm, u64 type) +{ + struct kvm_vcpu *dst_vcpu; + int sigcpu, online_vcpus, nr_tries = 0; + + online_vcpus = atomic_read(&kvm->online_vcpus); + if (!online_vcpus) + return; + + /* find idle VCPUs first, then round robin */ + sigcpu = find_first_bit(kvm->arch.idle_mask, online_vcpus); + if (sigcpu == online_vcpus) { + do { + sigcpu = kvm->arch.float_int.next_rr_cpu++; + kvm->arch.float_int.next_rr_cpu %= online_vcpus; + /* avoid endless loops if all vcpus are stopped */ + if (nr_tries++ >= online_vcpus) + return; + } while (is_vcpu_stopped(kvm_get_vcpu(kvm, sigcpu))); + } + dst_vcpu = kvm_get_vcpu(kvm, sigcpu); + + /* make the VCPU drop out of the SIE, or wake it up if sleeping */ + switch (type) { + case KVM_S390_MCHK: + kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_STOP_INT); + break; + case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: + if (!(type & KVM_S390_INT_IO_AI_MASK && + kvm->arch.gisa_int.origin) || + kvm_s390_pv_cpu_get_handle(dst_vcpu)) + kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_IO_INT); + break; + default: + kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_EXT_INT); + break; + } + kvm_s390_vcpu_wakeup(dst_vcpu); +} + +static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) +{ + u64 type = READ_ONCE(inti->type); + int rc; + + switch (type) { + case KVM_S390_MCHK: + rc = __inject_float_mchk(kvm, inti); + break; + case KVM_S390_INT_VIRTIO: + rc = __inject_virtio(kvm, inti); + break; + case KVM_S390_INT_SERVICE: + rc = __inject_service(kvm, inti); + break; + case KVM_S390_INT_PFAULT_DONE: + rc = __inject_pfault_done(kvm, inti); + break; + case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: + rc = __inject_io(kvm, inti); + break; + default: + rc = -EINVAL; + } + if (rc) + return rc; + + __floating_irq_kick(kvm, type); + return 0; +} + +int kvm_s390_inject_vm(struct kvm *kvm, + struct kvm_s390_interrupt *s390int) +{ + struct kvm_s390_interrupt_info *inti; + int rc; + + inti = kzalloc(sizeof(*inti), GFP_KERNEL_ACCOUNT); + if (!inti) + return -ENOMEM; + + inti->type = s390int->type; + switch (inti->type) { + case KVM_S390_INT_VIRTIO: + VM_EVENT(kvm, 5, "inject: virtio parm:%x,parm64:%llx", + s390int->parm, s390int->parm64); + inti->ext.ext_params = s390int->parm; + inti->ext.ext_params2 = s390int->parm64; + break; + case KVM_S390_INT_SERVICE: + VM_EVENT(kvm, 4, "inject: sclp parm:%x", s390int->parm); + inti->ext.ext_params = s390int->parm; + break; + case KVM_S390_INT_PFAULT_DONE: + inti->ext.ext_params2 = s390int->parm64; + break; + case KVM_S390_MCHK: + VM_EVENT(kvm, 3, "inject: machine check mcic 0x%llx", + s390int->parm64); + inti->mchk.cr14 = s390int->parm; /* upper bits are not used */ + inti->mchk.mcic = s390int->parm64; + break; + case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: + inti->io.subchannel_id = s390int->parm >> 16; + inti->io.subchannel_nr = s390int->parm & 0x0000ffffu; + inti->io.io_int_parm = s390int->parm64 >> 32; + inti->io.io_int_word = s390int->parm64 & 0x00000000ffffffffull; + break; + default: + kfree(inti); + return -EINVAL; + } + trace_kvm_s390_inject_vm(s390int->type, s390int->parm, s390int->parm64, + 2); + + rc = __inject_vm(kvm, inti); + if (rc) + kfree(inti); + return rc; +} + +int kvm_s390_reinject_io_int(struct kvm *kvm, + struct kvm_s390_interrupt_info *inti) +{ + return __inject_vm(kvm, inti); +} + +int s390int_to_s390irq(struct kvm_s390_interrupt *s390int, + struct kvm_s390_irq *irq) +{ + irq->type = s390int->type; + switch (irq->type) { + case KVM_S390_PROGRAM_INT: + if (s390int->parm & 0xffff0000) + return -EINVAL; + irq->u.pgm.code = s390int->parm; + break; + case KVM_S390_SIGP_SET_PREFIX: + irq->u.prefix.address = s390int->parm; + break; + case KVM_S390_SIGP_STOP: + irq->u.stop.flags = s390int->parm; + break; + case KVM_S390_INT_EXTERNAL_CALL: + if (s390int->parm & 0xffff0000) + return -EINVAL; + irq->u.extcall.code = s390int->parm; + break; + case KVM_S390_INT_EMERGENCY: + if (s390int->parm & 0xffff0000) + return -EINVAL; + irq->u.emerg.code = s390int->parm; + break; + case KVM_S390_MCHK: + irq->u.mchk.mcic = s390int->parm64; + break; + case KVM_S390_INT_PFAULT_INIT: + irq->u.ext.ext_params = s390int->parm; + irq->u.ext.ext_params2 = s390int->parm64; + break; + case KVM_S390_RESTART: + case KVM_S390_INT_CLOCK_COMP: + case KVM_S390_INT_CPU_TIMER: + break; + default: + return -EINVAL; + } + return 0; +} + +int kvm_s390_is_stop_irq_pending(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + + return test_bit(IRQ_PEND_SIGP_STOP, &li->pending_irqs); +} + +int kvm_s390_is_restart_irq_pending(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + + return test_bit(IRQ_PEND_RESTART, &li->pending_irqs); +} + +void kvm_s390_clear_stop_irq(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + + spin_lock(&li->lock); + li->irq.stop.flags = 0; + clear_bit(IRQ_PEND_SIGP_STOP, &li->pending_irqs); + spin_unlock(&li->lock); +} + +static int do_inject_vcpu(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) +{ + int rc; + + switch (irq->type) { + case KVM_S390_PROGRAM_INT: + rc = __inject_prog(vcpu, irq); + break; + case KVM_S390_SIGP_SET_PREFIX: + rc = __inject_set_prefix(vcpu, irq); + break; + case KVM_S390_SIGP_STOP: + rc = __inject_sigp_stop(vcpu, irq); + break; + case KVM_S390_RESTART: + rc = __inject_sigp_restart(vcpu); + break; + case KVM_S390_INT_CLOCK_COMP: + rc = __inject_ckc(vcpu); + break; + case KVM_S390_INT_CPU_TIMER: + rc = __inject_cpu_timer(vcpu); + break; + case KVM_S390_INT_EXTERNAL_CALL: + rc = __inject_extcall(vcpu, irq); + break; + case KVM_S390_INT_EMERGENCY: + rc = __inject_sigp_emergency(vcpu, irq); + break; + case KVM_S390_MCHK: + rc = __inject_mchk(vcpu, irq); + break; + case KVM_S390_INT_PFAULT_INIT: + rc = __inject_pfault_init(vcpu, irq); + break; + case KVM_S390_INT_VIRTIO: + case KVM_S390_INT_SERVICE: + case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: + default: + rc = -EINVAL; + } + + return rc; +} + +int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + int rc; + + spin_lock(&li->lock); + rc = do_inject_vcpu(vcpu, irq); + spin_unlock(&li->lock); + if (!rc) + kvm_s390_vcpu_wakeup(vcpu); + return rc; +} + +static inline void clear_irq_list(struct list_head *_list) +{ + struct kvm_s390_interrupt_info *inti, *n; + + list_for_each_entry_safe(inti, n, _list, list) { + list_del(&inti->list); + kfree(inti); + } +} + +static void inti_to_irq(struct kvm_s390_interrupt_info *inti, + struct kvm_s390_irq *irq) +{ + irq->type = inti->type; + switch (inti->type) { + case KVM_S390_INT_PFAULT_INIT: + case KVM_S390_INT_PFAULT_DONE: + case KVM_S390_INT_VIRTIO: + irq->u.ext = inti->ext; + break; + case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: + irq->u.io = inti->io; + break; + } +} + +void kvm_s390_clear_float_irqs(struct kvm *kvm) +{ + struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int; + int i; + + mutex_lock(&kvm->lock); + if (!kvm_s390_pv_is_protected(kvm)) + fi->masked_irqs = 0; + mutex_unlock(&kvm->lock); + spin_lock(&fi->lock); + fi->pending_irqs = 0; + memset(&fi->srv_signal, 0, sizeof(fi->srv_signal)); + memset(&fi->mchk, 0, sizeof(fi->mchk)); + for (i = 0; i < FIRQ_LIST_COUNT; i++) + clear_irq_list(&fi->lists[i]); + for (i = 0; i < FIRQ_MAX_COUNT; i++) + fi->counters[i] = 0; + spin_unlock(&fi->lock); + kvm_s390_gisa_clear(kvm); +}; + +static int get_all_floating_irqs(struct kvm *kvm, u8 __user *usrbuf, u64 len) +{ + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + struct kvm_s390_interrupt_info *inti; + struct kvm_s390_float_interrupt *fi; + struct kvm_s390_irq *buf; + struct kvm_s390_irq *irq; + int max_irqs; + int ret = 0; + int n = 0; + int i; + + if (len > KVM_S390_FLIC_MAX_BUFFER || len == 0) + return -EINVAL; + + /* + * We are already using -ENOMEM to signal + * userspace it may retry with a bigger buffer, + * so we need to use something else for this case + */ + buf = vzalloc(len); + if (!buf) + return -ENOBUFS; + + max_irqs = len / sizeof(struct kvm_s390_irq); + + if (gi->origin && gisa_get_ipm(gi->origin)) { + for (i = 0; i <= MAX_ISC; i++) { + if (n == max_irqs) { + /* signal userspace to try again */ + ret = -ENOMEM; + goto out_nolock; + } + if (gisa_tac_ipm_gisc(gi->origin, i)) { + irq = (struct kvm_s390_irq *) &buf[n]; + irq->type = KVM_S390_INT_IO(1, 0, 0, 0); + irq->u.io.io_int_word = isc_to_int_word(i); + n++; + } + } + } + fi = &kvm->arch.float_int; + spin_lock(&fi->lock); + for (i = 0; i < FIRQ_LIST_COUNT; i++) { + list_for_each_entry(inti, &fi->lists[i], list) { + if (n == max_irqs) { + /* signal userspace to try again */ + ret = -ENOMEM; + goto out; + } + inti_to_irq(inti, &buf[n]); + n++; + } + } + if (test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs) || + test_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs)) { + if (n == max_irqs) { + /* signal userspace to try again */ + ret = -ENOMEM; + goto out; + } + irq = (struct kvm_s390_irq *) &buf[n]; + irq->type = KVM_S390_INT_SERVICE; + irq->u.ext = fi->srv_signal; + n++; + } + if (test_bit(IRQ_PEND_MCHK_REP, &fi->pending_irqs)) { + if (n == max_irqs) { + /* signal userspace to try again */ + ret = -ENOMEM; + goto out; + } + irq = (struct kvm_s390_irq *) &buf[n]; + irq->type = KVM_S390_MCHK; + irq->u.mchk = fi->mchk; + n++; +} + +out: + spin_unlock(&fi->lock); +out_nolock: + if (!ret && n > 0) { + if (copy_to_user(usrbuf, buf, sizeof(struct kvm_s390_irq) * n)) + ret = -EFAULT; + } + vfree(buf); + + return ret < 0 ? ret : n; +} + +static int flic_ais_mode_get_all(struct kvm *kvm, struct kvm_device_attr *attr) +{ + struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int; + struct kvm_s390_ais_all ais; + + if (attr->attr < sizeof(ais)) + return -EINVAL; + + if (!test_kvm_facility(kvm, 72)) + return -EOPNOTSUPP; + + mutex_lock(&fi->ais_lock); + ais.simm = fi->simm; + ais.nimm = fi->nimm; + mutex_unlock(&fi->ais_lock); + + if (copy_to_user((void __user *)attr->addr, &ais, sizeof(ais))) + return -EFAULT; + + return 0; +} + +static int flic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + int r; + + switch (attr->group) { + case KVM_DEV_FLIC_GET_ALL_IRQS: + r = get_all_floating_irqs(dev->kvm, (u8 __user *) attr->addr, + attr->attr); + break; + case KVM_DEV_FLIC_AISM_ALL: + r = flic_ais_mode_get_all(dev->kvm, attr); + break; + default: + r = -EINVAL; + } + + return r; +} + +static inline int copy_irq_from_user(struct kvm_s390_interrupt_info *inti, + u64 addr) +{ + struct kvm_s390_irq __user *uptr = (struct kvm_s390_irq __user *) addr; + void *target = NULL; + void __user *source; + u64 size; + + if (get_user(inti->type, (u64 __user *)addr)) + return -EFAULT; + + switch (inti->type) { + case KVM_S390_INT_PFAULT_INIT: + case KVM_S390_INT_PFAULT_DONE: + case KVM_S390_INT_VIRTIO: + case KVM_S390_INT_SERVICE: + target = (void *) &inti->ext; + source = &uptr->u.ext; + size = sizeof(inti->ext); + break; + case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: + target = (void *) &inti->io; + source = &uptr->u.io; + size = sizeof(inti->io); + break; + case KVM_S390_MCHK: + target = (void *) &inti->mchk; + source = &uptr->u.mchk; + size = sizeof(inti->mchk); + break; + default: + return -EINVAL; + } + + if (copy_from_user(target, source, size)) + return -EFAULT; + + return 0; +} + +static int enqueue_floating_irq(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + struct kvm_s390_interrupt_info *inti = NULL; + int r = 0; + int len = attr->attr; + + if (len % sizeof(struct kvm_s390_irq) != 0) + return -EINVAL; + else if (len > KVM_S390_FLIC_MAX_BUFFER) + return -EINVAL; + + while (len >= sizeof(struct kvm_s390_irq)) { + inti = kzalloc(sizeof(*inti), GFP_KERNEL_ACCOUNT); + if (!inti) + return -ENOMEM; + + r = copy_irq_from_user(inti, attr->addr); + if (r) { + kfree(inti); + return r; + } + r = __inject_vm(dev->kvm, inti); + if (r) { + kfree(inti); + return r; + } + len -= sizeof(struct kvm_s390_irq); + attr->addr += sizeof(struct kvm_s390_irq); + } + + return r; +} + +static struct s390_io_adapter *get_io_adapter(struct kvm *kvm, unsigned int id) +{ + if (id >= MAX_S390_IO_ADAPTERS) + return NULL; + id = array_index_nospec(id, MAX_S390_IO_ADAPTERS); + return kvm->arch.adapters[id]; +} + +static int register_io_adapter(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + struct s390_io_adapter *adapter; + struct kvm_s390_io_adapter adapter_info; + + if (copy_from_user(&adapter_info, + (void __user *)attr->addr, sizeof(adapter_info))) + return -EFAULT; + + if (adapter_info.id >= MAX_S390_IO_ADAPTERS) + return -EINVAL; + + adapter_info.id = array_index_nospec(adapter_info.id, + MAX_S390_IO_ADAPTERS); + + if (dev->kvm->arch.adapters[adapter_info.id] != NULL) + return -EINVAL; + + adapter = kzalloc(sizeof(*adapter), GFP_KERNEL_ACCOUNT); + if (!adapter) + return -ENOMEM; + + adapter->id = adapter_info.id; + adapter->isc = adapter_info.isc; + adapter->maskable = adapter_info.maskable; + adapter->masked = false; + adapter->swap = adapter_info.swap; + adapter->suppressible = (adapter_info.flags) & + KVM_S390_ADAPTER_SUPPRESSIBLE; + dev->kvm->arch.adapters[adapter->id] = adapter; + + return 0; +} + +int kvm_s390_mask_adapter(struct kvm *kvm, unsigned int id, bool masked) +{ + int ret; + struct s390_io_adapter *adapter = get_io_adapter(kvm, id); + + if (!adapter || !adapter->maskable) + return -EINVAL; + ret = adapter->masked; + adapter->masked = masked; + return ret; +} + +void kvm_s390_destroy_adapters(struct kvm *kvm) +{ + int i; + + for (i = 0; i < MAX_S390_IO_ADAPTERS; i++) + kfree(kvm->arch.adapters[i]); +} + +static int modify_io_adapter(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + struct kvm_s390_io_adapter_req req; + struct s390_io_adapter *adapter; + int ret; + + if (copy_from_user(&req, (void __user *)attr->addr, sizeof(req))) + return -EFAULT; + + adapter = get_io_adapter(dev->kvm, req.id); + if (!adapter) + return -EINVAL; + switch (req.type) { + case KVM_S390_IO_ADAPTER_MASK: + ret = kvm_s390_mask_adapter(dev->kvm, req.id, req.mask); + if (ret > 0) + ret = 0; + break; + /* + * The following operations are no longer needed and therefore no-ops. + * The gpa to hva translation is done when an IRQ route is set up. The + * set_irq code uses get_user_pages_remote() to do the actual write. + */ + case KVM_S390_IO_ADAPTER_MAP: + case KVM_S390_IO_ADAPTER_UNMAP: + ret = 0; + break; + default: + ret = -EINVAL; + } + + return ret; +} + +static int clear_io_irq(struct kvm *kvm, struct kvm_device_attr *attr) + +{ + const u64 isc_mask = 0xffUL << 24; /* all iscs set */ + u32 schid; + + if (attr->flags) + return -EINVAL; + if (attr->attr != sizeof(schid)) + return -EINVAL; + if (copy_from_user(&schid, (void __user *) attr->addr, sizeof(schid))) + return -EFAULT; + if (!schid) + return -EINVAL; + kfree(kvm_s390_get_io_int(kvm, isc_mask, schid)); + /* + * If userspace is conforming to the architecture, we can have at most + * one pending I/O interrupt per subchannel, so this is effectively a + * clear all. + */ + return 0; +} + +static int modify_ais_mode(struct kvm *kvm, struct kvm_device_attr *attr) +{ + struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int; + struct kvm_s390_ais_req req; + int ret = 0; + + if (!test_kvm_facility(kvm, 72)) + return -EOPNOTSUPP; + + if (copy_from_user(&req, (void __user *)attr->addr, sizeof(req))) + return -EFAULT; + + if (req.isc > MAX_ISC) + return -EINVAL; + + trace_kvm_s390_modify_ais_mode(req.isc, + (fi->simm & AIS_MODE_MASK(req.isc)) ? + (fi->nimm & AIS_MODE_MASK(req.isc)) ? + 2 : KVM_S390_AIS_MODE_SINGLE : + KVM_S390_AIS_MODE_ALL, req.mode); + + mutex_lock(&fi->ais_lock); + switch (req.mode) { + case KVM_S390_AIS_MODE_ALL: + fi->simm &= ~AIS_MODE_MASK(req.isc); + fi->nimm &= ~AIS_MODE_MASK(req.isc); + break; + case KVM_S390_AIS_MODE_SINGLE: + fi->simm |= AIS_MODE_MASK(req.isc); + fi->nimm &= ~AIS_MODE_MASK(req.isc); + break; + default: + ret = -EINVAL; + } + mutex_unlock(&fi->ais_lock); + + return ret; +} + +static int kvm_s390_inject_airq(struct kvm *kvm, + struct s390_io_adapter *adapter) +{ + struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int; + struct kvm_s390_interrupt s390int = { + .type = KVM_S390_INT_IO(1, 0, 0, 0), + .parm = 0, + .parm64 = isc_to_int_word(adapter->isc), + }; + int ret = 0; + + if (!test_kvm_facility(kvm, 72) || !adapter->suppressible) + return kvm_s390_inject_vm(kvm, &s390int); + + mutex_lock(&fi->ais_lock); + if (fi->nimm & AIS_MODE_MASK(adapter->isc)) { + trace_kvm_s390_airq_suppressed(adapter->id, adapter->isc); + goto out; + } + + ret = kvm_s390_inject_vm(kvm, &s390int); + if (!ret && (fi->simm & AIS_MODE_MASK(adapter->isc))) { + fi->nimm |= AIS_MODE_MASK(adapter->isc); + trace_kvm_s390_modify_ais_mode(adapter->isc, + KVM_S390_AIS_MODE_SINGLE, 2); + } +out: + mutex_unlock(&fi->ais_lock); + return ret; +} + +static int flic_inject_airq(struct kvm *kvm, struct kvm_device_attr *attr) +{ + unsigned int id = attr->attr; + struct s390_io_adapter *adapter = get_io_adapter(kvm, id); + + if (!adapter) + return -EINVAL; + + return kvm_s390_inject_airq(kvm, adapter); +} + +static int flic_ais_mode_set_all(struct kvm *kvm, struct kvm_device_attr *attr) +{ + struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int; + struct kvm_s390_ais_all ais; + + if (!test_kvm_facility(kvm, 72)) + return -EOPNOTSUPP; + + if (copy_from_user(&ais, (void __user *)attr->addr, sizeof(ais))) + return -EFAULT; + + mutex_lock(&fi->ais_lock); + fi->simm = ais.simm; + fi->nimm = ais.nimm; + mutex_unlock(&fi->ais_lock); + + return 0; +} + +static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + int r = 0; + unsigned long i; + struct kvm_vcpu *vcpu; + + switch (attr->group) { + case KVM_DEV_FLIC_ENQUEUE: + r = enqueue_floating_irq(dev, attr); + break; + case KVM_DEV_FLIC_CLEAR_IRQS: + kvm_s390_clear_float_irqs(dev->kvm); + break; + case KVM_DEV_FLIC_APF_ENABLE: + dev->kvm->arch.gmap->pfault_enabled = 1; + break; + case KVM_DEV_FLIC_APF_DISABLE_WAIT: + dev->kvm->arch.gmap->pfault_enabled = 0; + /* + * Make sure no async faults are in transition when + * clearing the queues. So we don't need to worry + * about late coming workers. + */ + synchronize_srcu(&dev->kvm->srcu); + kvm_for_each_vcpu(i, vcpu, dev->kvm) + kvm_clear_async_pf_completion_queue(vcpu); + break; + case KVM_DEV_FLIC_ADAPTER_REGISTER: + r = register_io_adapter(dev, attr); + break; + case KVM_DEV_FLIC_ADAPTER_MODIFY: + r = modify_io_adapter(dev, attr); + break; + case KVM_DEV_FLIC_CLEAR_IO_IRQ: + r = clear_io_irq(dev->kvm, attr); + break; + case KVM_DEV_FLIC_AISM: + r = modify_ais_mode(dev->kvm, attr); + break; + case KVM_DEV_FLIC_AIRQ_INJECT: + r = flic_inject_airq(dev->kvm, attr); + break; + case KVM_DEV_FLIC_AISM_ALL: + r = flic_ais_mode_set_all(dev->kvm, attr); + break; + default: + r = -EINVAL; + } + + return r; +} + +static int flic_has_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_FLIC_GET_ALL_IRQS: + case KVM_DEV_FLIC_ENQUEUE: + case KVM_DEV_FLIC_CLEAR_IRQS: + case KVM_DEV_FLIC_APF_ENABLE: + case KVM_DEV_FLIC_APF_DISABLE_WAIT: + case KVM_DEV_FLIC_ADAPTER_REGISTER: + case KVM_DEV_FLIC_ADAPTER_MODIFY: + case KVM_DEV_FLIC_CLEAR_IO_IRQ: + case KVM_DEV_FLIC_AISM: + case KVM_DEV_FLIC_AIRQ_INJECT: + case KVM_DEV_FLIC_AISM_ALL: + return 0; + } + return -ENXIO; +} + +static int flic_create(struct kvm_device *dev, u32 type) +{ + if (!dev) + return -EINVAL; + if (dev->kvm->arch.flic) + return -EINVAL; + dev->kvm->arch.flic = dev; + return 0; +} + +static void flic_destroy(struct kvm_device *dev) +{ + dev->kvm->arch.flic = NULL; + kfree(dev); +} + +/* s390 floating irq controller (flic) */ +struct kvm_device_ops kvm_flic_ops = { + .name = "kvm-flic", + .get_attr = flic_get_attr, + .set_attr = flic_set_attr, + .has_attr = flic_has_attr, + .create = flic_create, + .destroy = flic_destroy, +}; + +static unsigned long get_ind_bit(__u64 addr, unsigned long bit_nr, bool swap) +{ + unsigned long bit; + + bit = bit_nr + (addr % PAGE_SIZE) * 8; + + return swap ? (bit ^ (BITS_PER_LONG - 1)) : bit; +} + +static struct page *get_map_page(struct kvm *kvm, u64 uaddr) +{ + struct page *page = NULL; + + mmap_read_lock(kvm->mm); + get_user_pages_remote(kvm->mm, uaddr, 1, FOLL_WRITE, + &page, NULL); + mmap_read_unlock(kvm->mm); + return page; +} + +static int adapter_indicators_set(struct kvm *kvm, + struct s390_io_adapter *adapter, + struct kvm_s390_adapter_int *adapter_int) +{ + unsigned long bit; + int summary_set, idx; + struct page *ind_page, *summary_page; + void *map; + + ind_page = get_map_page(kvm, adapter_int->ind_addr); + if (!ind_page) + return -1; + summary_page = get_map_page(kvm, adapter_int->summary_addr); + if (!summary_page) { + put_page(ind_page); + return -1; + } + + idx = srcu_read_lock(&kvm->srcu); + map = page_address(ind_page); + bit = get_ind_bit(adapter_int->ind_addr, + adapter_int->ind_offset, adapter->swap); + set_bit(bit, map); + mark_page_dirty(kvm, adapter_int->ind_addr >> PAGE_SHIFT); + set_page_dirty_lock(ind_page); + map = page_address(summary_page); + bit = get_ind_bit(adapter_int->summary_addr, + adapter_int->summary_offset, adapter->swap); + summary_set = test_and_set_bit(bit, map); + mark_page_dirty(kvm, adapter_int->summary_addr >> PAGE_SHIFT); + set_page_dirty_lock(summary_page); + srcu_read_unlock(&kvm->srcu, idx); + + put_page(ind_page); + put_page(summary_page); + return summary_set ? 0 : 1; +} + +/* + * < 0 - not injected due to error + * = 0 - coalesced, summary indicator already active + * > 0 - injected interrupt + */ +static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, + bool line_status) +{ + int ret; + struct s390_io_adapter *adapter; + + /* We're only interested in the 0->1 transition. */ + if (!level) + return 0; + adapter = get_io_adapter(kvm, e->adapter.adapter_id); + if (!adapter) + return -1; + ret = adapter_indicators_set(kvm, adapter, &e->adapter); + if ((ret > 0) && !adapter->masked) { + ret = kvm_s390_inject_airq(kvm, adapter); + if (ret == 0) + ret = 1; + } + return ret; +} + +/* + * Inject the machine check to the guest. + */ +void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu, + struct mcck_volatile_info *mcck_info) +{ + struct kvm_s390_interrupt_info inti; + struct kvm_s390_irq irq; + struct kvm_s390_mchk_info *mchk; + union mci mci; + __u64 cr14 = 0; /* upper bits are not used */ + int rc; + + mci.val = mcck_info->mcic; + if (mci.sr) + cr14 |= CR14_RECOVERY_SUBMASK; + if (mci.dg) + cr14 |= CR14_DEGRADATION_SUBMASK; + if (mci.w) + cr14 |= CR14_WARNING_SUBMASK; + + mchk = mci.ck ? &inti.mchk : &irq.u.mchk; + mchk->cr14 = cr14; + mchk->mcic = mcck_info->mcic; + mchk->ext_damage_code = mcck_info->ext_damage_code; + mchk->failing_storage_address = mcck_info->failing_storage_address; + if (mci.ck) { + /* Inject the floating machine check */ + inti.type = KVM_S390_MCHK; + rc = __inject_vm(vcpu->kvm, &inti); + } else { + /* Inject the machine check to specified vcpu */ + irq.type = KVM_S390_MCHK; + rc = kvm_s390_inject_vcpu(vcpu, &irq); + } + WARN_ON_ONCE(rc); +} + +int kvm_set_routing_entry(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue) +{ + u64 uaddr; + + switch (ue->type) { + /* we store the userspace addresses instead of the guest addresses */ + case KVM_IRQ_ROUTING_S390_ADAPTER: + e->set = set_adapter_int; + uaddr = gmap_translate(kvm->arch.gmap, ue->u.adapter.summary_addr); + if (uaddr == -EFAULT) + return -EFAULT; + e->adapter.summary_addr = uaddr; + uaddr = gmap_translate(kvm->arch.gmap, ue->u.adapter.ind_addr); + if (uaddr == -EFAULT) + return -EFAULT; + e->adapter.ind_addr = uaddr; + e->adapter.summary_offset = ue->u.adapter.summary_offset; + e->adapter.ind_offset = ue->u.adapter.ind_offset; + e->adapter.adapter_id = ue->u.adapter.adapter_id; + return 0; + default: + return -EINVAL; + } +} + +int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, + int irq_source_id, int level, bool line_status) +{ + return -EINVAL; +} + +int kvm_s390_set_irq_state(struct kvm_vcpu *vcpu, void __user *irqstate, int len) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + struct kvm_s390_irq *buf; + int r = 0; + int n; + + buf = vmalloc(len); + if (!buf) + return -ENOMEM; + + if (copy_from_user((void *) buf, irqstate, len)) { + r = -EFAULT; + goto out_free; + } + + /* + * Don't allow setting the interrupt state + * when there are already interrupts pending + */ + spin_lock(&li->lock); + if (li->pending_irqs) { + r = -EBUSY; + goto out_unlock; + } + + for (n = 0; n < len / sizeof(*buf); n++) { + r = do_inject_vcpu(vcpu, &buf[n]); + if (r) + break; + } + +out_unlock: + spin_unlock(&li->lock); +out_free: + vfree(buf); + + return r; +} + +static void store_local_irq(struct kvm_s390_local_interrupt *li, + struct kvm_s390_irq *irq, + unsigned long irq_type) +{ + switch (irq_type) { + case IRQ_PEND_MCHK_EX: + case IRQ_PEND_MCHK_REP: + irq->type = KVM_S390_MCHK; + irq->u.mchk = li->irq.mchk; + break; + case IRQ_PEND_PROG: + irq->type = KVM_S390_PROGRAM_INT; + irq->u.pgm = li->irq.pgm; + break; + case IRQ_PEND_PFAULT_INIT: + irq->type = KVM_S390_INT_PFAULT_INIT; + irq->u.ext = li->irq.ext; + break; + case IRQ_PEND_EXT_EXTERNAL: + irq->type = KVM_S390_INT_EXTERNAL_CALL; + irq->u.extcall = li->irq.extcall; + break; + case IRQ_PEND_EXT_CLOCK_COMP: + irq->type = KVM_S390_INT_CLOCK_COMP; + break; + case IRQ_PEND_EXT_CPU_TIMER: + irq->type = KVM_S390_INT_CPU_TIMER; + break; + case IRQ_PEND_SIGP_STOP: + irq->type = KVM_S390_SIGP_STOP; + irq->u.stop = li->irq.stop; + break; + case IRQ_PEND_RESTART: + irq->type = KVM_S390_RESTART; + break; + case IRQ_PEND_SET_PREFIX: + irq->type = KVM_S390_SIGP_SET_PREFIX; + irq->u.prefix = li->irq.prefix; + break; + } +} + +int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len) +{ + int scn; + DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS); + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + unsigned long pending_irqs; + struct kvm_s390_irq irq; + unsigned long irq_type; + int cpuaddr; + int n = 0; + + spin_lock(&li->lock); + pending_irqs = li->pending_irqs; + memcpy(&sigp_emerg_pending, &li->sigp_emerg_pending, + sizeof(sigp_emerg_pending)); + spin_unlock(&li->lock); + + for_each_set_bit(irq_type, &pending_irqs, IRQ_PEND_COUNT) { + memset(&irq, 0, sizeof(irq)); + if (irq_type == IRQ_PEND_EXT_EMERGENCY) + continue; + if (n + sizeof(irq) > len) + return -ENOBUFS; + store_local_irq(&vcpu->arch.local_int, &irq, irq_type); + if (copy_to_user(&buf[n], &irq, sizeof(irq))) + return -EFAULT; + n += sizeof(irq); + } + + if (test_bit(IRQ_PEND_EXT_EMERGENCY, &pending_irqs)) { + for_each_set_bit(cpuaddr, sigp_emerg_pending, KVM_MAX_VCPUS) { + memset(&irq, 0, sizeof(irq)); + if (n + sizeof(irq) > len) + return -ENOBUFS; + irq.type = KVM_S390_INT_EMERGENCY; + irq.u.emerg.code = cpuaddr; + if (copy_to_user(&buf[n], &irq, sizeof(irq))) + return -EFAULT; + n += sizeof(irq); + } + } + + if (sca_ext_call_pending(vcpu, &scn)) { + if (n + sizeof(irq) > len) + return -ENOBUFS; + memset(&irq, 0, sizeof(irq)); + irq.type = KVM_S390_INT_EXTERNAL_CALL; + irq.u.extcall.code = scn; + if (copy_to_user(&buf[n], &irq, sizeof(irq))) + return -EFAULT; + n += sizeof(irq); + } + + return n; +} + +static void __airqs_kick_single_vcpu(struct kvm *kvm, u8 deliverable_mask) +{ + int vcpu_idx, online_vcpus = atomic_read(&kvm->online_vcpus); + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + struct kvm_vcpu *vcpu; + u8 vcpu_isc_mask; + + for_each_set_bit(vcpu_idx, kvm->arch.idle_mask, online_vcpus) { + vcpu = kvm_get_vcpu(kvm, vcpu_idx); + if (psw_ioint_disabled(vcpu)) + continue; + vcpu_isc_mask = (u8)(vcpu->arch.sie_block->gcr[6] >> 24); + if (deliverable_mask & vcpu_isc_mask) { + /* lately kicked but not yet running */ + if (test_and_set_bit(vcpu_idx, gi->kicked_mask)) + return; + kvm_s390_vcpu_wakeup(vcpu); + return; + } + } +} + +static enum hrtimer_restart gisa_vcpu_kicker(struct hrtimer *timer) +{ + struct kvm_s390_gisa_interrupt *gi = + container_of(timer, struct kvm_s390_gisa_interrupt, timer); + struct kvm *kvm = + container_of(gi->origin, struct sie_page2, gisa)->kvm; + u8 pending_mask; + + pending_mask = gisa_get_ipm_or_restore_iam(gi); + if (pending_mask) { + __airqs_kick_single_vcpu(kvm, pending_mask); + hrtimer_forward_now(timer, ns_to_ktime(gi->expires)); + return HRTIMER_RESTART; + } + + return HRTIMER_NORESTART; +} + +#define NULL_GISA_ADDR 0x00000000UL +#define NONE_GISA_ADDR 0x00000001UL +#define GISA_ADDR_MASK 0xfffff000UL + +static void process_gib_alert_list(void) +{ + struct kvm_s390_gisa_interrupt *gi; + u32 final, gisa_phys, origin = 0UL; + struct kvm_s390_gisa *gisa; + struct kvm *kvm; + + do { + /* + * If the NONE_GISA_ADDR is still stored in the alert list + * origin, we will leave the outer loop. No further GISA has + * been added to the alert list by millicode while processing + * the current alert list. + */ + final = (origin & NONE_GISA_ADDR); + /* + * Cut off the alert list and store the NONE_GISA_ADDR in the + * alert list origin to avoid further GAL interruptions. + * A new alert list can be build up by millicode in parallel + * for guests not in the yet cut-off alert list. When in the + * final loop, store the NULL_GISA_ADDR instead. This will re- + * enable GAL interruptions on the host again. + */ + origin = xchg(&gib->alert_list_origin, + (!final) ? NONE_GISA_ADDR : NULL_GISA_ADDR); + /* + * Loop through the just cut-off alert list and start the + * gisa timers to kick idle vcpus to consume the pending + * interruptions asap. + */ + while (origin & GISA_ADDR_MASK) { + gisa_phys = origin; + gisa = phys_to_virt(gisa_phys); + origin = gisa->next_alert; + gisa->next_alert = gisa_phys; + kvm = container_of(gisa, struct sie_page2, gisa)->kvm; + gi = &kvm->arch.gisa_int; + if (hrtimer_active(&gi->timer)) + hrtimer_cancel(&gi->timer); + hrtimer_start(&gi->timer, 0, HRTIMER_MODE_REL); + } + } while (!final); + +} + +void kvm_s390_gisa_clear(struct kvm *kvm) +{ + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + + if (!gi->origin) + return; + gisa_clear_ipm(gi->origin); + VM_EVENT(kvm, 3, "gisa 0x%pK cleared", gi->origin); +} + +void kvm_s390_gisa_init(struct kvm *kvm) +{ + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + + if (!css_general_characteristics.aiv) + return; + gi->origin = &kvm->arch.sie_page2->gisa; + gi->alert.mask = 0; + spin_lock_init(&gi->alert.ref_lock); + gi->expires = 50 * 1000; /* 50 usec */ + hrtimer_init(&gi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + gi->timer.function = gisa_vcpu_kicker; + memset(gi->origin, 0, sizeof(struct kvm_s390_gisa)); + gi->origin->next_alert = (u32)virt_to_phys(gi->origin); + VM_EVENT(kvm, 3, "gisa 0x%pK initialized", gi->origin); +} + +void kvm_s390_gisa_enable(struct kvm *kvm) +{ + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + struct kvm_vcpu *vcpu; + unsigned long i; + u32 gisa_desc; + + if (gi->origin) + return; + kvm_s390_gisa_init(kvm); + gisa_desc = kvm_s390_get_gisa_desc(kvm); + if (!gisa_desc) + return; + kvm_for_each_vcpu(i, vcpu, kvm) { + mutex_lock(&vcpu->mutex); + vcpu->arch.sie_block->gd = gisa_desc; + vcpu->arch.sie_block->eca |= ECA_AIV; + VCPU_EVENT(vcpu, 3, "AIV gisa format-%u enabled for cpu %03u", + vcpu->arch.sie_block->gd & 0x3, vcpu->vcpu_id); + mutex_unlock(&vcpu->mutex); + } +} + +void kvm_s390_gisa_destroy(struct kvm *kvm) +{ + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + struct kvm_s390_gisa *gisa = gi->origin; + + if (!gi->origin) + return; + WARN(gi->alert.mask != 0x00, + "unexpected non zero alert.mask 0x%02x", + gi->alert.mask); + gi->alert.mask = 0x00; + if (gisa_set_iam(gi->origin, gi->alert.mask)) + process_gib_alert_list(); + hrtimer_cancel(&gi->timer); + gi->origin = NULL; + VM_EVENT(kvm, 3, "gisa 0x%pK destroyed", gisa); +} + +void kvm_s390_gisa_disable(struct kvm *kvm) +{ + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + struct kvm_vcpu *vcpu; + unsigned long i; + + if (!gi->origin) + return; + kvm_for_each_vcpu(i, vcpu, kvm) { + mutex_lock(&vcpu->mutex); + vcpu->arch.sie_block->eca &= ~ECA_AIV; + vcpu->arch.sie_block->gd = 0U; + mutex_unlock(&vcpu->mutex); + VCPU_EVENT(vcpu, 3, "AIV disabled for cpu %03u", vcpu->vcpu_id); + } + kvm_s390_gisa_destroy(kvm); +} + +/** + * kvm_s390_gisc_register - register a guest ISC + * + * @kvm: the kernel vm to work with + * @gisc: the guest interruption sub class to register + * + * The function extends the vm specific alert mask to use. + * The effective IAM mask in the GISA is updated as well + * in case the GISA is not part of the GIB alert list. + * It will be updated latest when the IAM gets restored + * by gisa_get_ipm_or_restore_iam(). + * + * Returns: the nonspecific ISC (NISC) the gib alert mechanism + * has registered with the channel subsystem. + * -ENODEV in case the vm uses no GISA + * -ERANGE in case the guest ISC is invalid + */ +int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc) +{ + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + + if (!gi->origin) + return -ENODEV; + if (gisc > MAX_ISC) + return -ERANGE; + + spin_lock(&gi->alert.ref_lock); + gi->alert.ref_count[gisc]++; + if (gi->alert.ref_count[gisc] == 1) { + gi->alert.mask |= 0x80 >> gisc; + gisa_set_iam(gi->origin, gi->alert.mask); + } + spin_unlock(&gi->alert.ref_lock); + + return gib->nisc; +} +EXPORT_SYMBOL_GPL(kvm_s390_gisc_register); + +/** + * kvm_s390_gisc_unregister - unregister a guest ISC + * + * @kvm: the kernel vm to work with + * @gisc: the guest interruption sub class to register + * + * The function reduces the vm specific alert mask to use. + * The effective IAM mask in the GISA is updated as well + * in case the GISA is not part of the GIB alert list. + * It will be updated latest when the IAM gets restored + * by gisa_get_ipm_or_restore_iam(). + * + * Returns: the nonspecific ISC (NISC) the gib alert mechanism + * has registered with the channel subsystem. + * -ENODEV in case the vm uses no GISA + * -ERANGE in case the guest ISC is invalid + * -EINVAL in case the guest ISC is not registered + */ +int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc) +{ + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + int rc = 0; + + if (!gi->origin) + return -ENODEV; + if (gisc > MAX_ISC) + return -ERANGE; + + spin_lock(&gi->alert.ref_lock); + if (gi->alert.ref_count[gisc] == 0) { + rc = -EINVAL; + goto out; + } + gi->alert.ref_count[gisc]--; + if (gi->alert.ref_count[gisc] == 0) { + gi->alert.mask &= ~(0x80 >> gisc); + gisa_set_iam(gi->origin, gi->alert.mask); + } +out: + spin_unlock(&gi->alert.ref_lock); + + return rc; +} +EXPORT_SYMBOL_GPL(kvm_s390_gisc_unregister); + +static void aen_host_forward(unsigned long si) +{ + struct kvm_s390_gisa_interrupt *gi; + struct zpci_gaite *gaite; + struct kvm *kvm; + + gaite = (struct zpci_gaite *)aift->gait + + (si * sizeof(struct zpci_gaite)); + if (gaite->count == 0) + return; + if (gaite->aisb != 0) + set_bit_inv(gaite->aisbo, phys_to_virt(gaite->aisb)); + + kvm = kvm_s390_pci_si_to_kvm(aift, si); + if (!kvm) + return; + gi = &kvm->arch.gisa_int; + + if (!(gi->origin->g1.simm & AIS_MODE_MASK(gaite->gisc)) || + !(gi->origin->g1.nimm & AIS_MODE_MASK(gaite->gisc))) { + gisa_set_ipm_gisc(gi->origin, gaite->gisc); + if (hrtimer_active(&gi->timer)) + hrtimer_cancel(&gi->timer); + hrtimer_start(&gi->timer, 0, HRTIMER_MODE_REL); + kvm->stat.aen_forward++; + } +} + +static void aen_process_gait(u8 isc) +{ + bool found = false, first = true; + union zpci_sic_iib iib = {{0}}; + unsigned long si, flags; + + spin_lock_irqsave(&aift->gait_lock, flags); + + if (!aift->gait) { + spin_unlock_irqrestore(&aift->gait_lock, flags); + return; + } + + for (si = 0;;) { + /* Scan adapter summary indicator bit vector */ + si = airq_iv_scan(aift->sbv, si, airq_iv_end(aift->sbv)); + if (si == -1UL) { + if (first || found) { + /* Re-enable interrupts. */ + zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, isc, + &iib); + first = found = false; + } else { + /* Interrupts on and all bits processed */ + break; + } + found = false; + si = 0; + /* Scan again after re-enabling interrupts */ + continue; + } + found = true; + aen_host_forward(si); + } + + spin_unlock_irqrestore(&aift->gait_lock, flags); +} + +static void gib_alert_irq_handler(struct airq_struct *airq, + struct tpi_info *tpi_info) +{ + struct tpi_adapter_info *info = (struct tpi_adapter_info *)tpi_info; + + inc_irq_stat(IRQIO_GAL); + + if ((info->forward || info->error) && + IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) { + aen_process_gait(info->isc); + if (info->aism != 0) + process_gib_alert_list(); + } else { + process_gib_alert_list(); + } +} + +static struct airq_struct gib_alert_irq = { + .handler = gib_alert_irq_handler, +}; + +void kvm_s390_gib_destroy(void) +{ + if (!gib) + return; + if (kvm_s390_pci_interp_allowed() && aift) { + mutex_lock(&aift->aift_lock); + kvm_s390_pci_aen_exit(); + mutex_unlock(&aift->aift_lock); + } + chsc_sgib(0); + unregister_adapter_interrupt(&gib_alert_irq); + free_page((unsigned long)gib); + gib = NULL; +} + +int __init kvm_s390_gib_init(u8 nisc) +{ + u32 gib_origin; + int rc = 0; + + if (!css_general_characteristics.aiv) { + KVM_EVENT(3, "%s", "gib not initialized, no AIV facility"); + goto out; + } + + gib = (struct kvm_s390_gib *)get_zeroed_page(GFP_KERNEL_ACCOUNT | GFP_DMA); + if (!gib) { + rc = -ENOMEM; + goto out; + } + + gib_alert_irq.isc = nisc; + if (register_adapter_interrupt(&gib_alert_irq)) { + pr_err("Registering the GIB alert interruption handler failed\n"); + rc = -EIO; + goto out_free_gib; + } + /* adapter interrupts used for AP (applicable here) don't use the LSI */ + *gib_alert_irq.lsi_ptr = 0xff; + + gib->nisc = nisc; + gib_origin = virt_to_phys(gib); + if (chsc_sgib(gib_origin)) { + pr_err("Associating the GIB with the AIV facility failed\n"); + free_page((unsigned long)gib); + gib = NULL; + rc = -EIO; + goto out_unreg_gal; + } + + if (kvm_s390_pci_interp_allowed()) { + if (kvm_s390_pci_aen_init(nisc)) { + pr_err("Initializing AEN for PCI failed\n"); + rc = -EIO; + goto out_unreg_gal; + } + } + + KVM_EVENT(3, "gib 0x%pK (nisc=%d) initialized", gib, gib->nisc); + goto out; + +out_unreg_gal: + unregister_adapter_interrupt(&gib_alert_irq); +out_free_gib: + free_page((unsigned long)gib); + gib = NULL; +out: + return rc; +} diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c new file mode 100644 index 0000000000..b3f17e014c --- /dev/null +++ b/arch/s390/kvm/kvm-s390.c @@ -0,0 +1,5895 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * hosting IBM Z kernel virtual machines (s390x) + * + * Copyright IBM Corp. 2008, 2020 + * + * Author(s): Carsten Otte + * Christian Borntraeger + * Christian Ehrhardt + * Jason J. Herne + */ + +#define KMSG_COMPONENT "kvm-s390" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "kvm-s390.h" +#include "gaccess.h" +#include "pci.h" + +#define CREATE_TRACE_POINTS +#include "trace.h" +#include "trace-s390.h" + +#define MEM_OP_MAX_SIZE 65536 /* Maximum transfer size for KVM_S390_MEM_OP */ +#define LOCAL_IRQS 32 +#define VCPU_IRQS_MAX_BUF (sizeof(struct kvm_s390_irq) * \ + (KVM_MAX_VCPUS + LOCAL_IRQS)) + +const struct _kvm_stats_desc kvm_vm_stats_desc[] = { + KVM_GENERIC_VM_STATS(), + STATS_DESC_COUNTER(VM, inject_io), + STATS_DESC_COUNTER(VM, inject_float_mchk), + STATS_DESC_COUNTER(VM, inject_pfault_done), + STATS_DESC_COUNTER(VM, inject_service_signal), + STATS_DESC_COUNTER(VM, inject_virtio), + STATS_DESC_COUNTER(VM, aen_forward) +}; + +const struct kvm_stats_header kvm_vm_stats_header = { + .name_size = KVM_STATS_NAME_SIZE, + .num_desc = ARRAY_SIZE(kvm_vm_stats_desc), + .id_offset = sizeof(struct kvm_stats_header), + .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, + .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + + sizeof(kvm_vm_stats_desc), +}; + +const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { + KVM_GENERIC_VCPU_STATS(), + STATS_DESC_COUNTER(VCPU, exit_userspace), + STATS_DESC_COUNTER(VCPU, exit_null), + STATS_DESC_COUNTER(VCPU, exit_external_request), + STATS_DESC_COUNTER(VCPU, exit_io_request), + STATS_DESC_COUNTER(VCPU, exit_external_interrupt), + STATS_DESC_COUNTER(VCPU, exit_stop_request), + STATS_DESC_COUNTER(VCPU, exit_validity), + STATS_DESC_COUNTER(VCPU, exit_instruction), + STATS_DESC_COUNTER(VCPU, exit_pei), + STATS_DESC_COUNTER(VCPU, halt_no_poll_steal), + STATS_DESC_COUNTER(VCPU, instruction_lctl), + STATS_DESC_COUNTER(VCPU, instruction_lctlg), + STATS_DESC_COUNTER(VCPU, instruction_stctl), + STATS_DESC_COUNTER(VCPU, instruction_stctg), + STATS_DESC_COUNTER(VCPU, exit_program_interruption), + STATS_DESC_COUNTER(VCPU, exit_instr_and_program), + STATS_DESC_COUNTER(VCPU, exit_operation_exception), + STATS_DESC_COUNTER(VCPU, deliver_ckc), + STATS_DESC_COUNTER(VCPU, deliver_cputm), + STATS_DESC_COUNTER(VCPU, deliver_external_call), + STATS_DESC_COUNTER(VCPU, deliver_emergency_signal), + STATS_DESC_COUNTER(VCPU, deliver_service_signal), + STATS_DESC_COUNTER(VCPU, deliver_virtio), + STATS_DESC_COUNTER(VCPU, deliver_stop_signal), + STATS_DESC_COUNTER(VCPU, deliver_prefix_signal), + STATS_DESC_COUNTER(VCPU, deliver_restart_signal), + STATS_DESC_COUNTER(VCPU, deliver_program), + STATS_DESC_COUNTER(VCPU, deliver_io), + STATS_DESC_COUNTER(VCPU, deliver_machine_check), + STATS_DESC_COUNTER(VCPU, exit_wait_state), + STATS_DESC_COUNTER(VCPU, inject_ckc), + STATS_DESC_COUNTER(VCPU, inject_cputm), + STATS_DESC_COUNTER(VCPU, inject_external_call), + STATS_DESC_COUNTER(VCPU, inject_emergency_signal), + STATS_DESC_COUNTER(VCPU, inject_mchk), + STATS_DESC_COUNTER(VCPU, inject_pfault_init), + STATS_DESC_COUNTER(VCPU, inject_program), + STATS_DESC_COUNTER(VCPU, inject_restart), + STATS_DESC_COUNTER(VCPU, inject_set_prefix), + STATS_DESC_COUNTER(VCPU, inject_stop_signal), + STATS_DESC_COUNTER(VCPU, instruction_epsw), + STATS_DESC_COUNTER(VCPU, instruction_gs), + STATS_DESC_COUNTER(VCPU, instruction_io_other), + STATS_DESC_COUNTER(VCPU, instruction_lpsw), + STATS_DESC_COUNTER(VCPU, instruction_lpswe), + STATS_DESC_COUNTER(VCPU, instruction_pfmf), + STATS_DESC_COUNTER(VCPU, instruction_ptff), + STATS_DESC_COUNTER(VCPU, instruction_sck), + STATS_DESC_COUNTER(VCPU, instruction_sckpf), + STATS_DESC_COUNTER(VCPU, instruction_stidp), + STATS_DESC_COUNTER(VCPU, instruction_spx), + STATS_DESC_COUNTER(VCPU, instruction_stpx), + STATS_DESC_COUNTER(VCPU, instruction_stap), + STATS_DESC_COUNTER(VCPU, instruction_iske), + STATS_DESC_COUNTER(VCPU, instruction_ri), + STATS_DESC_COUNTER(VCPU, instruction_rrbe), + STATS_DESC_COUNTER(VCPU, instruction_sske), + STATS_DESC_COUNTER(VCPU, instruction_ipte_interlock), + STATS_DESC_COUNTER(VCPU, instruction_stsi), + STATS_DESC_COUNTER(VCPU, instruction_stfl), + STATS_DESC_COUNTER(VCPU, instruction_tb), + STATS_DESC_COUNTER(VCPU, instruction_tpi), + STATS_DESC_COUNTER(VCPU, instruction_tprot), + STATS_DESC_COUNTER(VCPU, instruction_tsch), + STATS_DESC_COUNTER(VCPU, instruction_sie), + STATS_DESC_COUNTER(VCPU, instruction_essa), + STATS_DESC_COUNTER(VCPU, instruction_sthyi), + STATS_DESC_COUNTER(VCPU, instruction_sigp_sense), + STATS_DESC_COUNTER(VCPU, instruction_sigp_sense_running), + STATS_DESC_COUNTER(VCPU, instruction_sigp_external_call), + STATS_DESC_COUNTER(VCPU, instruction_sigp_emergency), + STATS_DESC_COUNTER(VCPU, instruction_sigp_cond_emergency), + STATS_DESC_COUNTER(VCPU, instruction_sigp_start), + STATS_DESC_COUNTER(VCPU, instruction_sigp_stop), + STATS_DESC_COUNTER(VCPU, instruction_sigp_stop_store_status), + STATS_DESC_COUNTER(VCPU, instruction_sigp_store_status), + STATS_DESC_COUNTER(VCPU, instruction_sigp_store_adtl_status), + STATS_DESC_COUNTER(VCPU, instruction_sigp_arch), + STATS_DESC_COUNTER(VCPU, instruction_sigp_prefix), + STATS_DESC_COUNTER(VCPU, instruction_sigp_restart), + STATS_DESC_COUNTER(VCPU, instruction_sigp_init_cpu_reset), + STATS_DESC_COUNTER(VCPU, instruction_sigp_cpu_reset), + STATS_DESC_COUNTER(VCPU, instruction_sigp_unknown), + STATS_DESC_COUNTER(VCPU, instruction_diagnose_10), + STATS_DESC_COUNTER(VCPU, instruction_diagnose_44), + STATS_DESC_COUNTER(VCPU, instruction_diagnose_9c), + STATS_DESC_COUNTER(VCPU, diag_9c_ignored), + STATS_DESC_COUNTER(VCPU, diag_9c_forward), + STATS_DESC_COUNTER(VCPU, instruction_diagnose_258), + STATS_DESC_COUNTER(VCPU, instruction_diagnose_308), + STATS_DESC_COUNTER(VCPU, instruction_diagnose_500), + STATS_DESC_COUNTER(VCPU, instruction_diagnose_other), + STATS_DESC_COUNTER(VCPU, pfault_sync) +}; + +const struct kvm_stats_header kvm_vcpu_stats_header = { + .name_size = KVM_STATS_NAME_SIZE, + .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc), + .id_offset = sizeof(struct kvm_stats_header), + .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, + .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + + sizeof(kvm_vcpu_stats_desc), +}; + +/* allow nested virtualization in KVM (if enabled by user space) */ +static int nested; +module_param(nested, int, S_IRUGO); +MODULE_PARM_DESC(nested, "Nested virtualization support"); + +/* allow 1m huge page guest backing, if !nested */ +static int hpage; +module_param(hpage, int, 0444); +MODULE_PARM_DESC(hpage, "1m huge page backing support"); + +/* maximum percentage of steal time for polling. >100 is treated like 100 */ +static u8 halt_poll_max_steal = 10; +module_param(halt_poll_max_steal, byte, 0644); +MODULE_PARM_DESC(halt_poll_max_steal, "Maximum percentage of steal time to allow polling"); + +/* if set to true, the GISA will be initialized and used if available */ +static bool use_gisa = true; +module_param(use_gisa, bool, 0644); +MODULE_PARM_DESC(use_gisa, "Use the GISA if the host supports it."); + +/* maximum diag9c forwarding per second */ +unsigned int diag9c_forwarding_hz; +module_param(diag9c_forwarding_hz, uint, 0644); +MODULE_PARM_DESC(diag9c_forwarding_hz, "Maximum diag9c forwarding per second, 0 to turn off"); + +/* + * allow asynchronous deinit for protected guests; enable by default since + * the feature is opt-in anyway + */ +static int async_destroy = 1; +module_param(async_destroy, int, 0444); +MODULE_PARM_DESC(async_destroy, "Asynchronous destroy for protected guests"); + +/* + * For now we handle at most 16 double words as this is what the s390 base + * kernel handles and stores in the prefix page. If we ever need to go beyond + * this, this requires changes to code, but the external uapi can stay. + */ +#define SIZE_INTERNAL 16 + +/* + * Base feature mask that defines default mask for facilities. Consists of the + * defines in FACILITIES_KVM and the non-hypervisor managed bits. + */ +static unsigned long kvm_s390_fac_base[SIZE_INTERNAL] = { FACILITIES_KVM }; +/* + * Extended feature mask. Consists of the defines in FACILITIES_KVM_CPUMODEL + * and defines the facilities that can be enabled via a cpu model. + */ +static unsigned long kvm_s390_fac_ext[SIZE_INTERNAL] = { FACILITIES_KVM_CPUMODEL }; + +static unsigned long kvm_s390_fac_size(void) +{ + BUILD_BUG_ON(SIZE_INTERNAL > S390_ARCH_FAC_MASK_SIZE_U64); + BUILD_BUG_ON(SIZE_INTERNAL > S390_ARCH_FAC_LIST_SIZE_U64); + BUILD_BUG_ON(SIZE_INTERNAL * sizeof(unsigned long) > + sizeof(stfle_fac_list)); + + return SIZE_INTERNAL; +} + +/* available cpu features supported by kvm */ +static DECLARE_BITMAP(kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); +/* available subfunctions indicated via query / "test bit" */ +static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc; + +static struct gmap_notifier gmap_notifier; +static struct gmap_notifier vsie_gmap_notifier; +debug_info_t *kvm_s390_dbf; +debug_info_t *kvm_s390_dbf_uv; + +/* Section: not file related */ +/* forward declarations */ +static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, + unsigned long end); +static int sca_switch_to_extended(struct kvm *kvm); + +static void kvm_clock_sync_scb(struct kvm_s390_sie_block *scb, u64 delta) +{ + u8 delta_idx = 0; + + /* + * The TOD jumps by delta, we have to compensate this by adding + * -delta to the epoch. + */ + delta = -delta; + + /* sign-extension - we're adding to signed values below */ + if ((s64)delta < 0) + delta_idx = -1; + + scb->epoch += delta; + if (scb->ecd & ECD_MEF) { + scb->epdx += delta_idx; + if (scb->epoch < delta) + scb->epdx += 1; + } +} + +/* + * This callback is executed during stop_machine(). All CPUs are therefore + * temporarily stopped. In order not to change guest behavior, we have to + * disable preemption whenever we touch the epoch of kvm and the VCPUs, + * so a CPU won't be stopped while calculating with the epoch. + */ +static int kvm_clock_sync(struct notifier_block *notifier, unsigned long val, + void *v) +{ + struct kvm *kvm; + struct kvm_vcpu *vcpu; + unsigned long i; + unsigned long long *delta = v; + + list_for_each_entry(kvm, &vm_list, vm_list) { + kvm_for_each_vcpu(i, vcpu, kvm) { + kvm_clock_sync_scb(vcpu->arch.sie_block, *delta); + if (i == 0) { + kvm->arch.epoch = vcpu->arch.sie_block->epoch; + kvm->arch.epdx = vcpu->arch.sie_block->epdx; + } + if (vcpu->arch.cputm_enabled) + vcpu->arch.cputm_start += *delta; + if (vcpu->arch.vsie_block) + kvm_clock_sync_scb(vcpu->arch.vsie_block, + *delta); + } + } + return NOTIFY_OK; +} + +static struct notifier_block kvm_clock_notifier = { + .notifier_call = kvm_clock_sync, +}; + +static void allow_cpu_feat(unsigned long nr) +{ + set_bit_inv(nr, kvm_s390_available_cpu_feat); +} + +static inline int plo_test_bit(unsigned char nr) +{ + unsigned long function = (unsigned long)nr | 0x100; + int cc; + + asm volatile( + " lgr 0,%[function]\n" + /* Parameter registers are ignored for "test bit" */ + " plo 0,0,0,0(0)\n" + " ipm %0\n" + " srl %0,28\n" + : "=d" (cc) + : [function] "d" (function) + : "cc", "0"); + return cc == 0; +} + +static __always_inline void __insn32_query(unsigned int opcode, u8 *query) +{ + asm volatile( + " lghi 0,0\n" + " lgr 1,%[query]\n" + /* Parameter registers are ignored */ + " .insn rrf,%[opc] << 16,2,4,6,0\n" + : + : [query] "d" ((unsigned long)query), [opc] "i" (opcode) + : "cc", "memory", "0", "1"); +} + +#define INSN_SORTL 0xb938 +#define INSN_DFLTCC 0xb939 + +static void __init kvm_s390_cpu_feat_init(void) +{ + int i; + + for (i = 0; i < 256; ++i) { + if (plo_test_bit(i)) + kvm_s390_available_subfunc.plo[i >> 3] |= 0x80 >> (i & 7); + } + + if (test_facility(28)) /* TOD-clock steering */ + ptff(kvm_s390_available_subfunc.ptff, + sizeof(kvm_s390_available_subfunc.ptff), + PTFF_QAF); + + if (test_facility(17)) { /* MSA */ + __cpacf_query(CPACF_KMAC, (cpacf_mask_t *) + kvm_s390_available_subfunc.kmac); + __cpacf_query(CPACF_KMC, (cpacf_mask_t *) + kvm_s390_available_subfunc.kmc); + __cpacf_query(CPACF_KM, (cpacf_mask_t *) + kvm_s390_available_subfunc.km); + __cpacf_query(CPACF_KIMD, (cpacf_mask_t *) + kvm_s390_available_subfunc.kimd); + __cpacf_query(CPACF_KLMD, (cpacf_mask_t *) + kvm_s390_available_subfunc.klmd); + } + if (test_facility(76)) /* MSA3 */ + __cpacf_query(CPACF_PCKMO, (cpacf_mask_t *) + kvm_s390_available_subfunc.pckmo); + if (test_facility(77)) { /* MSA4 */ + __cpacf_query(CPACF_KMCTR, (cpacf_mask_t *) + kvm_s390_available_subfunc.kmctr); + __cpacf_query(CPACF_KMF, (cpacf_mask_t *) + kvm_s390_available_subfunc.kmf); + __cpacf_query(CPACF_KMO, (cpacf_mask_t *) + kvm_s390_available_subfunc.kmo); + __cpacf_query(CPACF_PCC, (cpacf_mask_t *) + kvm_s390_available_subfunc.pcc); + } + if (test_facility(57)) /* MSA5 */ + __cpacf_query(CPACF_PRNO, (cpacf_mask_t *) + kvm_s390_available_subfunc.ppno); + + if (test_facility(146)) /* MSA8 */ + __cpacf_query(CPACF_KMA, (cpacf_mask_t *) + kvm_s390_available_subfunc.kma); + + if (test_facility(155)) /* MSA9 */ + __cpacf_query(CPACF_KDSA, (cpacf_mask_t *) + kvm_s390_available_subfunc.kdsa); + + if (test_facility(150)) /* SORTL */ + __insn32_query(INSN_SORTL, kvm_s390_available_subfunc.sortl); + + if (test_facility(151)) /* DFLTCC */ + __insn32_query(INSN_DFLTCC, kvm_s390_available_subfunc.dfltcc); + + if (MACHINE_HAS_ESOP) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP); + /* + * We need SIE support, ESOP (PROT_READ protection for gmap_shadow), + * 64bit SCAO (SCA passthrough) and IDTE (for gmap_shadow unshadowing). + */ + if (!sclp.has_sief2 || !MACHINE_HAS_ESOP || !sclp.has_64bscao || + !test_facility(3) || !nested) + return; + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIEF2); + if (sclp.has_64bscao) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_64BSCAO); + if (sclp.has_siif) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIIF); + if (sclp.has_gpere) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GPERE); + if (sclp.has_gsls) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GSLS); + if (sclp.has_ib) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IB); + if (sclp.has_cei) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_CEI); + if (sclp.has_ibs) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IBS); + if (sclp.has_kss) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_KSS); + /* + * KVM_S390_VM_CPU_FEAT_SKEY: Wrong shadow of PTE.I bits will make + * all skey handling functions read/set the skey from the PGSTE + * instead of the real storage key. + * + * KVM_S390_VM_CPU_FEAT_CMMA: Wrong shadow of PTE.I bits will make + * pages being detected as preserved although they are resident. + * + * KVM_S390_VM_CPU_FEAT_PFMFI: Wrong shadow of PTE.I bits will + * have the same effect as for KVM_S390_VM_CPU_FEAT_SKEY. + * + * For KVM_S390_VM_CPU_FEAT_SKEY, KVM_S390_VM_CPU_FEAT_CMMA and + * KVM_S390_VM_CPU_FEAT_PFMFI, all PTE.I and PGSTE bits have to be + * correctly shadowed. We can do that for the PGSTE but not for PTE.I. + * + * KVM_S390_VM_CPU_FEAT_SIGPIF: Wrong SCB addresses in the SCA. We + * cannot easily shadow the SCA because of the ipte lock. + */ +} + +static int __init __kvm_s390_init(void) +{ + int rc = -ENOMEM; + + kvm_s390_dbf = debug_register("kvm-trace", 32, 1, 7 * sizeof(long)); + if (!kvm_s390_dbf) + return -ENOMEM; + + kvm_s390_dbf_uv = debug_register("kvm-uv", 32, 1, 7 * sizeof(long)); + if (!kvm_s390_dbf_uv) + goto err_kvm_uv; + + if (debug_register_view(kvm_s390_dbf, &debug_sprintf_view) || + debug_register_view(kvm_s390_dbf_uv, &debug_sprintf_view)) + goto err_debug_view; + + kvm_s390_cpu_feat_init(); + + /* Register floating interrupt controller interface. */ + rc = kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC); + if (rc) { + pr_err("A FLIC registration call failed with rc=%d\n", rc); + goto err_flic; + } + + if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) { + rc = kvm_s390_pci_init(); + if (rc) { + pr_err("Unable to allocate AIFT for PCI\n"); + goto err_pci; + } + } + + rc = kvm_s390_gib_init(GAL_ISC); + if (rc) + goto err_gib; + + gmap_notifier.notifier_call = kvm_gmap_notifier; + gmap_register_pte_notifier(&gmap_notifier); + vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier; + gmap_register_pte_notifier(&vsie_gmap_notifier); + atomic_notifier_chain_register(&s390_epoch_delta_notifier, + &kvm_clock_notifier); + + return 0; + +err_gib: + if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) + kvm_s390_pci_exit(); +err_pci: +err_flic: +err_debug_view: + debug_unregister(kvm_s390_dbf_uv); +err_kvm_uv: + debug_unregister(kvm_s390_dbf); + return rc; +} + +static void __kvm_s390_exit(void) +{ + gmap_unregister_pte_notifier(&gmap_notifier); + gmap_unregister_pte_notifier(&vsie_gmap_notifier); + atomic_notifier_chain_unregister(&s390_epoch_delta_notifier, + &kvm_clock_notifier); + + kvm_s390_gib_destroy(); + if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) + kvm_s390_pci_exit(); + debug_unregister(kvm_s390_dbf); + debug_unregister(kvm_s390_dbf_uv); +} + +/* Section: device related */ +long kvm_arch_dev_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + if (ioctl == KVM_S390_ENABLE_SIE) + return s390_enable_sie(); + return -EINVAL; +} + +int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) +{ + int r; + + switch (ext) { + case KVM_CAP_S390_PSW: + case KVM_CAP_S390_GMAP: + case KVM_CAP_SYNC_MMU: +#ifdef CONFIG_KVM_S390_UCONTROL + case KVM_CAP_S390_UCONTROL: +#endif + case KVM_CAP_ASYNC_PF: + case KVM_CAP_SYNC_REGS: + case KVM_CAP_ONE_REG: + case KVM_CAP_ENABLE_CAP: + case KVM_CAP_S390_CSS_SUPPORT: + case KVM_CAP_IOEVENTFD: + case KVM_CAP_DEVICE_CTRL: + case KVM_CAP_S390_IRQCHIP: + case KVM_CAP_VM_ATTRIBUTES: + case KVM_CAP_MP_STATE: + case KVM_CAP_IMMEDIATE_EXIT: + case KVM_CAP_S390_INJECT_IRQ: + case KVM_CAP_S390_USER_SIGP: + case KVM_CAP_S390_USER_STSI: + case KVM_CAP_S390_SKEYS: + case KVM_CAP_S390_IRQ_STATE: + case KVM_CAP_S390_USER_INSTR0: + case KVM_CAP_S390_CMMA_MIGRATION: + case KVM_CAP_S390_AIS: + case KVM_CAP_S390_AIS_MIGRATION: + case KVM_CAP_S390_VCPU_RESETS: + case KVM_CAP_SET_GUEST_DEBUG: + case KVM_CAP_S390_DIAG318: + case KVM_CAP_IRQFD_RESAMPLE: + r = 1; + break; + case KVM_CAP_SET_GUEST_DEBUG2: + r = KVM_GUESTDBG_VALID_MASK; + break; + case KVM_CAP_S390_HPAGE_1M: + r = 0; + if (hpage && !kvm_is_ucontrol(kvm)) + r = 1; + break; + case KVM_CAP_S390_MEM_OP: + r = MEM_OP_MAX_SIZE; + break; + case KVM_CAP_S390_MEM_OP_EXTENSION: + /* + * Flag bits indicating which extensions are supported. + * If r > 0, the base extension must also be supported/indicated, + * in order to maintain backwards compatibility. + */ + r = KVM_S390_MEMOP_EXTENSION_CAP_BASE | + KVM_S390_MEMOP_EXTENSION_CAP_CMPXCHG; + break; + case KVM_CAP_NR_VCPUS: + case KVM_CAP_MAX_VCPUS: + case KVM_CAP_MAX_VCPU_ID: + r = KVM_S390_BSCA_CPU_SLOTS; + if (!kvm_s390_use_sca_entries()) + r = KVM_MAX_VCPUS; + else if (sclp.has_esca && sclp.has_64bscao) + r = KVM_S390_ESCA_CPU_SLOTS; + if (ext == KVM_CAP_NR_VCPUS) + r = min_t(unsigned int, num_online_cpus(), r); + break; + case KVM_CAP_S390_COW: + r = MACHINE_HAS_ESOP; + break; + case KVM_CAP_S390_VECTOR_REGISTERS: + r = MACHINE_HAS_VX; + break; + case KVM_CAP_S390_RI: + r = test_facility(64); + break; + case KVM_CAP_S390_GS: + r = test_facility(133); + break; + case KVM_CAP_S390_BPB: + r = test_facility(82); + break; + case KVM_CAP_S390_PROTECTED_ASYNC_DISABLE: + r = async_destroy && is_prot_virt_host(); + break; + case KVM_CAP_S390_PROTECTED: + r = is_prot_virt_host(); + break; + case KVM_CAP_S390_PROTECTED_DUMP: { + u64 pv_cmds_dump[] = { + BIT_UVC_CMD_DUMP_INIT, + BIT_UVC_CMD_DUMP_CONFIG_STOR_STATE, + BIT_UVC_CMD_DUMP_CPU, + BIT_UVC_CMD_DUMP_COMPLETE, + }; + int i; + + r = is_prot_virt_host(); + + for (i = 0; i < ARRAY_SIZE(pv_cmds_dump); i++) { + if (!test_bit_inv(pv_cmds_dump[i], + (unsigned long *)&uv_info.inst_calls_list)) { + r = 0; + break; + } + } + break; + } + case KVM_CAP_S390_ZPCI_OP: + r = kvm_s390_pci_interp_allowed(); + break; + case KVM_CAP_S390_CPU_TOPOLOGY: + r = test_facility(11); + break; + default: + r = 0; + } + return r; +} + +void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) +{ + int i; + gfn_t cur_gfn, last_gfn; + unsigned long gaddr, vmaddr; + struct gmap *gmap = kvm->arch.gmap; + DECLARE_BITMAP(bitmap, _PAGE_ENTRIES); + + /* Loop over all guest segments */ + cur_gfn = memslot->base_gfn; + last_gfn = memslot->base_gfn + memslot->npages; + for (; cur_gfn <= last_gfn; cur_gfn += _PAGE_ENTRIES) { + gaddr = gfn_to_gpa(cur_gfn); + vmaddr = gfn_to_hva_memslot(memslot, cur_gfn); + if (kvm_is_error_hva(vmaddr)) + continue; + + bitmap_zero(bitmap, _PAGE_ENTRIES); + gmap_sync_dirty_log_pmd(gmap, bitmap, gaddr, vmaddr); + for (i = 0; i < _PAGE_ENTRIES; i++) { + if (test_bit(i, bitmap)) + mark_page_dirty(kvm, cur_gfn + i); + } + + if (fatal_signal_pending(current)) + return; + cond_resched(); + } +} + +/* Section: vm related */ +static void sca_del_vcpu(struct kvm_vcpu *vcpu); + +/* + * Get (and clear) the dirty memory log for a memory slot. + */ +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, + struct kvm_dirty_log *log) +{ + int r; + unsigned long n; + struct kvm_memory_slot *memslot; + int is_dirty; + + if (kvm_is_ucontrol(kvm)) + return -EINVAL; + + mutex_lock(&kvm->slots_lock); + + r = -EINVAL; + if (log->slot >= KVM_USER_MEM_SLOTS) + goto out; + + r = kvm_get_dirty_log(kvm, log, &is_dirty, &memslot); + if (r) + goto out; + + /* Clear the dirty log */ + if (is_dirty) { + n = kvm_dirty_bitmap_bytes(memslot); + memset(memslot->dirty_bitmap, 0, n); + } + r = 0; +out: + mutex_unlock(&kvm->slots_lock); + return r; +} + +static void icpt_operexc_on_all_vcpus(struct kvm *kvm) +{ + unsigned long i; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) { + kvm_s390_sync_request(KVM_REQ_ICPT_OPEREXC, vcpu); + } +} + +int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) +{ + int r; + + if (cap->flags) + return -EINVAL; + + switch (cap->cap) { + case KVM_CAP_S390_IRQCHIP: + VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_IRQCHIP"); + kvm->arch.use_irqchip = 1; + r = 0; + break; + case KVM_CAP_S390_USER_SIGP: + VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_SIGP"); + kvm->arch.user_sigp = 1; + r = 0; + break; + case KVM_CAP_S390_VECTOR_REGISTERS: + mutex_lock(&kvm->lock); + if (kvm->created_vcpus) { + r = -EBUSY; + } else if (MACHINE_HAS_VX) { + set_kvm_facility(kvm->arch.model.fac_mask, 129); + set_kvm_facility(kvm->arch.model.fac_list, 129); + if (test_facility(134)) { + set_kvm_facility(kvm->arch.model.fac_mask, 134); + set_kvm_facility(kvm->arch.model.fac_list, 134); + } + if (test_facility(135)) { + set_kvm_facility(kvm->arch.model.fac_mask, 135); + set_kvm_facility(kvm->arch.model.fac_list, 135); + } + if (test_facility(148)) { + set_kvm_facility(kvm->arch.model.fac_mask, 148); + set_kvm_facility(kvm->arch.model.fac_list, 148); + } + if (test_facility(152)) { + set_kvm_facility(kvm->arch.model.fac_mask, 152); + set_kvm_facility(kvm->arch.model.fac_list, 152); + } + if (test_facility(192)) { + set_kvm_facility(kvm->arch.model.fac_mask, 192); + set_kvm_facility(kvm->arch.model.fac_list, 192); + } + r = 0; + } else + r = -EINVAL; + mutex_unlock(&kvm->lock); + VM_EVENT(kvm, 3, "ENABLE: CAP_S390_VECTOR_REGISTERS %s", + r ? "(not available)" : "(success)"); + break; + case KVM_CAP_S390_RI: + r = -EINVAL; + mutex_lock(&kvm->lock); + if (kvm->created_vcpus) { + r = -EBUSY; + } else if (test_facility(64)) { + set_kvm_facility(kvm->arch.model.fac_mask, 64); + set_kvm_facility(kvm->arch.model.fac_list, 64); + r = 0; + } + mutex_unlock(&kvm->lock); + VM_EVENT(kvm, 3, "ENABLE: CAP_S390_RI %s", + r ? "(not available)" : "(success)"); + break; + case KVM_CAP_S390_AIS: + mutex_lock(&kvm->lock); + if (kvm->created_vcpus) { + r = -EBUSY; + } else { + set_kvm_facility(kvm->arch.model.fac_mask, 72); + set_kvm_facility(kvm->arch.model.fac_list, 72); + r = 0; + } + mutex_unlock(&kvm->lock); + VM_EVENT(kvm, 3, "ENABLE: AIS %s", + r ? "(not available)" : "(success)"); + break; + case KVM_CAP_S390_GS: + r = -EINVAL; + mutex_lock(&kvm->lock); + if (kvm->created_vcpus) { + r = -EBUSY; + } else if (test_facility(133)) { + set_kvm_facility(kvm->arch.model.fac_mask, 133); + set_kvm_facility(kvm->arch.model.fac_list, 133); + r = 0; + } + mutex_unlock(&kvm->lock); + VM_EVENT(kvm, 3, "ENABLE: CAP_S390_GS %s", + r ? "(not available)" : "(success)"); + break; + case KVM_CAP_S390_HPAGE_1M: + mutex_lock(&kvm->lock); + if (kvm->created_vcpus) + r = -EBUSY; + else if (!hpage || kvm->arch.use_cmma || kvm_is_ucontrol(kvm)) + r = -EINVAL; + else { + r = 0; + mmap_write_lock(kvm->mm); + kvm->mm->context.allow_gmap_hpage_1m = 1; + mmap_write_unlock(kvm->mm); + /* + * We might have to create fake 4k page + * tables. To avoid that the hardware works on + * stale PGSTEs, we emulate these instructions. + */ + kvm->arch.use_skf = 0; + kvm->arch.use_pfmfi = 0; + } + mutex_unlock(&kvm->lock); + VM_EVENT(kvm, 3, "ENABLE: CAP_S390_HPAGE %s", + r ? "(not available)" : "(success)"); + break; + case KVM_CAP_S390_USER_STSI: + VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_STSI"); + kvm->arch.user_stsi = 1; + r = 0; + break; + case KVM_CAP_S390_USER_INSTR0: + VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_INSTR0"); + kvm->arch.user_instr0 = 1; + icpt_operexc_on_all_vcpus(kvm); + r = 0; + break; + case KVM_CAP_S390_CPU_TOPOLOGY: + r = -EINVAL; + mutex_lock(&kvm->lock); + if (kvm->created_vcpus) { + r = -EBUSY; + } else if (test_facility(11)) { + set_kvm_facility(kvm->arch.model.fac_mask, 11); + set_kvm_facility(kvm->arch.model.fac_list, 11); + r = 0; + } + mutex_unlock(&kvm->lock); + VM_EVENT(kvm, 3, "ENABLE: CAP_S390_CPU_TOPOLOGY %s", + r ? "(not available)" : "(success)"); + break; + default: + r = -EINVAL; + break; + } + return r; +} + +static int kvm_s390_get_mem_control(struct kvm *kvm, struct kvm_device_attr *attr) +{ + int ret; + + switch (attr->attr) { + case KVM_S390_VM_MEM_LIMIT_SIZE: + ret = 0; + VM_EVENT(kvm, 3, "QUERY: max guest memory: %lu bytes", + kvm->arch.mem_limit); + if (put_user(kvm->arch.mem_limit, (u64 __user *)attr->addr)) + ret = -EFAULT; + break; + default: + ret = -ENXIO; + break; + } + return ret; +} + +static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *attr) +{ + int ret; + unsigned int idx; + switch (attr->attr) { + case KVM_S390_VM_MEM_ENABLE_CMMA: + ret = -ENXIO; + if (!sclp.has_cmma) + break; + + VM_EVENT(kvm, 3, "%s", "ENABLE: CMMA support"); + mutex_lock(&kvm->lock); + if (kvm->created_vcpus) + ret = -EBUSY; + else if (kvm->mm->context.allow_gmap_hpage_1m) + ret = -EINVAL; + else { + kvm->arch.use_cmma = 1; + /* Not compatible with cmma. */ + kvm->arch.use_pfmfi = 0; + ret = 0; + } + mutex_unlock(&kvm->lock); + break; + case KVM_S390_VM_MEM_CLR_CMMA: + ret = -ENXIO; + if (!sclp.has_cmma) + break; + ret = -EINVAL; + if (!kvm->arch.use_cmma) + break; + + VM_EVENT(kvm, 3, "%s", "RESET: CMMA states"); + mutex_lock(&kvm->lock); + idx = srcu_read_lock(&kvm->srcu); + s390_reset_cmma(kvm->arch.gmap->mm); + srcu_read_unlock(&kvm->srcu, idx); + mutex_unlock(&kvm->lock); + ret = 0; + break; + case KVM_S390_VM_MEM_LIMIT_SIZE: { + unsigned long new_limit; + + if (kvm_is_ucontrol(kvm)) + return -EINVAL; + + if (get_user(new_limit, (u64 __user *)attr->addr)) + return -EFAULT; + + if (kvm->arch.mem_limit != KVM_S390_NO_MEM_LIMIT && + new_limit > kvm->arch.mem_limit) + return -E2BIG; + + if (!new_limit) + return -EINVAL; + + /* gmap_create takes last usable address */ + if (new_limit != KVM_S390_NO_MEM_LIMIT) + new_limit -= 1; + + ret = -EBUSY; + mutex_lock(&kvm->lock); + if (!kvm->created_vcpus) { + /* gmap_create will round the limit up */ + struct gmap *new = gmap_create(current->mm, new_limit); + + if (!new) { + ret = -ENOMEM; + } else { + gmap_remove(kvm->arch.gmap); + new->private = kvm; + kvm->arch.gmap = new; + ret = 0; + } + } + mutex_unlock(&kvm->lock); + VM_EVENT(kvm, 3, "SET: max guest address: %lu", new_limit); + VM_EVENT(kvm, 3, "New guest asce: 0x%pK", + (void *) kvm->arch.gmap->asce); + break; + } + default: + ret = -ENXIO; + break; + } + return ret; +} + +static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu); + +void kvm_s390_vcpu_crypto_reset_all(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + unsigned long i; + + kvm_s390_vcpu_block_all(kvm); + + kvm_for_each_vcpu(i, vcpu, kvm) { + kvm_s390_vcpu_crypto_setup(vcpu); + /* recreate the shadow crycb by leaving the VSIE handler */ + kvm_s390_sync_request(KVM_REQ_VSIE_RESTART, vcpu); + } + + kvm_s390_vcpu_unblock_all(kvm); +} + +static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr) +{ + mutex_lock(&kvm->lock); + switch (attr->attr) { + case KVM_S390_VM_CRYPTO_ENABLE_AES_KW: + if (!test_kvm_facility(kvm, 76)) { + mutex_unlock(&kvm->lock); + return -EINVAL; + } + get_random_bytes( + kvm->arch.crypto.crycb->aes_wrapping_key_mask, + sizeof(kvm->arch.crypto.crycb->aes_wrapping_key_mask)); + kvm->arch.crypto.aes_kw = 1; + VM_EVENT(kvm, 3, "%s", "ENABLE: AES keywrapping support"); + break; + case KVM_S390_VM_CRYPTO_ENABLE_DEA_KW: + if (!test_kvm_facility(kvm, 76)) { + mutex_unlock(&kvm->lock); + return -EINVAL; + } + get_random_bytes( + kvm->arch.crypto.crycb->dea_wrapping_key_mask, + sizeof(kvm->arch.crypto.crycb->dea_wrapping_key_mask)); + kvm->arch.crypto.dea_kw = 1; + VM_EVENT(kvm, 3, "%s", "ENABLE: DEA keywrapping support"); + break; + case KVM_S390_VM_CRYPTO_DISABLE_AES_KW: + if (!test_kvm_facility(kvm, 76)) { + mutex_unlock(&kvm->lock); + return -EINVAL; + } + kvm->arch.crypto.aes_kw = 0; + memset(kvm->arch.crypto.crycb->aes_wrapping_key_mask, 0, + sizeof(kvm->arch.crypto.crycb->aes_wrapping_key_mask)); + VM_EVENT(kvm, 3, "%s", "DISABLE: AES keywrapping support"); + break; + case KVM_S390_VM_CRYPTO_DISABLE_DEA_KW: + if (!test_kvm_facility(kvm, 76)) { + mutex_unlock(&kvm->lock); + return -EINVAL; + } + kvm->arch.crypto.dea_kw = 0; + memset(kvm->arch.crypto.crycb->dea_wrapping_key_mask, 0, + sizeof(kvm->arch.crypto.crycb->dea_wrapping_key_mask)); + VM_EVENT(kvm, 3, "%s", "DISABLE: DEA keywrapping support"); + break; + case KVM_S390_VM_CRYPTO_ENABLE_APIE: + if (!ap_instructions_available()) { + mutex_unlock(&kvm->lock); + return -EOPNOTSUPP; + } + kvm->arch.crypto.apie = 1; + break; + case KVM_S390_VM_CRYPTO_DISABLE_APIE: + if (!ap_instructions_available()) { + mutex_unlock(&kvm->lock); + return -EOPNOTSUPP; + } + kvm->arch.crypto.apie = 0; + break; + default: + mutex_unlock(&kvm->lock); + return -ENXIO; + } + + kvm_s390_vcpu_crypto_reset_all(kvm); + mutex_unlock(&kvm->lock); + return 0; +} + +static void kvm_s390_vcpu_pci_setup(struct kvm_vcpu *vcpu) +{ + /* Only set the ECB bits after guest requests zPCI interpretation */ + if (!vcpu->kvm->arch.use_zpci_interp) + return; + + vcpu->arch.sie_block->ecb2 |= ECB2_ZPCI_LSI; + vcpu->arch.sie_block->ecb3 |= ECB3_AISII + ECB3_AISI; +} + +void kvm_s390_vcpu_pci_enable_interp(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + unsigned long i; + + lockdep_assert_held(&kvm->lock); + + if (!kvm_s390_pci_interp_allowed()) + return; + + /* + * If host is configured for PCI and the necessary facilities are + * available, turn on interpretation for the life of this guest + */ + kvm->arch.use_zpci_interp = 1; + + kvm_s390_vcpu_block_all(kvm); + + kvm_for_each_vcpu(i, vcpu, kvm) { + kvm_s390_vcpu_pci_setup(vcpu); + kvm_s390_sync_request(KVM_REQ_VSIE_RESTART, vcpu); + } + + kvm_s390_vcpu_unblock_all(kvm); +} + +static void kvm_s390_sync_request_broadcast(struct kvm *kvm, int req) +{ + unsigned long cx; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(cx, vcpu, kvm) + kvm_s390_sync_request(req, vcpu); +} + +/* + * Must be called with kvm->srcu held to avoid races on memslots, and with + * kvm->slots_lock to avoid races with ourselves and kvm_s390_vm_stop_migration. + */ +static int kvm_s390_vm_start_migration(struct kvm *kvm) +{ + struct kvm_memory_slot *ms; + struct kvm_memslots *slots; + unsigned long ram_pages = 0; + int bkt; + + /* migration mode already enabled */ + if (kvm->arch.migration_mode) + return 0; + slots = kvm_memslots(kvm); + if (!slots || kvm_memslots_empty(slots)) + return -EINVAL; + + if (!kvm->arch.use_cmma) { + kvm->arch.migration_mode = 1; + return 0; + } + /* mark all the pages in active slots as dirty */ + kvm_for_each_memslot(ms, bkt, slots) { + if (!ms->dirty_bitmap) + return -EINVAL; + /* + * The second half of the bitmap is only used on x86, + * and would be wasted otherwise, so we put it to good + * use here to keep track of the state of the storage + * attributes. + */ + memset(kvm_second_dirty_bitmap(ms), 0xff, kvm_dirty_bitmap_bytes(ms)); + ram_pages += ms->npages; + } + atomic64_set(&kvm->arch.cmma_dirty_pages, ram_pages); + kvm->arch.migration_mode = 1; + kvm_s390_sync_request_broadcast(kvm, KVM_REQ_START_MIGRATION); + return 0; +} + +/* + * Must be called with kvm->slots_lock to avoid races with ourselves and + * kvm_s390_vm_start_migration. + */ +static int kvm_s390_vm_stop_migration(struct kvm *kvm) +{ + /* migration mode already disabled */ + if (!kvm->arch.migration_mode) + return 0; + kvm->arch.migration_mode = 0; + if (kvm->arch.use_cmma) + kvm_s390_sync_request_broadcast(kvm, KVM_REQ_STOP_MIGRATION); + return 0; +} + +static int kvm_s390_vm_set_migration(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + int res = -ENXIO; + + mutex_lock(&kvm->slots_lock); + switch (attr->attr) { + case KVM_S390_VM_MIGRATION_START: + res = kvm_s390_vm_start_migration(kvm); + break; + case KVM_S390_VM_MIGRATION_STOP: + res = kvm_s390_vm_stop_migration(kvm); + break; + default: + break; + } + mutex_unlock(&kvm->slots_lock); + + return res; +} + +static int kvm_s390_vm_get_migration(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + u64 mig = kvm->arch.migration_mode; + + if (attr->attr != KVM_S390_VM_MIGRATION_STATUS) + return -ENXIO; + + if (copy_to_user((void __user *)attr->addr, &mig, sizeof(mig))) + return -EFAULT; + return 0; +} + +static void __kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod); + +static int kvm_s390_set_tod_ext(struct kvm *kvm, struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_tod_clock gtod; + + if (copy_from_user(>od, (void __user *)attr->addr, sizeof(gtod))) + return -EFAULT; + + if (!test_kvm_facility(kvm, 139) && gtod.epoch_idx) + return -EINVAL; + __kvm_s390_set_tod_clock(kvm, >od); + + VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x, TOD base: 0x%llx", + gtod.epoch_idx, gtod.tod); + + return 0; +} + +static int kvm_s390_set_tod_high(struct kvm *kvm, struct kvm_device_attr *attr) +{ + u8 gtod_high; + + if (copy_from_user(>od_high, (void __user *)attr->addr, + sizeof(gtod_high))) + return -EFAULT; + + if (gtod_high != 0) + return -EINVAL; + VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x", gtod_high); + + return 0; +} + +static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_tod_clock gtod = { 0 }; + + if (copy_from_user(>od.tod, (void __user *)attr->addr, + sizeof(gtod.tod))) + return -EFAULT; + + __kvm_s390_set_tod_clock(kvm, >od); + VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx", gtod.tod); + return 0; +} + +static int kvm_s390_set_tod(struct kvm *kvm, struct kvm_device_attr *attr) +{ + int ret; + + if (attr->flags) + return -EINVAL; + + mutex_lock(&kvm->lock); + /* + * For protected guests, the TOD is managed by the ultravisor, so trying + * to change it will never bring the expected results. + */ + if (kvm_s390_pv_is_protected(kvm)) { + ret = -EOPNOTSUPP; + goto out_unlock; + } + + switch (attr->attr) { + case KVM_S390_VM_TOD_EXT: + ret = kvm_s390_set_tod_ext(kvm, attr); + break; + case KVM_S390_VM_TOD_HIGH: + ret = kvm_s390_set_tod_high(kvm, attr); + break; + case KVM_S390_VM_TOD_LOW: + ret = kvm_s390_set_tod_low(kvm, attr); + break; + default: + ret = -ENXIO; + break; + } + +out_unlock: + mutex_unlock(&kvm->lock); + return ret; +} + +static void kvm_s390_get_tod_clock(struct kvm *kvm, + struct kvm_s390_vm_tod_clock *gtod) +{ + union tod_clock clk; + + preempt_disable(); + + store_tod_clock_ext(&clk); + + gtod->tod = clk.tod + kvm->arch.epoch; + gtod->epoch_idx = 0; + if (test_kvm_facility(kvm, 139)) { + gtod->epoch_idx = clk.ei + kvm->arch.epdx; + if (gtod->tod < clk.tod) + gtod->epoch_idx += 1; + } + + preempt_enable(); +} + +static int kvm_s390_get_tod_ext(struct kvm *kvm, struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_tod_clock gtod; + + memset(>od, 0, sizeof(gtod)); + kvm_s390_get_tod_clock(kvm, >od); + if (copy_to_user((void __user *)attr->addr, >od, sizeof(gtod))) + return -EFAULT; + + VM_EVENT(kvm, 3, "QUERY: TOD extension: 0x%x, TOD base: 0x%llx", + gtod.epoch_idx, gtod.tod); + return 0; +} + +static int kvm_s390_get_tod_high(struct kvm *kvm, struct kvm_device_attr *attr) +{ + u8 gtod_high = 0; + + if (copy_to_user((void __user *)attr->addr, >od_high, + sizeof(gtod_high))) + return -EFAULT; + VM_EVENT(kvm, 3, "QUERY: TOD extension: 0x%x", gtod_high); + + return 0; +} + +static int kvm_s390_get_tod_low(struct kvm *kvm, struct kvm_device_attr *attr) +{ + u64 gtod; + + gtod = kvm_s390_get_tod_clock_fast(kvm); + if (copy_to_user((void __user *)attr->addr, >od, sizeof(gtod))) + return -EFAULT; + VM_EVENT(kvm, 3, "QUERY: TOD base: 0x%llx", gtod); + + return 0; +} + +static int kvm_s390_get_tod(struct kvm *kvm, struct kvm_device_attr *attr) +{ + int ret; + + if (attr->flags) + return -EINVAL; + + switch (attr->attr) { + case KVM_S390_VM_TOD_EXT: + ret = kvm_s390_get_tod_ext(kvm, attr); + break; + case KVM_S390_VM_TOD_HIGH: + ret = kvm_s390_get_tod_high(kvm, attr); + break; + case KVM_S390_VM_TOD_LOW: + ret = kvm_s390_get_tod_low(kvm, attr); + break; + default: + ret = -ENXIO; + break; + } + return ret; +} + +static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_cpu_processor *proc; + u16 lowest_ibc, unblocked_ibc; + int ret = 0; + + mutex_lock(&kvm->lock); + if (kvm->created_vcpus) { + ret = -EBUSY; + goto out; + } + proc = kzalloc(sizeof(*proc), GFP_KERNEL_ACCOUNT); + if (!proc) { + ret = -ENOMEM; + goto out; + } + if (!copy_from_user(proc, (void __user *)attr->addr, + sizeof(*proc))) { + kvm->arch.model.cpuid = proc->cpuid; + lowest_ibc = sclp.ibc >> 16 & 0xfff; + unblocked_ibc = sclp.ibc & 0xfff; + if (lowest_ibc && proc->ibc) { + if (proc->ibc > unblocked_ibc) + kvm->arch.model.ibc = unblocked_ibc; + else if (proc->ibc < lowest_ibc) + kvm->arch.model.ibc = lowest_ibc; + else + kvm->arch.model.ibc = proc->ibc; + } + memcpy(kvm->arch.model.fac_list, proc->fac_list, + S390_ARCH_FAC_LIST_SIZE_BYTE); + VM_EVENT(kvm, 3, "SET: guest ibc: 0x%4.4x, guest cpuid: 0x%16.16llx", + kvm->arch.model.ibc, + kvm->arch.model.cpuid); + VM_EVENT(kvm, 3, "SET: guest faclist: 0x%16.16llx.%16.16llx.%16.16llx", + kvm->arch.model.fac_list[0], + kvm->arch.model.fac_list[1], + kvm->arch.model.fac_list[2]); + } else + ret = -EFAULT; + kfree(proc); +out: + mutex_unlock(&kvm->lock); + return ret; +} + +static int kvm_s390_set_processor_feat(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_cpu_feat data; + + if (copy_from_user(&data, (void __user *)attr->addr, sizeof(data))) + return -EFAULT; + if (!bitmap_subset((unsigned long *) data.feat, + kvm_s390_available_cpu_feat, + KVM_S390_VM_CPU_FEAT_NR_BITS)) + return -EINVAL; + + mutex_lock(&kvm->lock); + if (kvm->created_vcpus) { + mutex_unlock(&kvm->lock); + return -EBUSY; + } + bitmap_from_arr64(kvm->arch.cpu_feat, data.feat, KVM_S390_VM_CPU_FEAT_NR_BITS); + mutex_unlock(&kvm->lock); + VM_EVENT(kvm, 3, "SET: guest feat: 0x%16.16llx.0x%16.16llx.0x%16.16llx", + data.feat[0], + data.feat[1], + data.feat[2]); + return 0; +} + +static int kvm_s390_set_processor_subfunc(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + mutex_lock(&kvm->lock); + if (kvm->created_vcpus) { + mutex_unlock(&kvm->lock); + return -EBUSY; + } + + if (copy_from_user(&kvm->arch.model.subfuncs, (void __user *)attr->addr, + sizeof(struct kvm_s390_vm_cpu_subfunc))) { + mutex_unlock(&kvm->lock); + return -EFAULT; + } + mutex_unlock(&kvm->lock); + + VM_EVENT(kvm, 3, "SET: guest PLO subfunc 0x%16.16lx.%16.16lx.%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.plo)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.plo)[1], + ((unsigned long *) &kvm->arch.model.subfuncs.plo)[2], + ((unsigned long *) &kvm->arch.model.subfuncs.plo)[3]); + VM_EVENT(kvm, 3, "SET: guest PTFF subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.ptff)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.ptff)[1]); + VM_EVENT(kvm, 3, "SET: guest KMAC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmac)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmac)[1]); + VM_EVENT(kvm, 3, "SET: guest KMC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmc)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmc)[1]); + VM_EVENT(kvm, 3, "SET: guest KM subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.km)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.km)[1]); + VM_EVENT(kvm, 3, "SET: guest KIMD subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kimd)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kimd)[1]); + VM_EVENT(kvm, 3, "SET: guest KLMD subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.klmd)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.klmd)[1]); + VM_EVENT(kvm, 3, "SET: guest PCKMO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.pckmo)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.pckmo)[1]); + VM_EVENT(kvm, 3, "SET: guest KMCTR subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmctr)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmctr)[1]); + VM_EVENT(kvm, 3, "SET: guest KMF subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmf)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmf)[1]); + VM_EVENT(kvm, 3, "SET: guest KMO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmo)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmo)[1]); + VM_EVENT(kvm, 3, "SET: guest PCC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.pcc)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.pcc)[1]); + VM_EVENT(kvm, 3, "SET: guest PPNO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.ppno)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.ppno)[1]); + VM_EVENT(kvm, 3, "SET: guest KMA subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kma)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kma)[1]); + VM_EVENT(kvm, 3, "SET: guest KDSA subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kdsa)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kdsa)[1]); + VM_EVENT(kvm, 3, "SET: guest SORTL subfunc 0x%16.16lx.%16.16lx.%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.sortl)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.sortl)[1], + ((unsigned long *) &kvm->arch.model.subfuncs.sortl)[2], + ((unsigned long *) &kvm->arch.model.subfuncs.sortl)[3]); + VM_EVENT(kvm, 3, "SET: guest DFLTCC subfunc 0x%16.16lx.%16.16lx.%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[1], + ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[2], + ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[3]); + + return 0; +} + +#define KVM_S390_VM_CPU_UV_FEAT_GUEST_MASK \ +( \ + ((struct kvm_s390_vm_cpu_uv_feat){ \ + .ap = 1, \ + .ap_intr = 1, \ + }) \ + .feat \ +) + +static int kvm_s390_set_uv_feat(struct kvm *kvm, struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_cpu_uv_feat __user *ptr = (void __user *)attr->addr; + unsigned long data, filter; + + filter = uv_info.uv_feature_indications & KVM_S390_VM_CPU_UV_FEAT_GUEST_MASK; + if (get_user(data, &ptr->feat)) + return -EFAULT; + if (!bitmap_subset(&data, &filter, KVM_S390_VM_CPU_UV_FEAT_NR_BITS)) + return -EINVAL; + + mutex_lock(&kvm->lock); + if (kvm->created_vcpus) { + mutex_unlock(&kvm->lock); + return -EBUSY; + } + kvm->arch.model.uv_feat_guest.feat = data; + mutex_unlock(&kvm->lock); + + VM_EVENT(kvm, 3, "SET: guest UV-feat: 0x%16.16lx", data); + + return 0; +} + +static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) +{ + int ret = -ENXIO; + + switch (attr->attr) { + case KVM_S390_VM_CPU_PROCESSOR: + ret = kvm_s390_set_processor(kvm, attr); + break; + case KVM_S390_VM_CPU_PROCESSOR_FEAT: + ret = kvm_s390_set_processor_feat(kvm, attr); + break; + case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC: + ret = kvm_s390_set_processor_subfunc(kvm, attr); + break; + case KVM_S390_VM_CPU_PROCESSOR_UV_FEAT_GUEST: + ret = kvm_s390_set_uv_feat(kvm, attr); + break; + } + return ret; +} + +static int kvm_s390_get_processor(struct kvm *kvm, struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_cpu_processor *proc; + int ret = 0; + + proc = kzalloc(sizeof(*proc), GFP_KERNEL_ACCOUNT); + if (!proc) { + ret = -ENOMEM; + goto out; + } + proc->cpuid = kvm->arch.model.cpuid; + proc->ibc = kvm->arch.model.ibc; + memcpy(&proc->fac_list, kvm->arch.model.fac_list, + S390_ARCH_FAC_LIST_SIZE_BYTE); + VM_EVENT(kvm, 3, "GET: guest ibc: 0x%4.4x, guest cpuid: 0x%16.16llx", + kvm->arch.model.ibc, + kvm->arch.model.cpuid); + VM_EVENT(kvm, 3, "GET: guest faclist: 0x%16.16llx.%16.16llx.%16.16llx", + kvm->arch.model.fac_list[0], + kvm->arch.model.fac_list[1], + kvm->arch.model.fac_list[2]); + if (copy_to_user((void __user *)attr->addr, proc, sizeof(*proc))) + ret = -EFAULT; + kfree(proc); +out: + return ret; +} + +static int kvm_s390_get_machine(struct kvm *kvm, struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_cpu_machine *mach; + int ret = 0; + + mach = kzalloc(sizeof(*mach), GFP_KERNEL_ACCOUNT); + if (!mach) { + ret = -ENOMEM; + goto out; + } + get_cpu_id((struct cpuid *) &mach->cpuid); + mach->ibc = sclp.ibc; + memcpy(&mach->fac_mask, kvm->arch.model.fac_mask, + S390_ARCH_FAC_LIST_SIZE_BYTE); + memcpy((unsigned long *)&mach->fac_list, stfle_fac_list, + sizeof(stfle_fac_list)); + VM_EVENT(kvm, 3, "GET: host ibc: 0x%4.4x, host cpuid: 0x%16.16llx", + kvm->arch.model.ibc, + kvm->arch.model.cpuid); + VM_EVENT(kvm, 3, "GET: host facmask: 0x%16.16llx.%16.16llx.%16.16llx", + mach->fac_mask[0], + mach->fac_mask[1], + mach->fac_mask[2]); + VM_EVENT(kvm, 3, "GET: host faclist: 0x%16.16llx.%16.16llx.%16.16llx", + mach->fac_list[0], + mach->fac_list[1], + mach->fac_list[2]); + if (copy_to_user((void __user *)attr->addr, mach, sizeof(*mach))) + ret = -EFAULT; + kfree(mach); +out: + return ret; +} + +static int kvm_s390_get_processor_feat(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_cpu_feat data; + + bitmap_to_arr64(data.feat, kvm->arch.cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); + if (copy_to_user((void __user *)attr->addr, &data, sizeof(data))) + return -EFAULT; + VM_EVENT(kvm, 3, "GET: guest feat: 0x%16.16llx.0x%16.16llx.0x%16.16llx", + data.feat[0], + data.feat[1], + data.feat[2]); + return 0; +} + +static int kvm_s390_get_machine_feat(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_cpu_feat data; + + bitmap_to_arr64(data.feat, kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); + if (copy_to_user((void __user *)attr->addr, &data, sizeof(data))) + return -EFAULT; + VM_EVENT(kvm, 3, "GET: host feat: 0x%16.16llx.0x%16.16llx.0x%16.16llx", + data.feat[0], + data.feat[1], + data.feat[2]); + return 0; +} + +static int kvm_s390_get_processor_subfunc(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + if (copy_to_user((void __user *)attr->addr, &kvm->arch.model.subfuncs, + sizeof(struct kvm_s390_vm_cpu_subfunc))) + return -EFAULT; + + VM_EVENT(kvm, 3, "GET: guest PLO subfunc 0x%16.16lx.%16.16lx.%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.plo)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.plo)[1], + ((unsigned long *) &kvm->arch.model.subfuncs.plo)[2], + ((unsigned long *) &kvm->arch.model.subfuncs.plo)[3]); + VM_EVENT(kvm, 3, "GET: guest PTFF subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.ptff)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.ptff)[1]); + VM_EVENT(kvm, 3, "GET: guest KMAC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmac)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmac)[1]); + VM_EVENT(kvm, 3, "GET: guest KMC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmc)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmc)[1]); + VM_EVENT(kvm, 3, "GET: guest KM subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.km)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.km)[1]); + VM_EVENT(kvm, 3, "GET: guest KIMD subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kimd)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kimd)[1]); + VM_EVENT(kvm, 3, "GET: guest KLMD subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.klmd)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.klmd)[1]); + VM_EVENT(kvm, 3, "GET: guest PCKMO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.pckmo)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.pckmo)[1]); + VM_EVENT(kvm, 3, "GET: guest KMCTR subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmctr)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmctr)[1]); + VM_EVENT(kvm, 3, "GET: guest KMF subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmf)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmf)[1]); + VM_EVENT(kvm, 3, "GET: guest KMO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmo)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmo)[1]); + VM_EVENT(kvm, 3, "GET: guest PCC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.pcc)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.pcc)[1]); + VM_EVENT(kvm, 3, "GET: guest PPNO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.ppno)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.ppno)[1]); + VM_EVENT(kvm, 3, "GET: guest KMA subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kma)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kma)[1]); + VM_EVENT(kvm, 3, "GET: guest KDSA subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kdsa)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kdsa)[1]); + VM_EVENT(kvm, 3, "GET: guest SORTL subfunc 0x%16.16lx.%16.16lx.%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.sortl)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.sortl)[1], + ((unsigned long *) &kvm->arch.model.subfuncs.sortl)[2], + ((unsigned long *) &kvm->arch.model.subfuncs.sortl)[3]); + VM_EVENT(kvm, 3, "GET: guest DFLTCC subfunc 0x%16.16lx.%16.16lx.%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[1], + ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[2], + ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[3]); + + return 0; +} + +static int kvm_s390_get_machine_subfunc(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + if (copy_to_user((void __user *)attr->addr, &kvm_s390_available_subfunc, + sizeof(struct kvm_s390_vm_cpu_subfunc))) + return -EFAULT; + + VM_EVENT(kvm, 3, "GET: host PLO subfunc 0x%16.16lx.%16.16lx.%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.plo)[0], + ((unsigned long *) &kvm_s390_available_subfunc.plo)[1], + ((unsigned long *) &kvm_s390_available_subfunc.plo)[2], + ((unsigned long *) &kvm_s390_available_subfunc.plo)[3]); + VM_EVENT(kvm, 3, "GET: host PTFF subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.ptff)[0], + ((unsigned long *) &kvm_s390_available_subfunc.ptff)[1]); + VM_EVENT(kvm, 3, "GET: host KMAC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.kmac)[0], + ((unsigned long *) &kvm_s390_available_subfunc.kmac)[1]); + VM_EVENT(kvm, 3, "GET: host KMC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.kmc)[0], + ((unsigned long *) &kvm_s390_available_subfunc.kmc)[1]); + VM_EVENT(kvm, 3, "GET: host KM subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.km)[0], + ((unsigned long *) &kvm_s390_available_subfunc.km)[1]); + VM_EVENT(kvm, 3, "GET: host KIMD subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.kimd)[0], + ((unsigned long *) &kvm_s390_available_subfunc.kimd)[1]); + VM_EVENT(kvm, 3, "GET: host KLMD subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.klmd)[0], + ((unsigned long *) &kvm_s390_available_subfunc.klmd)[1]); + VM_EVENT(kvm, 3, "GET: host PCKMO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.pckmo)[0], + ((unsigned long *) &kvm_s390_available_subfunc.pckmo)[1]); + VM_EVENT(kvm, 3, "GET: host KMCTR subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.kmctr)[0], + ((unsigned long *) &kvm_s390_available_subfunc.kmctr)[1]); + VM_EVENT(kvm, 3, "GET: host KMF subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.kmf)[0], + ((unsigned long *) &kvm_s390_available_subfunc.kmf)[1]); + VM_EVENT(kvm, 3, "GET: host KMO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.kmo)[0], + ((unsigned long *) &kvm_s390_available_subfunc.kmo)[1]); + VM_EVENT(kvm, 3, "GET: host PCC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.pcc)[0], + ((unsigned long *) &kvm_s390_available_subfunc.pcc)[1]); + VM_EVENT(kvm, 3, "GET: host PPNO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.ppno)[0], + ((unsigned long *) &kvm_s390_available_subfunc.ppno)[1]); + VM_EVENT(kvm, 3, "GET: host KMA subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.kma)[0], + ((unsigned long *) &kvm_s390_available_subfunc.kma)[1]); + VM_EVENT(kvm, 3, "GET: host KDSA subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.kdsa)[0], + ((unsigned long *) &kvm_s390_available_subfunc.kdsa)[1]); + VM_EVENT(kvm, 3, "GET: host SORTL subfunc 0x%16.16lx.%16.16lx.%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.sortl)[0], + ((unsigned long *) &kvm_s390_available_subfunc.sortl)[1], + ((unsigned long *) &kvm_s390_available_subfunc.sortl)[2], + ((unsigned long *) &kvm_s390_available_subfunc.sortl)[3]); + VM_EVENT(kvm, 3, "GET: host DFLTCC subfunc 0x%16.16lx.%16.16lx.%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.dfltcc)[0], + ((unsigned long *) &kvm_s390_available_subfunc.dfltcc)[1], + ((unsigned long *) &kvm_s390_available_subfunc.dfltcc)[2], + ((unsigned long *) &kvm_s390_available_subfunc.dfltcc)[3]); + + return 0; +} + +static int kvm_s390_get_processor_uv_feat(struct kvm *kvm, struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_cpu_uv_feat __user *dst = (void __user *)attr->addr; + unsigned long feat = kvm->arch.model.uv_feat_guest.feat; + + if (put_user(feat, &dst->feat)) + return -EFAULT; + VM_EVENT(kvm, 3, "GET: guest UV-feat: 0x%16.16lx", feat); + + return 0; +} + +static int kvm_s390_get_machine_uv_feat(struct kvm *kvm, struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_cpu_uv_feat __user *dst = (void __user *)attr->addr; + unsigned long feat; + + BUILD_BUG_ON(sizeof(*dst) != sizeof(uv_info.uv_feature_indications)); + + feat = uv_info.uv_feature_indications & KVM_S390_VM_CPU_UV_FEAT_GUEST_MASK; + if (put_user(feat, &dst->feat)) + return -EFAULT; + VM_EVENT(kvm, 3, "GET: guest UV-feat: 0x%16.16lx", feat); + + return 0; +} + +static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) +{ + int ret = -ENXIO; + + switch (attr->attr) { + case KVM_S390_VM_CPU_PROCESSOR: + ret = kvm_s390_get_processor(kvm, attr); + break; + case KVM_S390_VM_CPU_MACHINE: + ret = kvm_s390_get_machine(kvm, attr); + break; + case KVM_S390_VM_CPU_PROCESSOR_FEAT: + ret = kvm_s390_get_processor_feat(kvm, attr); + break; + case KVM_S390_VM_CPU_MACHINE_FEAT: + ret = kvm_s390_get_machine_feat(kvm, attr); + break; + case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC: + ret = kvm_s390_get_processor_subfunc(kvm, attr); + break; + case KVM_S390_VM_CPU_MACHINE_SUBFUNC: + ret = kvm_s390_get_machine_subfunc(kvm, attr); + break; + case KVM_S390_VM_CPU_PROCESSOR_UV_FEAT_GUEST: + ret = kvm_s390_get_processor_uv_feat(kvm, attr); + break; + case KVM_S390_VM_CPU_MACHINE_UV_FEAT_GUEST: + ret = kvm_s390_get_machine_uv_feat(kvm, attr); + break; + } + return ret; +} + +/** + * kvm_s390_update_topology_change_report - update CPU topology change report + * @kvm: guest KVM description + * @val: set or clear the MTCR bit + * + * Updates the Multiprocessor Topology-Change-Report bit to signal + * the guest with a topology change. + * This is only relevant if the topology facility is present. + * + * The SCA version, bsca or esca, doesn't matter as offset is the same. + */ +static void kvm_s390_update_topology_change_report(struct kvm *kvm, bool val) +{ + union sca_utility new, old; + struct bsca_block *sca; + + read_lock(&kvm->arch.sca_lock); + sca = kvm->arch.sca; + do { + old = READ_ONCE(sca->utility); + new = old; + new.mtcr = val; + } while (cmpxchg(&sca->utility.val, old.val, new.val) != old.val); + read_unlock(&kvm->arch.sca_lock); +} + +static int kvm_s390_set_topo_change_indication(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + if (!test_kvm_facility(kvm, 11)) + return -ENXIO; + + kvm_s390_update_topology_change_report(kvm, !!attr->attr); + return 0; +} + +static int kvm_s390_get_topo_change_indication(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + u8 topo; + + if (!test_kvm_facility(kvm, 11)) + return -ENXIO; + + read_lock(&kvm->arch.sca_lock); + topo = ((struct bsca_block *)kvm->arch.sca)->utility.mtcr; + read_unlock(&kvm->arch.sca_lock); + + return put_user(topo, (u8 __user *)attr->addr); +} + +static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr) +{ + int ret; + + switch (attr->group) { + case KVM_S390_VM_MEM_CTRL: + ret = kvm_s390_set_mem_control(kvm, attr); + break; + case KVM_S390_VM_TOD: + ret = kvm_s390_set_tod(kvm, attr); + break; + case KVM_S390_VM_CPU_MODEL: + ret = kvm_s390_set_cpu_model(kvm, attr); + break; + case KVM_S390_VM_CRYPTO: + ret = kvm_s390_vm_set_crypto(kvm, attr); + break; + case KVM_S390_VM_MIGRATION: + ret = kvm_s390_vm_set_migration(kvm, attr); + break; + case KVM_S390_VM_CPU_TOPOLOGY: + ret = kvm_s390_set_topo_change_indication(kvm, attr); + break; + default: + ret = -ENXIO; + break; + } + + return ret; +} + +static int kvm_s390_vm_get_attr(struct kvm *kvm, struct kvm_device_attr *attr) +{ + int ret; + + switch (attr->group) { + case KVM_S390_VM_MEM_CTRL: + ret = kvm_s390_get_mem_control(kvm, attr); + break; + case KVM_S390_VM_TOD: + ret = kvm_s390_get_tod(kvm, attr); + break; + case KVM_S390_VM_CPU_MODEL: + ret = kvm_s390_get_cpu_model(kvm, attr); + break; + case KVM_S390_VM_MIGRATION: + ret = kvm_s390_vm_get_migration(kvm, attr); + break; + case KVM_S390_VM_CPU_TOPOLOGY: + ret = kvm_s390_get_topo_change_indication(kvm, attr); + break; + default: + ret = -ENXIO; + break; + } + + return ret; +} + +static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) +{ + int ret; + + switch (attr->group) { + case KVM_S390_VM_MEM_CTRL: + switch (attr->attr) { + case KVM_S390_VM_MEM_ENABLE_CMMA: + case KVM_S390_VM_MEM_CLR_CMMA: + ret = sclp.has_cmma ? 0 : -ENXIO; + break; + case KVM_S390_VM_MEM_LIMIT_SIZE: + ret = 0; + break; + default: + ret = -ENXIO; + break; + } + break; + case KVM_S390_VM_TOD: + switch (attr->attr) { + case KVM_S390_VM_TOD_LOW: + case KVM_S390_VM_TOD_HIGH: + ret = 0; + break; + default: + ret = -ENXIO; + break; + } + break; + case KVM_S390_VM_CPU_MODEL: + switch (attr->attr) { + case KVM_S390_VM_CPU_PROCESSOR: + case KVM_S390_VM_CPU_MACHINE: + case KVM_S390_VM_CPU_PROCESSOR_FEAT: + case KVM_S390_VM_CPU_MACHINE_FEAT: + case KVM_S390_VM_CPU_MACHINE_SUBFUNC: + case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC: + case KVM_S390_VM_CPU_MACHINE_UV_FEAT_GUEST: + case KVM_S390_VM_CPU_PROCESSOR_UV_FEAT_GUEST: + ret = 0; + break; + default: + ret = -ENXIO; + break; + } + break; + case KVM_S390_VM_CRYPTO: + switch (attr->attr) { + case KVM_S390_VM_CRYPTO_ENABLE_AES_KW: + case KVM_S390_VM_CRYPTO_ENABLE_DEA_KW: + case KVM_S390_VM_CRYPTO_DISABLE_AES_KW: + case KVM_S390_VM_CRYPTO_DISABLE_DEA_KW: + ret = 0; + break; + case KVM_S390_VM_CRYPTO_ENABLE_APIE: + case KVM_S390_VM_CRYPTO_DISABLE_APIE: + ret = ap_instructions_available() ? 0 : -ENXIO; + break; + default: + ret = -ENXIO; + break; + } + break; + case KVM_S390_VM_MIGRATION: + ret = 0; + break; + case KVM_S390_VM_CPU_TOPOLOGY: + ret = test_kvm_facility(kvm, 11) ? 0 : -ENXIO; + break; + default: + ret = -ENXIO; + break; + } + + return ret; +} + +static int kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) +{ + uint8_t *keys; + uint64_t hva; + int srcu_idx, i, r = 0; + + if (args->flags != 0) + return -EINVAL; + + /* Is this guest using storage keys? */ + if (!mm_uses_skeys(current->mm)) + return KVM_S390_GET_SKEYS_NONE; + + /* Enforce sane limit on memory allocation */ + if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX) + return -EINVAL; + + keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL_ACCOUNT); + if (!keys) + return -ENOMEM; + + mmap_read_lock(current->mm); + srcu_idx = srcu_read_lock(&kvm->srcu); + for (i = 0; i < args->count; i++) { + hva = gfn_to_hva(kvm, args->start_gfn + i); + if (kvm_is_error_hva(hva)) { + r = -EFAULT; + break; + } + + r = get_guest_storage_key(current->mm, hva, &keys[i]); + if (r) + break; + } + srcu_read_unlock(&kvm->srcu, srcu_idx); + mmap_read_unlock(current->mm); + + if (!r) { + r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys, + sizeof(uint8_t) * args->count); + if (r) + r = -EFAULT; + } + + kvfree(keys); + return r; +} + +static int kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) +{ + uint8_t *keys; + uint64_t hva; + int srcu_idx, i, r = 0; + bool unlocked; + + if (args->flags != 0) + return -EINVAL; + + /* Enforce sane limit on memory allocation */ + if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX) + return -EINVAL; + + keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL_ACCOUNT); + if (!keys) + return -ENOMEM; + + r = copy_from_user(keys, (uint8_t __user *)args->skeydata_addr, + sizeof(uint8_t) * args->count); + if (r) { + r = -EFAULT; + goto out; + } + + /* Enable storage key handling for the guest */ + r = s390_enable_skey(); + if (r) + goto out; + + i = 0; + mmap_read_lock(current->mm); + srcu_idx = srcu_read_lock(&kvm->srcu); + while (i < args->count) { + unlocked = false; + hva = gfn_to_hva(kvm, args->start_gfn + i); + if (kvm_is_error_hva(hva)) { + r = -EFAULT; + break; + } + + /* Lowest order bit is reserved */ + if (keys[i] & 0x01) { + r = -EINVAL; + break; + } + + r = set_guest_storage_key(current->mm, hva, keys[i], 0); + if (r) { + r = fixup_user_fault(current->mm, hva, + FAULT_FLAG_WRITE, &unlocked); + if (r) + break; + } + if (!r) + i++; + } + srcu_read_unlock(&kvm->srcu, srcu_idx); + mmap_read_unlock(current->mm); +out: + kvfree(keys); + return r; +} + +/* + * Base address and length must be sent at the start of each block, therefore + * it's cheaper to send some clean data, as long as it's less than the size of + * two longs. + */ +#define KVM_S390_MAX_BIT_DISTANCE (2 * sizeof(void *)) +/* for consistency */ +#define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX) + +static int kvm_s390_peek_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args, + u8 *res, unsigned long bufsize) +{ + unsigned long pgstev, hva, cur_gfn = args->start_gfn; + + args->count = 0; + while (args->count < bufsize) { + hva = gfn_to_hva(kvm, cur_gfn); + /* + * We return an error if the first value was invalid, but we + * return successfully if at least one value was copied. + */ + if (kvm_is_error_hva(hva)) + return args->count ? 0 : -EFAULT; + if (get_pgste(kvm->mm, hva, &pgstev) < 0) + pgstev = 0; + res[args->count++] = (pgstev >> 24) & 0x43; + cur_gfn++; + } + + return 0; +} + +static struct kvm_memory_slot *gfn_to_memslot_approx(struct kvm_memslots *slots, + gfn_t gfn) +{ + return ____gfn_to_memslot(slots, gfn, true); +} + +static unsigned long kvm_s390_next_dirty_cmma(struct kvm_memslots *slots, + unsigned long cur_gfn) +{ + struct kvm_memory_slot *ms = gfn_to_memslot_approx(slots, cur_gfn); + unsigned long ofs = cur_gfn - ms->base_gfn; + struct rb_node *mnode = &ms->gfn_node[slots->node_idx]; + + if (ms->base_gfn + ms->npages <= cur_gfn) { + mnode = rb_next(mnode); + /* If we are above the highest slot, wrap around */ + if (!mnode) + mnode = rb_first(&slots->gfn_tree); + + ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]); + ofs = 0; + } + + if (cur_gfn < ms->base_gfn) + ofs = 0; + + ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, ofs); + while (ofs >= ms->npages && (mnode = rb_next(mnode))) { + ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]); + ofs = find_first_bit(kvm_second_dirty_bitmap(ms), ms->npages); + } + return ms->base_gfn + ofs; +} + +static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args, + u8 *res, unsigned long bufsize) +{ + unsigned long mem_end, cur_gfn, next_gfn, hva, pgstev; + struct kvm_memslots *slots = kvm_memslots(kvm); + struct kvm_memory_slot *ms; + + if (unlikely(kvm_memslots_empty(slots))) + return 0; + + cur_gfn = kvm_s390_next_dirty_cmma(slots, args->start_gfn); + ms = gfn_to_memslot(kvm, cur_gfn); + args->count = 0; + args->start_gfn = cur_gfn; + if (!ms) + return 0; + next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1); + mem_end = kvm_s390_get_gfn_end(slots); + + while (args->count < bufsize) { + hva = gfn_to_hva(kvm, cur_gfn); + if (kvm_is_error_hva(hva)) + return 0; + /* Decrement only if we actually flipped the bit to 0 */ + if (test_and_clear_bit(cur_gfn - ms->base_gfn, kvm_second_dirty_bitmap(ms))) + atomic64_dec(&kvm->arch.cmma_dirty_pages); + if (get_pgste(kvm->mm, hva, &pgstev) < 0) + pgstev = 0; + /* Save the value */ + res[args->count++] = (pgstev >> 24) & 0x43; + /* If the next bit is too far away, stop. */ + if (next_gfn > cur_gfn + KVM_S390_MAX_BIT_DISTANCE) + return 0; + /* If we reached the previous "next", find the next one */ + if (cur_gfn == next_gfn) + next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1); + /* Reached the end of memory or of the buffer, stop */ + if ((next_gfn >= mem_end) || + (next_gfn - args->start_gfn >= bufsize)) + return 0; + cur_gfn++; + /* Reached the end of the current memslot, take the next one. */ + if (cur_gfn - ms->base_gfn >= ms->npages) { + ms = gfn_to_memslot(kvm, cur_gfn); + if (!ms) + return 0; + } + } + return 0; +} + +/* + * This function searches for the next page with dirty CMMA attributes, and + * saves the attributes in the buffer up to either the end of the buffer or + * until a block of at least KVM_S390_MAX_BIT_DISTANCE clean bits is found; + * no trailing clean bytes are saved. + * In case no dirty bits were found, or if CMMA was not enabled or used, the + * output buffer will indicate 0 as length. + */ +static int kvm_s390_get_cmma_bits(struct kvm *kvm, + struct kvm_s390_cmma_log *args) +{ + unsigned long bufsize; + int srcu_idx, peek, ret; + u8 *values; + + if (!kvm->arch.use_cmma) + return -ENXIO; + /* Invalid/unsupported flags were specified */ + if (args->flags & ~KVM_S390_CMMA_PEEK) + return -EINVAL; + /* Migration mode query, and we are not doing a migration */ + peek = !!(args->flags & KVM_S390_CMMA_PEEK); + if (!peek && !kvm->arch.migration_mode) + return -EINVAL; + /* CMMA is disabled or was not used, or the buffer has length zero */ + bufsize = min(args->count, KVM_S390_CMMA_SIZE_MAX); + if (!bufsize || !kvm->mm->context.uses_cmm) { + memset(args, 0, sizeof(*args)); + return 0; + } + /* We are not peeking, and there are no dirty pages */ + if (!peek && !atomic64_read(&kvm->arch.cmma_dirty_pages)) { + memset(args, 0, sizeof(*args)); + return 0; + } + + values = vmalloc(bufsize); + if (!values) + return -ENOMEM; + + mmap_read_lock(kvm->mm); + srcu_idx = srcu_read_lock(&kvm->srcu); + if (peek) + ret = kvm_s390_peek_cmma(kvm, args, values, bufsize); + else + ret = kvm_s390_get_cmma(kvm, args, values, bufsize); + srcu_read_unlock(&kvm->srcu, srcu_idx); + mmap_read_unlock(kvm->mm); + + if (kvm->arch.migration_mode) + args->remaining = atomic64_read(&kvm->arch.cmma_dirty_pages); + else + args->remaining = 0; + + if (copy_to_user((void __user *)args->values, values, args->count)) + ret = -EFAULT; + + vfree(values); + return ret; +} + +/* + * This function sets the CMMA attributes for the given pages. If the input + * buffer has zero length, no action is taken, otherwise the attributes are + * set and the mm->context.uses_cmm flag is set. + */ +static int kvm_s390_set_cmma_bits(struct kvm *kvm, + const struct kvm_s390_cmma_log *args) +{ + unsigned long hva, mask, pgstev, i; + uint8_t *bits; + int srcu_idx, r = 0; + + mask = args->mask; + + if (!kvm->arch.use_cmma) + return -ENXIO; + /* invalid/unsupported flags */ + if (args->flags != 0) + return -EINVAL; + /* Enforce sane limit on memory allocation */ + if (args->count > KVM_S390_CMMA_SIZE_MAX) + return -EINVAL; + /* Nothing to do */ + if (args->count == 0) + return 0; + + bits = vmalloc(array_size(sizeof(*bits), args->count)); + if (!bits) + return -ENOMEM; + + r = copy_from_user(bits, (void __user *)args->values, args->count); + if (r) { + r = -EFAULT; + goto out; + } + + mmap_read_lock(kvm->mm); + srcu_idx = srcu_read_lock(&kvm->srcu); + for (i = 0; i < args->count; i++) { + hva = gfn_to_hva(kvm, args->start_gfn + i); + if (kvm_is_error_hva(hva)) { + r = -EFAULT; + break; + } + + pgstev = bits[i]; + pgstev = pgstev << 24; + mask &= _PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT; + set_pgste_bits(kvm->mm, hva, mask, pgstev); + } + srcu_read_unlock(&kvm->srcu, srcu_idx); + mmap_read_unlock(kvm->mm); + + if (!kvm->mm->context.uses_cmm) { + mmap_write_lock(kvm->mm); + kvm->mm->context.uses_cmm = 1; + mmap_write_unlock(kvm->mm); + } +out: + vfree(bits); + return r; +} + +/** + * kvm_s390_cpus_from_pv - Convert all protected vCPUs in a protected VM to + * non protected. + * @kvm: the VM whose protected vCPUs are to be converted + * @rc: return value for the RC field of the UVC (in case of error) + * @rrc: return value for the RRC field of the UVC (in case of error) + * + * Does not stop in case of error, tries to convert as many + * CPUs as possible. In case of error, the RC and RRC of the last error are + * returned. + * + * Return: 0 in case of success, otherwise -EIO + */ +int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct kvm_vcpu *vcpu; + unsigned long i; + u16 _rc, _rrc; + int ret = 0; + + /* + * We ignore failures and try to destroy as many CPUs as possible. + * At the same time we must not free the assigned resources when + * this fails, as the ultravisor has still access to that memory. + * So kvm_s390_pv_destroy_cpu can leave a "wanted" memory leak + * behind. + * We want to return the first failure rc and rrc, though. + */ + kvm_for_each_vcpu(i, vcpu, kvm) { + mutex_lock(&vcpu->mutex); + if (kvm_s390_pv_destroy_cpu(vcpu, &_rc, &_rrc) && !ret) { + *rc = _rc; + *rrc = _rrc; + ret = -EIO; + } + mutex_unlock(&vcpu->mutex); + } + /* Ensure that we re-enable gisa if the non-PV guest used it but the PV guest did not. */ + if (use_gisa) + kvm_s390_gisa_enable(kvm); + return ret; +} + +/** + * kvm_s390_cpus_to_pv - Convert all non-protected vCPUs in a protected VM + * to protected. + * @kvm: the VM whose protected vCPUs are to be converted + * @rc: return value for the RC field of the UVC (in case of error) + * @rrc: return value for the RRC field of the UVC (in case of error) + * + * Tries to undo the conversion in case of error. + * + * Return: 0 in case of success, otherwise -EIO + */ +static int kvm_s390_cpus_to_pv(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + unsigned long i; + int r = 0; + u16 dummy; + + struct kvm_vcpu *vcpu; + + /* Disable the GISA if the ultravisor does not support AIV. */ + if (!uv_has_feature(BIT_UV_FEAT_AIV)) + kvm_s390_gisa_disable(kvm); + + kvm_for_each_vcpu(i, vcpu, kvm) { + mutex_lock(&vcpu->mutex); + r = kvm_s390_pv_create_cpu(vcpu, rc, rrc); + mutex_unlock(&vcpu->mutex); + if (r) + break; + } + if (r) + kvm_s390_cpus_from_pv(kvm, &dummy, &dummy); + return r; +} + +/* + * Here we provide user space with a direct interface to query UV + * related data like UV maxima and available features as well as + * feature specific data. + * + * To facilitate future extension of the data structures we'll try to + * write data up to the maximum requested length. + */ +static ssize_t kvm_s390_handle_pv_info(struct kvm_s390_pv_info *info) +{ + ssize_t len_min; + + switch (info->header.id) { + case KVM_PV_INFO_VM: { + len_min = sizeof(info->header) + sizeof(info->vm); + + if (info->header.len_max < len_min) + return -EINVAL; + + memcpy(info->vm.inst_calls_list, + uv_info.inst_calls_list, + sizeof(uv_info.inst_calls_list)); + + /* It's max cpuid not max cpus, so it's off by one */ + info->vm.max_cpus = uv_info.max_guest_cpu_id + 1; + info->vm.max_guests = uv_info.max_num_sec_conf; + info->vm.max_guest_addr = uv_info.max_sec_stor_addr; + info->vm.feature_indication = uv_info.uv_feature_indications; + + return len_min; + } + case KVM_PV_INFO_DUMP: { + len_min = sizeof(info->header) + sizeof(info->dump); + + if (info->header.len_max < len_min) + return -EINVAL; + + info->dump.dump_cpu_buffer_len = uv_info.guest_cpu_stor_len; + info->dump.dump_config_mem_buffer_per_1m = uv_info.conf_dump_storage_state_len; + info->dump.dump_config_finalize_len = uv_info.conf_dump_finalize_len; + return len_min; + } + default: + return -EINVAL; + } +} + +static int kvm_s390_pv_dmp(struct kvm *kvm, struct kvm_pv_cmd *cmd, + struct kvm_s390_pv_dmp dmp) +{ + int r = -EINVAL; + void __user *result_buff = (void __user *)dmp.buff_addr; + + switch (dmp.subcmd) { + case KVM_PV_DUMP_INIT: { + if (kvm->arch.pv.dumping) + break; + + /* + * Block SIE entry as concurrent dump UVCs could lead + * to validities. + */ + kvm_s390_vcpu_block_all(kvm); + + r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), + UVC_CMD_DUMP_INIT, &cmd->rc, &cmd->rrc); + KVM_UV_EVENT(kvm, 3, "PROTVIRT DUMP INIT: rc %x rrc %x", + cmd->rc, cmd->rrc); + if (!r) { + kvm->arch.pv.dumping = true; + } else { + kvm_s390_vcpu_unblock_all(kvm); + r = -EINVAL; + } + break; + } + case KVM_PV_DUMP_CONFIG_STOR_STATE: { + if (!kvm->arch.pv.dumping) + break; + + /* + * gaddr is an output parameter since we might stop + * early. As dmp will be copied back in our caller, we + * don't need to do it ourselves. + */ + r = kvm_s390_pv_dump_stor_state(kvm, result_buff, &dmp.gaddr, dmp.buff_len, + &cmd->rc, &cmd->rrc); + break; + } + case KVM_PV_DUMP_COMPLETE: { + if (!kvm->arch.pv.dumping) + break; + + r = -EINVAL; + if (dmp.buff_len < uv_info.conf_dump_finalize_len) + break; + + r = kvm_s390_pv_dump_complete(kvm, result_buff, + &cmd->rc, &cmd->rrc); + break; + } + default: + r = -ENOTTY; + break; + } + + return r; +} + +static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) +{ + const bool need_lock = (cmd->cmd != KVM_PV_ASYNC_CLEANUP_PERFORM); + void __user *argp = (void __user *)cmd->data; + int r = 0; + u16 dummy; + + if (need_lock) + mutex_lock(&kvm->lock); + + switch (cmd->cmd) { + case KVM_PV_ENABLE: { + r = -EINVAL; + if (kvm_s390_pv_is_protected(kvm)) + break; + + /* + * FMT 4 SIE needs esca. As we never switch back to bsca from + * esca, we need no cleanup in the error cases below + */ + r = sca_switch_to_extended(kvm); + if (r) + break; + + mmap_write_lock(current->mm); + r = gmap_mark_unmergeable(); + mmap_write_unlock(current->mm); + if (r) + break; + + r = kvm_s390_pv_init_vm(kvm, &cmd->rc, &cmd->rrc); + if (r) + break; + + r = kvm_s390_cpus_to_pv(kvm, &cmd->rc, &cmd->rrc); + if (r) + kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy); + + /* we need to block service interrupts from now on */ + set_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs); + break; + } + case KVM_PV_ASYNC_CLEANUP_PREPARE: + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm) || !async_destroy) + break; + + r = kvm_s390_cpus_from_pv(kvm, &cmd->rc, &cmd->rrc); + /* + * If a CPU could not be destroyed, destroy VM will also fail. + * There is no point in trying to destroy it. Instead return + * the rc and rrc from the first CPU that failed destroying. + */ + if (r) + break; + r = kvm_s390_pv_set_aside(kvm, &cmd->rc, &cmd->rrc); + + /* no need to block service interrupts any more */ + clear_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs); + break; + case KVM_PV_ASYNC_CLEANUP_PERFORM: + r = -EINVAL; + if (!async_destroy) + break; + /* kvm->lock must not be held; this is asserted inside the function. */ + r = kvm_s390_pv_deinit_aside_vm(kvm, &cmd->rc, &cmd->rrc); + break; + case KVM_PV_DISABLE: { + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = kvm_s390_cpus_from_pv(kvm, &cmd->rc, &cmd->rrc); + /* + * If a CPU could not be destroyed, destroy VM will also fail. + * There is no point in trying to destroy it. Instead return + * the rc and rrc from the first CPU that failed destroying. + */ + if (r) + break; + r = kvm_s390_pv_deinit_cleanup_all(kvm, &cmd->rc, &cmd->rrc); + + /* no need to block service interrupts any more */ + clear_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs); + break; + } + case KVM_PV_SET_SEC_PARMS: { + struct kvm_s390_pv_sec_parm parms = {}; + void *hdr; + + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = -EFAULT; + if (copy_from_user(&parms, argp, sizeof(parms))) + break; + + /* Currently restricted to 8KB */ + r = -EINVAL; + if (parms.length > PAGE_SIZE * 2) + break; + + r = -ENOMEM; + hdr = vmalloc(parms.length); + if (!hdr) + break; + + r = -EFAULT; + if (!copy_from_user(hdr, (void __user *)parms.origin, + parms.length)) + r = kvm_s390_pv_set_sec_parms(kvm, hdr, parms.length, + &cmd->rc, &cmd->rrc); + + vfree(hdr); + break; + } + case KVM_PV_UNPACK: { + struct kvm_s390_pv_unp unp = {}; + + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm) || !mm_is_protected(kvm->mm)) + break; + + r = -EFAULT; + if (copy_from_user(&unp, argp, sizeof(unp))) + break; + + r = kvm_s390_pv_unpack(kvm, unp.addr, unp.size, unp.tweak, + &cmd->rc, &cmd->rrc); + break; + } + case KVM_PV_VERIFY: { + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), + UVC_CMD_VERIFY_IMG, &cmd->rc, &cmd->rrc); + KVM_UV_EVENT(kvm, 3, "PROTVIRT VERIFY: rc %x rrc %x", cmd->rc, + cmd->rrc); + break; + } + case KVM_PV_PREP_RESET: { + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), + UVC_CMD_PREPARE_RESET, &cmd->rc, &cmd->rrc); + KVM_UV_EVENT(kvm, 3, "PROTVIRT PREP RESET: rc %x rrc %x", + cmd->rc, cmd->rrc); + break; + } + case KVM_PV_UNSHARE_ALL: { + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), + UVC_CMD_SET_UNSHARE_ALL, &cmd->rc, &cmd->rrc); + KVM_UV_EVENT(kvm, 3, "PROTVIRT UNSHARE: rc %x rrc %x", + cmd->rc, cmd->rrc); + break; + } + case KVM_PV_INFO: { + struct kvm_s390_pv_info info = {}; + ssize_t data_len; + + /* + * No need to check the VM protection here. + * + * Maybe user space wants to query some of the data + * when the VM is still unprotected. If we see the + * need to fence a new data command we can still + * return an error in the info handler. + */ + + r = -EFAULT; + if (copy_from_user(&info, argp, sizeof(info.header))) + break; + + r = -EINVAL; + if (info.header.len_max < sizeof(info.header)) + break; + + data_len = kvm_s390_handle_pv_info(&info); + if (data_len < 0) { + r = data_len; + break; + } + /* + * If a data command struct is extended (multiple + * times) this can be used to determine how much of it + * is valid. + */ + info.header.len_written = data_len; + + r = -EFAULT; + if (copy_to_user(argp, &info, data_len)) + break; + + r = 0; + break; + } + case KVM_PV_DUMP: { + struct kvm_s390_pv_dmp dmp; + + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = -EFAULT; + if (copy_from_user(&dmp, argp, sizeof(dmp))) + break; + + r = kvm_s390_pv_dmp(kvm, cmd, dmp); + if (r) + break; + + if (copy_to_user(argp, &dmp, sizeof(dmp))) { + r = -EFAULT; + break; + } + + break; + } + default: + r = -ENOTTY; + } + if (need_lock) + mutex_unlock(&kvm->lock); + + return r; +} + +static int mem_op_validate_common(struct kvm_s390_mem_op *mop, u64 supported_flags) +{ + if (mop->flags & ~supported_flags || !mop->size) + return -EINVAL; + if (mop->size > MEM_OP_MAX_SIZE) + return -E2BIG; + if (mop->flags & KVM_S390_MEMOP_F_SKEY_PROTECTION) { + if (mop->key > 0xf) + return -EINVAL; + } else { + mop->key = 0; + } + return 0; +} + +static int kvm_s390_vm_mem_op_abs(struct kvm *kvm, struct kvm_s390_mem_op *mop) +{ + void __user *uaddr = (void __user *)mop->buf; + enum gacc_mode acc_mode; + void *tmpbuf = NULL; + int r, srcu_idx; + + r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_SKEY_PROTECTION | + KVM_S390_MEMOP_F_CHECK_ONLY); + if (r) + return r; + + if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) { + tmpbuf = vmalloc(mop->size); + if (!tmpbuf) + return -ENOMEM; + } + + srcu_idx = srcu_read_lock(&kvm->srcu); + + if (kvm_is_error_gpa(kvm, mop->gaddr)) { + r = PGM_ADDRESSING; + goto out_unlock; + } + + acc_mode = mop->op == KVM_S390_MEMOP_ABSOLUTE_READ ? GACC_FETCH : GACC_STORE; + if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { + r = check_gpa_range(kvm, mop->gaddr, mop->size, acc_mode, mop->key); + goto out_unlock; + } + if (acc_mode == GACC_FETCH) { + r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf, + mop->size, GACC_FETCH, mop->key); + if (r) + goto out_unlock; + if (copy_to_user(uaddr, tmpbuf, mop->size)) + r = -EFAULT; + } else { + if (copy_from_user(tmpbuf, uaddr, mop->size)) { + r = -EFAULT; + goto out_unlock; + } + r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf, + mop->size, GACC_STORE, mop->key); + } + +out_unlock: + srcu_read_unlock(&kvm->srcu, srcu_idx); + + vfree(tmpbuf); + return r; +} + +static int kvm_s390_vm_mem_op_cmpxchg(struct kvm *kvm, struct kvm_s390_mem_op *mop) +{ + void __user *uaddr = (void __user *)mop->buf; + void __user *old_addr = (void __user *)mop->old_addr; + union { + __uint128_t quad; + char raw[sizeof(__uint128_t)]; + } old = { .quad = 0}, new = { .quad = 0 }; + unsigned int off_in_quad = sizeof(new) - mop->size; + int r, srcu_idx; + bool success; + + r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_SKEY_PROTECTION); + if (r) + return r; + /* + * This validates off_in_quad. Checking that size is a power + * of two is not necessary, as cmpxchg_guest_abs_with_key + * takes care of that + */ + if (mop->size > sizeof(new)) + return -EINVAL; + if (copy_from_user(&new.raw[off_in_quad], uaddr, mop->size)) + return -EFAULT; + if (copy_from_user(&old.raw[off_in_quad], old_addr, mop->size)) + return -EFAULT; + + srcu_idx = srcu_read_lock(&kvm->srcu); + + if (kvm_is_error_gpa(kvm, mop->gaddr)) { + r = PGM_ADDRESSING; + goto out_unlock; + } + + r = cmpxchg_guest_abs_with_key(kvm, mop->gaddr, mop->size, &old.quad, + new.quad, mop->key, &success); + if (!success && copy_to_user(old_addr, &old.raw[off_in_quad], mop->size)) + r = -EFAULT; + +out_unlock: + srcu_read_unlock(&kvm->srcu, srcu_idx); + return r; +} + +static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop) +{ + /* + * This is technically a heuristic only, if the kvm->lock is not + * taken, it is not guaranteed that the vm is/remains non-protected. + * This is ok from a kernel perspective, wrongdoing is detected + * on the access, -EFAULT is returned and the vm may crash the + * next time it accesses the memory in question. + * There is no sane usecase to do switching and a memop on two + * different CPUs at the same time. + */ + if (kvm_s390_pv_get_handle(kvm)) + return -EINVAL; + + switch (mop->op) { + case KVM_S390_MEMOP_ABSOLUTE_READ: + case KVM_S390_MEMOP_ABSOLUTE_WRITE: + return kvm_s390_vm_mem_op_abs(kvm, mop); + case KVM_S390_MEMOP_ABSOLUTE_CMPXCHG: + return kvm_s390_vm_mem_op_cmpxchg(kvm, mop); + default: + return -EINVAL; + } +} + +int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) +{ + struct kvm *kvm = filp->private_data; + void __user *argp = (void __user *)arg; + struct kvm_device_attr attr; + int r; + + switch (ioctl) { + case KVM_S390_INTERRUPT: { + struct kvm_s390_interrupt s390int; + + r = -EFAULT; + if (copy_from_user(&s390int, argp, sizeof(s390int))) + break; + r = kvm_s390_inject_vm(kvm, &s390int); + break; + } + case KVM_CREATE_IRQCHIP: { + struct kvm_irq_routing_entry routing; + + r = -EINVAL; + if (kvm->arch.use_irqchip) { + /* Set up dummy routing. */ + memset(&routing, 0, sizeof(routing)); + r = kvm_set_irq_routing(kvm, &routing, 0, 0); + } + break; + } + case KVM_SET_DEVICE_ATTR: { + r = -EFAULT; + if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) + break; + r = kvm_s390_vm_set_attr(kvm, &attr); + break; + } + case KVM_GET_DEVICE_ATTR: { + r = -EFAULT; + if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) + break; + r = kvm_s390_vm_get_attr(kvm, &attr); + break; + } + case KVM_HAS_DEVICE_ATTR: { + r = -EFAULT; + if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) + break; + r = kvm_s390_vm_has_attr(kvm, &attr); + break; + } + case KVM_S390_GET_SKEYS: { + struct kvm_s390_skeys args; + + r = -EFAULT; + if (copy_from_user(&args, argp, + sizeof(struct kvm_s390_skeys))) + break; + r = kvm_s390_get_skeys(kvm, &args); + break; + } + case KVM_S390_SET_SKEYS: { + struct kvm_s390_skeys args; + + r = -EFAULT; + if (copy_from_user(&args, argp, + sizeof(struct kvm_s390_skeys))) + break; + r = kvm_s390_set_skeys(kvm, &args); + break; + } + case KVM_S390_GET_CMMA_BITS: { + struct kvm_s390_cmma_log args; + + r = -EFAULT; + if (copy_from_user(&args, argp, sizeof(args))) + break; + mutex_lock(&kvm->slots_lock); + r = kvm_s390_get_cmma_bits(kvm, &args); + mutex_unlock(&kvm->slots_lock); + if (!r) { + r = copy_to_user(argp, &args, sizeof(args)); + if (r) + r = -EFAULT; + } + break; + } + case KVM_S390_SET_CMMA_BITS: { + struct kvm_s390_cmma_log args; + + r = -EFAULT; + if (copy_from_user(&args, argp, sizeof(args))) + break; + mutex_lock(&kvm->slots_lock); + r = kvm_s390_set_cmma_bits(kvm, &args); + mutex_unlock(&kvm->slots_lock); + break; + } + case KVM_S390_PV_COMMAND: { + struct kvm_pv_cmd args; + + /* protvirt means user cpu state */ + kvm_s390_set_user_cpu_state_ctrl(kvm); + r = 0; + if (!is_prot_virt_host()) { + r = -EINVAL; + break; + } + if (copy_from_user(&args, argp, sizeof(args))) { + r = -EFAULT; + break; + } + if (args.flags) { + r = -EINVAL; + break; + } + /* must be called without kvm->lock */ + r = kvm_s390_handle_pv(kvm, &args); + if (copy_to_user(argp, &args, sizeof(args))) { + r = -EFAULT; + break; + } + break; + } + case KVM_S390_MEM_OP: { + struct kvm_s390_mem_op mem_op; + + if (copy_from_user(&mem_op, argp, sizeof(mem_op)) == 0) + r = kvm_s390_vm_mem_op(kvm, &mem_op); + else + r = -EFAULT; + break; + } + case KVM_S390_ZPCI_OP: { + struct kvm_s390_zpci_op args; + + r = -EINVAL; + if (!IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) + break; + if (copy_from_user(&args, argp, sizeof(args))) { + r = -EFAULT; + break; + } + r = kvm_s390_pci_zpci_op(kvm, &args); + break; + } + default: + r = -ENOTTY; + } + + return r; +} + +static int kvm_s390_apxa_installed(void) +{ + struct ap_config_info info; + + if (ap_instructions_available()) { + if (ap_qci(&info) == 0) + return info.apxa; + } + + return 0; +} + +/* + * The format of the crypto control block (CRYCB) is specified in the 3 low + * order bits of the CRYCB designation (CRYCBD) field as follows: + * Format 0: Neither the message security assist extension 3 (MSAX3) nor the + * AP extended addressing (APXA) facility are installed. + * Format 1: The APXA facility is not installed but the MSAX3 facility is. + * Format 2: Both the APXA and MSAX3 facilities are installed + */ +static void kvm_s390_set_crycb_format(struct kvm *kvm) +{ + kvm->arch.crypto.crycbd = (__u32)(unsigned long) kvm->arch.crypto.crycb; + + /* Clear the CRYCB format bits - i.e., set format 0 by default */ + kvm->arch.crypto.crycbd &= ~(CRYCB_FORMAT_MASK); + + /* Check whether MSAX3 is installed */ + if (!test_kvm_facility(kvm, 76)) + return; + + if (kvm_s390_apxa_installed()) + kvm->arch.crypto.crycbd |= CRYCB_FORMAT2; + else + kvm->arch.crypto.crycbd |= CRYCB_FORMAT1; +} + +/* + * kvm_arch_crypto_set_masks + * + * @kvm: pointer to the target guest's KVM struct containing the crypto masks + * to be set. + * @apm: the mask identifying the accessible AP adapters + * @aqm: the mask identifying the accessible AP domains + * @adm: the mask identifying the accessible AP control domains + * + * Set the masks that identify the adapters, domains and control domains to + * which the KVM guest is granted access. + * + * Note: The kvm->lock mutex must be locked by the caller before invoking this + * function. + */ +void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm, + unsigned long *aqm, unsigned long *adm) +{ + struct kvm_s390_crypto_cb *crycb = kvm->arch.crypto.crycb; + + kvm_s390_vcpu_block_all(kvm); + + switch (kvm->arch.crypto.crycbd & CRYCB_FORMAT_MASK) { + case CRYCB_FORMAT2: /* APCB1 use 256 bits */ + memcpy(crycb->apcb1.apm, apm, 32); + VM_EVENT(kvm, 3, "SET CRYCB: apm %016lx %016lx %016lx %016lx", + apm[0], apm[1], apm[2], apm[3]); + memcpy(crycb->apcb1.aqm, aqm, 32); + VM_EVENT(kvm, 3, "SET CRYCB: aqm %016lx %016lx %016lx %016lx", + aqm[0], aqm[1], aqm[2], aqm[3]); + memcpy(crycb->apcb1.adm, adm, 32); + VM_EVENT(kvm, 3, "SET CRYCB: adm %016lx %016lx %016lx %016lx", + adm[0], adm[1], adm[2], adm[3]); + break; + case CRYCB_FORMAT1: + case CRYCB_FORMAT0: /* Fall through both use APCB0 */ + memcpy(crycb->apcb0.apm, apm, 8); + memcpy(crycb->apcb0.aqm, aqm, 2); + memcpy(crycb->apcb0.adm, adm, 2); + VM_EVENT(kvm, 3, "SET CRYCB: apm %016lx aqm %04x adm %04x", + apm[0], *((unsigned short *)aqm), + *((unsigned short *)adm)); + break; + default: /* Can not happen */ + break; + } + + /* recreate the shadow crycb for each vcpu */ + kvm_s390_sync_request_broadcast(kvm, KVM_REQ_VSIE_RESTART); + kvm_s390_vcpu_unblock_all(kvm); +} +EXPORT_SYMBOL_GPL(kvm_arch_crypto_set_masks); + +/* + * kvm_arch_crypto_clear_masks + * + * @kvm: pointer to the target guest's KVM struct containing the crypto masks + * to be cleared. + * + * Clear the masks that identify the adapters, domains and control domains to + * which the KVM guest is granted access. + * + * Note: The kvm->lock mutex must be locked by the caller before invoking this + * function. + */ +void kvm_arch_crypto_clear_masks(struct kvm *kvm) +{ + kvm_s390_vcpu_block_all(kvm); + + memset(&kvm->arch.crypto.crycb->apcb0, 0, + sizeof(kvm->arch.crypto.crycb->apcb0)); + memset(&kvm->arch.crypto.crycb->apcb1, 0, + sizeof(kvm->arch.crypto.crycb->apcb1)); + + VM_EVENT(kvm, 3, "%s", "CLR CRYCB:"); + /* recreate the shadow crycb for each vcpu */ + kvm_s390_sync_request_broadcast(kvm, KVM_REQ_VSIE_RESTART); + kvm_s390_vcpu_unblock_all(kvm); +} +EXPORT_SYMBOL_GPL(kvm_arch_crypto_clear_masks); + +static u64 kvm_s390_get_initial_cpuid(void) +{ + struct cpuid cpuid; + + get_cpu_id(&cpuid); + cpuid.version = 0xff; + return *((u64 *) &cpuid); +} + +static void kvm_s390_crypto_init(struct kvm *kvm) +{ + kvm->arch.crypto.crycb = &kvm->arch.sie_page2->crycb; + kvm_s390_set_crycb_format(kvm); + init_rwsem(&kvm->arch.crypto.pqap_hook_rwsem); + + if (!test_kvm_facility(kvm, 76)) + return; + + /* Enable AES/DEA protected key functions by default */ + kvm->arch.crypto.aes_kw = 1; + kvm->arch.crypto.dea_kw = 1; + get_random_bytes(kvm->arch.crypto.crycb->aes_wrapping_key_mask, + sizeof(kvm->arch.crypto.crycb->aes_wrapping_key_mask)); + get_random_bytes(kvm->arch.crypto.crycb->dea_wrapping_key_mask, + sizeof(kvm->arch.crypto.crycb->dea_wrapping_key_mask)); +} + +static void sca_dispose(struct kvm *kvm) +{ + if (kvm->arch.use_esca) + free_pages_exact(kvm->arch.sca, sizeof(struct esca_block)); + else + free_page((unsigned long)(kvm->arch.sca)); + kvm->arch.sca = NULL; +} + +void kvm_arch_free_vm(struct kvm *kvm) +{ + if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) + kvm_s390_pci_clear_list(kvm); + + __kvm_arch_free_vm(kvm); +} + +int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) +{ + gfp_t alloc_flags = GFP_KERNEL_ACCOUNT; + int i, rc; + char debug_name[16]; + static unsigned long sca_offset; + + rc = -EINVAL; +#ifdef CONFIG_KVM_S390_UCONTROL + if (type & ~KVM_VM_S390_UCONTROL) + goto out_err; + if ((type & KVM_VM_S390_UCONTROL) && (!capable(CAP_SYS_ADMIN))) + goto out_err; +#else + if (type) + goto out_err; +#endif + + rc = s390_enable_sie(); + if (rc) + goto out_err; + + rc = -ENOMEM; + + if (!sclp.has_64bscao) + alloc_flags |= GFP_DMA; + rwlock_init(&kvm->arch.sca_lock); + /* start with basic SCA */ + kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags); + if (!kvm->arch.sca) + goto out_err; + mutex_lock(&kvm_lock); + sca_offset += 16; + if (sca_offset + sizeof(struct bsca_block) > PAGE_SIZE) + sca_offset = 0; + kvm->arch.sca = (struct bsca_block *) + ((char *) kvm->arch.sca + sca_offset); + mutex_unlock(&kvm_lock); + + sprintf(debug_name, "kvm-%u", current->pid); + + kvm->arch.dbf = debug_register(debug_name, 32, 1, 7 * sizeof(long)); + if (!kvm->arch.dbf) + goto out_err; + + BUILD_BUG_ON(sizeof(struct sie_page2) != 4096); + kvm->arch.sie_page2 = + (struct sie_page2 *) get_zeroed_page(GFP_KERNEL_ACCOUNT | GFP_DMA); + if (!kvm->arch.sie_page2) + goto out_err; + + kvm->arch.sie_page2->kvm = kvm; + kvm->arch.model.fac_list = kvm->arch.sie_page2->fac_list; + + for (i = 0; i < kvm_s390_fac_size(); i++) { + kvm->arch.model.fac_mask[i] = stfle_fac_list[i] & + (kvm_s390_fac_base[i] | + kvm_s390_fac_ext[i]); + kvm->arch.model.fac_list[i] = stfle_fac_list[i] & + kvm_s390_fac_base[i]; + } + kvm->arch.model.subfuncs = kvm_s390_available_subfunc; + + /* we are always in czam mode - even on pre z14 machines */ + set_kvm_facility(kvm->arch.model.fac_mask, 138); + set_kvm_facility(kvm->arch.model.fac_list, 138); + /* we emulate STHYI in kvm */ + set_kvm_facility(kvm->arch.model.fac_mask, 74); + set_kvm_facility(kvm->arch.model.fac_list, 74); + if (MACHINE_HAS_TLB_GUEST) { + set_kvm_facility(kvm->arch.model.fac_mask, 147); + set_kvm_facility(kvm->arch.model.fac_list, 147); + } + + if (css_general_characteristics.aiv && test_facility(65)) + set_kvm_facility(kvm->arch.model.fac_mask, 65); + + kvm->arch.model.cpuid = kvm_s390_get_initial_cpuid(); + kvm->arch.model.ibc = sclp.ibc & 0x0fff; + + kvm->arch.model.uv_feat_guest.feat = 0; + + kvm_s390_crypto_init(kvm); + + if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) { + mutex_lock(&kvm->lock); + kvm_s390_pci_init_list(kvm); + kvm_s390_vcpu_pci_enable_interp(kvm); + mutex_unlock(&kvm->lock); + } + + mutex_init(&kvm->arch.float_int.ais_lock); + spin_lock_init(&kvm->arch.float_int.lock); + for (i = 0; i < FIRQ_LIST_COUNT; i++) + INIT_LIST_HEAD(&kvm->arch.float_int.lists[i]); + init_waitqueue_head(&kvm->arch.ipte_wq); + mutex_init(&kvm->arch.ipte_mutex); + + debug_register_view(kvm->arch.dbf, &debug_sprintf_view); + VM_EVENT(kvm, 3, "vm created with type %lu", type); + + if (type & KVM_VM_S390_UCONTROL) { + kvm->arch.gmap = NULL; + kvm->arch.mem_limit = KVM_S390_NO_MEM_LIMIT; + } else { + if (sclp.hamax == U64_MAX) + kvm->arch.mem_limit = TASK_SIZE_MAX; + else + kvm->arch.mem_limit = min_t(unsigned long, TASK_SIZE_MAX, + sclp.hamax + 1); + kvm->arch.gmap = gmap_create(current->mm, kvm->arch.mem_limit - 1); + if (!kvm->arch.gmap) + goto out_err; + kvm->arch.gmap->private = kvm; + kvm->arch.gmap->pfault_enabled = 0; + } + + kvm->arch.use_pfmfi = sclp.has_pfmfi; + kvm->arch.use_skf = sclp.has_skey; + spin_lock_init(&kvm->arch.start_stop_lock); + kvm_s390_vsie_init(kvm); + if (use_gisa) + kvm_s390_gisa_init(kvm); + INIT_LIST_HEAD(&kvm->arch.pv.need_cleanup); + kvm->arch.pv.set_aside = NULL; + KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid); + + return 0; +out_err: + free_page((unsigned long)kvm->arch.sie_page2); + debug_unregister(kvm->arch.dbf); + sca_dispose(kvm); + KVM_EVENT(3, "creation of vm failed: %d", rc); + return rc; +} + +void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) +{ + u16 rc, rrc; + + VCPU_EVENT(vcpu, 3, "%s", "free cpu"); + trace_kvm_s390_destroy_vcpu(vcpu->vcpu_id); + kvm_s390_clear_local_irqs(vcpu); + kvm_clear_async_pf_completion_queue(vcpu); + if (!kvm_is_ucontrol(vcpu->kvm)) + sca_del_vcpu(vcpu); + kvm_s390_update_topology_change_report(vcpu->kvm, 1); + + if (kvm_is_ucontrol(vcpu->kvm)) + gmap_remove(vcpu->arch.gmap); + + if (vcpu->kvm->arch.use_cmma) + kvm_s390_vcpu_unsetup_cmma(vcpu); + /* We can not hold the vcpu mutex here, we are already dying */ + if (kvm_s390_pv_cpu_get_handle(vcpu)) + kvm_s390_pv_destroy_cpu(vcpu, &rc, &rrc); + free_page((unsigned long)(vcpu->arch.sie_block)); +} + +void kvm_arch_destroy_vm(struct kvm *kvm) +{ + u16 rc, rrc; + + kvm_destroy_vcpus(kvm); + sca_dispose(kvm); + kvm_s390_gisa_destroy(kvm); + /* + * We are already at the end of life and kvm->lock is not taken. + * This is ok as the file descriptor is closed by now and nobody + * can mess with the pv state. + */ + kvm_s390_pv_deinit_cleanup_all(kvm, &rc, &rrc); + /* + * Remove the mmu notifier only when the whole KVM VM is torn down, + * and only if one was registered to begin with. If the VM is + * currently not protected, but has been previously been protected, + * then it's possible that the notifier is still registered. + */ + if (kvm->arch.pv.mmu_notifier.ops) + mmu_notifier_unregister(&kvm->arch.pv.mmu_notifier, kvm->mm); + + debug_unregister(kvm->arch.dbf); + free_page((unsigned long)kvm->arch.sie_page2); + if (!kvm_is_ucontrol(kvm)) + gmap_remove(kvm->arch.gmap); + kvm_s390_destroy_adapters(kvm); + kvm_s390_clear_float_irqs(kvm); + kvm_s390_vsie_destroy(kvm); + KVM_EVENT(3, "vm 0x%pK destroyed", kvm); +} + +/* Section: vcpu related */ +static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu) +{ + vcpu->arch.gmap = gmap_create(current->mm, -1UL); + if (!vcpu->arch.gmap) + return -ENOMEM; + vcpu->arch.gmap->private = vcpu->kvm; + + return 0; +} + +static void sca_del_vcpu(struct kvm_vcpu *vcpu) +{ + if (!kvm_s390_use_sca_entries()) + return; + read_lock(&vcpu->kvm->arch.sca_lock); + if (vcpu->kvm->arch.use_esca) { + struct esca_block *sca = vcpu->kvm->arch.sca; + + clear_bit_inv(vcpu->vcpu_id, (unsigned long *) sca->mcn); + sca->cpu[vcpu->vcpu_id].sda = 0; + } else { + struct bsca_block *sca = vcpu->kvm->arch.sca; + + clear_bit_inv(vcpu->vcpu_id, (unsigned long *) &sca->mcn); + sca->cpu[vcpu->vcpu_id].sda = 0; + } + read_unlock(&vcpu->kvm->arch.sca_lock); +} + +static void sca_add_vcpu(struct kvm_vcpu *vcpu) +{ + if (!kvm_s390_use_sca_entries()) { + phys_addr_t sca_phys = virt_to_phys(vcpu->kvm->arch.sca); + + /* we still need the basic sca for the ipte control */ + vcpu->arch.sie_block->scaoh = sca_phys >> 32; + vcpu->arch.sie_block->scaol = sca_phys; + return; + } + read_lock(&vcpu->kvm->arch.sca_lock); + if (vcpu->kvm->arch.use_esca) { + struct esca_block *sca = vcpu->kvm->arch.sca; + phys_addr_t sca_phys = virt_to_phys(sca); + + sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block); + vcpu->arch.sie_block->scaoh = sca_phys >> 32; + vcpu->arch.sie_block->scaol = sca_phys & ESCA_SCAOL_MASK; + vcpu->arch.sie_block->ecb2 |= ECB2_ESCA; + set_bit_inv(vcpu->vcpu_id, (unsigned long *) sca->mcn); + } else { + struct bsca_block *sca = vcpu->kvm->arch.sca; + phys_addr_t sca_phys = virt_to_phys(sca); + + sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block); + vcpu->arch.sie_block->scaoh = sca_phys >> 32; + vcpu->arch.sie_block->scaol = sca_phys; + set_bit_inv(vcpu->vcpu_id, (unsigned long *) &sca->mcn); + } + read_unlock(&vcpu->kvm->arch.sca_lock); +} + +/* Basic SCA to Extended SCA data copy routines */ +static inline void sca_copy_entry(struct esca_entry *d, struct bsca_entry *s) +{ + d->sda = s->sda; + d->sigp_ctrl.c = s->sigp_ctrl.c; + d->sigp_ctrl.scn = s->sigp_ctrl.scn; +} + +static void sca_copy_b_to_e(struct esca_block *d, struct bsca_block *s) +{ + int i; + + d->ipte_control = s->ipte_control; + d->mcn[0] = s->mcn; + for (i = 0; i < KVM_S390_BSCA_CPU_SLOTS; i++) + sca_copy_entry(&d->cpu[i], &s->cpu[i]); +} + +static int sca_switch_to_extended(struct kvm *kvm) +{ + struct bsca_block *old_sca = kvm->arch.sca; + struct esca_block *new_sca; + struct kvm_vcpu *vcpu; + unsigned long vcpu_idx; + u32 scaol, scaoh; + phys_addr_t new_sca_phys; + + if (kvm->arch.use_esca) + return 0; + + new_sca = alloc_pages_exact(sizeof(*new_sca), GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!new_sca) + return -ENOMEM; + + new_sca_phys = virt_to_phys(new_sca); + scaoh = new_sca_phys >> 32; + scaol = new_sca_phys & ESCA_SCAOL_MASK; + + kvm_s390_vcpu_block_all(kvm); + write_lock(&kvm->arch.sca_lock); + + sca_copy_b_to_e(new_sca, old_sca); + + kvm_for_each_vcpu(vcpu_idx, vcpu, kvm) { + vcpu->arch.sie_block->scaoh = scaoh; + vcpu->arch.sie_block->scaol = scaol; + vcpu->arch.sie_block->ecb2 |= ECB2_ESCA; + } + kvm->arch.sca = new_sca; + kvm->arch.use_esca = 1; + + write_unlock(&kvm->arch.sca_lock); + kvm_s390_vcpu_unblock_all(kvm); + + free_page((unsigned long)old_sca); + + VM_EVENT(kvm, 2, "Switched to ESCA (0x%pK -> 0x%pK)", + old_sca, kvm->arch.sca); + return 0; +} + +static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id) +{ + int rc; + + if (!kvm_s390_use_sca_entries()) { + if (id < KVM_MAX_VCPUS) + return true; + return false; + } + if (id < KVM_S390_BSCA_CPU_SLOTS) + return true; + if (!sclp.has_esca || !sclp.has_64bscao) + return false; + + rc = kvm->arch.use_esca ? 0 : sca_switch_to_extended(kvm); + + return rc == 0 && id < KVM_S390_ESCA_CPU_SLOTS; +} + +/* needs disabled preemption to protect from TOD sync and vcpu_load/put */ +static void __start_cpu_timer_accounting(struct kvm_vcpu *vcpu) +{ + WARN_ON_ONCE(vcpu->arch.cputm_start != 0); + raw_write_seqcount_begin(&vcpu->arch.cputm_seqcount); + vcpu->arch.cputm_start = get_tod_clock_fast(); + raw_write_seqcount_end(&vcpu->arch.cputm_seqcount); +} + +/* needs disabled preemption to protect from TOD sync and vcpu_load/put */ +static void __stop_cpu_timer_accounting(struct kvm_vcpu *vcpu) +{ + WARN_ON_ONCE(vcpu->arch.cputm_start == 0); + raw_write_seqcount_begin(&vcpu->arch.cputm_seqcount); + vcpu->arch.sie_block->cputm -= get_tod_clock_fast() - vcpu->arch.cputm_start; + vcpu->arch.cputm_start = 0; + raw_write_seqcount_end(&vcpu->arch.cputm_seqcount); +} + +/* needs disabled preemption to protect from TOD sync and vcpu_load/put */ +static void __enable_cpu_timer_accounting(struct kvm_vcpu *vcpu) +{ + WARN_ON_ONCE(vcpu->arch.cputm_enabled); + vcpu->arch.cputm_enabled = true; + __start_cpu_timer_accounting(vcpu); +} + +/* needs disabled preemption to protect from TOD sync and vcpu_load/put */ +static void __disable_cpu_timer_accounting(struct kvm_vcpu *vcpu) +{ + WARN_ON_ONCE(!vcpu->arch.cputm_enabled); + __stop_cpu_timer_accounting(vcpu); + vcpu->arch.cputm_enabled = false; +} + +static void enable_cpu_timer_accounting(struct kvm_vcpu *vcpu) +{ + preempt_disable(); /* protect from TOD sync and vcpu_load/put */ + __enable_cpu_timer_accounting(vcpu); + preempt_enable(); +} + +static void disable_cpu_timer_accounting(struct kvm_vcpu *vcpu) +{ + preempt_disable(); /* protect from TOD sync and vcpu_load/put */ + __disable_cpu_timer_accounting(vcpu); + preempt_enable(); +} + +/* set the cpu timer - may only be called from the VCPU thread itself */ +void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm) +{ + preempt_disable(); /* protect from TOD sync and vcpu_load/put */ + raw_write_seqcount_begin(&vcpu->arch.cputm_seqcount); + if (vcpu->arch.cputm_enabled) + vcpu->arch.cputm_start = get_tod_clock_fast(); + vcpu->arch.sie_block->cputm = cputm; + raw_write_seqcount_end(&vcpu->arch.cputm_seqcount); + preempt_enable(); +} + +/* update and get the cpu timer - can also be called from other VCPU threads */ +__u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu) +{ + unsigned int seq; + __u64 value; + + if (unlikely(!vcpu->arch.cputm_enabled)) + return vcpu->arch.sie_block->cputm; + + preempt_disable(); /* protect from TOD sync and vcpu_load/put */ + do { + seq = raw_read_seqcount(&vcpu->arch.cputm_seqcount); + /* + * If the writer would ever execute a read in the critical + * section, e.g. in irq context, we have a deadlock. + */ + WARN_ON_ONCE((seq & 1) && smp_processor_id() == vcpu->cpu); + value = vcpu->arch.sie_block->cputm; + /* if cputm_start is 0, accounting is being started/stopped */ + if (likely(vcpu->arch.cputm_start)) + value -= get_tod_clock_fast() - vcpu->arch.cputm_start; + } while (read_seqcount_retry(&vcpu->arch.cputm_seqcount, seq & ~1)); + preempt_enable(); + return value; +} + +void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + + gmap_enable(vcpu->arch.enabled_gmap); + kvm_s390_set_cpuflags(vcpu, CPUSTAT_RUNNING); + if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu)) + __start_cpu_timer_accounting(vcpu); + vcpu->cpu = cpu; +} + +void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) +{ + vcpu->cpu = -1; + if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu)) + __stop_cpu_timer_accounting(vcpu); + kvm_s390_clear_cpuflags(vcpu, CPUSTAT_RUNNING); + vcpu->arch.enabled_gmap = gmap_get_enabled(); + gmap_disable(vcpu->arch.enabled_gmap); + +} + +void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) +{ + mutex_lock(&vcpu->kvm->lock); + preempt_disable(); + vcpu->arch.sie_block->epoch = vcpu->kvm->arch.epoch; + vcpu->arch.sie_block->epdx = vcpu->kvm->arch.epdx; + preempt_enable(); + mutex_unlock(&vcpu->kvm->lock); + if (!kvm_is_ucontrol(vcpu->kvm)) { + vcpu->arch.gmap = vcpu->kvm->arch.gmap; + sca_add_vcpu(vcpu); + } + if (test_kvm_facility(vcpu->kvm, 74) || vcpu->kvm->arch.user_instr0) + vcpu->arch.sie_block->ictl |= ICTL_OPEREXC; + /* make vcpu_load load the right gmap on the first trigger */ + vcpu->arch.enabled_gmap = vcpu->arch.gmap; +} + +static bool kvm_has_pckmo_subfunc(struct kvm *kvm, unsigned long nr) +{ + if (test_bit_inv(nr, (unsigned long *)&kvm->arch.model.subfuncs.pckmo) && + test_bit_inv(nr, (unsigned long *)&kvm_s390_available_subfunc.pckmo)) + return true; + return false; +} + +static bool kvm_has_pckmo_ecc(struct kvm *kvm) +{ + /* At least one ECC subfunction must be present */ + return kvm_has_pckmo_subfunc(kvm, 32) || + kvm_has_pckmo_subfunc(kvm, 33) || + kvm_has_pckmo_subfunc(kvm, 34) || + kvm_has_pckmo_subfunc(kvm, 40) || + kvm_has_pckmo_subfunc(kvm, 41); + +} + +static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu) +{ + /* + * If the AP instructions are not being interpreted and the MSAX3 + * facility is not configured for the guest, there is nothing to set up. + */ + if (!vcpu->kvm->arch.crypto.apie && !test_kvm_facility(vcpu->kvm, 76)) + return; + + vcpu->arch.sie_block->crycbd = vcpu->kvm->arch.crypto.crycbd; + vcpu->arch.sie_block->ecb3 &= ~(ECB3_AES | ECB3_DEA); + vcpu->arch.sie_block->eca &= ~ECA_APIE; + vcpu->arch.sie_block->ecd &= ~ECD_ECC; + + if (vcpu->kvm->arch.crypto.apie) + vcpu->arch.sie_block->eca |= ECA_APIE; + + /* Set up protected key support */ + if (vcpu->kvm->arch.crypto.aes_kw) { + vcpu->arch.sie_block->ecb3 |= ECB3_AES; + /* ecc is also wrapped with AES key */ + if (kvm_has_pckmo_ecc(vcpu->kvm)) + vcpu->arch.sie_block->ecd |= ECD_ECC; + } + + if (vcpu->kvm->arch.crypto.dea_kw) + vcpu->arch.sie_block->ecb3 |= ECB3_DEA; +} + +void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu) +{ + free_page((unsigned long)phys_to_virt(vcpu->arch.sie_block->cbrlo)); + vcpu->arch.sie_block->cbrlo = 0; +} + +int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu) +{ + void *cbrlo_page = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + + if (!cbrlo_page) + return -ENOMEM; + + vcpu->arch.sie_block->cbrlo = virt_to_phys(cbrlo_page); + return 0; +} + +static void kvm_s390_vcpu_setup_model(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_cpu_model *model = &vcpu->kvm->arch.model; + + vcpu->arch.sie_block->ibc = model->ibc; + if (test_kvm_facility(vcpu->kvm, 7)) + vcpu->arch.sie_block->fac = virt_to_phys(model->fac_list); +} + +static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) +{ + int rc = 0; + u16 uvrc, uvrrc; + + atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH | + CPUSTAT_SM | + CPUSTAT_STOPPED); + + if (test_kvm_facility(vcpu->kvm, 78)) + kvm_s390_set_cpuflags(vcpu, CPUSTAT_GED2); + else if (test_kvm_facility(vcpu->kvm, 8)) + kvm_s390_set_cpuflags(vcpu, CPUSTAT_GED); + + kvm_s390_vcpu_setup_model(vcpu); + + /* pgste_set_pte has special handling for !MACHINE_HAS_ESOP */ + if (MACHINE_HAS_ESOP) + vcpu->arch.sie_block->ecb |= ECB_HOSTPROTINT; + if (test_kvm_facility(vcpu->kvm, 9)) + vcpu->arch.sie_block->ecb |= ECB_SRSI; + if (test_kvm_facility(vcpu->kvm, 11)) + vcpu->arch.sie_block->ecb |= ECB_PTF; + if (test_kvm_facility(vcpu->kvm, 73)) + vcpu->arch.sie_block->ecb |= ECB_TE; + if (!kvm_is_ucontrol(vcpu->kvm)) + vcpu->arch.sie_block->ecb |= ECB_SPECI; + + if (test_kvm_facility(vcpu->kvm, 8) && vcpu->kvm->arch.use_pfmfi) + vcpu->arch.sie_block->ecb2 |= ECB2_PFMFI; + if (test_kvm_facility(vcpu->kvm, 130)) + vcpu->arch.sie_block->ecb2 |= ECB2_IEP; + vcpu->arch.sie_block->eca = ECA_MVPGI | ECA_PROTEXCI; + if (sclp.has_cei) + vcpu->arch.sie_block->eca |= ECA_CEI; + if (sclp.has_ib) + vcpu->arch.sie_block->eca |= ECA_IB; + if (sclp.has_siif) + vcpu->arch.sie_block->eca |= ECA_SII; + if (sclp.has_sigpif) + vcpu->arch.sie_block->eca |= ECA_SIGPI; + if (test_kvm_facility(vcpu->kvm, 129)) { + vcpu->arch.sie_block->eca |= ECA_VX; + vcpu->arch.sie_block->ecd |= ECD_HOSTREGMGMT; + } + if (test_kvm_facility(vcpu->kvm, 139)) + vcpu->arch.sie_block->ecd |= ECD_MEF; + if (test_kvm_facility(vcpu->kvm, 156)) + vcpu->arch.sie_block->ecd |= ECD_ETOKENF; + if (vcpu->arch.sie_block->gd) { + vcpu->arch.sie_block->eca |= ECA_AIV; + VCPU_EVENT(vcpu, 3, "AIV gisa format-%u enabled for cpu %03u", + vcpu->arch.sie_block->gd & 0x3, vcpu->vcpu_id); + } + vcpu->arch.sie_block->sdnxo = virt_to_phys(&vcpu->run->s.regs.sdnx) | SDNXC; + vcpu->arch.sie_block->riccbd = virt_to_phys(&vcpu->run->s.regs.riccb); + + if (sclp.has_kss) + kvm_s390_set_cpuflags(vcpu, CPUSTAT_KSS); + else + vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE; + + if (vcpu->kvm->arch.use_cmma) { + rc = kvm_s390_vcpu_setup_cmma(vcpu); + if (rc) + return rc; + } + hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + vcpu->arch.ckc_timer.function = kvm_s390_idle_wakeup; + + vcpu->arch.sie_block->hpid = HPID_KVM; + + kvm_s390_vcpu_crypto_setup(vcpu); + + kvm_s390_vcpu_pci_setup(vcpu); + + mutex_lock(&vcpu->kvm->lock); + if (kvm_s390_pv_is_protected(vcpu->kvm)) { + rc = kvm_s390_pv_create_cpu(vcpu, &uvrc, &uvrrc); + if (rc) + kvm_s390_vcpu_unsetup_cmma(vcpu); + } + mutex_unlock(&vcpu->kvm->lock); + + return rc; +} + +int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) +{ + if (!kvm_is_ucontrol(kvm) && !sca_can_add_vcpu(kvm, id)) + return -EINVAL; + return 0; +} + +int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) +{ + struct sie_page *sie_page; + int rc; + + BUILD_BUG_ON(sizeof(struct sie_page) != 4096); + sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!sie_page) + return -ENOMEM; + + vcpu->arch.sie_block = &sie_page->sie_block; + vcpu->arch.sie_block->itdba = virt_to_phys(&sie_page->itdb); + + /* the real guest size will always be smaller than msl */ + vcpu->arch.sie_block->mso = 0; + vcpu->arch.sie_block->msl = sclp.hamax; + + vcpu->arch.sie_block->icpua = vcpu->vcpu_id; + spin_lock_init(&vcpu->arch.local_int.lock); + vcpu->arch.sie_block->gd = kvm_s390_get_gisa_desc(vcpu->kvm); + seqcount_init(&vcpu->arch.cputm_seqcount); + + vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; + kvm_clear_async_pf_completion_queue(vcpu); + vcpu->run->kvm_valid_regs = KVM_SYNC_PREFIX | + KVM_SYNC_GPRS | + KVM_SYNC_ACRS | + KVM_SYNC_CRS | + KVM_SYNC_ARCH0 | + KVM_SYNC_PFAULT | + KVM_SYNC_DIAG318; + kvm_s390_set_prefix(vcpu, 0); + if (test_kvm_facility(vcpu->kvm, 64)) + vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB; + if (test_kvm_facility(vcpu->kvm, 82)) + vcpu->run->kvm_valid_regs |= KVM_SYNC_BPBC; + if (test_kvm_facility(vcpu->kvm, 133)) + vcpu->run->kvm_valid_regs |= KVM_SYNC_GSCB; + if (test_kvm_facility(vcpu->kvm, 156)) + vcpu->run->kvm_valid_regs |= KVM_SYNC_ETOKEN; + /* fprs can be synchronized via vrs, even if the guest has no vx. With + * MACHINE_HAS_VX, (load|store)_fpu_regs() will work with vrs format. + */ + if (MACHINE_HAS_VX) + vcpu->run->kvm_valid_regs |= KVM_SYNC_VRS; + else + vcpu->run->kvm_valid_regs |= KVM_SYNC_FPRS; + + if (kvm_is_ucontrol(vcpu->kvm)) { + rc = __kvm_ucontrol_vcpu_init(vcpu); + if (rc) + goto out_free_sie_block; + } + + VM_EVENT(vcpu->kvm, 3, "create cpu %d at 0x%pK, sie block at 0x%pK", + vcpu->vcpu_id, vcpu, vcpu->arch.sie_block); + trace_kvm_s390_create_vcpu(vcpu->vcpu_id, vcpu, vcpu->arch.sie_block); + + rc = kvm_s390_vcpu_setup(vcpu); + if (rc) + goto out_ucontrol_uninit; + + kvm_s390_update_topology_change_report(vcpu->kvm, 1); + return 0; + +out_ucontrol_uninit: + if (kvm_is_ucontrol(vcpu->kvm)) + gmap_remove(vcpu->arch.gmap); +out_free_sie_block: + free_page((unsigned long)(vcpu->arch.sie_block)); + return rc; +} + +int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) +{ + clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.gisa_int.kicked_mask); + return kvm_s390_vcpu_has_irq(vcpu, 0); +} + +bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) +{ + return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE); +} + +void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu) +{ + atomic_or(PROG_BLOCK_SIE, &vcpu->arch.sie_block->prog20); + exit_sie(vcpu); +} + +void kvm_s390_vcpu_unblock(struct kvm_vcpu *vcpu) +{ + atomic_andnot(PROG_BLOCK_SIE, &vcpu->arch.sie_block->prog20); +} + +static void kvm_s390_vcpu_request(struct kvm_vcpu *vcpu) +{ + atomic_or(PROG_REQUEST, &vcpu->arch.sie_block->prog20); + exit_sie(vcpu); +} + +bool kvm_s390_vcpu_sie_inhibited(struct kvm_vcpu *vcpu) +{ + return atomic_read(&vcpu->arch.sie_block->prog20) & + (PROG_BLOCK_SIE | PROG_REQUEST); +} + +static void kvm_s390_vcpu_request_handled(struct kvm_vcpu *vcpu) +{ + atomic_andnot(PROG_REQUEST, &vcpu->arch.sie_block->prog20); +} + +/* + * Kick a guest cpu out of (v)SIE and wait until (v)SIE is not running. + * If the CPU is not running (e.g. waiting as idle) the function will + * return immediately. */ +void exit_sie(struct kvm_vcpu *vcpu) +{ + kvm_s390_set_cpuflags(vcpu, CPUSTAT_STOP_INT); + kvm_s390_vsie_kick(vcpu); + while (vcpu->arch.sie_block->prog0c & PROG_IN_SIE) + cpu_relax(); +} + +/* Kick a guest cpu out of SIE to process a request synchronously */ +void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu) +{ + __kvm_make_request(req, vcpu); + kvm_s390_vcpu_request(vcpu); +} + +static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, + unsigned long end) +{ + struct kvm *kvm = gmap->private; + struct kvm_vcpu *vcpu; + unsigned long prefix; + unsigned long i; + + if (gmap_is_shadow(gmap)) + return; + if (start >= 1UL << 31) + /* We are only interested in prefix pages */ + return; + kvm_for_each_vcpu(i, vcpu, kvm) { + /* match against both prefix pages */ + prefix = kvm_s390_get_prefix(vcpu); + if (prefix <= end && start <= prefix + 2*PAGE_SIZE - 1) { + VCPU_EVENT(vcpu, 2, "gmap notifier for %lx-%lx", + start, end); + kvm_s390_sync_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu); + } + } +} + +bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) +{ + /* do not poll with more than halt_poll_max_steal percent of steal time */ + if (S390_lowcore.avg_steal_timer * 100 / (TICK_USEC << 12) >= + READ_ONCE(halt_poll_max_steal)) { + vcpu->stat.halt_no_poll_steal++; + return true; + } + return false; +} + +int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) +{ + /* kvm common code refers to this, but never calls it */ + BUG(); + return 0; +} + +static int kvm_arch_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, + struct kvm_one_reg *reg) +{ + int r = -EINVAL; + + switch (reg->id) { + case KVM_REG_S390_TODPR: + r = put_user(vcpu->arch.sie_block->todpr, + (u32 __user *)reg->addr); + break; + case KVM_REG_S390_EPOCHDIFF: + r = put_user(vcpu->arch.sie_block->epoch, + (u64 __user *)reg->addr); + break; + case KVM_REG_S390_CPU_TIMER: + r = put_user(kvm_s390_get_cpu_timer(vcpu), + (u64 __user *)reg->addr); + break; + case KVM_REG_S390_CLOCK_COMP: + r = put_user(vcpu->arch.sie_block->ckc, + (u64 __user *)reg->addr); + break; + case KVM_REG_S390_PFTOKEN: + r = put_user(vcpu->arch.pfault_token, + (u64 __user *)reg->addr); + break; + case KVM_REG_S390_PFCOMPARE: + r = put_user(vcpu->arch.pfault_compare, + (u64 __user *)reg->addr); + break; + case KVM_REG_S390_PFSELECT: + r = put_user(vcpu->arch.pfault_select, + (u64 __user *)reg->addr); + break; + case KVM_REG_S390_PP: + r = put_user(vcpu->arch.sie_block->pp, + (u64 __user *)reg->addr); + break; + case KVM_REG_S390_GBEA: + r = put_user(vcpu->arch.sie_block->gbea, + (u64 __user *)reg->addr); + break; + default: + break; + } + + return r; +} + +static int kvm_arch_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, + struct kvm_one_reg *reg) +{ + int r = -EINVAL; + __u64 val; + + switch (reg->id) { + case KVM_REG_S390_TODPR: + r = get_user(vcpu->arch.sie_block->todpr, + (u32 __user *)reg->addr); + break; + case KVM_REG_S390_EPOCHDIFF: + r = get_user(vcpu->arch.sie_block->epoch, + (u64 __user *)reg->addr); + break; + case KVM_REG_S390_CPU_TIMER: + r = get_user(val, (u64 __user *)reg->addr); + if (!r) + kvm_s390_set_cpu_timer(vcpu, val); + break; + case KVM_REG_S390_CLOCK_COMP: + r = get_user(vcpu->arch.sie_block->ckc, + (u64 __user *)reg->addr); + break; + case KVM_REG_S390_PFTOKEN: + r = get_user(vcpu->arch.pfault_token, + (u64 __user *)reg->addr); + if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID) + kvm_clear_async_pf_completion_queue(vcpu); + break; + case KVM_REG_S390_PFCOMPARE: + r = get_user(vcpu->arch.pfault_compare, + (u64 __user *)reg->addr); + break; + case KVM_REG_S390_PFSELECT: + r = get_user(vcpu->arch.pfault_select, + (u64 __user *)reg->addr); + break; + case KVM_REG_S390_PP: + r = get_user(vcpu->arch.sie_block->pp, + (u64 __user *)reg->addr); + break; + case KVM_REG_S390_GBEA: + r = get_user(vcpu->arch.sie_block->gbea, + (u64 __user *)reg->addr); + break; + default: + break; + } + + return r; +} + +static void kvm_arch_vcpu_ioctl_normal_reset(struct kvm_vcpu *vcpu) +{ + vcpu->arch.sie_block->gpsw.mask &= ~PSW_MASK_RI; + vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; + memset(vcpu->run->s.regs.riccb, 0, sizeof(vcpu->run->s.regs.riccb)); + + kvm_clear_async_pf_completion_queue(vcpu); + if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) + kvm_s390_vcpu_stop(vcpu); + kvm_s390_clear_local_irqs(vcpu); +} + +static void kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu) +{ + /* Initial reset is a superset of the normal reset */ + kvm_arch_vcpu_ioctl_normal_reset(vcpu); + + /* + * This equals initial cpu reset in pop, but we don't switch to ESA. + * We do not only reset the internal data, but also ... + */ + vcpu->arch.sie_block->gpsw.mask = 0; + vcpu->arch.sie_block->gpsw.addr = 0; + kvm_s390_set_prefix(vcpu, 0); + kvm_s390_set_cpu_timer(vcpu, 0); + vcpu->arch.sie_block->ckc = 0; + memset(vcpu->arch.sie_block->gcr, 0, sizeof(vcpu->arch.sie_block->gcr)); + vcpu->arch.sie_block->gcr[0] = CR0_INITIAL_MASK; + vcpu->arch.sie_block->gcr[14] = CR14_INITIAL_MASK; + + /* ... the data in sync regs */ + memset(vcpu->run->s.regs.crs, 0, sizeof(vcpu->run->s.regs.crs)); + vcpu->run->s.regs.ckc = 0; + vcpu->run->s.regs.crs[0] = CR0_INITIAL_MASK; + vcpu->run->s.regs.crs[14] = CR14_INITIAL_MASK; + vcpu->run->psw_addr = 0; + vcpu->run->psw_mask = 0; + vcpu->run->s.regs.todpr = 0; + vcpu->run->s.regs.cputm = 0; + vcpu->run->s.regs.ckc = 0; + vcpu->run->s.regs.pp = 0; + vcpu->run->s.regs.gbea = 1; + vcpu->run->s.regs.fpc = 0; + /* + * Do not reset these registers in the protected case, as some of + * them are overlaid and they are not accessible in this case + * anyway. + */ + if (!kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->gbea = 1; + vcpu->arch.sie_block->pp = 0; + vcpu->arch.sie_block->fpf &= ~FPF_BPBC; + vcpu->arch.sie_block->todpr = 0; + } +} + +static void kvm_arch_vcpu_ioctl_clear_reset(struct kvm_vcpu *vcpu) +{ + struct kvm_sync_regs *regs = &vcpu->run->s.regs; + + /* Clear reset is a superset of the initial reset */ + kvm_arch_vcpu_ioctl_initial_reset(vcpu); + + memset(®s->gprs, 0, sizeof(regs->gprs)); + memset(®s->vrs, 0, sizeof(regs->vrs)); + memset(®s->acrs, 0, sizeof(regs->acrs)); + memset(®s->gscb, 0, sizeof(regs->gscb)); + + regs->etoken = 0; + regs->etoken_extension = 0; +} + +int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + vcpu_load(vcpu); + memcpy(&vcpu->run->s.regs.gprs, ®s->gprs, sizeof(regs->gprs)); + vcpu_put(vcpu); + return 0; +} + +int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + vcpu_load(vcpu); + memcpy(®s->gprs, &vcpu->run->s.regs.gprs, sizeof(regs->gprs)); + vcpu_put(vcpu); + return 0; +} + +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + vcpu_load(vcpu); + + memcpy(&vcpu->run->s.regs.acrs, &sregs->acrs, sizeof(sregs->acrs)); + memcpy(&vcpu->arch.sie_block->gcr, &sregs->crs, sizeof(sregs->crs)); + + vcpu_put(vcpu); + return 0; +} + +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + vcpu_load(vcpu); + + memcpy(&sregs->acrs, &vcpu->run->s.regs.acrs, sizeof(sregs->acrs)); + memcpy(&sregs->crs, &vcpu->arch.sie_block->gcr, sizeof(sregs->crs)); + + vcpu_put(vcpu); + return 0; +} + +int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + int ret = 0; + + vcpu_load(vcpu); + + if (test_fp_ctl(fpu->fpc)) { + ret = -EINVAL; + goto out; + } + vcpu->run->s.regs.fpc = fpu->fpc; + if (MACHINE_HAS_VX) + convert_fp_to_vx((__vector128 *) vcpu->run->s.regs.vrs, + (freg_t *) fpu->fprs); + else + memcpy(vcpu->run->s.regs.fprs, &fpu->fprs, sizeof(fpu->fprs)); + +out: + vcpu_put(vcpu); + return ret; +} + +int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + vcpu_load(vcpu); + + /* make sure we have the latest values */ + save_fpu_regs(); + if (MACHINE_HAS_VX) + convert_vx_to_fp((freg_t *) fpu->fprs, + (__vector128 *) vcpu->run->s.regs.vrs); + else + memcpy(fpu->fprs, vcpu->run->s.regs.fprs, sizeof(fpu->fprs)); + fpu->fpc = vcpu->run->s.regs.fpc; + + vcpu_put(vcpu); + return 0; +} + +static int kvm_arch_vcpu_ioctl_set_initial_psw(struct kvm_vcpu *vcpu, psw_t psw) +{ + int rc = 0; + + if (!is_vcpu_stopped(vcpu)) + rc = -EBUSY; + else { + vcpu->run->psw_mask = psw.mask; + vcpu->run->psw_addr = psw.addr; + } + return rc; +} + +int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, + struct kvm_translation *tr) +{ + return -EINVAL; /* not implemented yet */ +} + +#define VALID_GUESTDBG_FLAGS (KVM_GUESTDBG_SINGLESTEP | \ + KVM_GUESTDBG_USE_HW_BP | \ + KVM_GUESTDBG_ENABLE) + +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg) +{ + int rc = 0; + + vcpu_load(vcpu); + + vcpu->guest_debug = 0; + kvm_s390_clear_bp_data(vcpu); + + if (dbg->control & ~VALID_GUESTDBG_FLAGS) { + rc = -EINVAL; + goto out; + } + if (!sclp.has_gpere) { + rc = -EINVAL; + goto out; + } + + if (dbg->control & KVM_GUESTDBG_ENABLE) { + vcpu->guest_debug = dbg->control; + /* enforce guest PER */ + kvm_s390_set_cpuflags(vcpu, CPUSTAT_P); + + if (dbg->control & KVM_GUESTDBG_USE_HW_BP) + rc = kvm_s390_import_bp_data(vcpu, dbg); + } else { + kvm_s390_clear_cpuflags(vcpu, CPUSTAT_P); + vcpu->arch.guestdbg.last_bp = 0; + } + + if (rc) { + vcpu->guest_debug = 0; + kvm_s390_clear_bp_data(vcpu); + kvm_s390_clear_cpuflags(vcpu, CPUSTAT_P); + } + +out: + vcpu_put(vcpu); + return rc; +} + +int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + int ret; + + vcpu_load(vcpu); + + /* CHECK_STOP and LOAD are not supported yet */ + ret = is_vcpu_stopped(vcpu) ? KVM_MP_STATE_STOPPED : + KVM_MP_STATE_OPERATING; + + vcpu_put(vcpu); + return ret; +} + +int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + int rc = 0; + + vcpu_load(vcpu); + + /* user space knows about this interface - let it control the state */ + kvm_s390_set_user_cpu_state_ctrl(vcpu->kvm); + + switch (mp_state->mp_state) { + case KVM_MP_STATE_STOPPED: + rc = kvm_s390_vcpu_stop(vcpu); + break; + case KVM_MP_STATE_OPERATING: + rc = kvm_s390_vcpu_start(vcpu); + break; + case KVM_MP_STATE_LOAD: + if (!kvm_s390_pv_cpu_is_protected(vcpu)) { + rc = -ENXIO; + break; + } + rc = kvm_s390_pv_set_cpu_state(vcpu, PV_CPU_STATE_OPR_LOAD); + break; + case KVM_MP_STATE_CHECK_STOP: + fallthrough; /* CHECK_STOP and LOAD are not supported yet */ + default: + rc = -ENXIO; + } + + vcpu_put(vcpu); + return rc; +} + +static bool ibs_enabled(struct kvm_vcpu *vcpu) +{ + return kvm_s390_test_cpuflags(vcpu, CPUSTAT_IBS); +} + +static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu) +{ +retry: + kvm_s390_vcpu_request_handled(vcpu); + if (!kvm_request_pending(vcpu)) + return 0; + /* + * If the guest prefix changed, re-arm the ipte notifier for the + * guest prefix page. gmap_mprotect_notify will wait on the ptl lock. + * This ensures that the ipte instruction for this request has + * already finished. We might race against a second unmapper that + * wants to set the blocking bit. Lets just retry the request loop. + */ + if (kvm_check_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu)) { + int rc; + rc = gmap_mprotect_notify(vcpu->arch.gmap, + kvm_s390_get_prefix(vcpu), + PAGE_SIZE * 2, PROT_WRITE); + if (rc) { + kvm_make_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu); + return rc; + } + goto retry; + } + + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) { + vcpu->arch.sie_block->ihcpu = 0xffff; + goto retry; + } + + if (kvm_check_request(KVM_REQ_ENABLE_IBS, vcpu)) { + if (!ibs_enabled(vcpu)) { + trace_kvm_s390_enable_disable_ibs(vcpu->vcpu_id, 1); + kvm_s390_set_cpuflags(vcpu, CPUSTAT_IBS); + } + goto retry; + } + + if (kvm_check_request(KVM_REQ_DISABLE_IBS, vcpu)) { + if (ibs_enabled(vcpu)) { + trace_kvm_s390_enable_disable_ibs(vcpu->vcpu_id, 0); + kvm_s390_clear_cpuflags(vcpu, CPUSTAT_IBS); + } + goto retry; + } + + if (kvm_check_request(KVM_REQ_ICPT_OPEREXC, vcpu)) { + vcpu->arch.sie_block->ictl |= ICTL_OPEREXC; + goto retry; + } + + if (kvm_check_request(KVM_REQ_START_MIGRATION, vcpu)) { + /* + * Disable CMM virtualization; we will emulate the ESSA + * instruction manually, in order to provide additional + * functionalities needed for live migration. + */ + vcpu->arch.sie_block->ecb2 &= ~ECB2_CMMA; + goto retry; + } + + if (kvm_check_request(KVM_REQ_STOP_MIGRATION, vcpu)) { + /* + * Re-enable CMM virtualization if CMMA is available and + * CMM has been used. + */ + if ((vcpu->kvm->arch.use_cmma) && + (vcpu->kvm->mm->context.uses_cmm)) + vcpu->arch.sie_block->ecb2 |= ECB2_CMMA; + goto retry; + } + + /* we left the vsie handler, nothing to do, just clear the request */ + kvm_clear_request(KVM_REQ_VSIE_RESTART, vcpu); + + return 0; +} + +static void __kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod) +{ + struct kvm_vcpu *vcpu; + union tod_clock clk; + unsigned long i; + + preempt_disable(); + + store_tod_clock_ext(&clk); + + kvm->arch.epoch = gtod->tod - clk.tod; + kvm->arch.epdx = 0; + if (test_kvm_facility(kvm, 139)) { + kvm->arch.epdx = gtod->epoch_idx - clk.ei; + if (kvm->arch.epoch > gtod->tod) + kvm->arch.epdx -= 1; + } + + kvm_s390_vcpu_block_all(kvm); + kvm_for_each_vcpu(i, vcpu, kvm) { + vcpu->arch.sie_block->epoch = kvm->arch.epoch; + vcpu->arch.sie_block->epdx = kvm->arch.epdx; + } + + kvm_s390_vcpu_unblock_all(kvm); + preempt_enable(); +} + +int kvm_s390_try_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod) +{ + if (!mutex_trylock(&kvm->lock)) + return 0; + __kvm_s390_set_tod_clock(kvm, gtod); + mutex_unlock(&kvm->lock); + return 1; +} + +/** + * kvm_arch_fault_in_page - fault-in guest page if necessary + * @vcpu: The corresponding virtual cpu + * @gpa: Guest physical address + * @writable: Whether the page should be writable or not + * + * Make sure that a guest page has been faulted-in on the host. + * + * Return: Zero on success, negative error code otherwise. + */ +long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable) +{ + return gmap_fault(vcpu->arch.gmap, gpa, + writable ? FAULT_FLAG_WRITE : 0); +} + +static void __kvm_inject_pfault_token(struct kvm_vcpu *vcpu, bool start_token, + unsigned long token) +{ + struct kvm_s390_interrupt inti; + struct kvm_s390_irq irq; + + if (start_token) { + irq.u.ext.ext_params2 = token; + irq.type = KVM_S390_INT_PFAULT_INIT; + WARN_ON_ONCE(kvm_s390_inject_vcpu(vcpu, &irq)); + } else { + inti.type = KVM_S390_INT_PFAULT_DONE; + inti.parm64 = token; + WARN_ON_ONCE(kvm_s390_inject_vm(vcpu->kvm, &inti)); + } +} + +bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work) +{ + trace_kvm_s390_pfault_init(vcpu, work->arch.pfault_token); + __kvm_inject_pfault_token(vcpu, true, work->arch.pfault_token); + + return true; +} + +void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work) +{ + trace_kvm_s390_pfault_done(vcpu, work->arch.pfault_token); + __kvm_inject_pfault_token(vcpu, false, work->arch.pfault_token); +} + +void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work) +{ + /* s390 will always inject the page directly */ +} + +bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) +{ + /* + * s390 will always inject the page directly, + * but we still want check_async_completion to cleanup + */ + return true; +} + +static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu) +{ + hva_t hva; + struct kvm_arch_async_pf arch; + + if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID) + return false; + if ((vcpu->arch.sie_block->gpsw.mask & vcpu->arch.pfault_select) != + vcpu->arch.pfault_compare) + return false; + if (psw_extint_disabled(vcpu)) + return false; + if (kvm_s390_vcpu_has_irq(vcpu, 0)) + return false; + if (!(vcpu->arch.sie_block->gcr[0] & CR0_SERVICE_SIGNAL_SUBMASK)) + return false; + if (!vcpu->arch.gmap->pfault_enabled) + return false; + + hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(current->thread.gmap_addr)); + hva += current->thread.gmap_addr & ~PAGE_MASK; + if (read_guest_real(vcpu, vcpu->arch.pfault_token, &arch.pfault_token, 8)) + return false; + + return kvm_setup_async_pf(vcpu, current->thread.gmap_addr, hva, &arch); +} + +static int vcpu_pre_run(struct kvm_vcpu *vcpu) +{ + int rc, cpuflags; + + /* + * On s390 notifications for arriving pages will be delivered directly + * to the guest but the house keeping for completed pfaults is + * handled outside the worker. + */ + kvm_check_async_pf_completion(vcpu); + + vcpu->arch.sie_block->gg14 = vcpu->run->s.regs.gprs[14]; + vcpu->arch.sie_block->gg15 = vcpu->run->s.regs.gprs[15]; + + if (need_resched()) + schedule(); + + if (!kvm_is_ucontrol(vcpu->kvm)) { + rc = kvm_s390_deliver_pending_interrupts(vcpu); + if (rc || guestdbg_exit_pending(vcpu)) + return rc; + } + + rc = kvm_s390_handle_requests(vcpu); + if (rc) + return rc; + + if (guestdbg_enabled(vcpu)) { + kvm_s390_backup_guest_per_regs(vcpu); + kvm_s390_patch_guest_per_regs(vcpu); + } + + clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.gisa_int.kicked_mask); + + vcpu->arch.sie_block->icptcode = 0; + cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags); + VCPU_EVENT(vcpu, 6, "entering sie flags %x", cpuflags); + trace_kvm_s390_sie_enter(vcpu, cpuflags); + + return 0; +} + +static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_pgm_info pgm_info = { + .code = PGM_ADDRESSING, + }; + u8 opcode, ilen; + int rc; + + VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction"); + trace_kvm_s390_sie_fault(vcpu); + + /* + * We want to inject an addressing exception, which is defined as a + * suppressing or terminating exception. However, since we came here + * by a DAT access exception, the PSW still points to the faulting + * instruction since DAT exceptions are nullifying. So we've got + * to look up the current opcode to get the length of the instruction + * to be able to forward the PSW. + */ + rc = read_guest_instr(vcpu, vcpu->arch.sie_block->gpsw.addr, &opcode, 1); + ilen = insn_length(opcode); + if (rc < 0) { + return rc; + } else if (rc) { + /* Instruction-Fetching Exceptions - we can't detect the ilen. + * Forward by arbitrary ilc, injection will take care of + * nullification if necessary. + */ + pgm_info = vcpu->arch.pgm; + ilen = 4; + } + pgm_info.flags = ilen | KVM_S390_PGM_FLAGS_ILC_VALID; + kvm_s390_forward_psw(vcpu, ilen); + return kvm_s390_inject_prog_irq(vcpu, &pgm_info); +} + +static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) +{ + struct mcck_volatile_info *mcck_info; + struct sie_page *sie_page; + + VCPU_EVENT(vcpu, 6, "exit sie icptcode %d", + vcpu->arch.sie_block->icptcode); + trace_kvm_s390_sie_exit(vcpu, vcpu->arch.sie_block->icptcode); + + if (guestdbg_enabled(vcpu)) + kvm_s390_restore_guest_per_regs(vcpu); + + vcpu->run->s.regs.gprs[14] = vcpu->arch.sie_block->gg14; + vcpu->run->s.regs.gprs[15] = vcpu->arch.sie_block->gg15; + + if (exit_reason == -EINTR) { + VCPU_EVENT(vcpu, 3, "%s", "machine check"); + sie_page = container_of(vcpu->arch.sie_block, + struct sie_page, sie_block); + mcck_info = &sie_page->mcck_info; + kvm_s390_reinject_machine_check(vcpu, mcck_info); + return 0; + } + + if (vcpu->arch.sie_block->icptcode > 0) { + int rc = kvm_handle_sie_intercept(vcpu); + + if (rc != -EOPNOTSUPP) + return rc; + vcpu->run->exit_reason = KVM_EXIT_S390_SIEIC; + vcpu->run->s390_sieic.icptcode = vcpu->arch.sie_block->icptcode; + vcpu->run->s390_sieic.ipa = vcpu->arch.sie_block->ipa; + vcpu->run->s390_sieic.ipb = vcpu->arch.sie_block->ipb; + return -EREMOTE; + } else if (exit_reason != -EFAULT) { + vcpu->stat.exit_null++; + return 0; + } else if (kvm_is_ucontrol(vcpu->kvm)) { + vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL; + vcpu->run->s390_ucontrol.trans_exc_code = + current->thread.gmap_addr; + vcpu->run->s390_ucontrol.pgm_code = 0x10; + return -EREMOTE; + } else if (current->thread.gmap_pfault) { + trace_kvm_s390_major_guest_pfault(vcpu); + current->thread.gmap_pfault = 0; + if (kvm_arch_setup_async_pf(vcpu)) + return 0; + vcpu->stat.pfault_sync++; + return kvm_arch_fault_in_page(vcpu, current->thread.gmap_addr, 1); + } + return vcpu_post_run_fault_in_sie(vcpu); +} + +#define PSW_INT_MASK (PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_MCHECK) +static int __vcpu_run(struct kvm_vcpu *vcpu) +{ + int rc, exit_reason; + struct sie_page *sie_page = (struct sie_page *)vcpu->arch.sie_block; + + /* + * We try to hold kvm->srcu during most of vcpu_run (except when run- + * ning the guest), so that memslots (and other stuff) are protected + */ + kvm_vcpu_srcu_read_lock(vcpu); + + do { + rc = vcpu_pre_run(vcpu); + if (rc || guestdbg_exit_pending(vcpu)) + break; + + kvm_vcpu_srcu_read_unlock(vcpu); + /* + * As PF_VCPU will be used in fault handler, between + * guest_enter and guest_exit should be no uaccess. + */ + local_irq_disable(); + guest_enter_irqoff(); + __disable_cpu_timer_accounting(vcpu); + local_irq_enable(); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + memcpy(sie_page->pv_grregs, + vcpu->run->s.regs.gprs, + sizeof(sie_page->pv_grregs)); + } + if (test_cpu_flag(CIF_FPU)) + load_fpu_regs(); + exit_reason = sie64a(vcpu->arch.sie_block, + vcpu->run->s.regs.gprs); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + memcpy(vcpu->run->s.regs.gprs, + sie_page->pv_grregs, + sizeof(sie_page->pv_grregs)); + /* + * We're not allowed to inject interrupts on intercepts + * that leave the guest state in an "in-between" state + * where the next SIE entry will do a continuation. + * Fence interrupts in our "internal" PSW. + */ + if (vcpu->arch.sie_block->icptcode == ICPT_PV_INSTR || + vcpu->arch.sie_block->icptcode == ICPT_PV_PREF) { + vcpu->arch.sie_block->gpsw.mask &= ~PSW_INT_MASK; + } + } + local_irq_disable(); + __enable_cpu_timer_accounting(vcpu); + guest_exit_irqoff(); + local_irq_enable(); + kvm_vcpu_srcu_read_lock(vcpu); + + rc = vcpu_post_run(vcpu, exit_reason); + } while (!signal_pending(current) && !guestdbg_exit_pending(vcpu) && !rc); + + kvm_vcpu_srcu_read_unlock(vcpu); + return rc; +} + +static void sync_regs_fmt2(struct kvm_vcpu *vcpu) +{ + struct kvm_run *kvm_run = vcpu->run; + struct runtime_instr_cb *riccb; + struct gs_cb *gscb; + + riccb = (struct runtime_instr_cb *) &kvm_run->s.regs.riccb; + gscb = (struct gs_cb *) &kvm_run->s.regs.gscb; + vcpu->arch.sie_block->gpsw.mask = kvm_run->psw_mask; + vcpu->arch.sie_block->gpsw.addr = kvm_run->psw_addr; + if (kvm_run->kvm_dirty_regs & KVM_SYNC_ARCH0) { + vcpu->arch.sie_block->todpr = kvm_run->s.regs.todpr; + vcpu->arch.sie_block->pp = kvm_run->s.regs.pp; + vcpu->arch.sie_block->gbea = kvm_run->s.regs.gbea; + } + if (kvm_run->kvm_dirty_regs & KVM_SYNC_PFAULT) { + vcpu->arch.pfault_token = kvm_run->s.regs.pft; + vcpu->arch.pfault_select = kvm_run->s.regs.pfs; + vcpu->arch.pfault_compare = kvm_run->s.regs.pfc; + if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID) + kvm_clear_async_pf_completion_queue(vcpu); + } + if (kvm_run->kvm_dirty_regs & KVM_SYNC_DIAG318) { + vcpu->arch.diag318_info.val = kvm_run->s.regs.diag318; + vcpu->arch.sie_block->cpnc = vcpu->arch.diag318_info.cpnc; + VCPU_EVENT(vcpu, 3, "setting cpnc to %d", vcpu->arch.diag318_info.cpnc); + } + /* + * If userspace sets the riccb (e.g. after migration) to a valid state, + * we should enable RI here instead of doing the lazy enablement. + */ + if ((kvm_run->kvm_dirty_regs & KVM_SYNC_RICCB) && + test_kvm_facility(vcpu->kvm, 64) && + riccb->v && + !(vcpu->arch.sie_block->ecb3 & ECB3_RI)) { + VCPU_EVENT(vcpu, 3, "%s", "ENABLE: RI (sync_regs)"); + vcpu->arch.sie_block->ecb3 |= ECB3_RI; + } + /* + * If userspace sets the gscb (e.g. after migration) to non-zero, + * we should enable GS here instead of doing the lazy enablement. + */ + if ((kvm_run->kvm_dirty_regs & KVM_SYNC_GSCB) && + test_kvm_facility(vcpu->kvm, 133) && + gscb->gssm && + !vcpu->arch.gs_enabled) { + VCPU_EVENT(vcpu, 3, "%s", "ENABLE: GS (sync_regs)"); + vcpu->arch.sie_block->ecb |= ECB_GS; + vcpu->arch.sie_block->ecd |= ECD_HOSTREGMGMT; + vcpu->arch.gs_enabled = 1; + } + if ((kvm_run->kvm_dirty_regs & KVM_SYNC_BPBC) && + test_kvm_facility(vcpu->kvm, 82)) { + vcpu->arch.sie_block->fpf &= ~FPF_BPBC; + vcpu->arch.sie_block->fpf |= kvm_run->s.regs.bpbc ? FPF_BPBC : 0; + } + if (MACHINE_HAS_GS) { + preempt_disable(); + __ctl_set_bit(2, 4); + if (current->thread.gs_cb) { + vcpu->arch.host_gscb = current->thread.gs_cb; + save_gs_cb(vcpu->arch.host_gscb); + } + if (vcpu->arch.gs_enabled) { + current->thread.gs_cb = (struct gs_cb *) + &vcpu->run->s.regs.gscb; + restore_gs_cb(current->thread.gs_cb); + } + preempt_enable(); + } + /* SIE will load etoken directly from SDNX and therefore kvm_run */ +} + +static void sync_regs(struct kvm_vcpu *vcpu) +{ + struct kvm_run *kvm_run = vcpu->run; + + if (kvm_run->kvm_dirty_regs & KVM_SYNC_PREFIX) + kvm_s390_set_prefix(vcpu, kvm_run->s.regs.prefix); + if (kvm_run->kvm_dirty_regs & KVM_SYNC_CRS) { + memcpy(&vcpu->arch.sie_block->gcr, &kvm_run->s.regs.crs, 128); + /* some control register changes require a tlb flush */ + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } + if (kvm_run->kvm_dirty_regs & KVM_SYNC_ARCH0) { + kvm_s390_set_cpu_timer(vcpu, kvm_run->s.regs.cputm); + vcpu->arch.sie_block->ckc = kvm_run->s.regs.ckc; + } + save_access_regs(vcpu->arch.host_acrs); + restore_access_regs(vcpu->run->s.regs.acrs); + /* save host (userspace) fprs/vrs */ + save_fpu_regs(); + vcpu->arch.host_fpregs.fpc = current->thread.fpu.fpc; + vcpu->arch.host_fpregs.regs = current->thread.fpu.regs; + if (MACHINE_HAS_VX) + current->thread.fpu.regs = vcpu->run->s.regs.vrs; + else + current->thread.fpu.regs = vcpu->run->s.regs.fprs; + current->thread.fpu.fpc = vcpu->run->s.regs.fpc; + if (test_fp_ctl(current->thread.fpu.fpc)) + /* User space provided an invalid FPC, let's clear it */ + current->thread.fpu.fpc = 0; + + /* Sync fmt2 only data */ + if (likely(!kvm_s390_pv_cpu_is_protected(vcpu))) { + sync_regs_fmt2(vcpu); + } else { + /* + * In several places we have to modify our internal view to + * not do things that are disallowed by the ultravisor. For + * example we must not inject interrupts after specific exits + * (e.g. 112 prefix page not secure). We do this by turning + * off the machine check, external and I/O interrupt bits + * of our PSW copy. To avoid getting validity intercepts, we + * do only accept the condition code from userspace. + */ + vcpu->arch.sie_block->gpsw.mask &= ~PSW_MASK_CC; + vcpu->arch.sie_block->gpsw.mask |= kvm_run->psw_mask & + PSW_MASK_CC; + } + + kvm_run->kvm_dirty_regs = 0; +} + +static void store_regs_fmt2(struct kvm_vcpu *vcpu) +{ + struct kvm_run *kvm_run = vcpu->run; + + kvm_run->s.regs.todpr = vcpu->arch.sie_block->todpr; + kvm_run->s.regs.pp = vcpu->arch.sie_block->pp; + kvm_run->s.regs.gbea = vcpu->arch.sie_block->gbea; + kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC; + kvm_run->s.regs.diag318 = vcpu->arch.diag318_info.val; + if (MACHINE_HAS_GS) { + preempt_disable(); + __ctl_set_bit(2, 4); + if (vcpu->arch.gs_enabled) + save_gs_cb(current->thread.gs_cb); + current->thread.gs_cb = vcpu->arch.host_gscb; + restore_gs_cb(vcpu->arch.host_gscb); + if (!vcpu->arch.host_gscb) + __ctl_clear_bit(2, 4); + vcpu->arch.host_gscb = NULL; + preempt_enable(); + } + /* SIE will save etoken directly into SDNX and therefore kvm_run */ +} + +static void store_regs(struct kvm_vcpu *vcpu) +{ + struct kvm_run *kvm_run = vcpu->run; + + kvm_run->psw_mask = vcpu->arch.sie_block->gpsw.mask; + kvm_run->psw_addr = vcpu->arch.sie_block->gpsw.addr; + kvm_run->s.regs.prefix = kvm_s390_get_prefix(vcpu); + memcpy(&kvm_run->s.regs.crs, &vcpu->arch.sie_block->gcr, 128); + kvm_run->s.regs.cputm = kvm_s390_get_cpu_timer(vcpu); + kvm_run->s.regs.ckc = vcpu->arch.sie_block->ckc; + kvm_run->s.regs.pft = vcpu->arch.pfault_token; + kvm_run->s.regs.pfs = vcpu->arch.pfault_select; + kvm_run->s.regs.pfc = vcpu->arch.pfault_compare; + save_access_regs(vcpu->run->s.regs.acrs); + restore_access_regs(vcpu->arch.host_acrs); + /* Save guest register state */ + save_fpu_regs(); + vcpu->run->s.regs.fpc = current->thread.fpu.fpc; + /* Restore will be done lazily at return */ + current->thread.fpu.fpc = vcpu->arch.host_fpregs.fpc; + current->thread.fpu.regs = vcpu->arch.host_fpregs.regs; + if (likely(!kvm_s390_pv_cpu_is_protected(vcpu))) + store_regs_fmt2(vcpu); +} + +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) +{ + struct kvm_run *kvm_run = vcpu->run; + int rc; + + /* + * Running a VM while dumping always has the potential to + * produce inconsistent dump data. But for PV vcpus a SIE + * entry while dumping could also lead to a fatal validity + * intercept which we absolutely want to avoid. + */ + if (vcpu->kvm->arch.pv.dumping) + return -EINVAL; + + if (kvm_run->immediate_exit) + return -EINTR; + + if (kvm_run->kvm_valid_regs & ~KVM_SYNC_S390_VALID_FIELDS || + kvm_run->kvm_dirty_regs & ~KVM_SYNC_S390_VALID_FIELDS) + return -EINVAL; + + vcpu_load(vcpu); + + if (guestdbg_exit_pending(vcpu)) { + kvm_s390_prepare_debug_exit(vcpu); + rc = 0; + goto out; + } + + kvm_sigset_activate(vcpu); + + /* + * no need to check the return value of vcpu_start as it can only have + * an error for protvirt, but protvirt means user cpu state + */ + if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) { + kvm_s390_vcpu_start(vcpu); + } else if (is_vcpu_stopped(vcpu)) { + pr_err_ratelimited("can't run stopped vcpu %d\n", + vcpu->vcpu_id); + rc = -EINVAL; + goto out; + } + + sync_regs(vcpu); + enable_cpu_timer_accounting(vcpu); + + might_fault(); + rc = __vcpu_run(vcpu); + + if (signal_pending(current) && !rc) { + kvm_run->exit_reason = KVM_EXIT_INTR; + rc = -EINTR; + } + + if (guestdbg_exit_pending(vcpu) && !rc) { + kvm_s390_prepare_debug_exit(vcpu); + rc = 0; + } + + if (rc == -EREMOTE) { + /* userspace support is needed, kvm_run has been prepared */ + rc = 0; + } + + disable_cpu_timer_accounting(vcpu); + store_regs(vcpu); + + kvm_sigset_deactivate(vcpu); + + vcpu->stat.exit_userspace++; +out: + vcpu_put(vcpu); + return rc; +} + +/* + * store status at address + * we use have two special cases: + * KVM_S390_STORE_STATUS_NOADDR: -> 0x1200 on 64 bit + * KVM_S390_STORE_STATUS_PREFIXED: -> prefix + */ +int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long gpa) +{ + unsigned char archmode = 1; + freg_t fprs[NUM_FPRS]; + unsigned int px; + u64 clkcomp, cputm; + int rc; + + px = kvm_s390_get_prefix(vcpu); + if (gpa == KVM_S390_STORE_STATUS_NOADDR) { + if (write_guest_abs(vcpu, 163, &archmode, 1)) + return -EFAULT; + gpa = 0; + } else if (gpa == KVM_S390_STORE_STATUS_PREFIXED) { + if (write_guest_real(vcpu, 163, &archmode, 1)) + return -EFAULT; + gpa = px; + } else + gpa -= __LC_FPREGS_SAVE_AREA; + + /* manually convert vector registers if necessary */ + if (MACHINE_HAS_VX) { + convert_vx_to_fp(fprs, (__vector128 *) vcpu->run->s.regs.vrs); + rc = write_guest_abs(vcpu, gpa + __LC_FPREGS_SAVE_AREA, + fprs, 128); + } else { + rc = write_guest_abs(vcpu, gpa + __LC_FPREGS_SAVE_AREA, + vcpu->run->s.regs.fprs, 128); + } + rc |= write_guest_abs(vcpu, gpa + __LC_GPREGS_SAVE_AREA, + vcpu->run->s.regs.gprs, 128); + rc |= write_guest_abs(vcpu, gpa + __LC_PSW_SAVE_AREA, + &vcpu->arch.sie_block->gpsw, 16); + rc |= write_guest_abs(vcpu, gpa + __LC_PREFIX_SAVE_AREA, + &px, 4); + rc |= write_guest_abs(vcpu, gpa + __LC_FP_CREG_SAVE_AREA, + &vcpu->run->s.regs.fpc, 4); + rc |= write_guest_abs(vcpu, gpa + __LC_TOD_PROGREG_SAVE_AREA, + &vcpu->arch.sie_block->todpr, 4); + cputm = kvm_s390_get_cpu_timer(vcpu); + rc |= write_guest_abs(vcpu, gpa + __LC_CPU_TIMER_SAVE_AREA, + &cputm, 8); + clkcomp = vcpu->arch.sie_block->ckc >> 8; + rc |= write_guest_abs(vcpu, gpa + __LC_CLOCK_COMP_SAVE_AREA, + &clkcomp, 8); + rc |= write_guest_abs(vcpu, gpa + __LC_AREGS_SAVE_AREA, + &vcpu->run->s.regs.acrs, 64); + rc |= write_guest_abs(vcpu, gpa + __LC_CREGS_SAVE_AREA, + &vcpu->arch.sie_block->gcr, 128); + return rc ? -EFAULT : 0; +} + +int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr) +{ + /* + * The guest FPRS and ACRS are in the host FPRS/ACRS due to the lazy + * switch in the run ioctl. Let's update our copies before we save + * it into the save area + */ + save_fpu_regs(); + vcpu->run->s.regs.fpc = current->thread.fpu.fpc; + save_access_regs(vcpu->run->s.regs.acrs); + + return kvm_s390_store_status_unloaded(vcpu, addr); +} + +static void __disable_ibs_on_vcpu(struct kvm_vcpu *vcpu) +{ + kvm_check_request(KVM_REQ_ENABLE_IBS, vcpu); + kvm_s390_sync_request(KVM_REQ_DISABLE_IBS, vcpu); +} + +static void __disable_ibs_on_all_vcpus(struct kvm *kvm) +{ + unsigned long i; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) { + __disable_ibs_on_vcpu(vcpu); + } +} + +static void __enable_ibs_on_vcpu(struct kvm_vcpu *vcpu) +{ + if (!sclp.has_ibs) + return; + kvm_check_request(KVM_REQ_DISABLE_IBS, vcpu); + kvm_s390_sync_request(KVM_REQ_ENABLE_IBS, vcpu); +} + +int kvm_s390_vcpu_start(struct kvm_vcpu *vcpu) +{ + int i, online_vcpus, r = 0, started_vcpus = 0; + + if (!is_vcpu_stopped(vcpu)) + return 0; + + trace_kvm_s390_vcpu_start_stop(vcpu->vcpu_id, 1); + /* Only one cpu at a time may enter/leave the STOPPED state. */ + spin_lock(&vcpu->kvm->arch.start_stop_lock); + online_vcpus = atomic_read(&vcpu->kvm->online_vcpus); + + /* Let's tell the UV that we want to change into the operating state */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + r = kvm_s390_pv_set_cpu_state(vcpu, PV_CPU_STATE_OPR); + if (r) { + spin_unlock(&vcpu->kvm->arch.start_stop_lock); + return r; + } + } + + for (i = 0; i < online_vcpus; i++) { + if (!is_vcpu_stopped(kvm_get_vcpu(vcpu->kvm, i))) + started_vcpus++; + } + + if (started_vcpus == 0) { + /* we're the only active VCPU -> speed it up */ + __enable_ibs_on_vcpu(vcpu); + } else if (started_vcpus == 1) { + /* + * As we are starting a second VCPU, we have to disable + * the IBS facility on all VCPUs to remove potentially + * outstanding ENABLE requests. + */ + __disable_ibs_on_all_vcpus(vcpu->kvm); + } + + kvm_s390_clear_cpuflags(vcpu, CPUSTAT_STOPPED); + /* + * The real PSW might have changed due to a RESTART interpreted by the + * ultravisor. We block all interrupts and let the next sie exit + * refresh our view. + */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) + vcpu->arch.sie_block->gpsw.mask &= ~PSW_INT_MASK; + /* + * Another VCPU might have used IBS while we were offline. + * Let's play safe and flush the VCPU at startup. + */ + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + spin_unlock(&vcpu->kvm->arch.start_stop_lock); + return 0; +} + +int kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu) +{ + int i, online_vcpus, r = 0, started_vcpus = 0; + struct kvm_vcpu *started_vcpu = NULL; + + if (is_vcpu_stopped(vcpu)) + return 0; + + trace_kvm_s390_vcpu_start_stop(vcpu->vcpu_id, 0); + /* Only one cpu at a time may enter/leave the STOPPED state. */ + spin_lock(&vcpu->kvm->arch.start_stop_lock); + online_vcpus = atomic_read(&vcpu->kvm->online_vcpus); + + /* Let's tell the UV that we want to change into the stopped state */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + r = kvm_s390_pv_set_cpu_state(vcpu, PV_CPU_STATE_STP); + if (r) { + spin_unlock(&vcpu->kvm->arch.start_stop_lock); + return r; + } + } + + /* + * Set the VCPU to STOPPED and THEN clear the interrupt flag, + * now that the SIGP STOP and SIGP STOP AND STORE STATUS orders + * have been fully processed. This will ensure that the VCPU + * is kept BUSY if another VCPU is inquiring with SIGP SENSE. + */ + kvm_s390_set_cpuflags(vcpu, CPUSTAT_STOPPED); + kvm_s390_clear_stop_irq(vcpu); + + __disable_ibs_on_vcpu(vcpu); + + for (i = 0; i < online_vcpus; i++) { + struct kvm_vcpu *tmp = kvm_get_vcpu(vcpu->kvm, i); + + if (!is_vcpu_stopped(tmp)) { + started_vcpus++; + started_vcpu = tmp; + } + } + + if (started_vcpus == 1) { + /* + * As we only have one VCPU left, we want to enable the + * IBS facility for that VCPU to speed it up. + */ + __enable_ibs_on_vcpu(started_vcpu); + } + + spin_unlock(&vcpu->kvm->arch.start_stop_lock); + return 0; +} + +static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, + struct kvm_enable_cap *cap) +{ + int r; + + if (cap->flags) + return -EINVAL; + + switch (cap->cap) { + case KVM_CAP_S390_CSS_SUPPORT: + if (!vcpu->kvm->arch.css_support) { + vcpu->kvm->arch.css_support = 1; + VM_EVENT(vcpu->kvm, 3, "%s", "ENABLE: CSS support"); + trace_kvm_s390_enable_css(vcpu->kvm); + } + r = 0; + break; + default: + r = -EINVAL; + break; + } + return r; +} + +static long kvm_s390_vcpu_sida_op(struct kvm_vcpu *vcpu, + struct kvm_s390_mem_op *mop) +{ + void __user *uaddr = (void __user *)mop->buf; + void *sida_addr; + int r = 0; + + if (mop->flags || !mop->size) + return -EINVAL; + if (mop->size + mop->sida_offset < mop->size) + return -EINVAL; + if (mop->size + mop->sida_offset > sida_size(vcpu->arch.sie_block)) + return -E2BIG; + if (!kvm_s390_pv_cpu_is_protected(vcpu)) + return -EINVAL; + + sida_addr = (char *)sida_addr(vcpu->arch.sie_block) + mop->sida_offset; + + switch (mop->op) { + case KVM_S390_MEMOP_SIDA_READ: + if (copy_to_user(uaddr, sida_addr, mop->size)) + r = -EFAULT; + + break; + case KVM_S390_MEMOP_SIDA_WRITE: + if (copy_from_user(sida_addr, uaddr, mop->size)) + r = -EFAULT; + break; + } + return r; +} + +static long kvm_s390_vcpu_mem_op(struct kvm_vcpu *vcpu, + struct kvm_s390_mem_op *mop) +{ + void __user *uaddr = (void __user *)mop->buf; + enum gacc_mode acc_mode; + void *tmpbuf = NULL; + int r; + + r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_INJECT_EXCEPTION | + KVM_S390_MEMOP_F_CHECK_ONLY | + KVM_S390_MEMOP_F_SKEY_PROTECTION); + if (r) + return r; + if (mop->ar >= NUM_ACRS) + return -EINVAL; + if (kvm_s390_pv_cpu_is_protected(vcpu)) + return -EINVAL; + if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) { + tmpbuf = vmalloc(mop->size); + if (!tmpbuf) + return -ENOMEM; + } + + acc_mode = mop->op == KVM_S390_MEMOP_LOGICAL_READ ? GACC_FETCH : GACC_STORE; + if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { + r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size, + acc_mode, mop->key); + goto out_inject; + } + if (acc_mode == GACC_FETCH) { + r = read_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf, + mop->size, mop->key); + if (r) + goto out_inject; + if (copy_to_user(uaddr, tmpbuf, mop->size)) { + r = -EFAULT; + goto out_free; + } + } else { + if (copy_from_user(tmpbuf, uaddr, mop->size)) { + r = -EFAULT; + goto out_free; + } + r = write_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf, + mop->size, mop->key); + } + +out_inject: + if (r > 0 && (mop->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION) != 0) + kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); + +out_free: + vfree(tmpbuf); + return r; +} + +static long kvm_s390_vcpu_memsida_op(struct kvm_vcpu *vcpu, + struct kvm_s390_mem_op *mop) +{ + int r, srcu_idx; + + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + + switch (mop->op) { + case KVM_S390_MEMOP_LOGICAL_READ: + case KVM_S390_MEMOP_LOGICAL_WRITE: + r = kvm_s390_vcpu_mem_op(vcpu, mop); + break; + case KVM_S390_MEMOP_SIDA_READ: + case KVM_S390_MEMOP_SIDA_WRITE: + /* we are locked against sida going away by the vcpu->mutex */ + r = kvm_s390_vcpu_sida_op(vcpu, mop); + break; + default: + r = -EINVAL; + } + + srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); + return r; +} + +long kvm_arch_vcpu_async_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm_vcpu *vcpu = filp->private_data; + void __user *argp = (void __user *)arg; + int rc; + + switch (ioctl) { + case KVM_S390_IRQ: { + struct kvm_s390_irq s390irq; + + if (copy_from_user(&s390irq, argp, sizeof(s390irq))) + return -EFAULT; + rc = kvm_s390_inject_vcpu(vcpu, &s390irq); + break; + } + case KVM_S390_INTERRUPT: { + struct kvm_s390_interrupt s390int; + struct kvm_s390_irq s390irq = {}; + + if (copy_from_user(&s390int, argp, sizeof(s390int))) + return -EFAULT; + if (s390int_to_s390irq(&s390int, &s390irq)) + return -EINVAL; + rc = kvm_s390_inject_vcpu(vcpu, &s390irq); + break; + } + default: + rc = -ENOIOCTLCMD; + break; + } + + /* + * To simplify single stepping of userspace-emulated instructions, + * KVM_EXIT_S390_SIEIC exit sets KVM_GUESTDBG_EXIT_PENDING (see + * should_handle_per_ifetch()). However, if userspace emulation injects + * an interrupt, it needs to be cleared, so that KVM_EXIT_DEBUG happens + * after (and not before) the interrupt delivery. + */ + if (!rc) + vcpu->guest_debug &= ~KVM_GUESTDBG_EXIT_PENDING; + + return rc; +} + +static int kvm_s390_handle_pv_vcpu_dump(struct kvm_vcpu *vcpu, + struct kvm_pv_cmd *cmd) +{ + struct kvm_s390_pv_dmp dmp; + void *data; + int ret; + + /* Dump initialization is a prerequisite */ + if (!vcpu->kvm->arch.pv.dumping) + return -EINVAL; + + if (copy_from_user(&dmp, (__u8 __user *)cmd->data, sizeof(dmp))) + return -EFAULT; + + /* We only handle this subcmd right now */ + if (dmp.subcmd != KVM_PV_DUMP_CPU) + return -EINVAL; + + /* CPU dump length is the same as create cpu storage donation. */ + if (dmp.buff_len != uv_info.guest_cpu_stor_len) + return -EINVAL; + + data = kvzalloc(uv_info.guest_cpu_stor_len, GFP_KERNEL); + if (!data) + return -ENOMEM; + + ret = kvm_s390_pv_dump_cpu(vcpu, data, &cmd->rc, &cmd->rrc); + + VCPU_EVENT(vcpu, 3, "PROTVIRT DUMP CPU %d rc %x rrc %x", + vcpu->vcpu_id, cmd->rc, cmd->rrc); + + if (ret) + ret = -EINVAL; + + /* On success copy over the dump data */ + if (!ret && copy_to_user((__u8 __user *)dmp.buff_addr, data, uv_info.guest_cpu_stor_len)) + ret = -EFAULT; + + kvfree(data); + return ret; +} + +long kvm_arch_vcpu_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm_vcpu *vcpu = filp->private_data; + void __user *argp = (void __user *)arg; + int idx; + long r; + u16 rc, rrc; + + vcpu_load(vcpu); + + switch (ioctl) { + case KVM_S390_STORE_STATUS: + idx = srcu_read_lock(&vcpu->kvm->srcu); + r = kvm_s390_store_status_unloaded(vcpu, arg); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + break; + case KVM_S390_SET_INITIAL_PSW: { + psw_t psw; + + r = -EFAULT; + if (copy_from_user(&psw, argp, sizeof(psw))) + break; + r = kvm_arch_vcpu_ioctl_set_initial_psw(vcpu, psw); + break; + } + case KVM_S390_CLEAR_RESET: + r = 0; + kvm_arch_vcpu_ioctl_clear_reset(vcpu); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + r = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu), + UVC_CMD_CPU_RESET_CLEAR, &rc, &rrc); + VCPU_EVENT(vcpu, 3, "PROTVIRT RESET CLEAR VCPU: rc %x rrc %x", + rc, rrc); + } + break; + case KVM_S390_INITIAL_RESET: + r = 0; + kvm_arch_vcpu_ioctl_initial_reset(vcpu); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + r = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu), + UVC_CMD_CPU_RESET_INITIAL, + &rc, &rrc); + VCPU_EVENT(vcpu, 3, "PROTVIRT RESET INITIAL VCPU: rc %x rrc %x", + rc, rrc); + } + break; + case KVM_S390_NORMAL_RESET: + r = 0; + kvm_arch_vcpu_ioctl_normal_reset(vcpu); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + r = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu), + UVC_CMD_CPU_RESET, &rc, &rrc); + VCPU_EVENT(vcpu, 3, "PROTVIRT RESET NORMAL VCPU: rc %x rrc %x", + rc, rrc); + } + break; + case KVM_SET_ONE_REG: + case KVM_GET_ONE_REG: { + struct kvm_one_reg reg; + r = -EINVAL; + if (kvm_s390_pv_cpu_is_protected(vcpu)) + break; + r = -EFAULT; + if (copy_from_user(®, argp, sizeof(reg))) + break; + if (ioctl == KVM_SET_ONE_REG) + r = kvm_arch_vcpu_ioctl_set_one_reg(vcpu, ®); + else + r = kvm_arch_vcpu_ioctl_get_one_reg(vcpu, ®); + break; + } +#ifdef CONFIG_KVM_S390_UCONTROL + case KVM_S390_UCAS_MAP: { + struct kvm_s390_ucas_mapping ucasmap; + + if (copy_from_user(&ucasmap, argp, sizeof(ucasmap))) { + r = -EFAULT; + break; + } + + if (!kvm_is_ucontrol(vcpu->kvm)) { + r = -EINVAL; + break; + } + + r = gmap_map_segment(vcpu->arch.gmap, ucasmap.user_addr, + ucasmap.vcpu_addr, ucasmap.length); + break; + } + case KVM_S390_UCAS_UNMAP: { + struct kvm_s390_ucas_mapping ucasmap; + + if (copy_from_user(&ucasmap, argp, sizeof(ucasmap))) { + r = -EFAULT; + break; + } + + if (!kvm_is_ucontrol(vcpu->kvm)) { + r = -EINVAL; + break; + } + + r = gmap_unmap_segment(vcpu->arch.gmap, ucasmap.vcpu_addr, + ucasmap.length); + break; + } +#endif + case KVM_S390_VCPU_FAULT: { + r = gmap_fault(vcpu->arch.gmap, arg, 0); + break; + } + case KVM_ENABLE_CAP: + { + struct kvm_enable_cap cap; + r = -EFAULT; + if (copy_from_user(&cap, argp, sizeof(cap))) + break; + r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap); + break; + } + case KVM_S390_MEM_OP: { + struct kvm_s390_mem_op mem_op; + + if (copy_from_user(&mem_op, argp, sizeof(mem_op)) == 0) + r = kvm_s390_vcpu_memsida_op(vcpu, &mem_op); + else + r = -EFAULT; + break; + } + case KVM_S390_SET_IRQ_STATE: { + struct kvm_s390_irq_state irq_state; + + r = -EFAULT; + if (copy_from_user(&irq_state, argp, sizeof(irq_state))) + break; + if (irq_state.len > VCPU_IRQS_MAX_BUF || + irq_state.len == 0 || + irq_state.len % sizeof(struct kvm_s390_irq) > 0) { + r = -EINVAL; + break; + } + /* do not use irq_state.flags, it will break old QEMUs */ + r = kvm_s390_set_irq_state(vcpu, + (void __user *) irq_state.buf, + irq_state.len); + break; + } + case KVM_S390_GET_IRQ_STATE: { + struct kvm_s390_irq_state irq_state; + + r = -EFAULT; + if (copy_from_user(&irq_state, argp, sizeof(irq_state))) + break; + if (irq_state.len == 0) { + r = -EINVAL; + break; + } + /* do not use irq_state.flags, it will break old QEMUs */ + r = kvm_s390_get_irq_state(vcpu, + (__u8 __user *) irq_state.buf, + irq_state.len); + break; + } + case KVM_S390_PV_CPU_COMMAND: { + struct kvm_pv_cmd cmd; + + r = -EINVAL; + if (!is_prot_virt_host()) + break; + + r = -EFAULT; + if (copy_from_user(&cmd, argp, sizeof(cmd))) + break; + + r = -EINVAL; + if (cmd.flags) + break; + + /* We only handle this cmd right now */ + if (cmd.cmd != KVM_PV_DUMP) + break; + + r = kvm_s390_handle_pv_vcpu_dump(vcpu, &cmd); + + /* Always copy over UV rc / rrc data */ + if (copy_to_user((__u8 __user *)argp, &cmd.rc, + sizeof(cmd.rc) + sizeof(cmd.rrc))) + r = -EFAULT; + break; + } + default: + r = -ENOTTY; + } + + vcpu_put(vcpu); + return r; +} + +vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) +{ +#ifdef CONFIG_KVM_S390_UCONTROL + if ((vmf->pgoff == KVM_S390_SIE_PAGE_OFFSET) + && (kvm_is_ucontrol(vcpu->kvm))) { + vmf->page = virt_to_page(vcpu->arch.sie_block); + get_page(vmf->page); + return 0; + } +#endif + return VM_FAULT_SIGBUS; +} + +bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) +{ + return true; +} + +/* Section: memory related */ +int kvm_arch_prepare_memory_region(struct kvm *kvm, + const struct kvm_memory_slot *old, + struct kvm_memory_slot *new, + enum kvm_mr_change change) +{ + gpa_t size; + + /* When we are protected, we should not change the memory slots */ + if (kvm_s390_pv_get_handle(kvm)) + return -EINVAL; + + if (change != KVM_MR_DELETE && change != KVM_MR_FLAGS_ONLY) { + /* + * A few sanity checks. We can have memory slots which have to be + * located/ended at a segment boundary (1MB). The memory in userland is + * ok to be fragmented into various different vmas. It is okay to mmap() + * and munmap() stuff in this slot after doing this call at any time + */ + + if (new->userspace_addr & 0xffffful) + return -EINVAL; + + size = new->npages * PAGE_SIZE; + if (size & 0xffffful) + return -EINVAL; + + if ((new->base_gfn * PAGE_SIZE) + size > kvm->arch.mem_limit) + return -EINVAL; + } + + if (!kvm->arch.migration_mode) + return 0; + + /* + * Turn off migration mode when: + * - userspace creates a new memslot with dirty logging off, + * - userspace modifies an existing memslot (MOVE or FLAGS_ONLY) and + * dirty logging is turned off. + * Migration mode expects dirty page logging being enabled to store + * its dirty bitmap. + */ + if (change != KVM_MR_DELETE && + !(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) + WARN(kvm_s390_vm_stop_migration(kvm), + "Failed to stop migration mode"); + + return 0; +} + +void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_memory_slot *old, + const struct kvm_memory_slot *new, + enum kvm_mr_change change) +{ + int rc = 0; + + switch (change) { + case KVM_MR_DELETE: + rc = gmap_unmap_segment(kvm->arch.gmap, old->base_gfn * PAGE_SIZE, + old->npages * PAGE_SIZE); + break; + case KVM_MR_MOVE: + rc = gmap_unmap_segment(kvm->arch.gmap, old->base_gfn * PAGE_SIZE, + old->npages * PAGE_SIZE); + if (rc) + break; + fallthrough; + case KVM_MR_CREATE: + rc = gmap_map_segment(kvm->arch.gmap, new->userspace_addr, + new->base_gfn * PAGE_SIZE, + new->npages * PAGE_SIZE); + break; + case KVM_MR_FLAGS_ONLY: + break; + default: + WARN(1, "Unknown KVM MR CHANGE: %d\n", change); + } + if (rc) + pr_warn("failed to commit memory region\n"); + return; +} + +static inline unsigned long nonhyp_mask(int i) +{ + unsigned int nonhyp_fai = (sclp.hmfai << i * 2) >> 30; + + return 0x0000ffffffffffffUL >> (nonhyp_fai << 4); +} + +static int __init kvm_s390_init(void) +{ + int i, r; + + if (!sclp.has_sief2) { + pr_info("SIE is not available\n"); + return -ENODEV; + } + + if (nested && hpage) { + pr_info("A KVM host that supports nesting cannot back its KVM guests with huge pages\n"); + return -EINVAL; + } + + for (i = 0; i < 16; i++) + kvm_s390_fac_base[i] |= + stfle_fac_list[i] & nonhyp_mask(i); + + r = __kvm_s390_init(); + if (r) + return r; + + r = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); + if (r) { + __kvm_s390_exit(); + return r; + } + return 0; +} + +static void __exit kvm_s390_exit(void) +{ + kvm_exit(); + + __kvm_s390_exit(); +} + +module_init(kvm_s390_init); +module_exit(kvm_s390_exit); + +/* + * Enable autoloading of the kvm module. + * Note that we add the module alias here instead of virt/kvm/kvm_main.c + * since x86 takes a different approach. + */ +#include +MODULE_ALIAS_MISCDEV(KVM_MINOR); +MODULE_ALIAS("devname:kvm"); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h new file mode 100644 index 0000000000..a7ea80cfa4 --- /dev/null +++ b/arch/s390/kvm/kvm-s390.h @@ -0,0 +1,524 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * definition for kvm on s390 + * + * Copyright IBM Corp. 2008, 2020 + * + * Author(s): Carsten Otte + * Christian Borntraeger + * Christian Ehrhardt + */ + +#ifndef ARCH_S390_KVM_S390_H +#define ARCH_S390_KVM_S390_H + +#include +#include +#include +#include +#include +#include +#include + +/* Transactional Memory Execution related macros */ +#define IS_TE_ENABLED(vcpu) ((vcpu->arch.sie_block->ecb & ECB_TE)) +#define TDB_FORMAT1 1 +#define IS_ITDB_VALID(vcpu) \ + ((*(char *)phys_to_virt((vcpu)->arch.sie_block->itdba) == TDB_FORMAT1)) + +extern debug_info_t *kvm_s390_dbf; +extern debug_info_t *kvm_s390_dbf_uv; + +#define KVM_UV_EVENT(d_kvm, d_loglevel, d_string, d_args...)\ +do { \ + debug_sprintf_event((d_kvm)->arch.dbf, d_loglevel, d_string "\n", \ + d_args); \ + debug_sprintf_event(kvm_s390_dbf_uv, d_loglevel, \ + "%d: " d_string "\n", (d_kvm)->userspace_pid, \ + d_args); \ +} while (0) + +#define KVM_EVENT(d_loglevel, d_string, d_args...)\ +do { \ + debug_sprintf_event(kvm_s390_dbf, d_loglevel, d_string "\n", \ + d_args); \ +} while (0) + +#define VM_EVENT(d_kvm, d_loglevel, d_string, d_args...)\ +do { \ + debug_sprintf_event(d_kvm->arch.dbf, d_loglevel, d_string "\n", \ + d_args); \ +} while (0) + +#define VCPU_EVENT(d_vcpu, d_loglevel, d_string, d_args...)\ +do { \ + debug_sprintf_event(d_vcpu->kvm->arch.dbf, d_loglevel, \ + "%02d[%016lx-%016lx]: " d_string "\n", d_vcpu->vcpu_id, \ + d_vcpu->arch.sie_block->gpsw.mask, d_vcpu->arch.sie_block->gpsw.addr,\ + d_args); \ +} while (0) + +static inline void kvm_s390_set_cpuflags(struct kvm_vcpu *vcpu, u32 flags) +{ + atomic_or(flags, &vcpu->arch.sie_block->cpuflags); +} + +static inline void kvm_s390_clear_cpuflags(struct kvm_vcpu *vcpu, u32 flags) +{ + atomic_andnot(flags, &vcpu->arch.sie_block->cpuflags); +} + +static inline bool kvm_s390_test_cpuflags(struct kvm_vcpu *vcpu, u32 flags) +{ + return (atomic_read(&vcpu->arch.sie_block->cpuflags) & flags) == flags; +} + +static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu) +{ + return kvm_s390_test_cpuflags(vcpu, CPUSTAT_STOPPED); +} + +static inline int is_vcpu_idle(struct kvm_vcpu *vcpu) +{ + return test_bit(vcpu->vcpu_idx, vcpu->kvm->arch.idle_mask); +} + +static inline int kvm_is_ucontrol(struct kvm *kvm) +{ +#ifdef CONFIG_KVM_S390_UCONTROL + if (kvm->arch.gmap) + return 0; + return 1; +#else + return 0; +#endif +} + +#define GUEST_PREFIX_SHIFT 13 +static inline u32 kvm_s390_get_prefix(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.sie_block->prefix << GUEST_PREFIX_SHIFT; +} + +static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix) +{ + VCPU_EVENT(vcpu, 3, "set prefix of cpu %03u to 0x%x", vcpu->vcpu_id, + prefix); + vcpu->arch.sie_block->prefix = prefix >> GUEST_PREFIX_SHIFT; + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_make_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu); +} + +static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu, u8 *ar) +{ + u32 base2 = vcpu->arch.sie_block->ipb >> 28; + u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16); + + if (ar) + *ar = base2; + + return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2; +} + +static inline void kvm_s390_get_base_disp_sse(struct kvm_vcpu *vcpu, + u64 *address1, u64 *address2, + u8 *ar_b1, u8 *ar_b2) +{ + u32 base1 = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28; + u32 disp1 = (vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16; + u32 base2 = (vcpu->arch.sie_block->ipb & 0xf000) >> 12; + u32 disp2 = vcpu->arch.sie_block->ipb & 0x0fff; + + *address1 = (base1 ? vcpu->run->s.regs.gprs[base1] : 0) + disp1; + *address2 = (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2; + + if (ar_b1) + *ar_b1 = base1; + if (ar_b2) + *ar_b2 = base2; +} + +static inline void kvm_s390_get_regs_rre(struct kvm_vcpu *vcpu, int *r1, int *r2) +{ + if (r1) + *r1 = (vcpu->arch.sie_block->ipb & 0x00f00000) >> 20; + if (r2) + *r2 = (vcpu->arch.sie_block->ipb & 0x000f0000) >> 16; +} + +static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu, u8 *ar) +{ + u32 base2 = vcpu->arch.sie_block->ipb >> 28; + u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16) + + ((vcpu->arch.sie_block->ipb & 0xff00) << 4); + /* The displacement is a 20bit _SIGNED_ value */ + if (disp2 & 0x80000) + disp2+=0xfff00000; + + if (ar) + *ar = base2; + + return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + (long)(int)disp2; +} + +static inline u64 kvm_s390_get_base_disp_rs(struct kvm_vcpu *vcpu, u8 *ar) +{ + u32 base2 = vcpu->arch.sie_block->ipb >> 28; + u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16); + + if (ar) + *ar = base2; + + return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2; +} + +/* Set the condition code in the guest program status word */ +static inline void kvm_s390_set_psw_cc(struct kvm_vcpu *vcpu, unsigned long cc) +{ + vcpu->arch.sie_block->gpsw.mask &= ~(3UL << 44); + vcpu->arch.sie_block->gpsw.mask |= cc << 44; +} + +/* test availability of facility in a kvm instance */ +static inline int test_kvm_facility(struct kvm *kvm, unsigned long nr) +{ + return __test_facility(nr, kvm->arch.model.fac_mask) && + __test_facility(nr, kvm->arch.model.fac_list); +} + +static inline int set_kvm_facility(u64 *fac_list, unsigned long nr) +{ + unsigned char *ptr; + + if (nr >= MAX_FACILITY_BIT) + return -EINVAL; + ptr = (unsigned char *) fac_list + (nr >> 3); + *ptr |= (0x80UL >> (nr & 7)); + return 0; +} + +static inline int test_kvm_cpu_feat(struct kvm *kvm, unsigned long nr) +{ + WARN_ON_ONCE(nr >= KVM_S390_VM_CPU_FEAT_NR_BITS); + return test_bit_inv(nr, kvm->arch.cpu_feat); +} + +/* are cpu states controlled by user space */ +static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm) +{ + return kvm->arch.user_cpu_state_ctrl != 0; +} + +static inline void kvm_s390_set_user_cpu_state_ctrl(struct kvm *kvm) +{ + if (kvm->arch.user_cpu_state_ctrl) + return; + + VM_EVENT(kvm, 3, "%s", "ENABLE: Userspace CPU state control"); + kvm->arch.user_cpu_state_ctrl = 1; +} + +/* get the end gfn of the last (highest gfn) memslot */ +static inline unsigned long kvm_s390_get_gfn_end(struct kvm_memslots *slots) +{ + struct rb_node *node; + struct kvm_memory_slot *ms; + + if (WARN_ON(kvm_memslots_empty(slots))) + return 0; + + node = rb_last(&slots->gfn_tree); + ms = container_of(node, struct kvm_memory_slot, gfn_node[slots->node_idx]); + return ms->base_gfn + ms->npages; +} + +static inline u32 kvm_s390_get_gisa_desc(struct kvm *kvm) +{ + u32 gd = virt_to_phys(kvm->arch.gisa_int.origin); + + if (gd && sclp.has_gisaf) + gd |= GISA_FORMAT1; + return gd; +} + +/* implemented in pv.c */ +int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc); +int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc); +int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc); +int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc); +int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc); +int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc); +int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc); +int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc, + u16 *rrc); +int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size, + unsigned long tweak, u16 *rc, u16 *rrc); +int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state); +int kvm_s390_pv_dump_cpu(struct kvm_vcpu *vcpu, void *buff, u16 *rc, u16 *rrc); +int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user, + u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc); +int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user, + u16 *rc, u16 *rrc); + +static inline u64 kvm_s390_pv_get_handle(struct kvm *kvm) +{ + return kvm->arch.pv.handle; +} + +static inline u64 kvm_s390_pv_cpu_get_handle(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.pv.handle; +} + +/* implemented in interrupt.c */ +int kvm_s390_handle_wait(struct kvm_vcpu *vcpu); +void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu); +enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer); +int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu); +void kvm_s390_clear_local_irqs(struct kvm_vcpu *vcpu); +void kvm_s390_clear_float_irqs(struct kvm *kvm); +int __must_check kvm_s390_inject_vm(struct kvm *kvm, + struct kvm_s390_interrupt *s390int); +int __must_check kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, + struct kvm_s390_irq *irq); +static inline int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu, + struct kvm_s390_pgm_info *pgm_info) +{ + struct kvm_s390_irq irq = { + .type = KVM_S390_PROGRAM_INT, + .u.pgm = *pgm_info, + }; + + return kvm_s390_inject_vcpu(vcpu, &irq); +} +static inline int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code) +{ + struct kvm_s390_irq irq = { + .type = KVM_S390_PROGRAM_INT, + .u.pgm.code = code, + }; + + return kvm_s390_inject_vcpu(vcpu, &irq); +} +struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm, + u64 isc_mask, u32 schid); +int kvm_s390_reinject_io_int(struct kvm *kvm, + struct kvm_s390_interrupt_info *inti); +int kvm_s390_mask_adapter(struct kvm *kvm, unsigned int id, bool masked); + +/* implemented in intercept.c */ +u8 kvm_s390_get_ilen(struct kvm_vcpu *vcpu); +int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu); +static inline void kvm_s390_rewind_psw(struct kvm_vcpu *vcpu, int ilen) +{ + struct kvm_s390_sie_block *sie_block = vcpu->arch.sie_block; + + sie_block->gpsw.addr = __rewind_psw(sie_block->gpsw, ilen); +} +static inline void kvm_s390_forward_psw(struct kvm_vcpu *vcpu, int ilen) +{ + kvm_s390_rewind_psw(vcpu, -ilen); +} +static inline void kvm_s390_retry_instr(struct kvm_vcpu *vcpu) +{ + /* don't inject PER events if we re-execute the instruction */ + vcpu->arch.sie_block->icptstatus &= ~0x02; + kvm_s390_rewind_psw(vcpu, kvm_s390_get_ilen(vcpu)); +} + +int handle_sthyi(struct kvm_vcpu *vcpu); + +/* implemented in priv.c */ +int is_valid_psw(psw_t *psw); +int kvm_s390_handle_aa(struct kvm_vcpu *vcpu); +int kvm_s390_handle_b2(struct kvm_vcpu *vcpu); +int kvm_s390_handle_e3(struct kvm_vcpu *vcpu); +int kvm_s390_handle_e5(struct kvm_vcpu *vcpu); +int kvm_s390_handle_01(struct kvm_vcpu *vcpu); +int kvm_s390_handle_b9(struct kvm_vcpu *vcpu); +int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu); +int kvm_s390_handle_stctl(struct kvm_vcpu *vcpu); +int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu); +int kvm_s390_handle_eb(struct kvm_vcpu *vcpu); +int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu); + +/* implemented in vsie.c */ +int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu); +void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu); +void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, + unsigned long end); +void kvm_s390_vsie_init(struct kvm *kvm); +void kvm_s390_vsie_destroy(struct kvm *kvm); + +/* implemented in sigp.c */ +int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); +int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu); + +/* implemented in kvm-s390.c */ +int kvm_s390_try_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod); +long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable); +int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr); +int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr); +int kvm_s390_vcpu_start(struct kvm_vcpu *vcpu); +int kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu); +void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu); +void kvm_s390_vcpu_unblock(struct kvm_vcpu *vcpu); +bool kvm_s390_vcpu_sie_inhibited(struct kvm_vcpu *vcpu); +void exit_sie(struct kvm_vcpu *vcpu); +void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu); +int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu); +void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu); +void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm); +__u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu); +int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc); + +/* implemented in diag.c */ +int kvm_s390_handle_diag(struct kvm_vcpu *vcpu); + +static inline void kvm_s390_vcpu_block_all(struct kvm *kvm) +{ + unsigned long i; + struct kvm_vcpu *vcpu; + + WARN_ON(!mutex_is_locked(&kvm->lock)); + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_s390_vcpu_block(vcpu); +} + +static inline void kvm_s390_vcpu_unblock_all(struct kvm *kvm) +{ + unsigned long i; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_s390_vcpu_unblock(vcpu); +} + +static inline u64 kvm_s390_get_tod_clock_fast(struct kvm *kvm) +{ + u64 rc; + + preempt_disable(); + rc = get_tod_clock_fast() + kvm->arch.epoch; + preempt_enable(); + return rc; +} + +/** + * kvm_s390_inject_prog_cond - conditionally inject a program check + * @vcpu: virtual cpu + * @rc: original return/error code + * + * This function is supposed to be used after regular guest access functions + * failed, to conditionally inject a program check to a vcpu. The typical + * pattern would look like + * + * rc = write_guest(vcpu, addr, data, len); + * if (rc) + * return kvm_s390_inject_prog_cond(vcpu, rc); + * + * A negative return code from guest access functions implies an internal error + * like e.g. out of memory. In these cases no program check should be injected + * to the guest. + * A positive value implies that an exception happened while accessing a guest's + * memory. In this case all data belonging to the corresponding program check + * has been stored in vcpu->arch.pgm and can be injected with + * kvm_s390_inject_prog_irq(). + * + * Returns: - the original @rc value if @rc was negative (internal error) + * - zero if @rc was already zero + * - zero or error code from injecting if @rc was positive + * (program check injected to @vcpu) + */ +static inline int kvm_s390_inject_prog_cond(struct kvm_vcpu *vcpu, int rc) +{ + if (rc <= 0) + return rc; + return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); +} + +int s390int_to_s390irq(struct kvm_s390_interrupt *s390int, + struct kvm_s390_irq *s390irq); + +/* implemented in interrupt.c */ +int kvm_s390_vcpu_has_irq(struct kvm_vcpu *vcpu, int exclude_stop); +int psw_extint_disabled(struct kvm_vcpu *vcpu); +void kvm_s390_destroy_adapters(struct kvm *kvm); +int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu); +extern struct kvm_device_ops kvm_flic_ops; +int kvm_s390_is_stop_irq_pending(struct kvm_vcpu *vcpu); +int kvm_s390_is_restart_irq_pending(struct kvm_vcpu *vcpu); +void kvm_s390_clear_stop_irq(struct kvm_vcpu *vcpu); +int kvm_s390_set_irq_state(struct kvm_vcpu *vcpu, + void __user *buf, int len); +int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, + __u8 __user *buf, int len); +void kvm_s390_gisa_init(struct kvm *kvm); +void kvm_s390_gisa_clear(struct kvm *kvm); +void kvm_s390_gisa_destroy(struct kvm *kvm); +void kvm_s390_gisa_disable(struct kvm *kvm); +void kvm_s390_gisa_enable(struct kvm *kvm); +int __init kvm_s390_gib_init(u8 nisc); +void kvm_s390_gib_destroy(void); + +/* implemented in guestdbg.c */ +void kvm_s390_backup_guest_per_regs(struct kvm_vcpu *vcpu); +void kvm_s390_restore_guest_per_regs(struct kvm_vcpu *vcpu); +void kvm_s390_patch_guest_per_regs(struct kvm_vcpu *vcpu); +int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg); +void kvm_s390_clear_bp_data(struct kvm_vcpu *vcpu); +void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu); +int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu); +int kvm_s390_handle_per_event(struct kvm_vcpu *vcpu); + +/* support for Basic/Extended SCA handling */ +static inline union ipte_control *kvm_s390_get_ipte_control(struct kvm *kvm) +{ + struct bsca_block *sca = kvm->arch.sca; /* SCA version doesn't matter */ + + return &sca->ipte_control; +} +static inline int kvm_s390_use_sca_entries(void) +{ + /* + * Without SIGP interpretation, only SRS interpretation (if available) + * might use the entries. By not setting the entries and keeping them + * invalid, hardware will not access them but intercept. + */ + return sclp.has_sigpif; +} +void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu, + struct mcck_volatile_info *mcck_info); + +/** + * kvm_s390_vcpu_crypto_reset_all + * + * Reset the crypto attributes for each vcpu. This can be done while the vcpus + * are running as each vcpu will be removed from SIE before resetting the crypt + * attributes and restored to SIE afterward. + * + * Note: The kvm->lock must be held while calling this function + * + * @kvm: the KVM guest + */ +void kvm_s390_vcpu_crypto_reset_all(struct kvm *kvm); + +/** + * kvm_s390_vcpu_pci_enable_interp + * + * Set the associated PCI attributes for each vcpu to allow for zPCI Load/Store + * interpretation as well as adapter interruption forwarding. + * + * @kvm: the KVM guest + */ +void kvm_s390_vcpu_pci_enable_interp(struct kvm *kvm); + +/** + * diag9c_forwarding_hz + * + * Set the maximum number of diag9c forwarding per second + */ +extern unsigned int diag9c_forwarding_hz; + +#endif diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c new file mode 100644 index 0000000000..ffa7739c7a --- /dev/null +++ b/arch/s390/kvm/pci.c @@ -0,0 +1,704 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * s390 kvm PCI passthrough support + * + * Copyright IBM Corp. 2022 + * + * Author(s): Matthew Rosato + */ + +#include +#include +#include +#include +#include +#include +#include "pci.h" +#include "kvm-s390.h" + +struct zpci_aift *aift; + +static inline int __set_irq_noiib(u16 ctl, u8 isc) +{ + union zpci_sic_iib iib = {{0}}; + + return zpci_set_irq_ctrl(ctl, isc, &iib); +} + +void kvm_s390_pci_aen_exit(void) +{ + unsigned long flags; + struct kvm_zdev **gait_kzdev; + + lockdep_assert_held(&aift->aift_lock); + + /* + * Contents of the aipb remain registered for the life of the host + * kernel, the information preserved in zpci_aipb and zpci_aif_sbv + * in case we insert the KVM module again later. Clear the AIFT + * information and free anything not registered with underlying + * firmware. + */ + spin_lock_irqsave(&aift->gait_lock, flags); + gait_kzdev = aift->kzdev; + aift->gait = NULL; + aift->sbv = NULL; + aift->kzdev = NULL; + spin_unlock_irqrestore(&aift->gait_lock, flags); + + kfree(gait_kzdev); +} + +static int zpci_setup_aipb(u8 nisc) +{ + struct page *page; + int size, rc; + + zpci_aipb = kzalloc(sizeof(union zpci_sic_iib), GFP_KERNEL); + if (!zpci_aipb) + return -ENOMEM; + + aift->sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC, NULL); + if (!aift->sbv) { + rc = -ENOMEM; + goto free_aipb; + } + zpci_aif_sbv = aift->sbv; + size = get_order(PAGE_ALIGN(ZPCI_NR_DEVICES * + sizeof(struct zpci_gaite))); + page = alloc_pages(GFP_KERNEL | __GFP_ZERO, size); + if (!page) { + rc = -ENOMEM; + goto free_sbv; + } + aift->gait = (struct zpci_gaite *)page_to_virt(page); + + zpci_aipb->aipb.faisb = virt_to_phys(aift->sbv->vector); + zpci_aipb->aipb.gait = virt_to_phys(aift->gait); + zpci_aipb->aipb.afi = nisc; + zpci_aipb->aipb.faal = ZPCI_NR_DEVICES; + + /* Setup Adapter Event Notification Interpretation */ + if (zpci_set_irq_ctrl(SIC_SET_AENI_CONTROLS, 0, zpci_aipb)) { + rc = -EIO; + goto free_gait; + } + + return 0; + +free_gait: + free_pages((unsigned long)aift->gait, size); +free_sbv: + airq_iv_release(aift->sbv); + zpci_aif_sbv = NULL; +free_aipb: + kfree(zpci_aipb); + zpci_aipb = NULL; + + return rc; +} + +static int zpci_reset_aipb(u8 nisc) +{ + /* + * AEN registration can only happen once per system boot. If + * an aipb already exists then AEN was already registered and + * we can re-use the aipb contents. This can only happen if + * the KVM module was removed and re-inserted. However, we must + * ensure that the same forwarding ISC is used as this is assigned + * during KVM module load. + */ + if (zpci_aipb->aipb.afi != nisc) + return -EINVAL; + + aift->sbv = zpci_aif_sbv; + aift->gait = phys_to_virt(zpci_aipb->aipb.gait); + + return 0; +} + +int kvm_s390_pci_aen_init(u8 nisc) +{ + int rc = 0; + + /* If already enabled for AEN, bail out now */ + if (aift->gait || aift->sbv) + return -EPERM; + + mutex_lock(&aift->aift_lock); + aift->kzdev = kcalloc(ZPCI_NR_DEVICES, sizeof(struct kvm_zdev *), + GFP_KERNEL); + if (!aift->kzdev) { + rc = -ENOMEM; + goto unlock; + } + + if (!zpci_aipb) + rc = zpci_setup_aipb(nisc); + else + rc = zpci_reset_aipb(nisc); + if (rc) + goto free_zdev; + + /* Enable floating IRQs */ + if (__set_irq_noiib(SIC_IRQ_MODE_SINGLE, nisc)) { + rc = -EIO; + kvm_s390_pci_aen_exit(); + } + + goto unlock; + +free_zdev: + kfree(aift->kzdev); +unlock: + mutex_unlock(&aift->aift_lock); + return rc; +} + +/* Modify PCI: Register floating adapter interruption forwarding */ +static int kvm_zpci_set_airq(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_REG_INT); + struct zpci_fib fib = {}; + u8 status; + + fib.fmt0.isc = zdev->kzdev->fib.fmt0.isc; + fib.fmt0.sum = 1; /* enable summary notifications */ + fib.fmt0.noi = airq_iv_end(zdev->aibv); + fib.fmt0.aibv = virt_to_phys(zdev->aibv->vector); + fib.fmt0.aibvo = 0; + fib.fmt0.aisb = virt_to_phys(aift->sbv->vector + (zdev->aisb / 64) * 8); + fib.fmt0.aisbo = zdev->aisb & 63; + fib.gd = zdev->gisa; + + return zpci_mod_fc(req, &fib, &status) ? -EIO : 0; +} + +/* Modify PCI: Unregister floating adapter interruption forwarding */ +static int kvm_zpci_clear_airq(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_DEREG_INT); + struct zpci_fib fib = {}; + u8 cc, status; + + fib.gd = zdev->gisa; + + cc = zpci_mod_fc(req, &fib, &status); + if (cc == 3 || (cc == 1 && status == 24)) + /* Function already gone or IRQs already deregistered. */ + cc = 0; + + return cc ? -EIO : 0; +} + +static inline void unaccount_mem(unsigned long nr_pages) +{ + struct user_struct *user = get_uid(current_user()); + + if (user) + atomic_long_sub(nr_pages, &user->locked_vm); + if (current->mm) + atomic64_sub(nr_pages, ¤t->mm->pinned_vm); +} + +static inline int account_mem(unsigned long nr_pages) +{ + struct user_struct *user = get_uid(current_user()); + unsigned long page_limit, cur_pages, new_pages; + + page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + do { + cur_pages = atomic_long_read(&user->locked_vm); + new_pages = cur_pages + nr_pages; + if (new_pages > page_limit) + return -ENOMEM; + } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages, + new_pages) != cur_pages); + + atomic64_add(nr_pages, ¤t->mm->pinned_vm); + + return 0; +} + +static int kvm_s390_pci_aif_enable(struct zpci_dev *zdev, struct zpci_fib *fib, + bool assist) +{ + struct page *pages[1], *aibv_page, *aisb_page = NULL; + unsigned int msi_vecs, idx; + struct zpci_gaite *gaite; + unsigned long hva, bit; + struct kvm *kvm; + phys_addr_t gaddr; + int rc = 0, gisc, npages, pcount = 0; + + /* + * Interrupt forwarding is only applicable if the device is already + * enabled for interpretation + */ + if (zdev->gisa == 0) + return -EINVAL; + + kvm = zdev->kzdev->kvm; + msi_vecs = min_t(unsigned int, fib->fmt0.noi, zdev->max_msi); + + /* Get the associated forwarding ISC - if invalid, return the error */ + gisc = kvm_s390_gisc_register(kvm, fib->fmt0.isc); + if (gisc < 0) + return gisc; + + /* Replace AIBV address */ + idx = srcu_read_lock(&kvm->srcu); + hva = gfn_to_hva(kvm, gpa_to_gfn((gpa_t)fib->fmt0.aibv)); + npages = pin_user_pages_fast(hva, 1, FOLL_WRITE | FOLL_LONGTERM, pages); + srcu_read_unlock(&kvm->srcu, idx); + if (npages < 1) { + rc = -EIO; + goto out; + } + aibv_page = pages[0]; + pcount++; + gaddr = page_to_phys(aibv_page) + (fib->fmt0.aibv & ~PAGE_MASK); + fib->fmt0.aibv = gaddr; + + /* Pin the guest AISB if one was specified */ + if (fib->fmt0.sum == 1) { + idx = srcu_read_lock(&kvm->srcu); + hva = gfn_to_hva(kvm, gpa_to_gfn((gpa_t)fib->fmt0.aisb)); + npages = pin_user_pages_fast(hva, 1, FOLL_WRITE | FOLL_LONGTERM, + pages); + srcu_read_unlock(&kvm->srcu, idx); + if (npages < 1) { + rc = -EIO; + goto unpin1; + } + aisb_page = pages[0]; + pcount++; + } + + /* Account for pinned pages, roll back on failure */ + if (account_mem(pcount)) + goto unpin2; + + /* AISB must be allocated before we can fill in GAITE */ + mutex_lock(&aift->aift_lock); + bit = airq_iv_alloc_bit(aift->sbv); + if (bit == -1UL) + goto unlock; + zdev->aisb = bit; /* store the summary bit number */ + zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | + AIRQ_IV_BITLOCK | + AIRQ_IV_GUESTVEC, + phys_to_virt(fib->fmt0.aibv)); + + spin_lock_irq(&aift->gait_lock); + gaite = (struct zpci_gaite *)aift->gait + (zdev->aisb * + sizeof(struct zpci_gaite)); + + /* If assist not requested, host will get all alerts */ + if (assist) + gaite->gisa = (u32)virt_to_phys(&kvm->arch.sie_page2->gisa); + else + gaite->gisa = 0; + + gaite->gisc = fib->fmt0.isc; + gaite->count++; + gaite->aisbo = fib->fmt0.aisbo; + gaite->aisb = virt_to_phys(page_address(aisb_page) + (fib->fmt0.aisb & + ~PAGE_MASK)); + aift->kzdev[zdev->aisb] = zdev->kzdev; + spin_unlock_irq(&aift->gait_lock); + + /* Update guest FIB for re-issue */ + fib->fmt0.aisbo = zdev->aisb & 63; + fib->fmt0.aisb = virt_to_phys(aift->sbv->vector + (zdev->aisb / 64) * 8); + fib->fmt0.isc = gisc; + + /* Save some guest fib values in the host for later use */ + zdev->kzdev->fib.fmt0.isc = fib->fmt0.isc; + zdev->kzdev->fib.fmt0.aibv = fib->fmt0.aibv; + mutex_unlock(&aift->aift_lock); + + /* Issue the clp to setup the irq now */ + rc = kvm_zpci_set_airq(zdev); + return rc; + +unlock: + mutex_unlock(&aift->aift_lock); +unpin2: + if (fib->fmt0.sum == 1) + unpin_user_page(aisb_page); +unpin1: + unpin_user_page(aibv_page); +out: + return rc; +} + +static int kvm_s390_pci_aif_disable(struct zpci_dev *zdev, bool force) +{ + struct kvm_zdev *kzdev = zdev->kzdev; + struct zpci_gaite *gaite; + struct page *vpage = NULL, *spage = NULL; + int rc, pcount = 0; + u8 isc; + + if (zdev->gisa == 0) + return -EINVAL; + + mutex_lock(&aift->aift_lock); + + /* + * If the clear fails due to an error, leave now unless we know this + * device is about to go away (force) -- In that case clear the GAITE + * regardless. + */ + rc = kvm_zpci_clear_airq(zdev); + if (rc && !force) + goto out; + + if (zdev->kzdev->fib.fmt0.aibv == 0) + goto out; + spin_lock_irq(&aift->gait_lock); + gaite = (struct zpci_gaite *)aift->gait + (zdev->aisb * + sizeof(struct zpci_gaite)); + isc = gaite->gisc; + gaite->count--; + if (gaite->count == 0) { + /* Release guest AIBV and AISB */ + vpage = phys_to_page(kzdev->fib.fmt0.aibv); + if (gaite->aisb != 0) + spage = phys_to_page(gaite->aisb); + /* Clear the GAIT entry */ + gaite->aisb = 0; + gaite->gisc = 0; + gaite->aisbo = 0; + gaite->gisa = 0; + aift->kzdev[zdev->aisb] = NULL; + /* Clear zdev info */ + airq_iv_free_bit(aift->sbv, zdev->aisb); + airq_iv_release(zdev->aibv); + zdev->aisb = 0; + zdev->aibv = NULL; + } + spin_unlock_irq(&aift->gait_lock); + kvm_s390_gisc_unregister(kzdev->kvm, isc); + kzdev->fib.fmt0.isc = 0; + kzdev->fib.fmt0.aibv = 0; + + if (vpage) { + unpin_user_page(vpage); + pcount++; + } + if (spage) { + unpin_user_page(spage); + pcount++; + } + if (pcount > 0) + unaccount_mem(pcount); +out: + mutex_unlock(&aift->aift_lock); + + return rc; +} + +static int kvm_s390_pci_dev_open(struct zpci_dev *zdev) +{ + struct kvm_zdev *kzdev; + + kzdev = kzalloc(sizeof(struct kvm_zdev), GFP_KERNEL); + if (!kzdev) + return -ENOMEM; + + kzdev->zdev = zdev; + zdev->kzdev = kzdev; + + return 0; +} + +static void kvm_s390_pci_dev_release(struct zpci_dev *zdev) +{ + struct kvm_zdev *kzdev; + + kzdev = zdev->kzdev; + WARN_ON(kzdev->zdev != zdev); + zdev->kzdev = NULL; + kfree(kzdev); +} + + +/* + * Register device with the specified KVM. If interpretation facilities are + * available, enable them and let userspace indicate whether or not they will + * be used (specify SHM bit to disable). + */ +static int kvm_s390_pci_register_kvm(void *opaque, struct kvm *kvm) +{ + struct zpci_dev *zdev = opaque; + u8 status; + int rc; + + if (!zdev) + return -EINVAL; + + mutex_lock(&zdev->kzdev_lock); + + if (zdev->kzdev || zdev->gisa != 0 || !kvm) { + mutex_unlock(&zdev->kzdev_lock); + return -EINVAL; + } + + kvm_get_kvm(kvm); + + mutex_lock(&kvm->lock); + + rc = kvm_s390_pci_dev_open(zdev); + if (rc) + goto err; + + /* + * If interpretation facilities aren't available, add the device to + * the kzdev list but don't enable for interpretation. + */ + if (!kvm_s390_pci_interp_allowed()) + goto out; + + /* + * If this is the first request to use an interpreted device, make the + * necessary vcpu changes + */ + if (!kvm->arch.use_zpci_interp) + kvm_s390_vcpu_pci_enable_interp(kvm); + + if (zdev_enabled(zdev)) { + rc = zpci_disable_device(zdev); + if (rc) + goto err; + } + + /* + * Store information about the identity of the kvm guest allowed to + * access this device via interpretation to be used by host CLP + */ + zdev->gisa = (u32)virt_to_phys(&kvm->arch.sie_page2->gisa); + + rc = zpci_enable_device(zdev); + if (rc) + goto clear_gisa; + + /* Re-register the IOMMU that was already created */ + rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, + virt_to_phys(zdev->dma_table), &status); + if (rc) + goto clear_gisa; + +out: + zdev->kzdev->kvm = kvm; + + spin_lock(&kvm->arch.kzdev_list_lock); + list_add_tail(&zdev->kzdev->entry, &kvm->arch.kzdev_list); + spin_unlock(&kvm->arch.kzdev_list_lock); + + mutex_unlock(&kvm->lock); + mutex_unlock(&zdev->kzdev_lock); + return 0; + +clear_gisa: + zdev->gisa = 0; +err: + if (zdev->kzdev) + kvm_s390_pci_dev_release(zdev); + mutex_unlock(&kvm->lock); + mutex_unlock(&zdev->kzdev_lock); + kvm_put_kvm(kvm); + return rc; +} + +static void kvm_s390_pci_unregister_kvm(void *opaque) +{ + struct zpci_dev *zdev = opaque; + struct kvm *kvm; + u8 status; + + if (!zdev) + return; + + mutex_lock(&zdev->kzdev_lock); + + if (WARN_ON(!zdev->kzdev)) { + mutex_unlock(&zdev->kzdev_lock); + return; + } + + kvm = zdev->kzdev->kvm; + mutex_lock(&kvm->lock); + + /* + * A 0 gisa means interpretation was never enabled, just remove the + * device from the list. + */ + if (zdev->gisa == 0) + goto out; + + /* Forwarding must be turned off before interpretation */ + if (zdev->kzdev->fib.fmt0.aibv != 0) + kvm_s390_pci_aif_disable(zdev, true); + + /* Remove the host CLP guest designation */ + zdev->gisa = 0; + + if (zdev_enabled(zdev)) { + if (zpci_disable_device(zdev)) + goto out; + } + + if (zpci_enable_device(zdev)) + goto out; + + /* Re-register the IOMMU that was already created */ + zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, + virt_to_phys(zdev->dma_table), &status); + +out: + spin_lock(&kvm->arch.kzdev_list_lock); + list_del(&zdev->kzdev->entry); + spin_unlock(&kvm->arch.kzdev_list_lock); + kvm_s390_pci_dev_release(zdev); + + mutex_unlock(&kvm->lock); + mutex_unlock(&zdev->kzdev_lock); + + kvm_put_kvm(kvm); +} + +void kvm_s390_pci_init_list(struct kvm *kvm) +{ + spin_lock_init(&kvm->arch.kzdev_list_lock); + INIT_LIST_HEAD(&kvm->arch.kzdev_list); +} + +void kvm_s390_pci_clear_list(struct kvm *kvm) +{ + /* + * This list should already be empty, either via vfio device closures + * or kvm fd cleanup. + */ + spin_lock(&kvm->arch.kzdev_list_lock); + WARN_ON_ONCE(!list_empty(&kvm->arch.kzdev_list)); + spin_unlock(&kvm->arch.kzdev_list_lock); +} + +static struct zpci_dev *get_zdev_from_kvm_by_fh(struct kvm *kvm, u32 fh) +{ + struct zpci_dev *zdev = NULL; + struct kvm_zdev *kzdev; + + spin_lock(&kvm->arch.kzdev_list_lock); + list_for_each_entry(kzdev, &kvm->arch.kzdev_list, entry) { + if (kzdev->zdev->fh == fh) { + zdev = kzdev->zdev; + break; + } + } + spin_unlock(&kvm->arch.kzdev_list_lock); + + return zdev; +} + +static int kvm_s390_pci_zpci_reg_aen(struct zpci_dev *zdev, + struct kvm_s390_zpci_op *args) +{ + struct zpci_fib fib = {}; + bool hostflag; + + fib.fmt0.aibv = args->u.reg_aen.ibv; + fib.fmt0.isc = args->u.reg_aen.isc; + fib.fmt0.noi = args->u.reg_aen.noi; + if (args->u.reg_aen.sb != 0) { + fib.fmt0.aisb = args->u.reg_aen.sb; + fib.fmt0.aisbo = args->u.reg_aen.sbo; + fib.fmt0.sum = 1; + } else { + fib.fmt0.aisb = 0; + fib.fmt0.aisbo = 0; + fib.fmt0.sum = 0; + } + + hostflag = !(args->u.reg_aen.flags & KVM_S390_ZPCIOP_REGAEN_HOST); + return kvm_s390_pci_aif_enable(zdev, &fib, hostflag); +} + +int kvm_s390_pci_zpci_op(struct kvm *kvm, struct kvm_s390_zpci_op *args) +{ + struct kvm_zdev *kzdev; + struct zpci_dev *zdev; + int r; + + zdev = get_zdev_from_kvm_by_fh(kvm, args->fh); + if (!zdev) + return -ENODEV; + + mutex_lock(&zdev->kzdev_lock); + mutex_lock(&kvm->lock); + + kzdev = zdev->kzdev; + if (!kzdev) { + r = -ENODEV; + goto out; + } + if (kzdev->kvm != kvm) { + r = -EPERM; + goto out; + } + + switch (args->op) { + case KVM_S390_ZPCIOP_REG_AEN: + /* Fail on unknown flags */ + if (args->u.reg_aen.flags & ~KVM_S390_ZPCIOP_REGAEN_HOST) { + r = -EINVAL; + break; + } + r = kvm_s390_pci_zpci_reg_aen(zdev, args); + break; + case KVM_S390_ZPCIOP_DEREG_AEN: + r = kvm_s390_pci_aif_disable(zdev, false); + break; + default: + r = -EINVAL; + } + +out: + mutex_unlock(&kvm->lock); + mutex_unlock(&zdev->kzdev_lock); + return r; +} + +int __init kvm_s390_pci_init(void) +{ + zpci_kvm_hook.kvm_register = kvm_s390_pci_register_kvm; + zpci_kvm_hook.kvm_unregister = kvm_s390_pci_unregister_kvm; + + if (!kvm_s390_pci_interp_allowed()) + return 0; + + aift = kzalloc(sizeof(struct zpci_aift), GFP_KERNEL); + if (!aift) + return -ENOMEM; + + spin_lock_init(&aift->gait_lock); + mutex_init(&aift->aift_lock); + + return 0; +} + +void kvm_s390_pci_exit(void) +{ + zpci_kvm_hook.kvm_register = NULL; + zpci_kvm_hook.kvm_unregister = NULL; + + if (!kvm_s390_pci_interp_allowed()) + return; + + mutex_destroy(&aift->aift_lock); + + kfree(aift); +} diff --git a/arch/s390/kvm/pci.h b/arch/s390/kvm/pci.h new file mode 100644 index 0000000000..ff0972dd5e --- /dev/null +++ b/arch/s390/kvm/pci.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * s390 kvm PCI passthrough support + * + * Copyright IBM Corp. 2022 + * + * Author(s): Matthew Rosato + */ + +#ifndef __KVM_S390_PCI_H +#define __KVM_S390_PCI_H + +#include +#include +#include +#include +#include +#include + +struct kvm_zdev { + struct zpci_dev *zdev; + struct kvm *kvm; + struct zpci_fib fib; + struct list_head entry; +}; + +struct zpci_gaite { + u32 gisa; + u8 gisc; + u8 count; + u8 reserved; + u8 aisbo; + u64 aisb; +}; + +struct zpci_aift { + struct zpci_gaite *gait; + struct airq_iv *sbv; + struct kvm_zdev **kzdev; + spinlock_t gait_lock; /* Protects the gait, used during AEN forward */ + struct mutex aift_lock; /* Protects the other structures in aift */ +}; + +extern struct zpci_aift *aift; + +static inline struct kvm *kvm_s390_pci_si_to_kvm(struct zpci_aift *aift, + unsigned long si) +{ + if (!IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM) || !aift->kzdev || + !aift->kzdev[si]) + return NULL; + return aift->kzdev[si]->kvm; +}; + +int kvm_s390_pci_aen_init(u8 nisc); +void kvm_s390_pci_aen_exit(void); + +void kvm_s390_pci_init_list(struct kvm *kvm); +void kvm_s390_pci_clear_list(struct kvm *kvm); + +int kvm_s390_pci_zpci_op(struct kvm *kvm, struct kvm_s390_zpci_op *args); + +int __init kvm_s390_pci_init(void); +void kvm_s390_pci_exit(void); + +static inline bool kvm_s390_pci_interp_allowed(void) +{ + struct cpuid cpu_id; + + get_cpu_id(&cpu_id); + switch (cpu_id.machine) { + case 0x2817: + case 0x2818: + case 0x2827: + case 0x2828: + case 0x2964: + case 0x2965: + /* No SHM on certain machines */ + return false; + default: + return (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM) && + sclp.has_zpci_lsi && sclp.has_aeni && sclp.has_aisi && + sclp.has_aisii); + } +} + +#endif /* __KVM_S390_PCI_H */ diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c new file mode 100644 index 0000000000..dc4cfa8795 --- /dev/null +++ b/arch/s390/kvm/priv.c @@ -0,0 +1,1573 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * handling privileged instructions + * + * Copyright IBM Corp. 2008, 2020 + * + * Author(s): Carsten Otte + * Christian Borntraeger + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "gaccess.h" +#include "kvm-s390.h" +#include "trace.h" + +static int handle_ri(struct kvm_vcpu *vcpu) +{ + vcpu->stat.instruction_ri++; + + if (test_kvm_facility(vcpu->kvm, 64)) { + VCPU_EVENT(vcpu, 3, "%s", "ENABLE: RI (lazy)"); + vcpu->arch.sie_block->ecb3 |= ECB3_RI; + kvm_s390_retry_instr(vcpu); + return 0; + } else + return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); +} + +int kvm_s390_handle_aa(struct kvm_vcpu *vcpu) +{ + if ((vcpu->arch.sie_block->ipa & 0xf) <= 4) + return handle_ri(vcpu); + else + return -EOPNOTSUPP; +} + +static int handle_gs(struct kvm_vcpu *vcpu) +{ + vcpu->stat.instruction_gs++; + + if (test_kvm_facility(vcpu->kvm, 133)) { + VCPU_EVENT(vcpu, 3, "%s", "ENABLE: GS (lazy)"); + preempt_disable(); + __ctl_set_bit(2, 4); + current->thread.gs_cb = (struct gs_cb *)&vcpu->run->s.regs.gscb; + restore_gs_cb(current->thread.gs_cb); + preempt_enable(); + vcpu->arch.sie_block->ecb |= ECB_GS; + vcpu->arch.sie_block->ecd |= ECD_HOSTREGMGMT; + vcpu->arch.gs_enabled = 1; + kvm_s390_retry_instr(vcpu); + return 0; + } else + return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); +} + +int kvm_s390_handle_e3(struct kvm_vcpu *vcpu) +{ + int code = vcpu->arch.sie_block->ipb & 0xff; + + if (code == 0x49 || code == 0x4d) + return handle_gs(vcpu); + else + return -EOPNOTSUPP; +} +/* Handle SCK (SET CLOCK) interception */ +static int handle_set_clock(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_vm_tod_clock gtod = { 0 }; + int rc; + u8 ar; + u64 op2; + + vcpu->stat.instruction_sck++; + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + op2 = kvm_s390_get_base_disp_s(vcpu, &ar); + if (op2 & 7) /* Operand must be on a doubleword boundary */ + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + rc = read_guest(vcpu, op2, ar, >od.tod, sizeof(gtod.tod)); + if (rc) + return kvm_s390_inject_prog_cond(vcpu, rc); + + VCPU_EVENT(vcpu, 3, "SCK: setting guest TOD to 0x%llx", gtod.tod); + /* + * To set the TOD clock the kvm lock must be taken, but the vcpu lock + * is already held in handle_set_clock. The usual lock order is the + * opposite. As SCK is deprecated and should not be used in several + * cases, for example when the multiple epoch facility or TOD clock + * steering facility is installed (see Principles of Operation), a + * slow path can be used. If the lock can not be taken via try_lock, + * the instruction will be retried via -EAGAIN at a later point in + * time. + */ + if (!kvm_s390_try_set_tod_clock(vcpu->kvm, >od)) { + kvm_s390_retry_instr(vcpu); + return -EAGAIN; + } + + kvm_s390_set_psw_cc(vcpu, 0); + return 0; +} + +static int handle_set_prefix(struct kvm_vcpu *vcpu) +{ + u64 operand2; + u32 address; + int rc; + u8 ar; + + vcpu->stat.instruction_spx++; + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + operand2 = kvm_s390_get_base_disp_s(vcpu, &ar); + + /* must be word boundary */ + if (operand2 & 3) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + /* get the value */ + rc = read_guest(vcpu, operand2, ar, &address, sizeof(address)); + if (rc) + return kvm_s390_inject_prog_cond(vcpu, rc); + + address &= 0x7fffe000u; + + /* + * Make sure the new value is valid memory. We only need to check the + * first page, since address is 8k aligned and memory pieces are always + * at least 1MB aligned and have at least a size of 1MB. + */ + if (kvm_is_error_gpa(vcpu->kvm, address)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + + kvm_s390_set_prefix(vcpu, address); + trace_kvm_s390_handle_prefix(vcpu, 1, address); + return 0; +} + +static int handle_store_prefix(struct kvm_vcpu *vcpu) +{ + u64 operand2; + u32 address; + int rc; + u8 ar; + + vcpu->stat.instruction_stpx++; + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + operand2 = kvm_s390_get_base_disp_s(vcpu, &ar); + + /* must be word boundary */ + if (operand2 & 3) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + address = kvm_s390_get_prefix(vcpu); + + /* get the value */ + rc = write_guest(vcpu, operand2, ar, &address, sizeof(address)); + if (rc) + return kvm_s390_inject_prog_cond(vcpu, rc); + + VCPU_EVENT(vcpu, 3, "STPX: storing prefix 0x%x into 0x%llx", address, operand2); + trace_kvm_s390_handle_prefix(vcpu, 0, address); + return 0; +} + +static int handle_store_cpu_address(struct kvm_vcpu *vcpu) +{ + u16 vcpu_id = vcpu->vcpu_id; + u64 ga; + int rc; + u8 ar; + + vcpu->stat.instruction_stap++; + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + ga = kvm_s390_get_base_disp_s(vcpu, &ar); + + if (ga & 1) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + rc = write_guest(vcpu, ga, ar, &vcpu_id, sizeof(vcpu_id)); + if (rc) + return kvm_s390_inject_prog_cond(vcpu, rc); + + VCPU_EVENT(vcpu, 3, "STAP: storing cpu address (%u) to 0x%llx", vcpu_id, ga); + trace_kvm_s390_handle_stap(vcpu, ga); + return 0; +} + +int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu) +{ + int rc; + + trace_kvm_s390_skey_related_inst(vcpu); + /* Already enabled? */ + if (vcpu->arch.skey_enabled) + return 0; + + rc = s390_enable_skey(); + VCPU_EVENT(vcpu, 3, "enabling storage keys for guest: %d", rc); + if (rc) + return rc; + + if (kvm_s390_test_cpuflags(vcpu, CPUSTAT_KSS)) + kvm_s390_clear_cpuflags(vcpu, CPUSTAT_KSS); + if (!vcpu->kvm->arch.use_skf) + vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE; + else + vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE); + vcpu->arch.skey_enabled = true; + return 0; +} + +static int try_handle_skey(struct kvm_vcpu *vcpu) +{ + int rc; + + rc = kvm_s390_skey_check_enable(vcpu); + if (rc) + return rc; + if (vcpu->kvm->arch.use_skf) { + /* with storage-key facility, SIE interprets it for us */ + kvm_s390_retry_instr(vcpu); + VCPU_EVENT(vcpu, 4, "%s", "retrying storage key operation"); + return -EAGAIN; + } + return 0; +} + +static int handle_iske(struct kvm_vcpu *vcpu) +{ + unsigned long gaddr, vmaddr; + unsigned char key; + int reg1, reg2; + bool unlocked; + int rc; + + vcpu->stat.instruction_iske++; + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + rc = try_handle_skey(vcpu); + if (rc) + return rc != -EAGAIN ? rc : 0; + + kvm_s390_get_regs_rre(vcpu, ®1, ®2); + + gaddr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; + gaddr = kvm_s390_logical_to_effective(vcpu, gaddr); + gaddr = kvm_s390_real_to_abs(vcpu, gaddr); + vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gaddr)); + if (kvm_is_error_hva(vmaddr)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); +retry: + unlocked = false; + mmap_read_lock(current->mm); + rc = get_guest_storage_key(current->mm, vmaddr, &key); + + if (rc) { + rc = fixup_user_fault(current->mm, vmaddr, + FAULT_FLAG_WRITE, &unlocked); + if (!rc) { + mmap_read_unlock(current->mm); + goto retry; + } + } + mmap_read_unlock(current->mm); + if (rc == -EFAULT) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + if (rc < 0) + return rc; + vcpu->run->s.regs.gprs[reg1] &= ~0xff; + vcpu->run->s.regs.gprs[reg1] |= key; + return 0; +} + +static int handle_rrbe(struct kvm_vcpu *vcpu) +{ + unsigned long vmaddr, gaddr; + int reg1, reg2; + bool unlocked; + int rc; + + vcpu->stat.instruction_rrbe++; + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + rc = try_handle_skey(vcpu); + if (rc) + return rc != -EAGAIN ? rc : 0; + + kvm_s390_get_regs_rre(vcpu, ®1, ®2); + + gaddr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; + gaddr = kvm_s390_logical_to_effective(vcpu, gaddr); + gaddr = kvm_s390_real_to_abs(vcpu, gaddr); + vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gaddr)); + if (kvm_is_error_hva(vmaddr)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); +retry: + unlocked = false; + mmap_read_lock(current->mm); + rc = reset_guest_reference_bit(current->mm, vmaddr); + if (rc < 0) { + rc = fixup_user_fault(current->mm, vmaddr, + FAULT_FLAG_WRITE, &unlocked); + if (!rc) { + mmap_read_unlock(current->mm); + goto retry; + } + } + mmap_read_unlock(current->mm); + if (rc == -EFAULT) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + if (rc < 0) + return rc; + kvm_s390_set_psw_cc(vcpu, rc); + return 0; +} + +#define SSKE_NQ 0x8 +#define SSKE_MR 0x4 +#define SSKE_MC 0x2 +#define SSKE_MB 0x1 +static int handle_sske(struct kvm_vcpu *vcpu) +{ + unsigned char m3 = vcpu->arch.sie_block->ipb >> 28; + unsigned long start, end; + unsigned char key, oldkey; + int reg1, reg2; + bool unlocked; + int rc; + + vcpu->stat.instruction_sske++; + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + rc = try_handle_skey(vcpu); + if (rc) + return rc != -EAGAIN ? rc : 0; + + if (!test_kvm_facility(vcpu->kvm, 8)) + m3 &= ~SSKE_MB; + if (!test_kvm_facility(vcpu->kvm, 10)) + m3 &= ~(SSKE_MC | SSKE_MR); + if (!test_kvm_facility(vcpu->kvm, 14)) + m3 &= ~SSKE_NQ; + + kvm_s390_get_regs_rre(vcpu, ®1, ®2); + + key = vcpu->run->s.regs.gprs[reg1] & 0xfe; + start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; + start = kvm_s390_logical_to_effective(vcpu, start); + if (m3 & SSKE_MB) { + /* start already designates an absolute address */ + end = (start + _SEGMENT_SIZE) & ~(_SEGMENT_SIZE - 1); + } else { + start = kvm_s390_real_to_abs(vcpu, start); + end = start + PAGE_SIZE; + } + + while (start != end) { + unsigned long vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start)); + unlocked = false; + + if (kvm_is_error_hva(vmaddr)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + + mmap_read_lock(current->mm); + rc = cond_set_guest_storage_key(current->mm, vmaddr, key, &oldkey, + m3 & SSKE_NQ, m3 & SSKE_MR, + m3 & SSKE_MC); + + if (rc < 0) { + rc = fixup_user_fault(current->mm, vmaddr, + FAULT_FLAG_WRITE, &unlocked); + rc = !rc ? -EAGAIN : rc; + } + mmap_read_unlock(current->mm); + if (rc == -EFAULT) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + if (rc == -EAGAIN) + continue; + if (rc < 0) + return rc; + start += PAGE_SIZE; + } + + if (m3 & (SSKE_MC | SSKE_MR)) { + if (m3 & SSKE_MB) { + /* skey in reg1 is unpredictable */ + kvm_s390_set_psw_cc(vcpu, 3); + } else { + kvm_s390_set_psw_cc(vcpu, rc); + vcpu->run->s.regs.gprs[reg1] &= ~0xff00UL; + vcpu->run->s.regs.gprs[reg1] |= (u64) oldkey << 8; + } + } + if (m3 & SSKE_MB) { + if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_BITS_AMODE_64BIT) + vcpu->run->s.regs.gprs[reg2] &= ~PAGE_MASK; + else + vcpu->run->s.regs.gprs[reg2] &= ~0xfffff000UL; + end = kvm_s390_logical_to_effective(vcpu, end); + vcpu->run->s.regs.gprs[reg2] |= end; + } + return 0; +} + +static int handle_ipte_interlock(struct kvm_vcpu *vcpu) +{ + vcpu->stat.instruction_ipte_interlock++; + if (psw_bits(vcpu->arch.sie_block->gpsw).pstate) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + wait_event(vcpu->kvm->arch.ipte_wq, !ipte_lock_held(vcpu->kvm)); + kvm_s390_retry_instr(vcpu); + VCPU_EVENT(vcpu, 4, "%s", "retrying ipte interlock operation"); + return 0; +} + +static int handle_test_block(struct kvm_vcpu *vcpu) +{ + gpa_t addr; + int reg2; + + vcpu->stat.instruction_tb++; + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + kvm_s390_get_regs_rre(vcpu, NULL, ®2); + addr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; + addr = kvm_s390_logical_to_effective(vcpu, addr); + if (kvm_s390_check_low_addr_prot_real(vcpu, addr)) + return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); + addr = kvm_s390_real_to_abs(vcpu, addr); + + if (kvm_is_error_gpa(vcpu->kvm, addr)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + /* + * We don't expect errors on modern systems, and do not care + * about storage keys (yet), so let's just clear the page. + */ + if (kvm_clear_guest(vcpu->kvm, addr, PAGE_SIZE)) + return -EFAULT; + kvm_s390_set_psw_cc(vcpu, 0); + vcpu->run->s.regs.gprs[0] = 0; + return 0; +} + +static int handle_tpi(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_interrupt_info *inti; + unsigned long len; + u32 tpi_data[3]; + int rc; + u64 addr; + u8 ar; + + vcpu->stat.instruction_tpi++; + + addr = kvm_s390_get_base_disp_s(vcpu, &ar); + if (addr & 3) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + inti = kvm_s390_get_io_int(vcpu->kvm, vcpu->arch.sie_block->gcr[6], 0); + if (!inti) { + kvm_s390_set_psw_cc(vcpu, 0); + return 0; + } + + tpi_data[0] = inti->io.subchannel_id << 16 | inti->io.subchannel_nr; + tpi_data[1] = inti->io.io_int_parm; + tpi_data[2] = inti->io.io_int_word; + if (addr) { + /* + * Store the two-word I/O interruption code into the + * provided area. + */ + len = sizeof(tpi_data) - 4; + rc = write_guest(vcpu, addr, ar, &tpi_data, len); + if (rc) { + rc = kvm_s390_inject_prog_cond(vcpu, rc); + goto reinject_interrupt; + } + } else { + /* + * Store the three-word I/O interruption code into + * the appropriate lowcore area. + */ + len = sizeof(tpi_data); + if (write_guest_lc(vcpu, __LC_SUBCHANNEL_ID, &tpi_data, len)) { + /* failed writes to the low core are not recoverable */ + rc = -EFAULT; + goto reinject_interrupt; + } + } + + /* irq was successfully handed to the guest */ + kfree(inti); + kvm_s390_set_psw_cc(vcpu, 1); + return 0; +reinject_interrupt: + /* + * If we encounter a problem storing the interruption code, the + * instruction is suppressed from the guest's view: reinject the + * interrupt. + */ + if (kvm_s390_reinject_io_int(vcpu->kvm, inti)) { + kfree(inti); + rc = -EFAULT; + } + /* don't set the cc, a pgm irq was injected or we drop to user space */ + return rc ? -EFAULT : 0; +} + +static int handle_tsch(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_interrupt_info *inti = NULL; + const u64 isc_mask = 0xffUL << 24; /* all iscs set */ + + vcpu->stat.instruction_tsch++; + + /* a valid schid has at least one bit set */ + if (vcpu->run->s.regs.gprs[1]) + inti = kvm_s390_get_io_int(vcpu->kvm, isc_mask, + vcpu->run->s.regs.gprs[1]); + + /* + * Prepare exit to userspace. + * We indicate whether we dequeued a pending I/O interrupt + * so that userspace can re-inject it if the instruction gets + * a program check. While this may re-order the pending I/O + * interrupts, this is no problem since the priority is kept + * intact. + */ + vcpu->run->exit_reason = KVM_EXIT_S390_TSCH; + vcpu->run->s390_tsch.dequeued = !!inti; + if (inti) { + vcpu->run->s390_tsch.subchannel_id = inti->io.subchannel_id; + vcpu->run->s390_tsch.subchannel_nr = inti->io.subchannel_nr; + vcpu->run->s390_tsch.io_int_parm = inti->io.io_int_parm; + vcpu->run->s390_tsch.io_int_word = inti->io.io_int_word; + } + vcpu->run->s390_tsch.ipb = vcpu->arch.sie_block->ipb; + kfree(inti); + return -EREMOTE; +} + +static int handle_io_inst(struct kvm_vcpu *vcpu) +{ + VCPU_EVENT(vcpu, 4, "%s", "I/O instruction"); + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + if (vcpu->kvm->arch.css_support) { + /* + * Most I/O instructions will be handled by userspace. + * Exceptions are tpi and the interrupt portion of tsch. + */ + if (vcpu->arch.sie_block->ipa == 0xb236) + return handle_tpi(vcpu); + if (vcpu->arch.sie_block->ipa == 0xb235) + return handle_tsch(vcpu); + /* Handle in userspace. */ + vcpu->stat.instruction_io_other++; + return -EOPNOTSUPP; + } else { + /* + * Set condition code 3 to stop the guest from issuing channel + * I/O instructions. + */ + kvm_s390_set_psw_cc(vcpu, 3); + return 0; + } +} + +/* + * handle_pqap: Handling pqap interception + * @vcpu: the vcpu having issue the pqap instruction + * + * We now support PQAP/AQIC instructions and we need to correctly + * answer the guest even if no dedicated driver's hook is available. + * + * The intercepting code calls a dedicated callback for this instruction + * if a driver did register one in the CRYPTO satellite of the + * SIE block. + * + * If no callback is available, the queues are not available, return this + * response code to the caller and set CC to 3. + * Else return the response code returned by the callback. + */ +static int handle_pqap(struct kvm_vcpu *vcpu) +{ + struct ap_queue_status status = {}; + crypto_hook pqap_hook; + unsigned long reg0; + int ret; + uint8_t fc; + + /* Verify that the AP instruction are available */ + if (!ap_instructions_available()) + return -EOPNOTSUPP; + /* Verify that the guest is allowed to use AP instructions */ + if (!(vcpu->arch.sie_block->eca & ECA_APIE)) + return -EOPNOTSUPP; + /* + * The only possibly intercepted functions when AP instructions are + * available for the guest are AQIC and TAPQ with the t bit set + * since we do not set IC.3 (FIII) we currently will only intercept + * the AQIC function code. + * Note: running nested under z/VM can result in intercepts for other + * function codes, e.g. PQAP(QCI). We do not support this and bail out. + */ + reg0 = vcpu->run->s.regs.gprs[0]; + fc = (reg0 >> 24) & 0xff; + if (fc != 0x03) + return -EOPNOTSUPP; + + /* PQAP instruction is allowed for guest kernel only */ + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + /* Common PQAP instruction specification exceptions */ + /* bits 41-47 must all be zeros */ + if (reg0 & 0x007f0000UL) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + /* APFT not install and T bit set */ + if (!test_kvm_facility(vcpu->kvm, 15) && (reg0 & 0x00800000UL)) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + /* APXA not installed and APID greater 64 or APQI greater 16 */ + if (!(vcpu->kvm->arch.crypto.crycbd & 0x02) && (reg0 & 0x0000c0f0UL)) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + /* AQIC function code specific exception */ + /* facility 65 not present for AQIC function code */ + if (!test_kvm_facility(vcpu->kvm, 65)) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + /* + * If the hook callback is registered, there will be a pointer to the + * hook function pointer in the kvm_s390_crypto structure. Lock the + * owner, retrieve the hook function pointer and call the hook. + */ + down_read(&vcpu->kvm->arch.crypto.pqap_hook_rwsem); + if (vcpu->kvm->arch.crypto.pqap_hook) { + pqap_hook = *vcpu->kvm->arch.crypto.pqap_hook; + ret = pqap_hook(vcpu); + if (!ret && vcpu->run->s.regs.gprs[1] & 0x00ff0000) + kvm_s390_set_psw_cc(vcpu, 3); + up_read(&vcpu->kvm->arch.crypto.pqap_hook_rwsem); + return ret; + } + up_read(&vcpu->kvm->arch.crypto.pqap_hook_rwsem); + /* + * A vfio_driver must register a hook. + * No hook means no driver to enable the SIE CRYCB and no queues. + * We send this response to the guest. + */ + status.response_code = 0x01; + memcpy(&vcpu->run->s.regs.gprs[1], &status, sizeof(status)); + kvm_s390_set_psw_cc(vcpu, 3); + return 0; +} + +static int handle_stfl(struct kvm_vcpu *vcpu) +{ + int rc; + unsigned int fac; + + vcpu->stat.instruction_stfl++; + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + /* + * We need to shift the lower 32 facility bits (bit 0-31) from a u64 + * into a u32 memory representation. They will remain bits 0-31. + */ + fac = *vcpu->kvm->arch.model.fac_list >> 32; + rc = write_guest_lc(vcpu, offsetof(struct lowcore, stfl_fac_list), + &fac, sizeof(fac)); + if (rc) + return rc; + VCPU_EVENT(vcpu, 3, "STFL: store facility list 0x%x", fac); + trace_kvm_s390_handle_stfl(vcpu, fac); + return 0; +} + +#define PSW_MASK_ADDR_MODE (PSW_MASK_EA | PSW_MASK_BA) +#define PSW_MASK_UNASSIGNED 0xb80800fe7fffffffUL +#define PSW_ADDR_24 0x0000000000ffffffUL +#define PSW_ADDR_31 0x000000007fffffffUL + +int is_valid_psw(psw_t *psw) +{ + if (psw->mask & PSW_MASK_UNASSIGNED) + return 0; + if ((psw->mask & PSW_MASK_ADDR_MODE) == PSW_MASK_BA) { + if (psw->addr & ~PSW_ADDR_31) + return 0; + } + if (!(psw->mask & PSW_MASK_ADDR_MODE) && (psw->addr & ~PSW_ADDR_24)) + return 0; + if ((psw->mask & PSW_MASK_ADDR_MODE) == PSW_MASK_EA) + return 0; + if (psw->addr & 1) + return 0; + return 1; +} + +int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu) +{ + psw_t *gpsw = &vcpu->arch.sie_block->gpsw; + psw_compat_t new_psw; + u64 addr; + int rc; + u8 ar; + + vcpu->stat.instruction_lpsw++; + + if (gpsw->mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + addr = kvm_s390_get_base_disp_s(vcpu, &ar); + if (addr & 7) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + rc = read_guest(vcpu, addr, ar, &new_psw, sizeof(new_psw)); + if (rc) + return kvm_s390_inject_prog_cond(vcpu, rc); + if (!(new_psw.mask & PSW32_MASK_BASE)) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + gpsw->mask = (new_psw.mask & ~PSW32_MASK_BASE) << 32; + gpsw->mask |= new_psw.addr & PSW32_ADDR_AMODE; + gpsw->addr = new_psw.addr & ~PSW32_ADDR_AMODE; + if (!is_valid_psw(gpsw)) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + return 0; +} + +static int handle_lpswe(struct kvm_vcpu *vcpu) +{ + psw_t new_psw; + u64 addr; + int rc; + u8 ar; + + vcpu->stat.instruction_lpswe++; + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + addr = kvm_s390_get_base_disp_s(vcpu, &ar); + if (addr & 7) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + rc = read_guest(vcpu, addr, ar, &new_psw, sizeof(new_psw)); + if (rc) + return kvm_s390_inject_prog_cond(vcpu, rc); + vcpu->arch.sie_block->gpsw = new_psw; + if (!is_valid_psw(&vcpu->arch.sie_block->gpsw)) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + return 0; +} + +static int handle_stidp(struct kvm_vcpu *vcpu) +{ + u64 stidp_data = vcpu->kvm->arch.model.cpuid; + u64 operand2; + int rc; + u8 ar; + + vcpu->stat.instruction_stidp++; + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + operand2 = kvm_s390_get_base_disp_s(vcpu, &ar); + + if (operand2 & 7) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + rc = write_guest(vcpu, operand2, ar, &stidp_data, sizeof(stidp_data)); + if (rc) + return kvm_s390_inject_prog_cond(vcpu, rc); + + VCPU_EVENT(vcpu, 3, "STIDP: store cpu id 0x%llx", stidp_data); + return 0; +} + +static void handle_stsi_3_2_2(struct kvm_vcpu *vcpu, struct sysinfo_3_2_2 *mem) +{ + int cpus = 0; + int n; + + cpus = atomic_read(&vcpu->kvm->online_vcpus); + + /* deal with other level 3 hypervisors */ + if (stsi(mem, 3, 2, 2)) + mem->count = 0; + if (mem->count < 8) + mem->count++; + for (n = mem->count - 1; n > 0 ; n--) + memcpy(&mem->vm[n], &mem->vm[n - 1], sizeof(mem->vm[0])); + + memset(&mem->vm[0], 0, sizeof(mem->vm[0])); + mem->vm[0].cpus_total = cpus; + mem->vm[0].cpus_configured = cpus; + mem->vm[0].cpus_standby = 0; + mem->vm[0].cpus_reserved = 0; + mem->vm[0].caf = 1000; + memcpy(mem->vm[0].name, "KVMguest", 8); + ASCEBC(mem->vm[0].name, 8); + memcpy(mem->vm[0].cpi, "KVM/Linux ", 16); + ASCEBC(mem->vm[0].cpi, 16); +} + +static void insert_stsi_usr_data(struct kvm_vcpu *vcpu, u64 addr, u8 ar, + u8 fc, u8 sel1, u16 sel2) +{ + vcpu->run->exit_reason = KVM_EXIT_S390_STSI; + vcpu->run->s390_stsi.addr = addr; + vcpu->run->s390_stsi.ar = ar; + vcpu->run->s390_stsi.fc = fc; + vcpu->run->s390_stsi.sel1 = sel1; + vcpu->run->s390_stsi.sel2 = sel2; +} + +static int handle_stsi(struct kvm_vcpu *vcpu) +{ + int fc = (vcpu->run->s.regs.gprs[0] & 0xf0000000) >> 28; + int sel1 = vcpu->run->s.regs.gprs[0] & 0xff; + int sel2 = vcpu->run->s.regs.gprs[1] & 0xffff; + unsigned long mem = 0; + u64 operand2; + int rc = 0; + u8 ar; + + vcpu->stat.instruction_stsi++; + VCPU_EVENT(vcpu, 3, "STSI: fc: %u sel1: %u sel2: %u", fc, sel1, sel2); + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + /* Bailout forbidden function codes */ + if (fc > 3 && fc != 15) + goto out_no_data; + + /* + * fc 15 is provided only with + * - PTF/CPU topology support through facility 15 + * - KVM_CAP_S390_USER_STSI + */ + if (fc == 15 && (!test_kvm_facility(vcpu->kvm, 11) || + !vcpu->kvm->arch.user_stsi)) + goto out_no_data; + + if (vcpu->run->s.regs.gprs[0] & 0x0fffff00 + || vcpu->run->s.regs.gprs[1] & 0xffff0000) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + if (fc == 0) { + vcpu->run->s.regs.gprs[0] = 3 << 28; + kvm_s390_set_psw_cc(vcpu, 0); + return 0; + } + + operand2 = kvm_s390_get_base_disp_s(vcpu, &ar); + + if (!kvm_s390_pv_cpu_is_protected(vcpu) && (operand2 & 0xfff)) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + switch (fc) { + case 1: /* same handling for 1 and 2 */ + case 2: + mem = get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!mem) + goto out_no_data; + if (stsi((void *) mem, fc, sel1, sel2)) + goto out_no_data; + break; + case 3: + if (sel1 != 2 || sel2 != 2) + goto out_no_data; + mem = get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!mem) + goto out_no_data; + handle_stsi_3_2_2(vcpu, (void *) mem); + break; + case 15: /* fc 15 is fully handled in userspace */ + insert_stsi_usr_data(vcpu, operand2, ar, fc, sel1, sel2); + trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2); + return -EREMOTE; + } + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + memcpy(sida_addr(vcpu->arch.sie_block), (void *)mem, PAGE_SIZE); + rc = 0; + } else { + rc = write_guest(vcpu, operand2, ar, (void *)mem, PAGE_SIZE); + } + if (rc) { + rc = kvm_s390_inject_prog_cond(vcpu, rc); + goto out; + } + if (vcpu->kvm->arch.user_stsi) { + insert_stsi_usr_data(vcpu, operand2, ar, fc, sel1, sel2); + rc = -EREMOTE; + } + trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2); + free_page(mem); + kvm_s390_set_psw_cc(vcpu, 0); + vcpu->run->s.regs.gprs[0] = 0; + return rc; +out_no_data: + kvm_s390_set_psw_cc(vcpu, 3); +out: + free_page(mem); + return rc; +} + +int kvm_s390_handle_b2(struct kvm_vcpu *vcpu) +{ + switch (vcpu->arch.sie_block->ipa & 0x00ff) { + case 0x02: + return handle_stidp(vcpu); + case 0x04: + return handle_set_clock(vcpu); + case 0x10: + return handle_set_prefix(vcpu); + case 0x11: + return handle_store_prefix(vcpu); + case 0x12: + return handle_store_cpu_address(vcpu); + case 0x14: + return kvm_s390_handle_vsie(vcpu); + case 0x21: + case 0x50: + return handle_ipte_interlock(vcpu); + case 0x29: + return handle_iske(vcpu); + case 0x2a: + return handle_rrbe(vcpu); + case 0x2b: + return handle_sske(vcpu); + case 0x2c: + return handle_test_block(vcpu); + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + case 0x3a: + case 0x3b: + case 0x3c: + case 0x5f: + case 0x74: + case 0x76: + return handle_io_inst(vcpu); + case 0x56: + return handle_sthyi(vcpu); + case 0x7d: + return handle_stsi(vcpu); + case 0xaf: + return handle_pqap(vcpu); + case 0xb1: + return handle_stfl(vcpu); + case 0xb2: + return handle_lpswe(vcpu); + default: + return -EOPNOTSUPP; + } +} + +static int handle_epsw(struct kvm_vcpu *vcpu) +{ + int reg1, reg2; + + vcpu->stat.instruction_epsw++; + + kvm_s390_get_regs_rre(vcpu, ®1, ®2); + + /* This basically extracts the mask half of the psw. */ + vcpu->run->s.regs.gprs[reg1] &= 0xffffffff00000000UL; + vcpu->run->s.regs.gprs[reg1] |= vcpu->arch.sie_block->gpsw.mask >> 32; + if (reg2) { + vcpu->run->s.regs.gprs[reg2] &= 0xffffffff00000000UL; + vcpu->run->s.regs.gprs[reg2] |= + vcpu->arch.sie_block->gpsw.mask & 0x00000000ffffffffUL; + } + return 0; +} + +#define PFMF_RESERVED 0xfffc0101UL +#define PFMF_SK 0x00020000UL +#define PFMF_CF 0x00010000UL +#define PFMF_UI 0x00008000UL +#define PFMF_FSC 0x00007000UL +#define PFMF_NQ 0x00000800UL +#define PFMF_MR 0x00000400UL +#define PFMF_MC 0x00000200UL +#define PFMF_KEY 0x000000feUL + +static int handle_pfmf(struct kvm_vcpu *vcpu) +{ + bool mr = false, mc = false, nq; + int reg1, reg2; + unsigned long start, end; + unsigned char key; + + vcpu->stat.instruction_pfmf++; + + kvm_s390_get_regs_rre(vcpu, ®1, ®2); + + if (!test_kvm_facility(vcpu->kvm, 8)) + return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + if (vcpu->run->s.regs.gprs[reg1] & PFMF_RESERVED) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + /* Only provide non-quiescing support if enabled for the guest */ + if (vcpu->run->s.regs.gprs[reg1] & PFMF_NQ && + !test_kvm_facility(vcpu->kvm, 14)) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + /* Only provide conditional-SSKE support if enabled for the guest */ + if (vcpu->run->s.regs.gprs[reg1] & PFMF_SK && + test_kvm_facility(vcpu->kvm, 10)) { + mr = vcpu->run->s.regs.gprs[reg1] & PFMF_MR; + mc = vcpu->run->s.regs.gprs[reg1] & PFMF_MC; + } + + nq = vcpu->run->s.regs.gprs[reg1] & PFMF_NQ; + key = vcpu->run->s.regs.gprs[reg1] & PFMF_KEY; + start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; + start = kvm_s390_logical_to_effective(vcpu, start); + + if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) { + if (kvm_s390_check_low_addr_prot_real(vcpu, start)) + return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); + } + + switch (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) { + case 0x00000000: + /* only 4k frames specify a real address */ + start = kvm_s390_real_to_abs(vcpu, start); + end = (start + PAGE_SIZE) & ~(PAGE_SIZE - 1); + break; + case 0x00001000: + end = (start + _SEGMENT_SIZE) & ~(_SEGMENT_SIZE - 1); + break; + case 0x00002000: + /* only support 2G frame size if EDAT2 is available and we are + not in 24-bit addressing mode */ + if (!test_kvm_facility(vcpu->kvm, 78) || + psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_BITS_AMODE_24BIT) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + end = (start + _REGION3_SIZE) & ~(_REGION3_SIZE - 1); + break; + default: + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + } + + while (start != end) { + unsigned long vmaddr; + bool unlocked = false; + + /* Translate guest address to host address */ + vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start)); + if (kvm_is_error_hva(vmaddr)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + + if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) { + if (kvm_clear_guest(vcpu->kvm, start, PAGE_SIZE)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + } + + if (vcpu->run->s.regs.gprs[reg1] & PFMF_SK) { + int rc = kvm_s390_skey_check_enable(vcpu); + + if (rc) + return rc; + mmap_read_lock(current->mm); + rc = cond_set_guest_storage_key(current->mm, vmaddr, + key, NULL, nq, mr, mc); + if (rc < 0) { + rc = fixup_user_fault(current->mm, vmaddr, + FAULT_FLAG_WRITE, &unlocked); + rc = !rc ? -EAGAIN : rc; + } + mmap_read_unlock(current->mm); + if (rc == -EFAULT) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + if (rc == -EAGAIN) + continue; + if (rc < 0) + return rc; + } + start += PAGE_SIZE; + } + if (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) { + if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_BITS_AMODE_64BIT) { + vcpu->run->s.regs.gprs[reg2] = end; + } else { + vcpu->run->s.regs.gprs[reg2] &= ~0xffffffffUL; + end = kvm_s390_logical_to_effective(vcpu, end); + vcpu->run->s.regs.gprs[reg2] |= end; + } + } + return 0; +} + +/* + * Must be called with relevant read locks held (kvm->mm->mmap_lock, kvm->srcu) + */ +static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc) +{ + int r1, r2, nappended, entries; + unsigned long gfn, hva, res, pgstev, ptev; + unsigned long *cbrlo; + + /* + * We don't need to set SD.FPF.SK to 1 here, because if we have a + * machine check here we either handle it or crash + */ + + kvm_s390_get_regs_rre(vcpu, &r1, &r2); + gfn = vcpu->run->s.regs.gprs[r2] >> PAGE_SHIFT; + hva = gfn_to_hva(vcpu->kvm, gfn); + entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3; + + if (kvm_is_error_hva(hva)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + + nappended = pgste_perform_essa(vcpu->kvm->mm, hva, orc, &ptev, &pgstev); + if (nappended < 0) { + res = orc ? 0x10 : 0; + vcpu->run->s.regs.gprs[r1] = res; /* Exception Indication */ + return 0; + } + res = (pgstev & _PGSTE_GPS_USAGE_MASK) >> 22; + /* + * Set the block-content state part of the result. 0 means resident, so + * nothing to do if the page is valid. 2 is for preserved pages + * (non-present and non-zero), and 3 for zero pages (non-present and + * zero). + */ + if (ptev & _PAGE_INVALID) { + res |= 2; + if (pgstev & _PGSTE_GPS_ZERO) + res |= 1; + } + if (pgstev & _PGSTE_GPS_NODAT) + res |= 0x20; + vcpu->run->s.regs.gprs[r1] = res; + /* + * It is possible that all the normal 511 slots were full, in which case + * we will now write in the 512th slot, which is reserved for host use. + * In both cases we let the normal essa handling code process all the + * slots, including the reserved one, if needed. + */ + if (nappended > 0) { + cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo & PAGE_MASK); + cbrlo[entries] = gfn << PAGE_SHIFT; + } + + if (orc) { + struct kvm_memory_slot *ms = gfn_to_memslot(vcpu->kvm, gfn); + + /* Increment only if we are really flipping the bit */ + if (ms && !test_and_set_bit(gfn - ms->base_gfn, kvm_second_dirty_bitmap(ms))) + atomic64_inc(&vcpu->kvm->arch.cmma_dirty_pages); + } + + return nappended; +} + +static int handle_essa(struct kvm_vcpu *vcpu) +{ + /* entries expected to be 1FF */ + int entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3; + unsigned long *cbrlo; + struct gmap *gmap; + int i, orc; + + VCPU_EVENT(vcpu, 4, "ESSA: release %d pages", entries); + gmap = vcpu->arch.gmap; + vcpu->stat.instruction_essa++; + if (!vcpu->kvm->arch.use_cmma) + return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + /* Check for invalid operation request code */ + orc = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28; + /* ORCs 0-6 are always valid */ + if (orc > (test_kvm_facility(vcpu->kvm, 147) ? ESSA_SET_STABLE_NODAT + : ESSA_SET_STABLE_IF_RESIDENT)) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + if (!vcpu->kvm->arch.migration_mode) { + /* + * CMMA is enabled in the KVM settings, but is disabled in + * the SIE block and in the mm_context, and we are not doing + * a migration. Enable CMMA in the mm_context. + * Since we need to take a write lock to write to the context + * to avoid races with storage keys handling, we check if the + * value really needs to be written to; if the value is + * already correct, we do nothing and avoid the lock. + */ + if (vcpu->kvm->mm->context.uses_cmm == 0) { + mmap_write_lock(vcpu->kvm->mm); + vcpu->kvm->mm->context.uses_cmm = 1; + mmap_write_unlock(vcpu->kvm->mm); + } + /* + * If we are here, we are supposed to have CMMA enabled in + * the SIE block. Enabling CMMA works on a per-CPU basis, + * while the context use_cmma flag is per process. + * It's possible that the context flag is enabled and the + * SIE flag is not, so we set the flag always; if it was + * already set, nothing changes, otherwise we enable it + * on this CPU too. + */ + vcpu->arch.sie_block->ecb2 |= ECB2_CMMA; + /* Retry the ESSA instruction */ + kvm_s390_retry_instr(vcpu); + } else { + int srcu_idx; + + mmap_read_lock(vcpu->kvm->mm); + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + i = __do_essa(vcpu, orc); + srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); + mmap_read_unlock(vcpu->kvm->mm); + if (i < 0) + return i; + /* Account for the possible extra cbrl entry */ + entries += i; + } + vcpu->arch.sie_block->cbrlo &= PAGE_MASK; /* reset nceo */ + cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo); + mmap_read_lock(gmap->mm); + for (i = 0; i < entries; ++i) + __gmap_zap(gmap, cbrlo[i]); + mmap_read_unlock(gmap->mm); + return 0; +} + +int kvm_s390_handle_b9(struct kvm_vcpu *vcpu) +{ + switch (vcpu->arch.sie_block->ipa & 0x00ff) { + case 0x8a: + case 0x8e: + case 0x8f: + return handle_ipte_interlock(vcpu); + case 0x8d: + return handle_epsw(vcpu); + case 0xab: + return handle_essa(vcpu); + case 0xaf: + return handle_pfmf(vcpu); + default: + return -EOPNOTSUPP; + } +} + +int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu) +{ + int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4; + int reg3 = vcpu->arch.sie_block->ipa & 0x000f; + int reg, rc, nr_regs; + u32 ctl_array[16]; + u64 ga; + u8 ar; + + vcpu->stat.instruction_lctl++; + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + ga = kvm_s390_get_base_disp_rs(vcpu, &ar); + + if (ga & 3) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + VCPU_EVENT(vcpu, 4, "LCTL: r1:%d, r3:%d, addr: 0x%llx", reg1, reg3, ga); + trace_kvm_s390_handle_lctl(vcpu, 0, reg1, reg3, ga); + + nr_regs = ((reg3 - reg1) & 0xf) + 1; + rc = read_guest(vcpu, ga, ar, ctl_array, nr_regs * sizeof(u32)); + if (rc) + return kvm_s390_inject_prog_cond(vcpu, rc); + reg = reg1; + nr_regs = 0; + do { + vcpu->arch.sie_block->gcr[reg] &= 0xffffffff00000000ul; + vcpu->arch.sie_block->gcr[reg] |= ctl_array[nr_regs++]; + if (reg == reg3) + break; + reg = (reg + 1) % 16; + } while (1); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + return 0; +} + +int kvm_s390_handle_stctl(struct kvm_vcpu *vcpu) +{ + int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4; + int reg3 = vcpu->arch.sie_block->ipa & 0x000f; + int reg, rc, nr_regs; + u32 ctl_array[16]; + u64 ga; + u8 ar; + + vcpu->stat.instruction_stctl++; + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + ga = kvm_s390_get_base_disp_rs(vcpu, &ar); + + if (ga & 3) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + VCPU_EVENT(vcpu, 4, "STCTL r1:%d, r3:%d, addr: 0x%llx", reg1, reg3, ga); + trace_kvm_s390_handle_stctl(vcpu, 0, reg1, reg3, ga); + + reg = reg1; + nr_regs = 0; + do { + ctl_array[nr_regs++] = vcpu->arch.sie_block->gcr[reg]; + if (reg == reg3) + break; + reg = (reg + 1) % 16; + } while (1); + rc = write_guest(vcpu, ga, ar, ctl_array, nr_regs * sizeof(u32)); + return rc ? kvm_s390_inject_prog_cond(vcpu, rc) : 0; +} + +static int handle_lctlg(struct kvm_vcpu *vcpu) +{ + int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4; + int reg3 = vcpu->arch.sie_block->ipa & 0x000f; + int reg, rc, nr_regs; + u64 ctl_array[16]; + u64 ga; + u8 ar; + + vcpu->stat.instruction_lctlg++; + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + ga = kvm_s390_get_base_disp_rsy(vcpu, &ar); + + if (ga & 7) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + VCPU_EVENT(vcpu, 4, "LCTLG: r1:%d, r3:%d, addr: 0x%llx", reg1, reg3, ga); + trace_kvm_s390_handle_lctl(vcpu, 1, reg1, reg3, ga); + + nr_regs = ((reg3 - reg1) & 0xf) + 1; + rc = read_guest(vcpu, ga, ar, ctl_array, nr_regs * sizeof(u64)); + if (rc) + return kvm_s390_inject_prog_cond(vcpu, rc); + reg = reg1; + nr_regs = 0; + do { + vcpu->arch.sie_block->gcr[reg] = ctl_array[nr_regs++]; + if (reg == reg3) + break; + reg = (reg + 1) % 16; + } while (1); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + return 0; +} + +static int handle_stctg(struct kvm_vcpu *vcpu) +{ + int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4; + int reg3 = vcpu->arch.sie_block->ipa & 0x000f; + int reg, rc, nr_regs; + u64 ctl_array[16]; + u64 ga; + u8 ar; + + vcpu->stat.instruction_stctg++; + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + ga = kvm_s390_get_base_disp_rsy(vcpu, &ar); + + if (ga & 7) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + VCPU_EVENT(vcpu, 4, "STCTG r1:%d, r3:%d, addr: 0x%llx", reg1, reg3, ga); + trace_kvm_s390_handle_stctl(vcpu, 1, reg1, reg3, ga); + + reg = reg1; + nr_regs = 0; + do { + ctl_array[nr_regs++] = vcpu->arch.sie_block->gcr[reg]; + if (reg == reg3) + break; + reg = (reg + 1) % 16; + } while (1); + rc = write_guest(vcpu, ga, ar, ctl_array, nr_regs * sizeof(u64)); + return rc ? kvm_s390_inject_prog_cond(vcpu, rc) : 0; +} + +int kvm_s390_handle_eb(struct kvm_vcpu *vcpu) +{ + switch (vcpu->arch.sie_block->ipb & 0x000000ff) { + case 0x25: + return handle_stctg(vcpu); + case 0x2f: + return handle_lctlg(vcpu); + case 0x60: + case 0x61: + case 0x62: + return handle_ri(vcpu); + default: + return -EOPNOTSUPP; + } +} + +static int handle_tprot(struct kvm_vcpu *vcpu) +{ + u64 address, operand2; + unsigned long gpa; + u8 access_key; + bool writable; + int ret, cc; + u8 ar; + + vcpu->stat.instruction_tprot++; + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + kvm_s390_get_base_disp_sse(vcpu, &address, &operand2, &ar, NULL); + access_key = (operand2 & 0xf0) >> 4; + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT) + ipte_lock(vcpu->kvm); + + ret = guest_translate_address_with_key(vcpu, address, ar, &gpa, + GACC_STORE, access_key); + if (ret == 0) { + gfn_to_hva_prot(vcpu->kvm, gpa_to_gfn(gpa), &writable); + } else if (ret == PGM_PROTECTION) { + writable = false; + /* Write protected? Try again with read-only... */ + ret = guest_translate_address_with_key(vcpu, address, ar, &gpa, + GACC_FETCH, access_key); + } + if (ret >= 0) { + cc = -1; + + /* Fetching permitted; storing permitted */ + if (ret == 0 && writable) + cc = 0; + /* Fetching permitted; storing not permitted */ + else if (ret == 0 && !writable) + cc = 1; + /* Fetching not permitted; storing not permitted */ + else if (ret == PGM_PROTECTION) + cc = 2; + /* Translation not available */ + else if (ret != PGM_ADDRESSING && ret != PGM_TRANSLATION_SPEC) + cc = 3; + + if (cc != -1) { + kvm_s390_set_psw_cc(vcpu, cc); + ret = 0; + } else { + ret = kvm_s390_inject_program_int(vcpu, ret); + } + } + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT) + ipte_unlock(vcpu->kvm); + return ret; +} + +int kvm_s390_handle_e5(struct kvm_vcpu *vcpu) +{ + switch (vcpu->arch.sie_block->ipa & 0x00ff) { + case 0x01: + return handle_tprot(vcpu); + default: + return -EOPNOTSUPP; + } +} + +static int handle_sckpf(struct kvm_vcpu *vcpu) +{ + u32 value; + + vcpu->stat.instruction_sckpf++; + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + if (vcpu->run->s.regs.gprs[0] & 0x00000000ffff0000) + return kvm_s390_inject_program_int(vcpu, + PGM_SPECIFICATION); + + value = vcpu->run->s.regs.gprs[0] & 0x000000000000ffff; + vcpu->arch.sie_block->todpr = value; + + return 0; +} + +static int handle_ptff(struct kvm_vcpu *vcpu) +{ + vcpu->stat.instruction_ptff++; + + /* we don't emulate any control instructions yet */ + kvm_s390_set_psw_cc(vcpu, 3); + return 0; +} + +int kvm_s390_handle_01(struct kvm_vcpu *vcpu) +{ + switch (vcpu->arch.sie_block->ipa & 0x00ff) { + case 0x04: + return handle_ptff(vcpu); + case 0x07: + return handle_sckpf(vcpu); + default: + return -EOPNOTSUPP; + } +} diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c new file mode 100644 index 0000000000..75e81ba26d --- /dev/null +++ b/arch/s390/kvm/pv.c @@ -0,0 +1,894 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Hosting Protected Virtual Machines + * + * Copyright IBM Corp. 2019, 2020 + * Author(s): Janosch Frank + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "kvm-s390.h" + +bool kvm_s390_pv_is_protected(struct kvm *kvm) +{ + lockdep_assert_held(&kvm->lock); + return !!kvm_s390_pv_get_handle(kvm); +} +EXPORT_SYMBOL_GPL(kvm_s390_pv_is_protected); + +bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu) +{ + lockdep_assert_held(&vcpu->mutex); + return !!kvm_s390_pv_cpu_get_handle(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_s390_pv_cpu_is_protected); + +/** + * struct pv_vm_to_be_destroyed - Represents a protected VM that needs to + * be destroyed + * + * @list: list head for the list of leftover VMs + * @old_gmap_table: the gmap table of the leftover protected VM + * @handle: the handle of the leftover protected VM + * @stor_var: pointer to the variable storage of the leftover protected VM + * @stor_base: address of the base storage of the leftover protected VM + * + * Represents a protected VM that is still registered with the Ultravisor, + * but which does not correspond any longer to an active KVM VM. It should + * be destroyed at some point later, either asynchronously or when the + * process terminates. + */ +struct pv_vm_to_be_destroyed { + struct list_head list; + unsigned long old_gmap_table; + u64 handle; + void *stor_var; + unsigned long stor_base; +}; + +static void kvm_s390_clear_pv_state(struct kvm *kvm) +{ + kvm->arch.pv.handle = 0; + kvm->arch.pv.guest_len = 0; + kvm->arch.pv.stor_base = 0; + kvm->arch.pv.stor_var = NULL; +} + +int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc) +{ + int cc; + + if (!kvm_s390_pv_cpu_get_handle(vcpu)) + return 0; + + cc = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu), UVC_CMD_DESTROY_SEC_CPU, rc, rrc); + + KVM_UV_EVENT(vcpu->kvm, 3, "PROTVIRT DESTROY VCPU %d: rc %x rrc %x", + vcpu->vcpu_id, *rc, *rrc); + WARN_ONCE(cc, "protvirt destroy cpu failed rc %x rrc %x", *rc, *rrc); + + /* Intended memory leak for something that should never happen. */ + if (!cc) + free_pages(vcpu->arch.pv.stor_base, + get_order(uv_info.guest_cpu_stor_len)); + + free_page((unsigned long)sida_addr(vcpu->arch.sie_block)); + vcpu->arch.sie_block->pv_handle_cpu = 0; + vcpu->arch.sie_block->pv_handle_config = 0; + memset(&vcpu->arch.pv, 0, sizeof(vcpu->arch.pv)); + vcpu->arch.sie_block->sdf = 0; + /* + * The sidad field (for sdf == 2) is now the gbea field (for sdf == 0). + * Use the reset value of gbea to avoid leaking the kernel pointer of + * the just freed sida. + */ + vcpu->arch.sie_block->gbea = 1; + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + + return cc ? EIO : 0; +} + +int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc) +{ + struct uv_cb_csc uvcb = { + .header.cmd = UVC_CMD_CREATE_SEC_CPU, + .header.len = sizeof(uvcb), + }; + void *sida_addr; + int cc; + + if (kvm_s390_pv_cpu_get_handle(vcpu)) + return -EINVAL; + + vcpu->arch.pv.stor_base = __get_free_pages(GFP_KERNEL_ACCOUNT, + get_order(uv_info.guest_cpu_stor_len)); + if (!vcpu->arch.pv.stor_base) + return -ENOMEM; + + /* Input */ + uvcb.guest_handle = kvm_s390_pv_get_handle(vcpu->kvm); + uvcb.num = vcpu->arch.sie_block->icpua; + uvcb.state_origin = virt_to_phys(vcpu->arch.sie_block); + uvcb.stor_origin = virt_to_phys((void *)vcpu->arch.pv.stor_base); + + /* Alloc Secure Instruction Data Area Designation */ + sida_addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!sida_addr) { + free_pages(vcpu->arch.pv.stor_base, + get_order(uv_info.guest_cpu_stor_len)); + return -ENOMEM; + } + vcpu->arch.sie_block->sidad = virt_to_phys(sida_addr); + + cc = uv_call(0, (u64)&uvcb); + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + KVM_UV_EVENT(vcpu->kvm, 3, + "PROTVIRT CREATE VCPU: cpu %d handle %llx rc %x rrc %x", + vcpu->vcpu_id, uvcb.cpu_handle, uvcb.header.rc, + uvcb.header.rrc); + + if (cc) { + u16 dummy; + + kvm_s390_pv_destroy_cpu(vcpu, &dummy, &dummy); + return -EIO; + } + + /* Output */ + vcpu->arch.pv.handle = uvcb.cpu_handle; + vcpu->arch.sie_block->pv_handle_cpu = uvcb.cpu_handle; + vcpu->arch.sie_block->pv_handle_config = kvm_s390_pv_get_handle(vcpu->kvm); + vcpu->arch.sie_block->sdf = 2; + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + return 0; +} + +/* only free resources when the destroy was successful */ +static void kvm_s390_pv_dealloc_vm(struct kvm *kvm) +{ + vfree(kvm->arch.pv.stor_var); + free_pages(kvm->arch.pv.stor_base, + get_order(uv_info.guest_base_stor_len)); + kvm_s390_clear_pv_state(kvm); +} + +static int kvm_s390_pv_alloc_vm(struct kvm *kvm) +{ + unsigned long base = uv_info.guest_base_stor_len; + unsigned long virt = uv_info.guest_virt_var_stor_len; + unsigned long npages = 0, vlen = 0; + + kvm->arch.pv.stor_var = NULL; + kvm->arch.pv.stor_base = __get_free_pages(GFP_KERNEL_ACCOUNT, get_order(base)); + if (!kvm->arch.pv.stor_base) + return -ENOMEM; + + /* + * Calculate current guest storage for allocation of the + * variable storage, which is based on the length in MB. + * + * Slots are sorted by GFN + */ + mutex_lock(&kvm->slots_lock); + npages = kvm_s390_get_gfn_end(kvm_memslots(kvm)); + mutex_unlock(&kvm->slots_lock); + + kvm->arch.pv.guest_len = npages * PAGE_SIZE; + + /* Allocate variable storage */ + vlen = ALIGN(virt * ((npages * PAGE_SIZE) / HPAGE_SIZE), PAGE_SIZE); + vlen += uv_info.guest_virt_base_stor_len; + kvm->arch.pv.stor_var = vzalloc(vlen); + if (!kvm->arch.pv.stor_var) + goto out_err; + return 0; + +out_err: + kvm_s390_pv_dealloc_vm(kvm); + return -ENOMEM; +} + +/** + * kvm_s390_pv_dispose_one_leftover - Clean up one leftover protected VM. + * @kvm: the KVM that was associated with this leftover protected VM + * @leftover: details about the leftover protected VM that needs a clean up + * @rc: the RC code of the Destroy Secure Configuration UVC + * @rrc: the RRC code of the Destroy Secure Configuration UVC + * + * Destroy one leftover protected VM. + * On success, kvm->mm->context.protected_count will be decremented atomically + * and all other resources used by the VM will be freed. + * + * Return: 0 in case of success, otherwise 1 + */ +static int kvm_s390_pv_dispose_one_leftover(struct kvm *kvm, + struct pv_vm_to_be_destroyed *leftover, + u16 *rc, u16 *rrc) +{ + int cc; + + /* It used the destroy-fast UVC, nothing left to do here */ + if (!leftover->handle) + goto done_fast; + cc = uv_cmd_nodata(leftover->handle, UVC_CMD_DESTROY_SEC_CONF, rc, rrc); + KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY LEFTOVER VM: rc %x rrc %x", *rc, *rrc); + WARN_ONCE(cc, "protvirt destroy leftover vm failed rc %x rrc %x", *rc, *rrc); + if (cc) + return cc; + /* + * Intentionally leak unusable memory. If the UVC fails, the memory + * used for the VM and its metadata is permanently unusable. + * This can only happen in case of a serious KVM or hardware bug; it + * is not expected to happen in normal operation. + */ + free_pages(leftover->stor_base, get_order(uv_info.guest_base_stor_len)); + free_pages(leftover->old_gmap_table, CRST_ALLOC_ORDER); + vfree(leftover->stor_var); +done_fast: + atomic_dec(&kvm->mm->context.protected_count); + return 0; +} + +/** + * kvm_s390_destroy_lower_2g - Destroy the first 2GB of protected guest memory. + * @kvm: the VM whose memory is to be cleared. + * + * Destroy the first 2GB of guest memory, to avoid prefix issues after reboot. + * The CPUs of the protected VM need to be destroyed beforehand. + */ +static void kvm_s390_destroy_lower_2g(struct kvm *kvm) +{ + const unsigned long pages_2g = SZ_2G / PAGE_SIZE; + struct kvm_memory_slot *slot; + unsigned long len; + int srcu_idx; + + srcu_idx = srcu_read_lock(&kvm->srcu); + + /* Take the memslot containing guest absolute address 0 */ + slot = gfn_to_memslot(kvm, 0); + /* Clear all slots or parts thereof that are below 2GB */ + while (slot && slot->base_gfn < pages_2g) { + len = min_t(u64, slot->npages, pages_2g - slot->base_gfn) * PAGE_SIZE; + s390_uv_destroy_range(kvm->mm, slot->userspace_addr, slot->userspace_addr + len); + /* Take the next memslot */ + slot = gfn_to_memslot(kvm, slot->base_gfn + slot->npages); + } + + srcu_read_unlock(&kvm->srcu, srcu_idx); +} + +static int kvm_s390_pv_deinit_vm_fast(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct uv_cb_destroy_fast uvcb = { + .header.cmd = UVC_CMD_DESTROY_SEC_CONF_FAST, + .header.len = sizeof(uvcb), + .handle = kvm_s390_pv_get_handle(kvm), + }; + int cc; + + cc = uv_call_sched(0, (u64)&uvcb); + if (rc) + *rc = uvcb.header.rc; + if (rrc) + *rrc = uvcb.header.rrc; + WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); + KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM FAST: rc %x rrc %x", + uvcb.header.rc, uvcb.header.rrc); + WARN_ONCE(cc && uvcb.header.rc != 0x104, + "protvirt destroy vm fast failed handle %llx rc %x rrc %x", + kvm_s390_pv_get_handle(kvm), uvcb.header.rc, uvcb.header.rrc); + /* Intended memory leak on "impossible" error */ + if (!cc) + kvm_s390_pv_dealloc_vm(kvm); + return cc ? -EIO : 0; +} + +static inline bool is_destroy_fast_available(void) +{ + return test_bit_inv(BIT_UVC_CMD_DESTROY_SEC_CONF_FAST, uv_info.inst_calls_list); +} + +/** + * kvm_s390_pv_set_aside - Set aside a protected VM for later teardown. + * @kvm: the VM + * @rc: return value for the RC field of the UVCB + * @rrc: return value for the RRC field of the UVCB + * + * Set aside the protected VM for a subsequent teardown. The VM will be able + * to continue immediately as a non-secure VM, and the information needed to + * properly tear down the protected VM is set aside. If another protected VM + * was already set aside without starting its teardown, this function will + * fail. + * The CPUs of the protected VM need to be destroyed beforehand. + * + * Context: kvm->lock needs to be held + * + * Return: 0 in case of success, -EINVAL if another protected VM was already set + * aside, -ENOMEM if the system ran out of memory. + */ +int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct pv_vm_to_be_destroyed *priv; + int res = 0; + + lockdep_assert_held(&kvm->lock); + /* + * If another protected VM was already prepared for teardown, refuse. + * A normal deinitialization has to be performed instead. + */ + if (kvm->arch.pv.set_aside) + return -EINVAL; + + /* Guest with segment type ASCE, refuse to destroy asynchronously */ + if ((kvm->arch.gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) + return -EINVAL; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + if (is_destroy_fast_available()) { + res = kvm_s390_pv_deinit_vm_fast(kvm, rc, rrc); + } else { + priv->stor_var = kvm->arch.pv.stor_var; + priv->stor_base = kvm->arch.pv.stor_base; + priv->handle = kvm_s390_pv_get_handle(kvm); + priv->old_gmap_table = (unsigned long)kvm->arch.gmap->table; + WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); + if (s390_replace_asce(kvm->arch.gmap)) + res = -ENOMEM; + } + + if (res) { + kfree(priv); + return res; + } + + kvm_s390_destroy_lower_2g(kvm); + kvm_s390_clear_pv_state(kvm); + kvm->arch.pv.set_aside = priv; + + *rc = UVC_RC_EXECUTED; + *rrc = 42; + return 0; +} + +/** + * kvm_s390_pv_deinit_vm - Deinitialize the current protected VM + * @kvm: the KVM whose protected VM needs to be deinitialized + * @rc: the RC code of the UVC + * @rrc: the RRC code of the UVC + * + * Deinitialize the current protected VM. This function will destroy and + * cleanup the current protected VM, but it will not cleanup the guest + * memory. This function should only be called when the protected VM has + * just been created and therefore does not have any guest memory, or when + * the caller cleans up the guest memory separately. + * + * This function should not fail, but if it does, the donated memory must + * not be freed. + * + * Context: kvm->lock needs to be held + * + * Return: 0 in case of success, otherwise -EIO + */ +int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + int cc; + + cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), + UVC_CMD_DESTROY_SEC_CONF, rc, rrc); + WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); + if (!cc) { + atomic_dec(&kvm->mm->context.protected_count); + kvm_s390_pv_dealloc_vm(kvm); + } else { + /* Intended memory leak on "impossible" error */ + s390_replace_asce(kvm->arch.gmap); + } + KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM: rc %x rrc %x", *rc, *rrc); + WARN_ONCE(cc, "protvirt destroy vm failed rc %x rrc %x", *rc, *rrc); + + return cc ? -EIO : 0; +} + +/** + * kvm_s390_pv_deinit_cleanup_all - Clean up all protected VMs associated + * with a specific KVM. + * @kvm: the KVM to be cleaned up + * @rc: the RC code of the first failing UVC + * @rrc: the RRC code of the first failing UVC + * + * This function will clean up all protected VMs associated with a KVM. + * This includes the active one, the one prepared for deinitialization with + * kvm_s390_pv_set_aside, and any still pending in the need_cleanup list. + * + * Context: kvm->lock needs to be held unless being called from + * kvm_arch_destroy_vm. + * + * Return: 0 if all VMs are successfully cleaned up, otherwise -EIO + */ +int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct pv_vm_to_be_destroyed *cur; + bool need_zap = false; + u16 _rc, _rrc; + int cc = 0; + + /* + * Nothing to do if the counter was already 0. Otherwise make sure + * the counter does not reach 0 before calling s390_uv_destroy_range. + */ + if (!atomic_inc_not_zero(&kvm->mm->context.protected_count)) + return 0; + + *rc = 1; + /* If the current VM is protected, destroy it */ + if (kvm_s390_pv_get_handle(kvm)) { + cc = kvm_s390_pv_deinit_vm(kvm, rc, rrc); + need_zap = true; + } + + /* If a previous protected VM was set aside, put it in the need_cleanup list */ + if (kvm->arch.pv.set_aside) { + list_add(kvm->arch.pv.set_aside, &kvm->arch.pv.need_cleanup); + kvm->arch.pv.set_aside = NULL; + } + + /* Cleanup all protected VMs in the need_cleanup list */ + while (!list_empty(&kvm->arch.pv.need_cleanup)) { + cur = list_first_entry(&kvm->arch.pv.need_cleanup, typeof(*cur), list); + need_zap = true; + if (kvm_s390_pv_dispose_one_leftover(kvm, cur, &_rc, &_rrc)) { + cc = 1; + /* + * Only return the first error rc and rrc, so make + * sure it is not overwritten. All destroys will + * additionally be reported via KVM_UV_EVENT(). + */ + if (*rc == UVC_RC_EXECUTED) { + *rc = _rc; + *rrc = _rrc; + } + } + list_del(&cur->list); + kfree(cur); + } + + /* + * If the mm still has a mapping, try to mark all its pages as + * accessible. The counter should not reach zero before this + * cleanup has been performed. + */ + if (need_zap && mmget_not_zero(kvm->mm)) { + s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE); + mmput(kvm->mm); + } + + /* Now the counter can safely reach 0 */ + atomic_dec(&kvm->mm->context.protected_count); + return cc ? -EIO : 0; +} + +/** + * kvm_s390_pv_deinit_aside_vm - Teardown a previously set aside protected VM. + * @kvm: the VM previously associated with the protected VM + * @rc: return value for the RC field of the UVCB + * @rrc: return value for the RRC field of the UVCB + * + * Tear down the protected VM that had been previously prepared for teardown + * using kvm_s390_pv_set_aside_vm. Ideally this should be called by + * userspace asynchronously from a separate thread. + * + * Context: kvm->lock must not be held. + * + * Return: 0 in case of success, -EINVAL if no protected VM had been + * prepared for asynchronous teardowm, -EIO in case of other errors. + */ +int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct pv_vm_to_be_destroyed *p; + int ret = 0; + + lockdep_assert_not_held(&kvm->lock); + mutex_lock(&kvm->lock); + p = kvm->arch.pv.set_aside; + kvm->arch.pv.set_aside = NULL; + mutex_unlock(&kvm->lock); + if (!p) + return -EINVAL; + + /* When a fatal signal is received, stop immediately */ + if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX)) + goto done; + if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc)) + ret = -EIO; + kfree(p); + p = NULL; +done: + /* + * p is not NULL if we aborted because of a fatal signal, in which + * case queue the leftover for later cleanup. + */ + if (p) { + mutex_lock(&kvm->lock); + list_add(&p->list, &kvm->arch.pv.need_cleanup); + mutex_unlock(&kvm->lock); + /* Did not finish, but pretend things went well */ + *rc = UVC_RC_EXECUTED; + *rrc = 42; + } + return ret; +} + +static void kvm_s390_pv_mmu_notifier_release(struct mmu_notifier *subscription, + struct mm_struct *mm) +{ + struct kvm *kvm = container_of(subscription, struct kvm, arch.pv.mmu_notifier); + u16 dummy; + int r; + + /* + * No locking is needed since this is the last thread of the last user of this + * struct mm. + * When the struct kvm gets deinitialized, this notifier is also + * unregistered. This means that if this notifier runs, then the + * struct kvm is still valid. + */ + r = kvm_s390_cpus_from_pv(kvm, &dummy, &dummy); + if (!r && is_destroy_fast_available() && kvm_s390_pv_get_handle(kvm)) + kvm_s390_pv_deinit_vm_fast(kvm, &dummy, &dummy); +} + +static const struct mmu_notifier_ops kvm_s390_pv_mmu_notifier_ops = { + .release = kvm_s390_pv_mmu_notifier_release, +}; + +int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct uv_cb_cgc uvcb = { + .header.cmd = UVC_CMD_CREATE_SEC_CONF, + .header.len = sizeof(uvcb) + }; + int cc, ret; + u16 dummy; + + ret = kvm_s390_pv_alloc_vm(kvm); + if (ret) + return ret; + + /* Inputs */ + uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */ + uvcb.guest_stor_len = kvm->arch.pv.guest_len; + uvcb.guest_asce = kvm->arch.gmap->asce; + uvcb.guest_sca = virt_to_phys(kvm->arch.sca); + uvcb.conf_base_stor_origin = + virt_to_phys((void *)kvm->arch.pv.stor_base); + uvcb.conf_virt_stor_origin = (u64)kvm->arch.pv.stor_var; + uvcb.flags.ap_allow_instr = kvm->arch.model.uv_feat_guest.ap; + uvcb.flags.ap_instr_intr = kvm->arch.model.uv_feat_guest.ap_intr; + + cc = uv_call_sched(0, (u64)&uvcb); + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + KVM_UV_EVENT(kvm, 3, "PROTVIRT CREATE VM: handle %llx len %llx rc %x rrc %x flags %04x", + uvcb.guest_handle, uvcb.guest_stor_len, *rc, *rrc, uvcb.flags.raw); + + /* Outputs */ + kvm->arch.pv.handle = uvcb.guest_handle; + + atomic_inc(&kvm->mm->context.protected_count); + if (cc) { + if (uvcb.header.rc & UVC_RC_NEED_DESTROY) { + kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy); + } else { + atomic_dec(&kvm->mm->context.protected_count); + kvm_s390_pv_dealloc_vm(kvm); + } + return -EIO; + } + kvm->arch.gmap->guest_handle = uvcb.guest_handle; + /* Add the notifier only once. No races because we hold kvm->lock */ + if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) { + kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops; + mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm); + } + return 0; +} + +int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc, + u16 *rrc) +{ + struct uv_cb_ssc uvcb = { + .header.cmd = UVC_CMD_SET_SEC_CONF_PARAMS, + .header.len = sizeof(uvcb), + .sec_header_origin = (u64)hdr, + .sec_header_len = length, + .guest_handle = kvm_s390_pv_get_handle(kvm), + }; + int cc = uv_call(0, (u64)&uvcb); + + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + KVM_UV_EVENT(kvm, 3, "PROTVIRT VM SET PARMS: rc %x rrc %x", + *rc, *rrc); + return cc ? -EINVAL : 0; +} + +static int unpack_one(struct kvm *kvm, unsigned long addr, u64 tweak, + u64 offset, u16 *rc, u16 *rrc) +{ + struct uv_cb_unp uvcb = { + .header.cmd = UVC_CMD_UNPACK_IMG, + .header.len = sizeof(uvcb), + .guest_handle = kvm_s390_pv_get_handle(kvm), + .gaddr = addr, + .tweak[0] = tweak, + .tweak[1] = offset, + }; + int ret = gmap_make_secure(kvm->arch.gmap, addr, &uvcb); + + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + + if (ret && ret != -EAGAIN) + KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: failed addr %llx with rc %x rrc %x", + uvcb.gaddr, *rc, *rrc); + return ret; +} + +int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size, + unsigned long tweak, u16 *rc, u16 *rrc) +{ + u64 offset = 0; + int ret = 0; + + if (addr & ~PAGE_MASK || !size || size & ~PAGE_MASK) + return -EINVAL; + + KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: start addr %lx size %lx", + addr, size); + + while (offset < size) { + ret = unpack_one(kvm, addr, tweak, offset, rc, rrc); + if (ret == -EAGAIN) { + cond_resched(); + if (fatal_signal_pending(current)) + break; + continue; + } + if (ret) + break; + addr += PAGE_SIZE; + offset += PAGE_SIZE; + } + if (!ret) + KVM_UV_EVENT(kvm, 3, "%s", "PROTVIRT VM UNPACK: successful"); + return ret; +} + +int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state) +{ + struct uv_cb_cpu_set_state uvcb = { + .header.cmd = UVC_CMD_CPU_SET_STATE, + .header.len = sizeof(uvcb), + .cpu_handle = kvm_s390_pv_cpu_get_handle(vcpu), + .state = state, + }; + int cc; + + cc = uv_call(0, (u64)&uvcb); + KVM_UV_EVENT(vcpu->kvm, 3, "PROTVIRT SET CPU %d STATE %d rc %x rrc %x", + vcpu->vcpu_id, state, uvcb.header.rc, uvcb.header.rrc); + if (cc) + return -EINVAL; + return 0; +} + +int kvm_s390_pv_dump_cpu(struct kvm_vcpu *vcpu, void *buff, u16 *rc, u16 *rrc) +{ + struct uv_cb_dump_cpu uvcb = { + .header.cmd = UVC_CMD_DUMP_CPU, + .header.len = sizeof(uvcb), + .cpu_handle = vcpu->arch.pv.handle, + .dump_area_origin = (u64)buff, + }; + int cc; + + cc = uv_call_sched(0, (u64)&uvcb); + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + return cc; +} + +/* Size of the cache for the storage state dump data. 1MB for now */ +#define DUMP_BUFF_LEN HPAGE_SIZE + +/** + * kvm_s390_pv_dump_stor_state + * + * @kvm: pointer to the guest's KVM struct + * @buff_user: Userspace pointer where we will write the results to + * @gaddr: Starting absolute guest address for which the storage state + * is requested. + * @buff_user_len: Length of the buff_user buffer + * @rc: Pointer to where the uvcb return code is stored + * @rrc: Pointer to where the uvcb return reason code is stored + * + * Stores buff_len bytes of tweak component values to buff_user + * starting with the 1MB block specified by the absolute guest address + * (gaddr). The gaddr pointer will be updated with the last address + * for which data was written when returning to userspace. buff_user + * might be written to even if an error rc is returned. For instance + * if we encounter a fault after writing the first page of data. + * + * Context: kvm->lock needs to be held + * + * Return: + * 0 on success + * -ENOMEM if allocating the cache fails + * -EINVAL if gaddr is not aligned to 1MB + * -EINVAL if buff_user_len is not aligned to uv_info.conf_dump_storage_state_len + * -EINVAL if the UV call fails, rc and rrc will be set in this case + * -EFAULT if copying the result to buff_user failed + */ +int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user, + u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc) +{ + struct uv_cb_dump_stor_state uvcb = { + .header.cmd = UVC_CMD_DUMP_CONF_STOR_STATE, + .header.len = sizeof(uvcb), + .config_handle = kvm->arch.pv.handle, + .gaddr = *gaddr, + .dump_area_origin = 0, + }; + const u64 increment_len = uv_info.conf_dump_storage_state_len; + size_t buff_kvm_size; + size_t size_done = 0; + u8 *buff_kvm = NULL; + int cc, ret; + + ret = -EINVAL; + /* UV call processes 1MB guest storage chunks at a time */ + if (!IS_ALIGNED(*gaddr, HPAGE_SIZE)) + goto out; + + /* + * We provide the storage state for 1MB chunks of guest + * storage. The buffer will need to be aligned to + * conf_dump_storage_state_len so we don't end on a partial + * chunk. + */ + if (!buff_user_len || + !IS_ALIGNED(buff_user_len, increment_len)) + goto out; + + /* + * Allocate a buffer from which we will later copy to the user + * process. We don't want userspace to dictate our buffer size + * so we limit it to DUMP_BUFF_LEN. + */ + ret = -ENOMEM; + buff_kvm_size = min_t(u64, buff_user_len, DUMP_BUFF_LEN); + buff_kvm = vzalloc(buff_kvm_size); + if (!buff_kvm) + goto out; + + ret = 0; + uvcb.dump_area_origin = (u64)buff_kvm; + /* We will loop until the user buffer is filled or an error occurs */ + do { + /* Get 1MB worth of guest storage state data */ + cc = uv_call_sched(0, (u64)&uvcb); + + /* All or nothing */ + if (cc) { + ret = -EINVAL; + break; + } + + size_done += increment_len; + uvcb.dump_area_origin += increment_len; + buff_user_len -= increment_len; + uvcb.gaddr += HPAGE_SIZE; + + /* KVM Buffer full, time to copy to the process */ + if (!buff_user_len || size_done == DUMP_BUFF_LEN) { + if (copy_to_user(buff_user, buff_kvm, size_done)) { + ret = -EFAULT; + break; + } + + buff_user += size_done; + size_done = 0; + uvcb.dump_area_origin = (u64)buff_kvm; + } + } while (buff_user_len); + + /* Report back where we ended dumping */ + *gaddr = uvcb.gaddr; + + /* Lets only log errors, we don't want to spam */ +out: + if (ret) + KVM_UV_EVENT(kvm, 3, + "PROTVIRT DUMP STORAGE STATE: addr %llx ret %d, uvcb rc %x rrc %x", + uvcb.gaddr, ret, uvcb.header.rc, uvcb.header.rrc); + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + vfree(buff_kvm); + + return ret; +} + +/** + * kvm_s390_pv_dump_complete + * + * @kvm: pointer to the guest's KVM struct + * @buff_user: Userspace pointer where we will write the results to + * @rc: Pointer to where the uvcb return code is stored + * @rrc: Pointer to where the uvcb return reason code is stored + * + * Completes the dumping operation and writes the completion data to + * user space. + * + * Context: kvm->lock needs to be held + * + * Return: + * 0 on success + * -ENOMEM if allocating the completion buffer fails + * -EINVAL if the UV call fails, rc and rrc will be set in this case + * -EFAULT if copying the result to buff_user failed + */ +int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user, + u16 *rc, u16 *rrc) +{ + struct uv_cb_dump_complete complete = { + .header.len = sizeof(complete), + .header.cmd = UVC_CMD_DUMP_COMPLETE, + .config_handle = kvm_s390_pv_get_handle(kvm), + }; + u64 *compl_data; + int ret; + + /* Allocate dump area */ + compl_data = vzalloc(uv_info.conf_dump_finalize_len); + if (!compl_data) + return -ENOMEM; + complete.dump_area_origin = (u64)compl_data; + + ret = uv_call_sched(0, (u64)&complete); + *rc = complete.header.rc; + *rrc = complete.header.rrc; + KVM_UV_EVENT(kvm, 3, "PROTVIRT DUMP COMPLETE: rc %x rrc %x", + complete.header.rc, complete.header.rrc); + + if (!ret) { + /* + * kvm_s390_pv_dealloc_vm() will also (mem)set + * this to false on a reboot or other destroy + * operation for this vm. + */ + kvm->arch.pv.dumping = false; + kvm_s390_vcpu_unblock_all(kvm); + ret = copy_to_user(buff_user, compl_data, uv_info.conf_dump_finalize_len); + if (ret) + ret = -EFAULT; + } + vfree(compl_data); + /* If the UVC returned an error, translate it to -EINVAL */ + if (ret > 0) + ret = -EINVAL; + return ret; +} diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c new file mode 100644 index 0000000000..d9696b5300 --- /dev/null +++ b/arch/s390/kvm/sigp.c @@ -0,0 +1,495 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * handling interprocessor communication + * + * Copyright IBM Corp. 2008, 2013 + * + * Author(s): Carsten Otte + * Christian Borntraeger + * Christian Ehrhardt + */ + +#include +#include +#include +#include +#include "gaccess.h" +#include "kvm-s390.h" +#include "trace.h" + +static int __sigp_sense(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu, + u64 *reg) +{ + const bool stopped = kvm_s390_test_cpuflags(dst_vcpu, CPUSTAT_STOPPED); + int rc; + int ext_call_pending; + + ext_call_pending = kvm_s390_ext_call_pending(dst_vcpu); + if (!stopped && !ext_call_pending) + rc = SIGP_CC_ORDER_CODE_ACCEPTED; + else { + *reg &= 0xffffffff00000000UL; + if (ext_call_pending) + *reg |= SIGP_STATUS_EXT_CALL_PENDING; + if (stopped) + *reg |= SIGP_STATUS_STOPPED; + rc = SIGP_CC_STATUS_STORED; + } + + VCPU_EVENT(vcpu, 4, "sensed status of cpu %x rc %x", dst_vcpu->vcpu_id, + rc); + return rc; +} + +static int __inject_sigp_emergency(struct kvm_vcpu *vcpu, + struct kvm_vcpu *dst_vcpu) +{ + struct kvm_s390_irq irq = { + .type = KVM_S390_INT_EMERGENCY, + .u.emerg.code = vcpu->vcpu_id, + }; + int rc = 0; + + rc = kvm_s390_inject_vcpu(dst_vcpu, &irq); + if (!rc) + VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", + dst_vcpu->vcpu_id); + + return rc ? rc : SIGP_CC_ORDER_CODE_ACCEPTED; +} + +static int __sigp_emergency(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu) +{ + return __inject_sigp_emergency(vcpu, dst_vcpu); +} + +static int __sigp_conditional_emergency(struct kvm_vcpu *vcpu, + struct kvm_vcpu *dst_vcpu, + u16 asn, u64 *reg) +{ + const u64 psw_int_mask = PSW_MASK_IO | PSW_MASK_EXT; + u16 p_asn, s_asn; + psw_t *psw; + bool idle; + + idle = is_vcpu_idle(vcpu); + psw = &dst_vcpu->arch.sie_block->gpsw; + p_asn = dst_vcpu->arch.sie_block->gcr[4] & 0xffff; /* Primary ASN */ + s_asn = dst_vcpu->arch.sie_block->gcr[3] & 0xffff; /* Secondary ASN */ + + /* Inject the emergency signal? */ + if (!is_vcpu_stopped(vcpu) + || (psw->mask & psw_int_mask) != psw_int_mask + || (idle && psw->addr != 0) + || (!idle && (asn == p_asn || asn == s_asn))) { + return __inject_sigp_emergency(vcpu, dst_vcpu); + } else { + *reg &= 0xffffffff00000000UL; + *reg |= SIGP_STATUS_INCORRECT_STATE; + return SIGP_CC_STATUS_STORED; + } +} + +static int __sigp_external_call(struct kvm_vcpu *vcpu, + struct kvm_vcpu *dst_vcpu, u64 *reg) +{ + struct kvm_s390_irq irq = { + .type = KVM_S390_INT_EXTERNAL_CALL, + .u.extcall.code = vcpu->vcpu_id, + }; + int rc; + + rc = kvm_s390_inject_vcpu(dst_vcpu, &irq); + if (rc == -EBUSY) { + *reg &= 0xffffffff00000000UL; + *reg |= SIGP_STATUS_EXT_CALL_PENDING; + return SIGP_CC_STATUS_STORED; + } else if (rc == 0) { + VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x", + dst_vcpu->vcpu_id); + } + + return rc ? rc : SIGP_CC_ORDER_CODE_ACCEPTED; +} + +static int __sigp_stop(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu) +{ + struct kvm_s390_irq irq = { + .type = KVM_S390_SIGP_STOP, + }; + int rc; + + rc = kvm_s390_inject_vcpu(dst_vcpu, &irq); + if (rc == -EBUSY) + rc = SIGP_CC_BUSY; + else if (rc == 0) + VCPU_EVENT(vcpu, 4, "sent sigp stop to cpu %x", + dst_vcpu->vcpu_id); + + return rc; +} + +static int __sigp_stop_and_store_status(struct kvm_vcpu *vcpu, + struct kvm_vcpu *dst_vcpu, u64 *reg) +{ + struct kvm_s390_irq irq = { + .type = KVM_S390_SIGP_STOP, + .u.stop.flags = KVM_S390_STOP_FLAG_STORE_STATUS, + }; + int rc; + + rc = kvm_s390_inject_vcpu(dst_vcpu, &irq); + if (rc == -EBUSY) + rc = SIGP_CC_BUSY; + else if (rc == 0) + VCPU_EVENT(vcpu, 4, "sent sigp stop and store status to cpu %x", + dst_vcpu->vcpu_id); + + return rc; +} + +static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter, + u64 *status_reg) +{ + *status_reg &= 0xffffffff00000000UL; + + /* Reject set arch order, with czam we're always in z/Arch mode. */ + *status_reg |= SIGP_STATUS_INVALID_PARAMETER; + return SIGP_CC_STATUS_STORED; +} + +static int __sigp_set_prefix(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu, + u32 address, u64 *reg) +{ + struct kvm_s390_irq irq = { + .type = KVM_S390_SIGP_SET_PREFIX, + .u.prefix.address = address & 0x7fffe000u, + }; + int rc; + + /* + * Make sure the new value is valid memory. We only need to check the + * first page, since address is 8k aligned and memory pieces are always + * at least 1MB aligned and have at least a size of 1MB. + */ + if (kvm_is_error_gpa(vcpu->kvm, irq.u.prefix.address)) { + *reg &= 0xffffffff00000000UL; + *reg |= SIGP_STATUS_INVALID_PARAMETER; + return SIGP_CC_STATUS_STORED; + } + + rc = kvm_s390_inject_vcpu(dst_vcpu, &irq); + if (rc == -EBUSY) { + *reg &= 0xffffffff00000000UL; + *reg |= SIGP_STATUS_INCORRECT_STATE; + return SIGP_CC_STATUS_STORED; + } + + return rc; +} + +static int __sigp_store_status_at_addr(struct kvm_vcpu *vcpu, + struct kvm_vcpu *dst_vcpu, + u32 addr, u64 *reg) +{ + int rc; + + if (!kvm_s390_test_cpuflags(dst_vcpu, CPUSTAT_STOPPED)) { + *reg &= 0xffffffff00000000UL; + *reg |= SIGP_STATUS_INCORRECT_STATE; + return SIGP_CC_STATUS_STORED; + } + + addr &= 0x7ffffe00; + rc = kvm_s390_store_status_unloaded(dst_vcpu, addr); + if (rc == -EFAULT) { + *reg &= 0xffffffff00000000UL; + *reg |= SIGP_STATUS_INVALID_PARAMETER; + rc = SIGP_CC_STATUS_STORED; + } + return rc; +} + +static int __sigp_sense_running(struct kvm_vcpu *vcpu, + struct kvm_vcpu *dst_vcpu, u64 *reg) +{ + int rc; + + if (!test_kvm_facility(vcpu->kvm, 9)) { + *reg &= 0xffffffff00000000UL; + *reg |= SIGP_STATUS_INVALID_ORDER; + return SIGP_CC_STATUS_STORED; + } + + if (kvm_s390_test_cpuflags(dst_vcpu, CPUSTAT_RUNNING)) { + /* running */ + rc = SIGP_CC_ORDER_CODE_ACCEPTED; + } else { + /* not running */ + *reg &= 0xffffffff00000000UL; + *reg |= SIGP_STATUS_NOT_RUNNING; + rc = SIGP_CC_STATUS_STORED; + } + + VCPU_EVENT(vcpu, 4, "sensed running status of cpu %x rc %x", + dst_vcpu->vcpu_id, rc); + + return rc; +} + +static int __prepare_sigp_re_start(struct kvm_vcpu *vcpu, + struct kvm_vcpu *dst_vcpu, u8 order_code) +{ + struct kvm_s390_local_interrupt *li = &dst_vcpu->arch.local_int; + /* handle (RE)START in user space */ + int rc = -EOPNOTSUPP; + + /* make sure we don't race with STOP irq injection */ + spin_lock(&li->lock); + if (kvm_s390_is_stop_irq_pending(dst_vcpu)) + rc = SIGP_CC_BUSY; + spin_unlock(&li->lock); + + return rc; +} + +static int __prepare_sigp_cpu_reset(struct kvm_vcpu *vcpu, + struct kvm_vcpu *dst_vcpu, u8 order_code) +{ + /* handle (INITIAL) CPU RESET in user space */ + return -EOPNOTSUPP; +} + +static int __prepare_sigp_unknown(struct kvm_vcpu *vcpu, + struct kvm_vcpu *dst_vcpu) +{ + /* handle unknown orders in user space */ + return -EOPNOTSUPP; +} + +static int handle_sigp_dst(struct kvm_vcpu *vcpu, u8 order_code, + u16 cpu_addr, u32 parameter, u64 *status_reg) +{ + int rc; + struct kvm_vcpu *dst_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, cpu_addr); + + if (!dst_vcpu) + return SIGP_CC_NOT_OPERATIONAL; + + /* + * SIGP RESTART, SIGP STOP, and SIGP STOP AND STORE STATUS orders + * are processed asynchronously. Until the affected VCPU finishes + * its work and calls back into KVM to clear the (RESTART or STOP) + * interrupt, we need to return any new non-reset orders "busy". + * + * This is important because a single VCPU could issue: + * 1) SIGP STOP $DESTINATION + * 2) SIGP SENSE $DESTINATION + * + * If the SIGP SENSE would not be rejected as "busy", it could + * return an incorrect answer as to whether the VCPU is STOPPED + * or OPERATING. + */ + if (order_code != SIGP_INITIAL_CPU_RESET && + order_code != SIGP_CPU_RESET) { + /* + * Lockless check. Both SIGP STOP and SIGP (RE)START + * properly synchronize everything while processing + * their orders, while the guest cannot observe a + * difference when issuing other orders from two + * different VCPUs. + */ + if (kvm_s390_is_stop_irq_pending(dst_vcpu) || + kvm_s390_is_restart_irq_pending(dst_vcpu)) + return SIGP_CC_BUSY; + } + + switch (order_code) { + case SIGP_SENSE: + vcpu->stat.instruction_sigp_sense++; + rc = __sigp_sense(vcpu, dst_vcpu, status_reg); + break; + case SIGP_EXTERNAL_CALL: + vcpu->stat.instruction_sigp_external_call++; + rc = __sigp_external_call(vcpu, dst_vcpu, status_reg); + break; + case SIGP_EMERGENCY_SIGNAL: + vcpu->stat.instruction_sigp_emergency++; + rc = __sigp_emergency(vcpu, dst_vcpu); + break; + case SIGP_STOP: + vcpu->stat.instruction_sigp_stop++; + rc = __sigp_stop(vcpu, dst_vcpu); + break; + case SIGP_STOP_AND_STORE_STATUS: + vcpu->stat.instruction_sigp_stop_store_status++; + rc = __sigp_stop_and_store_status(vcpu, dst_vcpu, status_reg); + break; + case SIGP_STORE_STATUS_AT_ADDRESS: + vcpu->stat.instruction_sigp_store_status++; + rc = __sigp_store_status_at_addr(vcpu, dst_vcpu, parameter, + status_reg); + break; + case SIGP_SET_PREFIX: + vcpu->stat.instruction_sigp_prefix++; + rc = __sigp_set_prefix(vcpu, dst_vcpu, parameter, status_reg); + break; + case SIGP_COND_EMERGENCY_SIGNAL: + vcpu->stat.instruction_sigp_cond_emergency++; + rc = __sigp_conditional_emergency(vcpu, dst_vcpu, parameter, + status_reg); + break; + case SIGP_SENSE_RUNNING: + vcpu->stat.instruction_sigp_sense_running++; + rc = __sigp_sense_running(vcpu, dst_vcpu, status_reg); + break; + case SIGP_START: + vcpu->stat.instruction_sigp_start++; + rc = __prepare_sigp_re_start(vcpu, dst_vcpu, order_code); + break; + case SIGP_RESTART: + vcpu->stat.instruction_sigp_restart++; + rc = __prepare_sigp_re_start(vcpu, dst_vcpu, order_code); + break; + case SIGP_INITIAL_CPU_RESET: + vcpu->stat.instruction_sigp_init_cpu_reset++; + rc = __prepare_sigp_cpu_reset(vcpu, dst_vcpu, order_code); + break; + case SIGP_CPU_RESET: + vcpu->stat.instruction_sigp_cpu_reset++; + rc = __prepare_sigp_cpu_reset(vcpu, dst_vcpu, order_code); + break; + default: + vcpu->stat.instruction_sigp_unknown++; + rc = __prepare_sigp_unknown(vcpu, dst_vcpu); + } + + if (rc == -EOPNOTSUPP) + VCPU_EVENT(vcpu, 4, + "sigp order %u -> cpu %x: handled in user space", + order_code, dst_vcpu->vcpu_id); + + return rc; +} + +static int handle_sigp_order_in_user_space(struct kvm_vcpu *vcpu, u8 order_code, + u16 cpu_addr) +{ + if (!vcpu->kvm->arch.user_sigp) + return 0; + + switch (order_code) { + case SIGP_SENSE: + case SIGP_EXTERNAL_CALL: + case SIGP_EMERGENCY_SIGNAL: + case SIGP_COND_EMERGENCY_SIGNAL: + case SIGP_SENSE_RUNNING: + return 0; + /* update counters as we're directly dropping to user space */ + case SIGP_STOP: + vcpu->stat.instruction_sigp_stop++; + break; + case SIGP_STOP_AND_STORE_STATUS: + vcpu->stat.instruction_sigp_stop_store_status++; + break; + case SIGP_STORE_STATUS_AT_ADDRESS: + vcpu->stat.instruction_sigp_store_status++; + break; + case SIGP_STORE_ADDITIONAL_STATUS: + vcpu->stat.instruction_sigp_store_adtl_status++; + break; + case SIGP_SET_PREFIX: + vcpu->stat.instruction_sigp_prefix++; + break; + case SIGP_START: + vcpu->stat.instruction_sigp_start++; + break; + case SIGP_RESTART: + vcpu->stat.instruction_sigp_restart++; + break; + case SIGP_INITIAL_CPU_RESET: + vcpu->stat.instruction_sigp_init_cpu_reset++; + break; + case SIGP_CPU_RESET: + vcpu->stat.instruction_sigp_cpu_reset++; + break; + default: + vcpu->stat.instruction_sigp_unknown++; + } + VCPU_EVENT(vcpu, 3, "SIGP: order %u for CPU %d handled in userspace", + order_code, cpu_addr); + + return 1; +} + +int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu) +{ + int r1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4; + int r3 = vcpu->arch.sie_block->ipa & 0x000f; + u32 parameter; + u16 cpu_addr = vcpu->run->s.regs.gprs[r3]; + u8 order_code; + int rc; + + /* sigp in userspace can exit */ + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + order_code = kvm_s390_get_base_disp_rs(vcpu, NULL); + if (handle_sigp_order_in_user_space(vcpu, order_code, cpu_addr)) + return -EOPNOTSUPP; + + if (r1 % 2) + parameter = vcpu->run->s.regs.gprs[r1]; + else + parameter = vcpu->run->s.regs.gprs[r1 + 1]; + + trace_kvm_s390_handle_sigp(vcpu, order_code, cpu_addr, parameter); + switch (order_code) { + case SIGP_SET_ARCHITECTURE: + vcpu->stat.instruction_sigp_arch++; + rc = __sigp_set_arch(vcpu, parameter, + &vcpu->run->s.regs.gprs[r1]); + break; + default: + rc = handle_sigp_dst(vcpu, order_code, cpu_addr, + parameter, + &vcpu->run->s.regs.gprs[r1]); + } + + if (rc < 0) + return rc; + + kvm_s390_set_psw_cc(vcpu, rc); + return 0; +} + +/* + * Handle SIGP partial execution interception. + * + * This interception will occur at the source cpu when a source cpu sends an + * external call to a target cpu and the target cpu has the WAIT bit set in + * its cpuflags. Interception will occur after the interrupt indicator bits at + * the target cpu have been set. All error cases will lead to instruction + * interception, therefore nothing is to be checked or prepared. + */ +int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu) +{ + int r3 = vcpu->arch.sie_block->ipa & 0x000f; + u16 cpu_addr = vcpu->run->s.regs.gprs[r3]; + struct kvm_vcpu *dest_vcpu; + u8 order_code = kvm_s390_get_base_disp_rs(vcpu, NULL); + + if (order_code == SIGP_EXTERNAL_CALL) { + trace_kvm_s390_handle_sigp_pei(vcpu, order_code, cpu_addr); + + dest_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, cpu_addr); + BUG_ON(dest_vcpu == NULL); + + kvm_s390_vcpu_wakeup(dest_vcpu); + kvm_s390_set_psw_cc(vcpu, SIGP_CC_ORDER_CODE_ACCEPTED); + return 0; + } + + return -EOPNOTSUPP; +} diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h new file mode 100644 index 0000000000..6f0209d451 --- /dev/null +++ b/arch/s390/kvm/trace-s390.h @@ -0,0 +1,340 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#if !defined(_TRACE_KVMS390_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVMS390_H + +#include + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm-s390 +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace-s390 + +/* + * The TRACE_SYSTEM_VAR defaults to TRACE_SYSTEM, but must be a + * legitimate C variable. It is not exported to user space. + */ +#undef TRACE_SYSTEM_VAR +#define TRACE_SYSTEM_VAR kvm_s390 + +/* + * Trace point for the creation of the kvm instance. + */ +TRACE_EVENT(kvm_s390_create_vm, + TP_PROTO(unsigned long type), + TP_ARGS(type), + + TP_STRUCT__entry( + __field(unsigned long, type) + ), + + TP_fast_assign( + __entry->type = type; + ), + + TP_printk("create vm%s", + __entry->type & KVM_VM_S390_UCONTROL ? " (UCONTROL)" : "") + ); + +/* + * Trace points for creation and destruction of vpcus. + */ +TRACE_EVENT(kvm_s390_create_vcpu, + TP_PROTO(unsigned int id, struct kvm_vcpu *vcpu, + struct kvm_s390_sie_block *sie_block), + TP_ARGS(id, vcpu, sie_block), + + TP_STRUCT__entry( + __field(unsigned int, id) + __field(struct kvm_vcpu *, vcpu) + __field(struct kvm_s390_sie_block *, sie_block) + ), + + TP_fast_assign( + __entry->id = id; + __entry->vcpu = vcpu; + __entry->sie_block = sie_block; + ), + + TP_printk("create cpu %d at 0x%pK, sie block at 0x%pK", + __entry->id, __entry->vcpu, __entry->sie_block) + ); + +TRACE_EVENT(kvm_s390_destroy_vcpu, + TP_PROTO(unsigned int id), + TP_ARGS(id), + + TP_STRUCT__entry( + __field(unsigned int, id) + ), + + TP_fast_assign( + __entry->id = id; + ), + + TP_printk("destroy cpu %d", __entry->id) + ); + +/* + * Trace point for start and stop of vpcus. + */ +TRACE_EVENT(kvm_s390_vcpu_start_stop, + TP_PROTO(unsigned int id, int state), + TP_ARGS(id, state), + + TP_STRUCT__entry( + __field(unsigned int, id) + __field(int, state) + ), + + TP_fast_assign( + __entry->id = id; + __entry->state = state; + ), + + TP_printk("%s cpu %d", __entry->state ? "starting" : "stopping", + __entry->id) + ); + +/* + * Trace points for injection of interrupts, either per machine or + * per vcpu. + */ + +#define kvm_s390_int_type \ + {KVM_S390_SIGP_STOP, "sigp stop"}, \ + {KVM_S390_PROGRAM_INT, "program interrupt"}, \ + {KVM_S390_SIGP_SET_PREFIX, "sigp set prefix"}, \ + {KVM_S390_RESTART, "sigp restart"}, \ + {KVM_S390_INT_PFAULT_INIT, "pfault init"}, \ + {KVM_S390_INT_PFAULT_DONE, "pfault done"}, \ + {KVM_S390_MCHK, "machine check"}, \ + {KVM_S390_INT_CLOCK_COMP, "clock comparator"}, \ + {KVM_S390_INT_CPU_TIMER, "cpu timer"}, \ + {KVM_S390_INT_VIRTIO, "virtio interrupt"}, \ + {KVM_S390_INT_SERVICE, "sclp interrupt"}, \ + {KVM_S390_INT_EMERGENCY, "sigp emergency"}, \ + {KVM_S390_INT_EXTERNAL_CALL, "sigp ext call"} + +#define get_irq_name(__type) \ + (__type > KVM_S390_INT_IO_MAX ? \ + __print_symbolic(__type, kvm_s390_int_type) : \ + (__type & KVM_S390_INT_IO_AI_MASK ? \ + "adapter I/O interrupt" : "subchannel I/O interrupt")) + +TRACE_EVENT(kvm_s390_inject_vm, + TP_PROTO(__u64 type, __u32 parm, __u64 parm64, int who), + TP_ARGS(type, parm, parm64, who), + + TP_STRUCT__entry( + __field(__u32, inttype) + __field(__u32, parm) + __field(__u64, parm64) + __field(int, who) + ), + + TP_fast_assign( + __entry->inttype = type & 0x00000000ffffffff; + __entry->parm = parm; + __entry->parm64 = parm64; + __entry->who = who; + ), + + TP_printk("inject%s: type:%x (%s) parm:%x parm64:%llx", + (__entry->who == 1) ? " (from kernel)" : + (__entry->who == 2) ? " (from user)" : "", + __entry->inttype, get_irq_name(__entry->inttype), + __entry->parm, __entry->parm64) + ); + +TRACE_EVENT(kvm_s390_inject_vcpu, + TP_PROTO(unsigned int id, __u64 type, __u32 parm, __u64 parm64), + TP_ARGS(id, type, parm, parm64), + + TP_STRUCT__entry( + __field(int, id) + __field(__u32, inttype) + __field(__u32, parm) + __field(__u64, parm64) + ), + + TP_fast_assign( + __entry->id = id; + __entry->inttype = type & 0x00000000ffffffff; + __entry->parm = parm; + __entry->parm64 = parm64; + ), + + TP_printk("inject (vcpu %d): type:%x (%s) parm:%x parm64:%llx", + __entry->id, __entry->inttype, + get_irq_name(__entry->inttype), __entry->parm, + __entry->parm64) + ); + +/* + * Trace point for the actual delivery of interrupts. + */ +TRACE_EVENT(kvm_s390_deliver_interrupt, + TP_PROTO(unsigned int id, __u64 type, __u64 data0, __u64 data1), + TP_ARGS(id, type, data0, data1), + + TP_STRUCT__entry( + __field(int, id) + __field(__u32, inttype) + __field(__u64, data0) + __field(__u64, data1) + ), + + TP_fast_assign( + __entry->id = id; + __entry->inttype = type & 0x00000000ffffffff; + __entry->data0 = data0; + __entry->data1 = data1; + ), + + TP_printk("deliver interrupt (vcpu %d): type:%x (%s) " \ + "data:%08llx %016llx", + __entry->id, __entry->inttype, + get_irq_name(__entry->inttype), __entry->data0, + __entry->data1) + ); + +/* + * Trace point for resets that may be requested from userspace. + */ +TRACE_EVENT(kvm_s390_request_resets, + TP_PROTO(__u64 resets), + TP_ARGS(resets), + + TP_STRUCT__entry( + __field(__u64, resets) + ), + + TP_fast_assign( + __entry->resets = resets; + ), + + TP_printk("requesting userspace resets %llx", + __entry->resets) + ); + +/* + * Trace point for a vcpu's stop requests. + */ +TRACE_EVENT(kvm_s390_stop_request, + TP_PROTO(unsigned char stop_irq, unsigned char flags), + TP_ARGS(stop_irq, flags), + + TP_STRUCT__entry( + __field(unsigned char, stop_irq) + __field(unsigned char, flags) + ), + + TP_fast_assign( + __entry->stop_irq = stop_irq; + __entry->flags = flags; + ), + + TP_printk("stop request, stop irq = %u, flags = %08x", + __entry->stop_irq, __entry->flags) + ); + + +/* + * Trace point for enabling channel I/O instruction support. + */ +TRACE_EVENT(kvm_s390_enable_css, + TP_PROTO(void *kvm), + TP_ARGS(kvm), + + TP_STRUCT__entry( + __field(void *, kvm) + ), + + TP_fast_assign( + __entry->kvm = kvm; + ), + + TP_printk("enabling channel I/O support (kvm @ %pK)\n", + __entry->kvm) + ); + +/* + * Trace point for enabling and disabling interlocking-and-broadcasting + * suppression. + */ +TRACE_EVENT(kvm_s390_enable_disable_ibs, + TP_PROTO(unsigned int id, int state), + TP_ARGS(id, state), + + TP_STRUCT__entry( + __field(unsigned int, id) + __field(int, state) + ), + + TP_fast_assign( + __entry->id = id; + __entry->state = state; + ), + + TP_printk("%s ibs on cpu %d", + __entry->state ? "enabling" : "disabling", __entry->id) + ); + +/* + * Trace point for modifying ais mode for a given isc. + */ +TRACE_EVENT(kvm_s390_modify_ais_mode, + TP_PROTO(__u8 isc, __u16 from, __u16 to), + TP_ARGS(isc, from, to), + + TP_STRUCT__entry( + __field(__u8, isc) + __field(__u16, from) + __field(__u16, to) + ), + + TP_fast_assign( + __entry->isc = isc; + __entry->from = from; + __entry->to = to; + ), + + TP_printk("for isc %x, modifying interruption mode from %s to %s", + __entry->isc, + (__entry->from == KVM_S390_AIS_MODE_ALL) ? + "ALL-Interruptions Mode" : + (__entry->from == KVM_S390_AIS_MODE_SINGLE) ? + "Single-Interruption Mode" : "No-Interruptions Mode", + (__entry->to == KVM_S390_AIS_MODE_ALL) ? + "ALL-Interruptions Mode" : + (__entry->to == KVM_S390_AIS_MODE_SINGLE) ? + "Single-Interruption Mode" : "No-Interruptions Mode") + ); + +/* + * Trace point for suppressed adapter I/O interrupt. + */ +TRACE_EVENT(kvm_s390_airq_suppressed, + TP_PROTO(__u32 id, __u8 isc), + TP_ARGS(id, isc), + + TP_STRUCT__entry( + __field(__u32, id) + __field(__u8, isc) + ), + + TP_fast_assign( + __entry->id = id; + __entry->isc = isc; + ), + + TP_printk("adapter I/O interrupt suppressed (id:%x isc:%x)", + __entry->id, __entry->isc) + ); + + +#endif /* _TRACE_KVMS390_H */ + +/* This part must be outside protection */ +#include diff --git a/arch/s390/kvm/trace.h b/arch/s390/kvm/trace.h new file mode 100644 index 0000000000..aa419eb6a0 --- /dev/null +++ b/arch/s390/kvm/trace.h @@ -0,0 +1,462 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVM_H + +#include +#include +#include +#include + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + +/* + * Helpers for vcpu-specific tracepoints containing the same information + * as s390dbf VCPU_EVENTs. + */ +#define VCPU_PROTO_COMMON struct kvm_vcpu *vcpu +#define VCPU_ARGS_COMMON vcpu +#define VCPU_FIELD_COMMON __field(int, id) \ + __field(unsigned long, pswmask) \ + __field(unsigned long, pswaddr) +#define VCPU_ASSIGN_COMMON do { \ + __entry->id = vcpu->vcpu_id; \ + __entry->pswmask = vcpu->arch.sie_block->gpsw.mask; \ + __entry->pswaddr = vcpu->arch.sie_block->gpsw.addr; \ + } while (0); +#define VCPU_TP_PRINTK(p_str, p_args...) \ + TP_printk("%02d[%016lx-%016lx]: " p_str, __entry->id, \ + __entry->pswmask, __entry->pswaddr, p_args) + +TRACE_EVENT(kvm_s390_skey_related_inst, + TP_PROTO(VCPU_PROTO_COMMON), + TP_ARGS(VCPU_ARGS_COMMON), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + ), + VCPU_TP_PRINTK("%s", "storage key related instruction") + ); + +TRACE_EVENT(kvm_s390_major_guest_pfault, + TP_PROTO(VCPU_PROTO_COMMON), + TP_ARGS(VCPU_ARGS_COMMON), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + ), + VCPU_TP_PRINTK("%s", "major fault, maybe applicable for pfault") + ); + +TRACE_EVENT(kvm_s390_pfault_init, + TP_PROTO(VCPU_PROTO_COMMON, long pfault_token), + TP_ARGS(VCPU_ARGS_COMMON, pfault_token), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + __field(long, pfault_token) + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + __entry->pfault_token = pfault_token; + ), + VCPU_TP_PRINTK("init pfault token %ld", __entry->pfault_token) + ); + +TRACE_EVENT(kvm_s390_pfault_done, + TP_PROTO(VCPU_PROTO_COMMON, long pfault_token), + TP_ARGS(VCPU_ARGS_COMMON, pfault_token), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + __field(long, pfault_token) + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + __entry->pfault_token = pfault_token; + ), + VCPU_TP_PRINTK("done pfault token %ld", __entry->pfault_token) + ); + +/* + * Tracepoints for SIE entry and exit. + */ +TRACE_EVENT(kvm_s390_sie_enter, + TP_PROTO(VCPU_PROTO_COMMON, int cpuflags), + TP_ARGS(VCPU_ARGS_COMMON, cpuflags), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + __field(int, cpuflags) + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + __entry->cpuflags = cpuflags; + ), + + VCPU_TP_PRINTK("entering sie flags %x", __entry->cpuflags) + ); + +TRACE_EVENT(kvm_s390_sie_fault, + TP_PROTO(VCPU_PROTO_COMMON), + TP_ARGS(VCPU_ARGS_COMMON), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + ), + + VCPU_TP_PRINTK("%s", "fault in sie instruction") + ); + +TRACE_EVENT(kvm_s390_sie_exit, + TP_PROTO(VCPU_PROTO_COMMON, u8 icptcode), + TP_ARGS(VCPU_ARGS_COMMON, icptcode), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + __field(u8, icptcode) + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + __entry->icptcode = icptcode; + ), + + VCPU_TP_PRINTK("exit sie icptcode %d (%s)", __entry->icptcode, + __print_symbolic(__entry->icptcode, + sie_intercept_code)) + ); + +/* + * Trace point for intercepted instructions. + */ +TRACE_EVENT(kvm_s390_intercept_instruction, + TP_PROTO(VCPU_PROTO_COMMON, __u16 ipa, __u32 ipb), + TP_ARGS(VCPU_ARGS_COMMON, ipa, ipb), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + __field(__u64, instruction) + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + __entry->instruction = ((__u64)ipa << 48) | + ((__u64)ipb << 16); + ), + + VCPU_TP_PRINTK("intercepted instruction %016llx (%s)", + __entry->instruction, + __print_symbolic(icpt_insn_decoder(__entry->instruction), + icpt_insn_codes)) + ); + +/* + * Trace point for intercepted program interruptions. + */ +TRACE_EVENT(kvm_s390_intercept_prog, + TP_PROTO(VCPU_PROTO_COMMON, __u16 code), + TP_ARGS(VCPU_ARGS_COMMON, code), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + __field(__u16, code) + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + __entry->code = code; + ), + + VCPU_TP_PRINTK("intercepted program interruption %04x (%s)", + __entry->code, + __print_symbolic(__entry->code, + icpt_prog_codes)) + ); + +/* + * Trace point for validity intercepts. + */ +TRACE_EVENT(kvm_s390_intercept_validity, + TP_PROTO(VCPU_PROTO_COMMON, __u16 viwhy), + TP_ARGS(VCPU_ARGS_COMMON, viwhy), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + __field(__u16, viwhy) + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + __entry->viwhy = viwhy; + ), + + VCPU_TP_PRINTK("got validity intercept %04x", __entry->viwhy) + ); + +/* + * Trace points for instructions that are of special interest. + */ + +TRACE_EVENT(kvm_s390_handle_sigp, + TP_PROTO(VCPU_PROTO_COMMON, __u8 order_code, __u16 cpu_addr, \ + __u32 parameter), + TP_ARGS(VCPU_ARGS_COMMON, order_code, cpu_addr, parameter), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + __field(__u8, order_code) + __field(__u16, cpu_addr) + __field(__u32, parameter) + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + __entry->order_code = order_code; + __entry->cpu_addr = cpu_addr; + __entry->parameter = parameter; + ), + + VCPU_TP_PRINTK("handle sigp order %02x (%s), cpu address %04x, " \ + "parameter %08x", __entry->order_code, + __print_symbolic(__entry->order_code, + sigp_order_codes), + __entry->cpu_addr, __entry->parameter) + ); + +TRACE_EVENT(kvm_s390_handle_sigp_pei, + TP_PROTO(VCPU_PROTO_COMMON, __u8 order_code, __u16 cpu_addr), + TP_ARGS(VCPU_ARGS_COMMON, order_code, cpu_addr), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + __field(__u8, order_code) + __field(__u16, cpu_addr) + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + __entry->order_code = order_code; + __entry->cpu_addr = cpu_addr; + ), + + VCPU_TP_PRINTK("handle sigp pei order %02x (%s), cpu address %04x", + __entry->order_code, + __print_symbolic(__entry->order_code, + sigp_order_codes), + __entry->cpu_addr) + ); + +TRACE_EVENT(kvm_s390_handle_diag, + TP_PROTO(VCPU_PROTO_COMMON, __u16 code), + TP_ARGS(VCPU_ARGS_COMMON, code), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + __field(__u16, code) + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + __entry->code = code; + ), + + VCPU_TP_PRINTK("handle diagnose call %04x (%s)", __entry->code, + __print_symbolic(__entry->code, diagnose_codes)) + ); + +TRACE_EVENT(kvm_s390_handle_lctl, + TP_PROTO(VCPU_PROTO_COMMON, int g, int reg1, int reg3, u64 addr), + TP_ARGS(VCPU_ARGS_COMMON, g, reg1, reg3, addr), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + __field(int, g) + __field(int, reg1) + __field(int, reg3) + __field(u64, addr) + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + __entry->g = g; + __entry->reg1 = reg1; + __entry->reg3 = reg3; + __entry->addr = addr; + ), + + VCPU_TP_PRINTK("%s: loading cr %x-%x from %016llx", + __entry->g ? "lctlg" : "lctl", + __entry->reg1, __entry->reg3, __entry->addr) + ); + +TRACE_EVENT(kvm_s390_handle_stctl, + TP_PROTO(VCPU_PROTO_COMMON, int g, int reg1, int reg3, u64 addr), + TP_ARGS(VCPU_ARGS_COMMON, g, reg1, reg3, addr), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + __field(int, g) + __field(int, reg1) + __field(int, reg3) + __field(u64, addr) + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + __entry->g = g; + __entry->reg1 = reg1; + __entry->reg3 = reg3; + __entry->addr = addr; + ), + + VCPU_TP_PRINTK("%s: storing cr %x-%x to %016llx", + __entry->g ? "stctg" : "stctl", + __entry->reg1, __entry->reg3, __entry->addr) + ); + +TRACE_EVENT(kvm_s390_handle_prefix, + TP_PROTO(VCPU_PROTO_COMMON, int set, u32 address), + TP_ARGS(VCPU_ARGS_COMMON, set, address), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + __field(int, set) + __field(u32, address) + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + __entry->set = set; + __entry->address = address; + ), + + VCPU_TP_PRINTK("%s prefix to %08x", + __entry->set ? "setting" : "storing", + __entry->address) + ); + +TRACE_EVENT(kvm_s390_handle_stap, + TP_PROTO(VCPU_PROTO_COMMON, u64 address), + TP_ARGS(VCPU_ARGS_COMMON, address), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + __field(u64, address) + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + __entry->address = address; + ), + + VCPU_TP_PRINTK("storing cpu address to %016llx", + __entry->address) + ); + +TRACE_EVENT(kvm_s390_handle_stfl, + TP_PROTO(VCPU_PROTO_COMMON, unsigned int facility_list), + TP_ARGS(VCPU_ARGS_COMMON, facility_list), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + __field(unsigned int, facility_list) + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + __entry->facility_list = facility_list; + ), + + VCPU_TP_PRINTK("store facility list value %08x", + __entry->facility_list) + ); + +TRACE_EVENT(kvm_s390_handle_stsi, + TP_PROTO(VCPU_PROTO_COMMON, int fc, int sel1, int sel2, u64 addr), + TP_ARGS(VCPU_ARGS_COMMON, fc, sel1, sel2, addr), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + __field(int, fc) + __field(int, sel1) + __field(int, sel2) + __field(u64, addr) + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + __entry->fc = fc; + __entry->sel1 = sel1; + __entry->sel2 = sel2; + __entry->addr = addr; + ), + + VCPU_TP_PRINTK("STSI %d.%d.%d information stored to %016llx", + __entry->fc, __entry->sel1, __entry->sel2, + __entry->addr) + ); + +TRACE_EVENT(kvm_s390_handle_operexc, + TP_PROTO(VCPU_PROTO_COMMON, __u16 ipa, __u32 ipb), + TP_ARGS(VCPU_ARGS_COMMON, ipa, ipb), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + __field(__u64, instruction) + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + __entry->instruction = ((__u64)ipa << 48) | + ((__u64)ipb << 16); + ), + + VCPU_TP_PRINTK("operation exception on instruction %016llx (%s)", + __entry->instruction, + __print_symbolic(icpt_insn_decoder(__entry->instruction), + icpt_insn_codes)) + ); + +TRACE_EVENT(kvm_s390_handle_sthyi, + TP_PROTO(VCPU_PROTO_COMMON, u64 code, u64 addr), + TP_ARGS(VCPU_ARGS_COMMON, code, addr), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + __field(u64, code) + __field(u64, addr) + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + __entry->code = code; + __entry->addr = addr; + ), + + VCPU_TP_PRINTK("STHYI fc: %llu addr: %016llx", + __entry->code, __entry->addr) + ); + +#endif /* _TRACE_KVM_H */ + +/* This part must be outside protection */ +#include diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c new file mode 100644 index 0000000000..e55f489e1f --- /dev/null +++ b/arch/s390/kvm/vsie.c @@ -0,0 +1,1488 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * kvm nested virtualization support for s390x + * + * Copyright IBM Corp. 2016, 2018 + * + * Author(s): David Hildenbrand + */ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include "kvm-s390.h" +#include "gaccess.h" + +struct vsie_page { + struct kvm_s390_sie_block scb_s; /* 0x0000 */ + /* + * the backup info for machine check. ensure it's at + * the same offset as that in struct sie_page! + */ + struct mcck_volatile_info mcck_info; /* 0x0200 */ + /* + * The pinned original scb. Be aware that other VCPUs can modify + * it while we read from it. Values that are used for conditions or + * are reused conditionally, should be accessed via READ_ONCE. + */ + struct kvm_s390_sie_block *scb_o; /* 0x0218 */ + /* the shadow gmap in use by the vsie_page */ + struct gmap *gmap; /* 0x0220 */ + /* address of the last reported fault to guest2 */ + unsigned long fault_addr; /* 0x0228 */ + /* calculated guest addresses of satellite control blocks */ + gpa_t sca_gpa; /* 0x0230 */ + gpa_t itdba_gpa; /* 0x0238 */ + gpa_t gvrd_gpa; /* 0x0240 */ + gpa_t riccbd_gpa; /* 0x0248 */ + gpa_t sdnx_gpa; /* 0x0250 */ + __u8 reserved[0x0700 - 0x0258]; /* 0x0258 */ + struct kvm_s390_crypto_cb crycb; /* 0x0700 */ + __u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */ +}; + +/* trigger a validity icpt for the given scb */ +static int set_validity_icpt(struct kvm_s390_sie_block *scb, + __u16 reason_code) +{ + scb->ipa = 0x1000; + scb->ipb = ((__u32) reason_code) << 16; + scb->icptcode = ICPT_VALIDITY; + return 1; +} + +/* mark the prefix as unmapped, this will block the VSIE */ +static void prefix_unmapped(struct vsie_page *vsie_page) +{ + atomic_or(PROG_REQUEST, &vsie_page->scb_s.prog20); +} + +/* mark the prefix as unmapped and wait until the VSIE has been left */ +static void prefix_unmapped_sync(struct vsie_page *vsie_page) +{ + prefix_unmapped(vsie_page); + if (vsie_page->scb_s.prog0c & PROG_IN_SIE) + atomic_or(CPUSTAT_STOP_INT, &vsie_page->scb_s.cpuflags); + while (vsie_page->scb_s.prog0c & PROG_IN_SIE) + cpu_relax(); +} + +/* mark the prefix as mapped, this will allow the VSIE to run */ +static void prefix_mapped(struct vsie_page *vsie_page) +{ + atomic_andnot(PROG_REQUEST, &vsie_page->scb_s.prog20); +} + +/* test if the prefix is mapped into the gmap shadow */ +static int prefix_is_mapped(struct vsie_page *vsie_page) +{ + return !(atomic_read(&vsie_page->scb_s.prog20) & PROG_REQUEST); +} + +/* copy the updated intervention request bits into the shadow scb */ +static void update_intervention_requests(struct vsie_page *vsie_page) +{ + const int bits = CPUSTAT_STOP_INT | CPUSTAT_IO_INT | CPUSTAT_EXT_INT; + int cpuflags; + + cpuflags = atomic_read(&vsie_page->scb_o->cpuflags); + atomic_andnot(bits, &vsie_page->scb_s.cpuflags); + atomic_or(cpuflags & bits, &vsie_page->scb_s.cpuflags); +} + +/* shadow (filter and validate) the cpuflags */ +static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + int newflags, cpuflags = atomic_read(&scb_o->cpuflags); + + /* we don't allow ESA/390 guests */ + if (!(cpuflags & CPUSTAT_ZARCH)) + return set_validity_icpt(scb_s, 0x0001U); + + if (cpuflags & (CPUSTAT_RRF | CPUSTAT_MCDS)) + return set_validity_icpt(scb_s, 0x0001U); + else if (cpuflags & (CPUSTAT_SLSV | CPUSTAT_SLSR)) + return set_validity_icpt(scb_s, 0x0007U); + + /* intervention requests will be set later */ + newflags = CPUSTAT_ZARCH; + if (cpuflags & CPUSTAT_GED && test_kvm_facility(vcpu->kvm, 8)) + newflags |= CPUSTAT_GED; + if (cpuflags & CPUSTAT_GED2 && test_kvm_facility(vcpu->kvm, 78)) { + if (cpuflags & CPUSTAT_GED) + return set_validity_icpt(scb_s, 0x0001U); + newflags |= CPUSTAT_GED2; + } + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GPERE)) + newflags |= cpuflags & CPUSTAT_P; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GSLS)) + newflags |= cpuflags & CPUSTAT_SM; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IBS)) + newflags |= cpuflags & CPUSTAT_IBS; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_KSS)) + newflags |= cpuflags & CPUSTAT_KSS; + + atomic_set(&scb_s->cpuflags, newflags); + return 0; +} +/* Copy to APCB FORMAT1 from APCB FORMAT0 */ +static int setup_apcb10(struct kvm_vcpu *vcpu, struct kvm_s390_apcb1 *apcb_s, + unsigned long crycb_gpa, struct kvm_s390_apcb1 *apcb_h) +{ + struct kvm_s390_apcb0 tmp; + unsigned long apcb_gpa; + + apcb_gpa = crycb_gpa + offsetof(struct kvm_s390_crypto_cb, apcb0); + + if (read_guest_real(vcpu, apcb_gpa, &tmp, + sizeof(struct kvm_s390_apcb0))) + return -EFAULT; + + apcb_s->apm[0] = apcb_h->apm[0] & tmp.apm[0]; + apcb_s->aqm[0] = apcb_h->aqm[0] & tmp.aqm[0] & 0xffff000000000000UL; + apcb_s->adm[0] = apcb_h->adm[0] & tmp.adm[0] & 0xffff000000000000UL; + + return 0; + +} + +/** + * setup_apcb00 - Copy to APCB FORMAT0 from APCB FORMAT0 + * @vcpu: pointer to the virtual CPU + * @apcb_s: pointer to start of apcb in the shadow crycb + * @crycb_gpa: guest physical address to start of original guest crycb + * @apcb_h: pointer to start of apcb in the guest1 + * + * Returns 0 and -EFAULT on error reading guest apcb + */ +static int setup_apcb00(struct kvm_vcpu *vcpu, unsigned long *apcb_s, + unsigned long crycb_gpa, unsigned long *apcb_h) +{ + unsigned long apcb_gpa; + + apcb_gpa = crycb_gpa + offsetof(struct kvm_s390_crypto_cb, apcb0); + + if (read_guest_real(vcpu, apcb_gpa, apcb_s, + sizeof(struct kvm_s390_apcb0))) + return -EFAULT; + + bitmap_and(apcb_s, apcb_s, apcb_h, + BITS_PER_BYTE * sizeof(struct kvm_s390_apcb0)); + + return 0; +} + +/** + * setup_apcb11 - Copy the FORMAT1 APCB from the guest to the shadow CRYCB + * @vcpu: pointer to the virtual CPU + * @apcb_s: pointer to start of apcb in the shadow crycb + * @crycb_gpa: guest physical address to start of original guest crycb + * @apcb_h: pointer to start of apcb in the host + * + * Returns 0 and -EFAULT on error reading guest apcb + */ +static int setup_apcb11(struct kvm_vcpu *vcpu, unsigned long *apcb_s, + unsigned long crycb_gpa, + unsigned long *apcb_h) +{ + unsigned long apcb_gpa; + + apcb_gpa = crycb_gpa + offsetof(struct kvm_s390_crypto_cb, apcb1); + + if (read_guest_real(vcpu, apcb_gpa, apcb_s, + sizeof(struct kvm_s390_apcb1))) + return -EFAULT; + + bitmap_and(apcb_s, apcb_s, apcb_h, + BITS_PER_BYTE * sizeof(struct kvm_s390_apcb1)); + + return 0; +} + +/** + * setup_apcb - Create a shadow copy of the apcb. + * @vcpu: pointer to the virtual CPU + * @crycb_s: pointer to shadow crycb + * @crycb_gpa: guest physical address of original guest crycb + * @crycb_h: pointer to the host crycb + * @fmt_o: format of the original guest crycb. + * @fmt_h: format of the host crycb. + * + * Checks the compatibility between the guest and host crycb and calls the + * appropriate copy function. + * + * Return 0 or an error number if the guest and host crycb are incompatible. + */ +static int setup_apcb(struct kvm_vcpu *vcpu, struct kvm_s390_crypto_cb *crycb_s, + const u32 crycb_gpa, + struct kvm_s390_crypto_cb *crycb_h, + int fmt_o, int fmt_h) +{ + switch (fmt_o) { + case CRYCB_FORMAT2: + if ((crycb_gpa & PAGE_MASK) != ((crycb_gpa + 256) & PAGE_MASK)) + return -EACCES; + if (fmt_h != CRYCB_FORMAT2) + return -EINVAL; + return setup_apcb11(vcpu, (unsigned long *)&crycb_s->apcb1, + crycb_gpa, + (unsigned long *)&crycb_h->apcb1); + case CRYCB_FORMAT1: + switch (fmt_h) { + case CRYCB_FORMAT2: + return setup_apcb10(vcpu, &crycb_s->apcb1, + crycb_gpa, + &crycb_h->apcb1); + case CRYCB_FORMAT1: + return setup_apcb00(vcpu, + (unsigned long *) &crycb_s->apcb0, + crycb_gpa, + (unsigned long *) &crycb_h->apcb0); + } + break; + case CRYCB_FORMAT0: + if ((crycb_gpa & PAGE_MASK) != ((crycb_gpa + 32) & PAGE_MASK)) + return -EACCES; + + switch (fmt_h) { + case CRYCB_FORMAT2: + return setup_apcb10(vcpu, &crycb_s->apcb1, + crycb_gpa, + &crycb_h->apcb1); + case CRYCB_FORMAT1: + case CRYCB_FORMAT0: + return setup_apcb00(vcpu, + (unsigned long *) &crycb_s->apcb0, + crycb_gpa, + (unsigned long *) &crycb_h->apcb0); + } + } + return -EINVAL; +} + +/** + * shadow_crycb - Create a shadow copy of the crycb block + * @vcpu: a pointer to the virtual CPU + * @vsie_page: a pointer to internal date used for the vSIE + * + * Create a shadow copy of the crycb block and setup key wrapping, if + * requested for guest 3 and enabled for guest 2. + * + * We accept format-1 or format-2, but we convert format-1 into format-2 + * in the shadow CRYCB. + * Using format-2 enables the firmware to choose the right format when + * scheduling the SIE. + * There is nothing to do for format-0. + * + * This function centralize the issuing of set_validity_icpt() for all + * the subfunctions working on the crycb. + * + * Returns: - 0 if shadowed or nothing to do + * - > 0 if control has to be given to guest 2 + */ +static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + const uint32_t crycbd_o = READ_ONCE(scb_o->crycbd); + const u32 crycb_addr = crycbd_o & 0x7ffffff8U; + unsigned long *b1, *b2; + u8 ecb3_flags; + u32 ecd_flags; + int apie_h; + int apie_s; + int key_msk = test_kvm_facility(vcpu->kvm, 76); + int fmt_o = crycbd_o & CRYCB_FORMAT_MASK; + int fmt_h = vcpu->arch.sie_block->crycbd & CRYCB_FORMAT_MASK; + int ret = 0; + + scb_s->crycbd = 0; + + apie_h = vcpu->arch.sie_block->eca & ECA_APIE; + apie_s = apie_h & scb_o->eca; + if (!apie_s && (!key_msk || (fmt_o == CRYCB_FORMAT0))) + return 0; + + if (!crycb_addr) + return set_validity_icpt(scb_s, 0x0039U); + + if (fmt_o == CRYCB_FORMAT1) + if ((crycb_addr & PAGE_MASK) != + ((crycb_addr + 128) & PAGE_MASK)) + return set_validity_icpt(scb_s, 0x003CU); + + if (apie_s) { + ret = setup_apcb(vcpu, &vsie_page->crycb, crycb_addr, + vcpu->kvm->arch.crypto.crycb, + fmt_o, fmt_h); + if (ret) + goto end; + scb_s->eca |= scb_o->eca & ECA_APIE; + } + + /* we may only allow it if enabled for guest 2 */ + ecb3_flags = scb_o->ecb3 & vcpu->arch.sie_block->ecb3 & + (ECB3_AES | ECB3_DEA); + ecd_flags = scb_o->ecd & vcpu->arch.sie_block->ecd & ECD_ECC; + if (!ecb3_flags && !ecd_flags) + goto end; + + /* copy only the wrapping keys */ + if (read_guest_real(vcpu, crycb_addr + 72, + vsie_page->crycb.dea_wrapping_key_mask, 56)) + return set_validity_icpt(scb_s, 0x0035U); + + scb_s->ecb3 |= ecb3_flags; + scb_s->ecd |= ecd_flags; + + /* xor both blocks in one run */ + b1 = (unsigned long *) vsie_page->crycb.dea_wrapping_key_mask; + b2 = (unsigned long *) + vcpu->kvm->arch.crypto.crycb->dea_wrapping_key_mask; + /* as 56%8 == 0, bitmap_xor won't overwrite any data */ + bitmap_xor(b1, b1, b2, BITS_PER_BYTE * 56); +end: + switch (ret) { + case -EINVAL: + return set_validity_icpt(scb_s, 0x0022U); + case -EFAULT: + return set_validity_icpt(scb_s, 0x0035U); + case -EACCES: + return set_validity_icpt(scb_s, 0x003CU); + } + scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT2; + return 0; +} + +/* shadow (round up/down) the ibc to avoid validity icpt */ +static void prepare_ibc(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + /* READ_ONCE does not work on bitfields - use a temporary variable */ + const uint32_t __new_ibc = scb_o->ibc; + const uint32_t new_ibc = READ_ONCE(__new_ibc) & 0x0fffU; + __u64 min_ibc = (sclp.ibc >> 16) & 0x0fffU; + + scb_s->ibc = 0; + /* ibc installed in g2 and requested for g3 */ + if (vcpu->kvm->arch.model.ibc && new_ibc) { + scb_s->ibc = new_ibc; + /* takte care of the minimum ibc level of the machine */ + if (scb_s->ibc < min_ibc) + scb_s->ibc = min_ibc; + /* take care of the maximum ibc level set for the guest */ + if (scb_s->ibc > vcpu->kvm->arch.model.ibc) + scb_s->ibc = vcpu->kvm->arch.model.ibc; + } +} + +/* unshadow the scb, copying parameters back to the real scb */ +static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + + /* interception */ + scb_o->icptcode = scb_s->icptcode; + scb_o->icptstatus = scb_s->icptstatus; + scb_o->ipa = scb_s->ipa; + scb_o->ipb = scb_s->ipb; + scb_o->gbea = scb_s->gbea; + + /* timer */ + scb_o->cputm = scb_s->cputm; + scb_o->ckc = scb_s->ckc; + scb_o->todpr = scb_s->todpr; + + /* guest state */ + scb_o->gpsw = scb_s->gpsw; + scb_o->gg14 = scb_s->gg14; + scb_o->gg15 = scb_s->gg15; + memcpy(scb_o->gcr, scb_s->gcr, 128); + scb_o->pp = scb_s->pp; + + /* branch prediction */ + if (test_kvm_facility(vcpu->kvm, 82)) { + scb_o->fpf &= ~FPF_BPBC; + scb_o->fpf |= scb_s->fpf & FPF_BPBC; + } + + /* interrupt intercept */ + switch (scb_s->icptcode) { + case ICPT_PROGI: + case ICPT_INSTPROGI: + case ICPT_EXTINT: + memcpy((void *)((u64)scb_o + 0xc0), + (void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0); + break; + } + + if (scb_s->ihcpu != 0xffffU) + scb_o->ihcpu = scb_s->ihcpu; +} + +/* + * Setup the shadow scb by copying and checking the relevant parts of the g2 + * provided scb. + * + * Returns: - 0 if the scb has been shadowed + * - > 0 if control has to be given to guest 2 + */ +static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + /* READ_ONCE does not work on bitfields - use a temporary variable */ + const uint32_t __new_prefix = scb_o->prefix; + const uint32_t new_prefix = READ_ONCE(__new_prefix); + const bool wants_tx = READ_ONCE(scb_o->ecb) & ECB_TE; + bool had_tx = scb_s->ecb & ECB_TE; + unsigned long new_mso = 0; + int rc; + + /* make sure we don't have any leftovers when reusing the scb */ + scb_s->icptcode = 0; + scb_s->eca = 0; + scb_s->ecb = 0; + scb_s->ecb2 = 0; + scb_s->ecb3 = 0; + scb_s->ecd = 0; + scb_s->fac = 0; + scb_s->fpf = 0; + + rc = prepare_cpuflags(vcpu, vsie_page); + if (rc) + goto out; + + /* timer */ + scb_s->cputm = scb_o->cputm; + scb_s->ckc = scb_o->ckc; + scb_s->todpr = scb_o->todpr; + scb_s->epoch = scb_o->epoch; + + /* guest state */ + scb_s->gpsw = scb_o->gpsw; + scb_s->gg14 = scb_o->gg14; + scb_s->gg15 = scb_o->gg15; + memcpy(scb_s->gcr, scb_o->gcr, 128); + scb_s->pp = scb_o->pp; + + /* interception / execution handling */ + scb_s->gbea = scb_o->gbea; + scb_s->lctl = scb_o->lctl; + scb_s->svcc = scb_o->svcc; + scb_s->ictl = scb_o->ictl; + /* + * SKEY handling functions can't deal with false setting of PTE invalid + * bits. Therefore we cannot provide interpretation and would later + * have to provide own emulation handlers. + */ + if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_KSS)) + scb_s->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE; + + scb_s->icpua = scb_o->icpua; + + if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_SM)) + new_mso = READ_ONCE(scb_o->mso) & 0xfffffffffff00000UL; + /* if the hva of the prefix changes, we have to remap the prefix */ + if (scb_s->mso != new_mso || scb_s->prefix != new_prefix) + prefix_unmapped(vsie_page); + /* SIE will do mso/msl validity and exception checks for us */ + scb_s->msl = scb_o->msl & 0xfffffffffff00000UL; + scb_s->mso = new_mso; + scb_s->prefix = new_prefix; + + /* We have to definitely flush the tlb if this scb never ran */ + if (scb_s->ihcpu != 0xffffU) + scb_s->ihcpu = scb_o->ihcpu; + + /* MVPG and Protection Exception Interpretation are always available */ + scb_s->eca |= scb_o->eca & (ECA_MVPGI | ECA_PROTEXCI); + /* Host-protection-interruption introduced with ESOP */ + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP)) + scb_s->ecb |= scb_o->ecb & ECB_HOSTPROTINT; + /* + * CPU Topology + * This facility only uses the utility field of the SCA and none of + * the cpu entries that are problematic with the other interpretation + * facilities so we can pass it through + */ + if (test_kvm_facility(vcpu->kvm, 11)) + scb_s->ecb |= scb_o->ecb & ECB_PTF; + /* transactional execution */ + if (test_kvm_facility(vcpu->kvm, 73) && wants_tx) { + /* remap the prefix is tx is toggled on */ + if (!had_tx) + prefix_unmapped(vsie_page); + scb_s->ecb |= ECB_TE; + } + /* specification exception interpretation */ + scb_s->ecb |= scb_o->ecb & ECB_SPECI; + /* branch prediction */ + if (test_kvm_facility(vcpu->kvm, 82)) + scb_s->fpf |= scb_o->fpf & FPF_BPBC; + /* SIMD */ + if (test_kvm_facility(vcpu->kvm, 129)) { + scb_s->eca |= scb_o->eca & ECA_VX; + scb_s->ecd |= scb_o->ecd & ECD_HOSTREGMGMT; + } + /* Run-time-Instrumentation */ + if (test_kvm_facility(vcpu->kvm, 64)) + scb_s->ecb3 |= scb_o->ecb3 & ECB3_RI; + /* Instruction Execution Prevention */ + if (test_kvm_facility(vcpu->kvm, 130)) + scb_s->ecb2 |= scb_o->ecb2 & ECB2_IEP; + /* Guarded Storage */ + if (test_kvm_facility(vcpu->kvm, 133)) { + scb_s->ecb |= scb_o->ecb & ECB_GS; + scb_s->ecd |= scb_o->ecd & ECD_HOSTREGMGMT; + } + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIIF)) + scb_s->eca |= scb_o->eca & ECA_SII; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IB)) + scb_s->eca |= scb_o->eca & ECA_IB; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_CEI)) + scb_s->eca |= scb_o->eca & ECA_CEI; + /* Epoch Extension */ + if (test_kvm_facility(vcpu->kvm, 139)) { + scb_s->ecd |= scb_o->ecd & ECD_MEF; + scb_s->epdx = scb_o->epdx; + } + + /* etoken */ + if (test_kvm_facility(vcpu->kvm, 156)) + scb_s->ecd |= scb_o->ecd & ECD_ETOKENF; + + scb_s->hpid = HPID_VSIE; + scb_s->cpnc = scb_o->cpnc; + + prepare_ibc(vcpu, vsie_page); + rc = shadow_crycb(vcpu, vsie_page); +out: + if (rc) + unshadow_scb(vcpu, vsie_page); + return rc; +} + +void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, + unsigned long end) +{ + struct kvm *kvm = gmap->private; + struct vsie_page *cur; + unsigned long prefix; + struct page *page; + int i; + + if (!gmap_is_shadow(gmap)) + return; + /* + * Only new shadow blocks are added to the list during runtime, + * therefore we can safely reference them all the time. + */ + for (i = 0; i < kvm->arch.vsie.page_count; i++) { + page = READ_ONCE(kvm->arch.vsie.pages[i]); + if (!page) + continue; + cur = page_to_virt(page); + if (READ_ONCE(cur->gmap) != gmap) + continue; + prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT; + /* with mso/msl, the prefix lies at an offset */ + prefix += cur->scb_s.mso; + if (prefix <= end && start <= prefix + 2 * PAGE_SIZE - 1) + prefix_unmapped_sync(cur); + } +} + +/* + * Map the first prefix page and if tx is enabled also the second prefix page. + * + * The prefix will be protected, a gmap notifier will inform about unmaps. + * The shadow scb must not be executed until the prefix is remapped, this is + * guaranteed by properly handling PROG_REQUEST. + * + * Returns: - 0 on if successfully mapped or already mapped + * - > 0 if control has to be given to guest 2 + * - -EAGAIN if the caller can retry immediately + * - -ENOMEM if out of memory + */ +static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + u64 prefix = scb_s->prefix << GUEST_PREFIX_SHIFT; + int rc; + + if (prefix_is_mapped(vsie_page)) + return 0; + + /* mark it as mapped so we can catch any concurrent unmappers */ + prefix_mapped(vsie_page); + + /* with mso/msl, the prefix lies at offset *mso* */ + prefix += scb_s->mso; + + rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix, NULL); + if (!rc && (scb_s->ecb & ECB_TE)) + rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, + prefix + PAGE_SIZE, NULL); + /* + * We don't have to mprotect, we will be called for all unshadows. + * SIE will detect if protection applies and trigger a validity. + */ + if (rc) + prefix_unmapped(vsie_page); + if (rc > 0 || rc == -EFAULT) + rc = set_validity_icpt(scb_s, 0x0037U); + return rc; +} + +/* + * Pin the guest page given by gpa and set hpa to the pinned host address. + * Will always be pinned writable. + * + * Returns: - 0 on success + * - -EINVAL if the gpa is not valid guest storage + */ +static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa) +{ + struct page *page; + + page = gfn_to_page(kvm, gpa_to_gfn(gpa)); + if (is_error_page(page)) + return -EINVAL; + *hpa = (hpa_t)page_to_phys(page) + (gpa & ~PAGE_MASK); + return 0; +} + +/* Unpins a page previously pinned via pin_guest_page, marking it as dirty. */ +static void unpin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t hpa) +{ + kvm_release_pfn_dirty(hpa >> PAGE_SHIFT); + /* mark the page always as dirty for migration */ + mark_page_dirty(kvm, gpa_to_gfn(gpa)); +} + +/* unpin all blocks previously pinned by pin_blocks(), marking them dirty */ +static void unpin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + hpa_t hpa; + + hpa = (u64) scb_s->scaoh << 32 | scb_s->scaol; + if (hpa) { + unpin_guest_page(vcpu->kvm, vsie_page->sca_gpa, hpa); + vsie_page->sca_gpa = 0; + scb_s->scaol = 0; + scb_s->scaoh = 0; + } + + hpa = scb_s->itdba; + if (hpa) { + unpin_guest_page(vcpu->kvm, vsie_page->itdba_gpa, hpa); + vsie_page->itdba_gpa = 0; + scb_s->itdba = 0; + } + + hpa = scb_s->gvrd; + if (hpa) { + unpin_guest_page(vcpu->kvm, vsie_page->gvrd_gpa, hpa); + vsie_page->gvrd_gpa = 0; + scb_s->gvrd = 0; + } + + hpa = scb_s->riccbd; + if (hpa) { + unpin_guest_page(vcpu->kvm, vsie_page->riccbd_gpa, hpa); + vsie_page->riccbd_gpa = 0; + scb_s->riccbd = 0; + } + + hpa = scb_s->sdnxo; + if (hpa) { + unpin_guest_page(vcpu->kvm, vsie_page->sdnx_gpa, hpa); + vsie_page->sdnx_gpa = 0; + scb_s->sdnxo = 0; + } +} + +/* + * Instead of shadowing some blocks, we can simply forward them because the + * addresses in the scb are 64 bit long. + * + * This works as long as the data lies in one page. If blocks ever exceed one + * page, we have to fall back to shadowing. + * + * As we reuse the sca, the vcpu pointers contained in it are invalid. We must + * therefore not enable any facilities that access these pointers (e.g. SIGPIF). + * + * Returns: - 0 if all blocks were pinned. + * - > 0 if control has to be given to guest 2 + * - -ENOMEM if out of memory + */ +static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + hpa_t hpa; + gpa_t gpa; + int rc = 0; + + gpa = READ_ONCE(scb_o->scaol) & ~0xfUL; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO)) + gpa |= (u64) READ_ONCE(scb_o->scaoh) << 32; + if (gpa) { + if (gpa < 2 * PAGE_SIZE) + rc = set_validity_icpt(scb_s, 0x0038U); + else if ((gpa & ~0x1fffUL) == kvm_s390_get_prefix(vcpu)) + rc = set_validity_icpt(scb_s, 0x0011U); + else if ((gpa & PAGE_MASK) != + ((gpa + sizeof(struct bsca_block) - 1) & PAGE_MASK)) + rc = set_validity_icpt(scb_s, 0x003bU); + if (!rc) { + rc = pin_guest_page(vcpu->kvm, gpa, &hpa); + if (rc) + rc = set_validity_icpt(scb_s, 0x0034U); + } + if (rc) + goto unpin; + vsie_page->sca_gpa = gpa; + scb_s->scaoh = (u32)((u64)hpa >> 32); + scb_s->scaol = (u32)(u64)hpa; + } + + gpa = READ_ONCE(scb_o->itdba) & ~0xffUL; + if (gpa && (scb_s->ecb & ECB_TE)) { + if (gpa < 2 * PAGE_SIZE) { + rc = set_validity_icpt(scb_s, 0x0080U); + goto unpin; + } + /* 256 bytes cannot cross page boundaries */ + rc = pin_guest_page(vcpu->kvm, gpa, &hpa); + if (rc) { + rc = set_validity_icpt(scb_s, 0x0080U); + goto unpin; + } + vsie_page->itdba_gpa = gpa; + scb_s->itdba = hpa; + } + + gpa = READ_ONCE(scb_o->gvrd) & ~0x1ffUL; + if (gpa && (scb_s->eca & ECA_VX) && !(scb_s->ecd & ECD_HOSTREGMGMT)) { + if (gpa < 2 * PAGE_SIZE) { + rc = set_validity_icpt(scb_s, 0x1310U); + goto unpin; + } + /* + * 512 bytes vector registers cannot cross page boundaries + * if this block gets bigger, we have to shadow it. + */ + rc = pin_guest_page(vcpu->kvm, gpa, &hpa); + if (rc) { + rc = set_validity_icpt(scb_s, 0x1310U); + goto unpin; + } + vsie_page->gvrd_gpa = gpa; + scb_s->gvrd = hpa; + } + + gpa = READ_ONCE(scb_o->riccbd) & ~0x3fUL; + if (gpa && (scb_s->ecb3 & ECB3_RI)) { + if (gpa < 2 * PAGE_SIZE) { + rc = set_validity_icpt(scb_s, 0x0043U); + goto unpin; + } + /* 64 bytes cannot cross page boundaries */ + rc = pin_guest_page(vcpu->kvm, gpa, &hpa); + if (rc) { + rc = set_validity_icpt(scb_s, 0x0043U); + goto unpin; + } + /* Validity 0x0044 will be checked by SIE */ + vsie_page->riccbd_gpa = gpa; + scb_s->riccbd = hpa; + } + if (((scb_s->ecb & ECB_GS) && !(scb_s->ecd & ECD_HOSTREGMGMT)) || + (scb_s->ecd & ECD_ETOKENF)) { + unsigned long sdnxc; + + gpa = READ_ONCE(scb_o->sdnxo) & ~0xfUL; + sdnxc = READ_ONCE(scb_o->sdnxo) & 0xfUL; + if (!gpa || gpa < 2 * PAGE_SIZE) { + rc = set_validity_icpt(scb_s, 0x10b0U); + goto unpin; + } + if (sdnxc < 6 || sdnxc > 12) { + rc = set_validity_icpt(scb_s, 0x10b1U); + goto unpin; + } + if (gpa & ((1 << sdnxc) - 1)) { + rc = set_validity_icpt(scb_s, 0x10b2U); + goto unpin; + } + /* Due to alignment rules (checked above) this cannot + * cross page boundaries + */ + rc = pin_guest_page(vcpu->kvm, gpa, &hpa); + if (rc) { + rc = set_validity_icpt(scb_s, 0x10b0U); + goto unpin; + } + vsie_page->sdnx_gpa = gpa; + scb_s->sdnxo = hpa | sdnxc; + } + return 0; +unpin: + unpin_blocks(vcpu, vsie_page); + return rc; +} + +/* unpin the scb provided by guest 2, marking it as dirty */ +static void unpin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, + gpa_t gpa) +{ + hpa_t hpa = (hpa_t) vsie_page->scb_o; + + if (hpa) + unpin_guest_page(vcpu->kvm, gpa, hpa); + vsie_page->scb_o = NULL; +} + +/* + * Pin the scb at gpa provided by guest 2 at vsie_page->scb_o. + * + * Returns: - 0 if the scb was pinned. + * - > 0 if control has to be given to guest 2 + */ +static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, + gpa_t gpa) +{ + hpa_t hpa; + int rc; + + rc = pin_guest_page(vcpu->kvm, gpa, &hpa); + if (rc) { + rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + WARN_ON_ONCE(rc); + return 1; + } + vsie_page->scb_o = phys_to_virt(hpa); + return 0; +} + +/* + * Inject a fault into guest 2. + * + * Returns: - > 0 if control has to be given to guest 2 + * < 0 if an error occurred during injection. + */ +static int inject_fault(struct kvm_vcpu *vcpu, __u16 code, __u64 vaddr, + bool write_flag) +{ + struct kvm_s390_pgm_info pgm = { + .code = code, + .trans_exc_code = + /* 0-51: virtual address */ + (vaddr & 0xfffffffffffff000UL) | + /* 52-53: store / fetch */ + (((unsigned int) !write_flag) + 1) << 10, + /* 62-63: asce id (always primary == 0) */ + .exc_access_id = 0, /* always primary */ + .op_access_id = 0, /* not MVPG */ + }; + int rc; + + if (code == PGM_PROTECTION) + pgm.trans_exc_code |= 0x4UL; + + rc = kvm_s390_inject_prog_irq(vcpu, &pgm); + return rc ? rc : 1; +} + +/* + * Handle a fault during vsie execution on a gmap shadow. + * + * Returns: - 0 if the fault was resolved + * - > 0 if control has to be given to guest 2 + * - < 0 if an error occurred + */ +static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + int rc; + + if (current->thread.gmap_int_code == PGM_PROTECTION) + /* we can directly forward all protection exceptions */ + return inject_fault(vcpu, PGM_PROTECTION, + current->thread.gmap_addr, 1); + + rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, + current->thread.gmap_addr, NULL); + if (rc > 0) { + rc = inject_fault(vcpu, rc, + current->thread.gmap_addr, + current->thread.gmap_write_flag); + if (rc >= 0) + vsie_page->fault_addr = current->thread.gmap_addr; + } + return rc; +} + +/* + * Retry the previous fault that required guest 2 intervention. This avoids + * one superfluous SIE re-entry and direct exit. + * + * Will ignore any errors. The next SIE fault will do proper fault handling. + */ +static void handle_last_fault(struct kvm_vcpu *vcpu, + struct vsie_page *vsie_page) +{ + if (vsie_page->fault_addr) + kvm_s390_shadow_fault(vcpu, vsie_page->gmap, + vsie_page->fault_addr, NULL); + vsie_page->fault_addr = 0; +} + +static inline void clear_vsie_icpt(struct vsie_page *vsie_page) +{ + vsie_page->scb_s.icptcode = 0; +} + +/* rewind the psw and clear the vsie icpt, so we can retry execution */ +static void retry_vsie_icpt(struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + int ilen = insn_length(scb_s->ipa >> 8); + + /* take care of EXECUTE instructions */ + if (scb_s->icptstatus & 1) { + ilen = (scb_s->icptstatus >> 4) & 0x6; + if (!ilen) + ilen = 4; + } + scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, ilen); + clear_vsie_icpt(vsie_page); +} + +/* + * Try to shadow + enable the guest 2 provided facility list. + * Retry instruction execution if enabled for and provided by guest 2. + * + * Returns: - 0 if handled (retry or guest 2 icpt) + * - > 0 if control has to be given to guest 2 + */ +static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + __u32 fac = READ_ONCE(vsie_page->scb_o->fac) & 0x7ffffff8U; + + if (fac && test_kvm_facility(vcpu->kvm, 7)) { + retry_vsie_icpt(vsie_page); + if (read_guest_real(vcpu, fac, &vsie_page->fac, + sizeof(vsie_page->fac))) + return set_validity_icpt(scb_s, 0x1090U); + scb_s->fac = (__u32)(__u64) &vsie_page->fac; + } + return 0; +} + +/* + * Get a register for a nested guest. + * @vcpu the vcpu of the guest + * @vsie_page the vsie_page for the nested guest + * @reg the register number, the upper 4 bits are ignored. + * returns: the value of the register. + */ +static u64 vsie_get_register(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, u8 reg) +{ + /* no need to validate the parameter and/or perform error handling */ + reg &= 0xf; + switch (reg) { + case 15: + return vsie_page->scb_s.gg15; + case 14: + return vsie_page->scb_s.gg14; + default: + return vcpu->run->s.regs.gprs[reg]; + } +} + +static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + unsigned long pei_dest, pei_src, src, dest, mask, prefix; + u64 *pei_block = &vsie_page->scb_o->mcic; + int edat, rc_dest, rc_src; + union ctlreg0 cr0; + + cr0.val = vcpu->arch.sie_block->gcr[0]; + edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8); + mask = _kvm_s390_logical_to_effective(&scb_s->gpsw, PAGE_MASK); + prefix = scb_s->prefix << GUEST_PREFIX_SHIFT; + + dest = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 20) & mask; + dest = _kvm_s390_real_to_abs(prefix, dest) + scb_s->mso; + src = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 16) & mask; + src = _kvm_s390_real_to_abs(prefix, src) + scb_s->mso; + + rc_dest = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, dest, &pei_dest); + rc_src = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, src, &pei_src); + /* + * Either everything went well, or something non-critical went wrong + * e.g. because of a race. In either case, simply retry. + */ + if (rc_dest == -EAGAIN || rc_src == -EAGAIN || (!rc_dest && !rc_src)) { + retry_vsie_icpt(vsie_page); + return -EAGAIN; + } + /* Something more serious went wrong, propagate the error */ + if (rc_dest < 0) + return rc_dest; + if (rc_src < 0) + return rc_src; + + /* The only possible suppressing exception: just deliver it */ + if (rc_dest == PGM_TRANSLATION_SPEC || rc_src == PGM_TRANSLATION_SPEC) { + clear_vsie_icpt(vsie_page); + rc_dest = kvm_s390_inject_program_int(vcpu, PGM_TRANSLATION_SPEC); + WARN_ON_ONCE(rc_dest); + return 1; + } + + /* + * Forward the PEI intercept to the guest if it was a page fault, or + * also for segment and region table faults if EDAT applies. + */ + if (edat) { + rc_dest = rc_dest == PGM_ASCE_TYPE ? rc_dest : 0; + rc_src = rc_src == PGM_ASCE_TYPE ? rc_src : 0; + } else { + rc_dest = rc_dest != PGM_PAGE_TRANSLATION ? rc_dest : 0; + rc_src = rc_src != PGM_PAGE_TRANSLATION ? rc_src : 0; + } + if (!rc_dest && !rc_src) { + pei_block[0] = pei_dest; + pei_block[1] = pei_src; + return 1; + } + + retry_vsie_icpt(vsie_page); + + /* + * The host has edat, and the guest does not, or it was an ASCE type + * exception. The host needs to inject the appropriate DAT interrupts + * into the guest. + */ + if (rc_dest) + return inject_fault(vcpu, rc_dest, dest, 1); + return inject_fault(vcpu, rc_src, src, 0); +} + +/* + * Run the vsie on a shadow scb and a shadow gmap, without any further + * sanity checks, handling SIE faults. + * + * Returns: - 0 everything went fine + * - > 0 if control has to be given to guest 2 + * - < 0 if an error occurred + */ +static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) + __releases(vcpu->kvm->srcu) + __acquires(vcpu->kvm->srcu) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + int guest_bp_isolation; + int rc = 0; + + handle_last_fault(vcpu, vsie_page); + + kvm_vcpu_srcu_read_unlock(vcpu); + + /* save current guest state of bp isolation override */ + guest_bp_isolation = test_thread_flag(TIF_ISOLATE_BP_GUEST); + + /* + * The guest is running with BPBC, so we have to force it on for our + * nested guest. This is done by enabling BPBC globally, so the BPBC + * control in the SCB (which the nested guest can modify) is simply + * ignored. + */ + if (test_kvm_facility(vcpu->kvm, 82) && + vcpu->arch.sie_block->fpf & FPF_BPBC) + set_thread_flag(TIF_ISOLATE_BP_GUEST); + + local_irq_disable(); + guest_enter_irqoff(); + local_irq_enable(); + + /* + * Simulate a SIE entry of the VCPU (see sie64a), so VCPU blocking + * and VCPU requests also hinder the vSIE from running and lead + * to an immediate exit. kvm_s390_vsie_kick() has to be used to + * also kick the vSIE. + */ + vcpu->arch.sie_block->prog0c |= PROG_IN_SIE; + barrier(); + if (test_cpu_flag(CIF_FPU)) + load_fpu_regs(); + if (!kvm_s390_vcpu_sie_inhibited(vcpu)) + rc = sie64a(scb_s, vcpu->run->s.regs.gprs); + barrier(); + vcpu->arch.sie_block->prog0c &= ~PROG_IN_SIE; + + local_irq_disable(); + guest_exit_irqoff(); + local_irq_enable(); + + /* restore guest state for bp isolation override */ + if (!guest_bp_isolation) + clear_thread_flag(TIF_ISOLATE_BP_GUEST); + + kvm_vcpu_srcu_read_lock(vcpu); + + if (rc == -EINTR) { + VCPU_EVENT(vcpu, 3, "%s", "machine check"); + kvm_s390_reinject_machine_check(vcpu, &vsie_page->mcck_info); + return 0; + } + + if (rc > 0) + rc = 0; /* we could still have an icpt */ + else if (rc == -EFAULT) + return handle_fault(vcpu, vsie_page); + + switch (scb_s->icptcode) { + case ICPT_INST: + if (scb_s->ipa == 0xb2b0) + rc = handle_stfle(vcpu, vsie_page); + break; + case ICPT_STOP: + /* stop not requested by g2 - must have been a kick */ + if (!(atomic_read(&scb_o->cpuflags) & CPUSTAT_STOP_INT)) + clear_vsie_icpt(vsie_page); + break; + case ICPT_VALIDITY: + if ((scb_s->ipa & 0xf000) != 0xf000) + scb_s->ipa += 0x1000; + break; + case ICPT_PARTEXEC: + if (scb_s->ipa == 0xb254) + rc = vsie_handle_mvpg(vcpu, vsie_page); + break; + } + return rc; +} + +static void release_gmap_shadow(struct vsie_page *vsie_page) +{ + if (vsie_page->gmap) + gmap_put(vsie_page->gmap); + WRITE_ONCE(vsie_page->gmap, NULL); + prefix_unmapped(vsie_page); +} + +static int acquire_gmap_shadow(struct kvm_vcpu *vcpu, + struct vsie_page *vsie_page) +{ + unsigned long asce; + union ctlreg0 cr0; + struct gmap *gmap; + int edat; + + asce = vcpu->arch.sie_block->gcr[1]; + cr0.val = vcpu->arch.sie_block->gcr[0]; + edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8); + edat += edat && test_kvm_facility(vcpu->kvm, 78); + + /* + * ASCE or EDAT could have changed since last icpt, or the gmap + * we're holding has been unshadowed. If the gmap is still valid, + * we can safely reuse it. + */ + if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat)) + return 0; + + /* release the old shadow - if any, and mark the prefix as unmapped */ + release_gmap_shadow(vsie_page); + gmap = gmap_shadow(vcpu->arch.gmap, asce, edat); + if (IS_ERR(gmap)) + return PTR_ERR(gmap); + gmap->private = vcpu->kvm; + WRITE_ONCE(vsie_page->gmap, gmap); + return 0; +} + +/* + * Register the shadow scb at the VCPU, e.g. for kicking out of vsie. + */ +static void register_shadow_scb(struct kvm_vcpu *vcpu, + struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + + WRITE_ONCE(vcpu->arch.vsie_block, &vsie_page->scb_s); + /* + * External calls have to lead to a kick of the vcpu and + * therefore the vsie -> Simulate Wait state. + */ + kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT); + /* + * We have to adjust the g3 epoch by the g2 epoch. The epoch will + * automatically be adjusted on tod clock changes via kvm_sync_clock. + */ + preempt_disable(); + scb_s->epoch += vcpu->kvm->arch.epoch; + + if (scb_s->ecd & ECD_MEF) { + scb_s->epdx += vcpu->kvm->arch.epdx; + if (scb_s->epoch < vcpu->kvm->arch.epoch) + scb_s->epdx += 1; + } + + preempt_enable(); +} + +/* + * Unregister a shadow scb from a VCPU. + */ +static void unregister_shadow_scb(struct kvm_vcpu *vcpu) +{ + kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT); + WRITE_ONCE(vcpu->arch.vsie_block, NULL); +} + +/* + * Run the vsie on a shadowed scb, managing the gmap shadow, handling + * prefix pages and faults. + * + * Returns: - 0 if no errors occurred + * - > 0 if control has to be given to guest 2 + * - -ENOMEM if out of memory + */ +static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + int rc = 0; + + while (1) { + rc = acquire_gmap_shadow(vcpu, vsie_page); + if (!rc) + rc = map_prefix(vcpu, vsie_page); + if (!rc) { + gmap_enable(vsie_page->gmap); + update_intervention_requests(vsie_page); + rc = do_vsie_run(vcpu, vsie_page); + gmap_enable(vcpu->arch.gmap); + } + atomic_andnot(PROG_BLOCK_SIE, &scb_s->prog20); + + if (rc == -EAGAIN) + rc = 0; + if (rc || scb_s->icptcode || signal_pending(current) || + kvm_s390_vcpu_has_irq(vcpu, 0) || + kvm_s390_vcpu_sie_inhibited(vcpu)) + break; + cond_resched(); + } + + if (rc == -EFAULT) { + /* + * Addressing exceptions are always presentes as intercepts. + * As addressing exceptions are suppressing and our guest 3 PSW + * points at the responsible instruction, we have to + * forward the PSW and set the ilc. If we can't read guest 3 + * instruction, we can use an arbitrary ilc. Let's always use + * ilen = 4 for now, so we can avoid reading in guest 3 virtual + * memory. (we could also fake the shadow so the hardware + * handles it). + */ + scb_s->icptcode = ICPT_PROGI; + scb_s->iprcc = PGM_ADDRESSING; + scb_s->pgmilc = 4; + scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, 4); + rc = 1; + } + return rc; +} + +/* + * Get or create a vsie page for a scb address. + * + * Returns: - address of a vsie page (cached or new one) + * - NULL if the same scb address is already used by another VCPU + * - ERR_PTR(-ENOMEM) if out of memory + */ +static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) +{ + struct vsie_page *vsie_page; + struct page *page; + int nr_vcpus; + + rcu_read_lock(); + page = radix_tree_lookup(&kvm->arch.vsie.addr_to_page, addr >> 9); + rcu_read_unlock(); + if (page) { + if (page_ref_inc_return(page) == 2) + return page_to_virt(page); + page_ref_dec(page); + } + + /* + * We want at least #online_vcpus shadows, so every VCPU can execute + * the VSIE in parallel. + */ + nr_vcpus = atomic_read(&kvm->online_vcpus); + + mutex_lock(&kvm->arch.vsie.mutex); + if (kvm->arch.vsie.page_count < nr_vcpus) { + page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO | GFP_DMA); + if (!page) { + mutex_unlock(&kvm->arch.vsie.mutex); + return ERR_PTR(-ENOMEM); + } + page_ref_inc(page); + kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = page; + kvm->arch.vsie.page_count++; + } else { + /* reuse an existing entry that belongs to nobody */ + while (true) { + page = kvm->arch.vsie.pages[kvm->arch.vsie.next]; + if (page_ref_inc_return(page) == 2) + break; + page_ref_dec(page); + kvm->arch.vsie.next++; + kvm->arch.vsie.next %= nr_vcpus; + } + radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9); + } + page->index = addr; + /* double use of the same address */ + if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, page)) { + page_ref_dec(page); + mutex_unlock(&kvm->arch.vsie.mutex); + return NULL; + } + mutex_unlock(&kvm->arch.vsie.mutex); + + vsie_page = page_to_virt(page); + memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block)); + release_gmap_shadow(vsie_page); + vsie_page->fault_addr = 0; + vsie_page->scb_s.ihcpu = 0xffffU; + return vsie_page; +} + +/* put a vsie page acquired via get_vsie_page */ +static void put_vsie_page(struct kvm *kvm, struct vsie_page *vsie_page) +{ + struct page *page = pfn_to_page(__pa(vsie_page) >> PAGE_SHIFT); + + page_ref_dec(page); +} + +int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu) +{ + struct vsie_page *vsie_page; + unsigned long scb_addr; + int rc; + + vcpu->stat.instruction_sie++; + if (!test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIEF2)) + return -EOPNOTSUPP; + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + BUILD_BUG_ON(sizeof(struct vsie_page) != PAGE_SIZE); + scb_addr = kvm_s390_get_base_disp_s(vcpu, NULL); + + /* 512 byte alignment */ + if (unlikely(scb_addr & 0x1ffUL)) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0) || + kvm_s390_vcpu_sie_inhibited(vcpu)) + return 0; + + vsie_page = get_vsie_page(vcpu->kvm, scb_addr); + if (IS_ERR(vsie_page)) + return PTR_ERR(vsie_page); + else if (!vsie_page) + /* double use of sie control block - simply do nothing */ + return 0; + + rc = pin_scb(vcpu, vsie_page, scb_addr); + if (rc) + goto out_put; + rc = shadow_scb(vcpu, vsie_page); + if (rc) + goto out_unpin_scb; + rc = pin_blocks(vcpu, vsie_page); + if (rc) + goto out_unshadow; + register_shadow_scb(vcpu, vsie_page); + rc = vsie_run(vcpu, vsie_page); + unregister_shadow_scb(vcpu); + unpin_blocks(vcpu, vsie_page); +out_unshadow: + unshadow_scb(vcpu, vsie_page); +out_unpin_scb: + unpin_scb(vcpu, vsie_page, scb_addr); +out_put: + put_vsie_page(vcpu->kvm, vsie_page); + + return rc < 0 ? rc : 0; +} + +/* Init the vsie data structures. To be called when a vm is initialized. */ +void kvm_s390_vsie_init(struct kvm *kvm) +{ + mutex_init(&kvm->arch.vsie.mutex); + INIT_RADIX_TREE(&kvm->arch.vsie.addr_to_page, GFP_KERNEL_ACCOUNT); +} + +/* Destroy the vsie data structures. To be called when a vm is destroyed. */ +void kvm_s390_vsie_destroy(struct kvm *kvm) +{ + struct vsie_page *vsie_page; + struct page *page; + int i; + + mutex_lock(&kvm->arch.vsie.mutex); + for (i = 0; i < kvm->arch.vsie.page_count; i++) { + page = kvm->arch.vsie.pages[i]; + kvm->arch.vsie.pages[i] = NULL; + vsie_page = page_to_virt(page); + release_gmap_shadow(vsie_page); + /* free the radix tree entry */ + radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9); + __free_page(page); + } + kvm->arch.vsie.page_count = 0; + mutex_unlock(&kvm->arch.vsie.mutex); +} + +void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_sie_block *scb = READ_ONCE(vcpu->arch.vsie_block); + + /* + * Even if the VCPU lets go of the shadow sie block reference, it is + * still valid in the cache. So we can safely kick it. + */ + if (scb) { + atomic_or(PROG_BLOCK_SIE, &scb->prog20); + if (scb->prog0c & PROG_IN_SIE) + atomic_or(CPUSTAT_STOP_INT, &scb->cpuflags); + } +} diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile new file mode 100644 index 0000000000..7c50eca85c --- /dev/null +++ b/arch/s390/lib/Makefile @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for s390-specific library files.. +# + +lib-y += delay.o string.o uaccess.o find.o spinlock.o tishift.o +obj-y += mem.o xor.o +lib-$(CONFIG_KPROBES) += probes.o +lib-$(CONFIG_UPROBES) += probes.o +obj-$(CONFIG_S390_KPROBES_SANITY_TEST) += test_kprobes_s390.o +test_kprobes_s390-objs += test_kprobes_asm.o test_kprobes.o + +# Instrumenting memory accesses to __user data (in different address space) +# produce false positives +KASAN_SANITIZE_uaccess.o := n + +obj-$(CONFIG_S390_UNWIND_SELFTEST) += test_unwind.o +CFLAGS_test_unwind.o += -fno-optimize-sibling-calls + +obj-$(CONFIG_S390_MODULES_SANITY_TEST) += test_modules.o +obj-$(CONFIG_S390_MODULES_SANITY_TEST_HELPERS) += test_modules_helpers.o + +lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o + +obj-$(CONFIG_EXPOLINE_EXTERN) += expoline/ diff --git a/arch/s390/lib/delay.c b/arch/s390/lib/delay.c new file mode 100644 index 0000000000..be14c58cb9 --- /dev/null +++ b/arch/s390/lib/delay.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Precise Delay Loops for S390 + * + * Copyright IBM Corp. 1999, 2008 + * Author(s): Martin Schwidefsky , + */ + +#include +#include +#include +#include + +void __delay(unsigned long loops) +{ + /* + * Loop 'loops' times. Callers must not assume a specific + * amount of time passes before this function returns. + */ + asm volatile("0: brct %0,0b" : : "d" ((loops/2) + 1)); +} +EXPORT_SYMBOL(__delay); + +static void delay_loop(unsigned long delta) +{ + unsigned long end; + + end = get_tod_clock_monotonic() + delta; + while (!tod_after(get_tod_clock_monotonic(), end)) + cpu_relax(); +} + +void __udelay(unsigned long usecs) +{ + delay_loop(usecs << 12); +} +EXPORT_SYMBOL(__udelay); + +void __ndelay(unsigned long nsecs) +{ + nsecs <<= 9; + do_div(nsecs, 125); + delay_loop(nsecs); +} +EXPORT_SYMBOL(__ndelay); diff --git a/arch/s390/lib/error-inject.c b/arch/s390/lib/error-inject.c new file mode 100644 index 0000000000..8c9d4da87e --- /dev/null +++ b/arch/s390/lib/error-inject.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0+ +#include +#include +#include + +void override_function_with_return(struct pt_regs *regs) +{ + /* + * Emulate 'br 14'. 'regs' is captured by kprobes on entry to some + * kernel function. + */ + regs->psw.addr = regs->gprs[14]; +} +NOKPROBE_SYMBOL(override_function_with_return); diff --git a/arch/s390/lib/expoline/Makefile b/arch/s390/lib/expoline/Makefile new file mode 100644 index 0000000000..854631d9cb --- /dev/null +++ b/arch/s390/lib/expoline/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-y += expoline.o diff --git a/arch/s390/lib/expoline/expoline.S b/arch/s390/lib/expoline/expoline.S new file mode 100644 index 0000000000..92ed8409a7 --- /dev/null +++ b/arch/s390/lib/expoline/expoline.S @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include +#include + +.macro GEN_ALL_BR_THUNK_EXTERN + .irp r1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 + GEN_BR_THUNK_EXTERN %r\r1 + .endr +.endm + +GEN_ALL_BR_THUNK_EXTERN diff --git a/arch/s390/lib/find.c b/arch/s390/lib/find.c new file mode 100644 index 0000000000..96a8a2e2d0 --- /dev/null +++ b/arch/s390/lib/find.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * MSB0 numbered special bitops handling. + * + * The bits are numbered: + * |0..............63|64............127|128...........191|192...........255| + * + * The reason for this bit numbering is the fact that the hardware sets bits + * in a bitmap starting at bit 0 (MSB) and we don't want to scan the bitmap + * from the 'wrong end'. + */ + +#include +#include +#include + +unsigned long find_first_bit_inv(const unsigned long *addr, unsigned long size) +{ + const unsigned long *p = addr; + unsigned long result = 0; + unsigned long tmp; + + while (size & ~(BITS_PER_LONG - 1)) { + if ((tmp = *(p++))) + goto found; + result += BITS_PER_LONG; + size -= BITS_PER_LONG; + } + if (!size) + return result; + tmp = (*p) & (~0UL << (BITS_PER_LONG - size)); + if (!tmp) /* Are any bits set? */ + return result + size; /* Nope. */ +found: + return result + (__fls(tmp) ^ (BITS_PER_LONG - 1)); +} +EXPORT_SYMBOL(find_first_bit_inv); + +unsigned long find_next_bit_inv(const unsigned long *addr, unsigned long size, + unsigned long offset) +{ + const unsigned long *p = addr + (offset / BITS_PER_LONG); + unsigned long result = offset & ~(BITS_PER_LONG - 1); + unsigned long tmp; + + if (offset >= size) + return size; + size -= result; + offset %= BITS_PER_LONG; + if (offset) { + tmp = *(p++); + tmp &= (~0UL >> offset); + if (size < BITS_PER_LONG) + goto found_first; + if (tmp) + goto found_middle; + size -= BITS_PER_LONG; + result += BITS_PER_LONG; + } + while (size & ~(BITS_PER_LONG-1)) { + if ((tmp = *(p++))) + goto found_middle; + result += BITS_PER_LONG; + size -= BITS_PER_LONG; + } + if (!size) + return result; + tmp = *p; +found_first: + tmp &= (~0UL << (BITS_PER_LONG - size)); + if (!tmp) /* Are any bits set? */ + return result + size; /* Nope. */ +found_middle: + return result + (__fls(tmp) ^ (BITS_PER_LONG - 1)); +} +EXPORT_SYMBOL(find_next_bit_inv); diff --git a/arch/s390/lib/mem.S b/arch/s390/lib/mem.S new file mode 100644 index 0000000000..08f60a42b9 --- /dev/null +++ b/arch/s390/lib/mem.S @@ -0,0 +1,197 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * String handling functions. + * + * Copyright IBM Corp. 2012 + */ + +#include +#include +#include + + GEN_BR_THUNK %r14 + +/* + * void *memmove(void *dest, const void *src, size_t n) + */ +SYM_FUNC_START(__memmove) + ltgr %r4,%r4 + lgr %r1,%r2 + jz .Lmemmove_exit + aghi %r4,-1 + clgr %r2,%r3 + jnh .Lmemmove_forward + la %r5,1(%r4,%r3) + clgr %r2,%r5 + jl .Lmemmove_reverse +.Lmemmove_forward: + srlg %r0,%r4,8 + ltgr %r0,%r0 + jz .Lmemmove_forward_remainder +.Lmemmove_forward_loop: + mvc 0(256,%r1),0(%r3) + la %r1,256(%r1) + la %r3,256(%r3) + brctg %r0,.Lmemmove_forward_loop +.Lmemmove_forward_remainder: + larl %r5,.Lmemmove_mvc + ex %r4,0(%r5) +.Lmemmove_exit: + BR_EX %r14 +.Lmemmove_reverse: + ic %r0,0(%r4,%r3) + stc %r0,0(%r4,%r1) + brctg %r4,.Lmemmove_reverse + ic %r0,0(%r4,%r3) + stc %r0,0(%r4,%r1) + BR_EX %r14 +.Lmemmove_mvc: + mvc 0(1,%r1),0(%r3) +SYM_FUNC_END(__memmove) +EXPORT_SYMBOL(__memmove) + +SYM_FUNC_ALIAS(memmove, __memmove) +EXPORT_SYMBOL(memmove) + +/* + * memset implementation + * + * This code corresponds to the C construct below. We do distinguish + * between clearing (c == 0) and setting a memory array (c != 0) simply + * because nearly all memset invocations in the kernel clear memory and + * the xc instruction is preferred in such cases. + * + * void *memset(void *s, int c, size_t n) + * { + * if (likely(c == 0)) + * return __builtin_memset(s, 0, n); + * return __builtin_memset(s, c, n); + * } + */ +SYM_FUNC_START(__memset) + ltgr %r4,%r4 + jz .Lmemset_exit + ltgr %r3,%r3 + jnz .Lmemset_fill + aghi %r4,-1 + srlg %r3,%r4,8 + ltgr %r3,%r3 + lgr %r1,%r2 + jz .Lmemset_clear_remainder +.Lmemset_clear_loop: + xc 0(256,%r1),0(%r1) + la %r1,256(%r1) + brctg %r3,.Lmemset_clear_loop +.Lmemset_clear_remainder: + larl %r3,.Lmemset_xc + ex %r4,0(%r3) +.Lmemset_exit: + BR_EX %r14 +.Lmemset_fill: + cghi %r4,1 + lgr %r1,%r2 + je .Lmemset_fill_exit + aghi %r4,-2 + srlg %r5,%r4,8 + ltgr %r5,%r5 + jz .Lmemset_fill_remainder +.Lmemset_fill_loop: + stc %r3,0(%r1) + mvc 1(255,%r1),0(%r1) + la %r1,256(%r1) + brctg %r5,.Lmemset_fill_loop +.Lmemset_fill_remainder: + stc %r3,0(%r1) + larl %r5,.Lmemset_mvc + ex %r4,0(%r5) + BR_EX %r14 +.Lmemset_fill_exit: + stc %r3,0(%r1) + BR_EX %r14 +.Lmemset_xc: + xc 0(1,%r1),0(%r1) +.Lmemset_mvc: + mvc 1(1,%r1),0(%r1) +SYM_FUNC_END(__memset) +EXPORT_SYMBOL(__memset) + +SYM_FUNC_ALIAS(memset, __memset) +EXPORT_SYMBOL(memset) + +/* + * memcpy implementation + * + * void *memcpy(void *dest, const void *src, size_t n) + */ +SYM_FUNC_START(__memcpy) + ltgr %r4,%r4 + jz .Lmemcpy_exit + aghi %r4,-1 + srlg %r5,%r4,8 + ltgr %r5,%r5 + lgr %r1,%r2 + jnz .Lmemcpy_loop +.Lmemcpy_remainder: + larl %r5,.Lmemcpy_mvc + ex %r4,0(%r5) +.Lmemcpy_exit: + BR_EX %r14 +.Lmemcpy_loop: + mvc 0(256,%r1),0(%r3) + la %r1,256(%r1) + la %r3,256(%r3) + brctg %r5,.Lmemcpy_loop + j .Lmemcpy_remainder +.Lmemcpy_mvc: + mvc 0(1,%r1),0(%r3) +SYM_FUNC_END(__memcpy) +EXPORT_SYMBOL(__memcpy) + +SYM_FUNC_ALIAS(memcpy, __memcpy) +EXPORT_SYMBOL(memcpy) + +/* + * __memset16/32/64 + * + * void *__memset16(uint16_t *s, uint16_t v, size_t count) + * void *__memset32(uint32_t *s, uint32_t v, size_t count) + * void *__memset64(uint64_t *s, uint64_t v, size_t count) + */ +.macro __MEMSET bits,bytes,insn +SYM_FUNC_START(__memset\bits) + ltgr %r4,%r4 + jz .L__memset_exit\bits + cghi %r4,\bytes + je .L__memset_store\bits + aghi %r4,-(\bytes+1) + srlg %r5,%r4,8 + ltgr %r5,%r5 + lgr %r1,%r2 + jz .L__memset_remainder\bits +.L__memset_loop\bits: + \insn %r3,0(%r1) + mvc \bytes(256-\bytes,%r1),0(%r1) + la %r1,256(%r1) + brctg %r5,.L__memset_loop\bits +.L__memset_remainder\bits: + \insn %r3,0(%r1) + larl %r5,.L__memset_mvc\bits + ex %r4,0(%r5) + BR_EX %r14 +.L__memset_store\bits: + \insn %r3,0(%r2) +.L__memset_exit\bits: + BR_EX %r14 +.L__memset_mvc\bits: + mvc \bytes(1,%r1),0(%r1) +SYM_FUNC_END(__memset\bits) +.endm + +__MEMSET 16,2,sth +EXPORT_SYMBOL(__memset16) + +__MEMSET 32,4,st +EXPORT_SYMBOL(__memset32) + +__MEMSET 64,8,stg +EXPORT_SYMBOL(__memset64) diff --git a/arch/s390/lib/probes.c b/arch/s390/lib/probes.c new file mode 100644 index 0000000000..1e184a0344 --- /dev/null +++ b/arch/s390/lib/probes.c @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Common helper functions for kprobes and uprobes + * + * Copyright IBM Corp. 2014 + */ + +#include +#include +#include + +int probe_is_prohibited_opcode(u16 *insn) +{ + if (!is_known_insn((unsigned char *)insn)) + return -EINVAL; + switch (insn[0] >> 8) { + case 0x0c: /* bassm */ + case 0x0b: /* bsm */ + case 0x83: /* diag */ + case 0x44: /* ex */ + case 0xac: /* stnsm */ + case 0xad: /* stosm */ + return -EINVAL; + case 0xc6: + switch (insn[0] & 0x0f) { + case 0x00: /* exrl */ + return -EINVAL; + } + } + switch (insn[0]) { + case 0x0101: /* pr */ + case 0xb25a: /* bsa */ + case 0xb240: /* bakr */ + case 0xb258: /* bsg */ + case 0xb218: /* pc */ + case 0xb228: /* pt */ + case 0xb98d: /* epsw */ + case 0xe560: /* tbegin */ + case 0xe561: /* tbeginc */ + case 0xb2f8: /* tend */ + return -EINVAL; + } + return 0; +} + +int probe_get_fixup_type(u16 *insn) +{ + /* default fixup method */ + int fixup = FIXUP_PSW_NORMAL; + + switch (insn[0] >> 8) { + case 0x05: /* balr */ + case 0x0d: /* basr */ + fixup = FIXUP_RETURN_REGISTER; + /* if r2 = 0, no branch will be taken */ + if ((insn[0] & 0x0f) == 0) + fixup |= FIXUP_BRANCH_NOT_TAKEN; + break; + case 0x06: /* bctr */ + case 0x07: /* bcr */ + fixup = FIXUP_BRANCH_NOT_TAKEN; + break; + case 0x45: /* bal */ + case 0x4d: /* bas */ + fixup = FIXUP_RETURN_REGISTER; + break; + case 0x47: /* bc */ + case 0x46: /* bct */ + case 0x86: /* bxh */ + case 0x87: /* bxle */ + fixup = FIXUP_BRANCH_NOT_TAKEN; + break; + case 0x82: /* lpsw */ + fixup = FIXUP_NOT_REQUIRED; + break; + case 0xb2: /* lpswe */ + if ((insn[0] & 0xff) == 0xb2) + fixup = FIXUP_NOT_REQUIRED; + break; + case 0xa7: /* bras */ + if ((insn[0] & 0x0f) == 0x05) + fixup |= FIXUP_RETURN_REGISTER; + break; + case 0xc0: + if ((insn[0] & 0x0f) == 0x05) /* brasl */ + fixup |= FIXUP_RETURN_REGISTER; + break; + case 0xeb: + switch (insn[2] & 0xff) { + case 0x44: /* bxhg */ + case 0x45: /* bxleg */ + fixup = FIXUP_BRANCH_NOT_TAKEN; + break; + } + break; + case 0xe3: /* bctg */ + if ((insn[2] & 0xff) == 0x46) + fixup = FIXUP_BRANCH_NOT_TAKEN; + break; + case 0xec: + switch (insn[2] & 0xff) { + case 0xe5: /* clgrb */ + case 0xe6: /* cgrb */ + case 0xf6: /* crb */ + case 0xf7: /* clrb */ + case 0xfc: /* cgib */ + case 0xfd: /* cglib */ + case 0xfe: /* cib */ + case 0xff: /* clib */ + fixup = FIXUP_BRANCH_NOT_TAKEN; + break; + } + break; + } + return fixup; +} + +int probe_is_insn_relative_long(u16 *insn) +{ + /* Check if we have a RIL-b or RIL-c format instruction which + * we need to modify in order to avoid instruction emulation. */ + switch (insn[0] >> 8) { + case 0xc0: + if ((insn[0] & 0x0f) == 0x00) /* larl */ + return true; + break; + case 0xc4: + switch (insn[0] & 0x0f) { + case 0x02: /* llhrl */ + case 0x04: /* lghrl */ + case 0x05: /* lhrl */ + case 0x06: /* llghrl */ + case 0x07: /* sthrl */ + case 0x08: /* lgrl */ + case 0x0b: /* stgrl */ + case 0x0c: /* lgfrl */ + case 0x0d: /* lrl */ + case 0x0e: /* llgfrl */ + case 0x0f: /* strl */ + return true; + } + break; + case 0xc6: + switch (insn[0] & 0x0f) { + case 0x02: /* pfdrl */ + case 0x04: /* cghrl */ + case 0x05: /* chrl */ + case 0x06: /* clghrl */ + case 0x07: /* clhrl */ + case 0x08: /* cgrl */ + case 0x0a: /* clgrl */ + case 0x0c: /* cgfrl */ + case 0x0d: /* crl */ + case 0x0e: /* clgfrl */ + case 0x0f: /* clrl */ + return true; + } + break; + } + return false; +} diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c new file mode 100644 index 0000000000..81c53440b3 --- /dev/null +++ b/arch/s390/lib/spinlock.c @@ -0,0 +1,324 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Out of line spinlock code. + * + * Copyright IBM Corp. 2004, 2006 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int spin_retry = -1; + +static int __init spin_retry_init(void) +{ + if (spin_retry < 0) + spin_retry = 1000; + return 0; +} +early_initcall(spin_retry_init); + +/* + * spin_retry= parameter + */ +static int __init spin_retry_setup(char *str) +{ + spin_retry = simple_strtoul(str, &str, 0); + return 1; +} +__setup("spin_retry=", spin_retry_setup); + +struct spin_wait { + struct spin_wait *next, *prev; + int node_id; +} __aligned(32); + +static DEFINE_PER_CPU_ALIGNED(struct spin_wait, spin_wait[4]); + +#define _Q_LOCK_CPU_OFFSET 0 +#define _Q_LOCK_STEAL_OFFSET 16 +#define _Q_TAIL_IDX_OFFSET 18 +#define _Q_TAIL_CPU_OFFSET 20 + +#define _Q_LOCK_CPU_MASK 0x0000ffff +#define _Q_LOCK_STEAL_ADD 0x00010000 +#define _Q_LOCK_STEAL_MASK 0x00030000 +#define _Q_TAIL_IDX_MASK 0x000c0000 +#define _Q_TAIL_CPU_MASK 0xfff00000 + +#define _Q_LOCK_MASK (_Q_LOCK_CPU_MASK | _Q_LOCK_STEAL_MASK) +#define _Q_TAIL_MASK (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK) + +void arch_spin_lock_setup(int cpu) +{ + struct spin_wait *node; + int ix; + + node = per_cpu_ptr(&spin_wait[0], cpu); + for (ix = 0; ix < 4; ix++, node++) { + memset(node, 0, sizeof(*node)); + node->node_id = ((cpu + 1) << _Q_TAIL_CPU_OFFSET) + + (ix << _Q_TAIL_IDX_OFFSET); + } +} + +static inline int arch_load_niai4(int *lock) +{ + int owner; + + asm_inline volatile( + ALTERNATIVE("nop", ".insn rre,0xb2fa0000,4,0", 49) /* NIAI 4 */ + " l %0,%1\n" + : "=d" (owner) : "Q" (*lock) : "memory"); + return owner; +} + +static inline int arch_cmpxchg_niai8(int *lock, int old, int new) +{ + int expected = old; + + asm_inline volatile( + ALTERNATIVE("nop", ".insn rre,0xb2fa0000,8,0", 49) /* NIAI 8 */ + " cs %0,%3,%1\n" + : "=d" (old), "=Q" (*lock) + : "0" (old), "d" (new), "Q" (*lock) + : "cc", "memory"); + return expected == old; +} + +static inline struct spin_wait *arch_spin_decode_tail(int lock) +{ + int ix, cpu; + + ix = (lock & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; + cpu = (lock & _Q_TAIL_CPU_MASK) >> _Q_TAIL_CPU_OFFSET; + return per_cpu_ptr(&spin_wait[ix], cpu - 1); +} + +static inline int arch_spin_yield_target(int lock, struct spin_wait *node) +{ + if (lock & _Q_LOCK_CPU_MASK) + return lock & _Q_LOCK_CPU_MASK; + if (node == NULL || node->prev == NULL) + return 0; /* 0 -> no target cpu */ + while (node->prev) + node = node->prev; + return node->node_id >> _Q_TAIL_CPU_OFFSET; +} + +static inline void arch_spin_lock_queued(arch_spinlock_t *lp) +{ + struct spin_wait *node, *next; + int lockval, ix, node_id, tail_id, old, new, owner, count; + + ix = S390_lowcore.spinlock_index++; + barrier(); + lockval = SPINLOCK_LOCKVAL; /* cpu + 1 */ + node = this_cpu_ptr(&spin_wait[ix]); + node->prev = node->next = NULL; + node_id = node->node_id; + + /* Enqueue the node for this CPU in the spinlock wait queue */ + while (1) { + old = READ_ONCE(lp->lock); + if ((old & _Q_LOCK_CPU_MASK) == 0 && + (old & _Q_LOCK_STEAL_MASK) != _Q_LOCK_STEAL_MASK) { + /* + * The lock is free but there may be waiters. + * With no waiters simply take the lock, if there + * are waiters try to steal the lock. The lock may + * be stolen three times before the next queued + * waiter will get the lock. + */ + new = (old ? (old + _Q_LOCK_STEAL_ADD) : 0) | lockval; + if (__atomic_cmpxchg_bool(&lp->lock, old, new)) + /* Got the lock */ + goto out; + /* lock passing in progress */ + continue; + } + /* Make the node of this CPU the new tail. */ + new = node_id | (old & _Q_LOCK_MASK); + if (__atomic_cmpxchg_bool(&lp->lock, old, new)) + break; + } + /* Set the 'next' pointer of the tail node in the queue */ + tail_id = old & _Q_TAIL_MASK; + if (tail_id != 0) { + node->prev = arch_spin_decode_tail(tail_id); + WRITE_ONCE(node->prev->next, node); + } + + /* Pass the virtual CPU to the lock holder if it is not running */ + owner = arch_spin_yield_target(old, node); + if (owner && arch_vcpu_is_preempted(owner - 1)) + smp_yield_cpu(owner - 1); + + /* Spin on the CPU local node->prev pointer */ + if (tail_id != 0) { + count = spin_retry; + while (READ_ONCE(node->prev) != NULL) { + if (count-- >= 0) + continue; + count = spin_retry; + /* Query running state of lock holder again. */ + owner = arch_spin_yield_target(old, node); + if (owner && arch_vcpu_is_preempted(owner - 1)) + smp_yield_cpu(owner - 1); + } + } + + /* Spin on the lock value in the spinlock_t */ + count = spin_retry; + while (1) { + old = READ_ONCE(lp->lock); + owner = old & _Q_LOCK_CPU_MASK; + if (!owner) { + tail_id = old & _Q_TAIL_MASK; + new = ((tail_id != node_id) ? tail_id : 0) | lockval; + if (__atomic_cmpxchg_bool(&lp->lock, old, new)) + /* Got the lock */ + break; + continue; + } + if (count-- >= 0) + continue; + count = spin_retry; + if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(owner - 1)) + smp_yield_cpu(owner - 1); + } + + /* Pass lock_spin job to next CPU in the queue */ + if (node_id && tail_id != node_id) { + /* Wait until the next CPU has set up the 'next' pointer */ + while ((next = READ_ONCE(node->next)) == NULL) + ; + next->prev = NULL; + } + + out: + S390_lowcore.spinlock_index--; +} + +static inline void arch_spin_lock_classic(arch_spinlock_t *lp) +{ + int lockval, old, new, owner, count; + + lockval = SPINLOCK_LOCKVAL; /* cpu + 1 */ + + /* Pass the virtual CPU to the lock holder if it is not running */ + owner = arch_spin_yield_target(READ_ONCE(lp->lock), NULL); + if (owner && arch_vcpu_is_preempted(owner - 1)) + smp_yield_cpu(owner - 1); + + count = spin_retry; + while (1) { + old = arch_load_niai4(&lp->lock); + owner = old & _Q_LOCK_CPU_MASK; + /* Try to get the lock if it is free. */ + if (!owner) { + new = (old & _Q_TAIL_MASK) | lockval; + if (arch_cmpxchg_niai8(&lp->lock, old, new)) { + /* Got the lock */ + return; + } + continue; + } + if (count-- >= 0) + continue; + count = spin_retry; + if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(owner - 1)) + smp_yield_cpu(owner - 1); + } +} + +void arch_spin_lock_wait(arch_spinlock_t *lp) +{ + if (test_cpu_flag(CIF_DEDICATED_CPU)) + arch_spin_lock_queued(lp); + else + arch_spin_lock_classic(lp); +} +EXPORT_SYMBOL(arch_spin_lock_wait); + +int arch_spin_trylock_retry(arch_spinlock_t *lp) +{ + int cpu = SPINLOCK_LOCKVAL; + int owner, count; + + for (count = spin_retry; count > 0; count--) { + owner = READ_ONCE(lp->lock); + /* Try to get the lock if it is free. */ + if (!owner) { + if (__atomic_cmpxchg_bool(&lp->lock, 0, cpu)) + return 1; + } + } + return 0; +} +EXPORT_SYMBOL(arch_spin_trylock_retry); + +void arch_read_lock_wait(arch_rwlock_t *rw) +{ + if (unlikely(in_interrupt())) { + while (READ_ONCE(rw->cnts) & 0x10000) + barrier(); + return; + } + + /* Remove this reader again to allow recursive read locking */ + __atomic_add_const(-1, &rw->cnts); + /* Put the reader into the wait queue */ + arch_spin_lock(&rw->wait); + /* Now add this reader to the count value again */ + __atomic_add_const(1, &rw->cnts); + /* Loop until the writer is done */ + while (READ_ONCE(rw->cnts) & 0x10000) + barrier(); + arch_spin_unlock(&rw->wait); +} +EXPORT_SYMBOL(arch_read_lock_wait); + +void arch_write_lock_wait(arch_rwlock_t *rw) +{ + int old; + + /* Add this CPU to the write waiters */ + __atomic_add(0x20000, &rw->cnts); + + /* Put the writer into the wait queue */ + arch_spin_lock(&rw->wait); + + while (1) { + old = READ_ONCE(rw->cnts); + if ((old & 0x1ffff) == 0 && + __atomic_cmpxchg_bool(&rw->cnts, old, old | 0x10000)) + /* Got the lock */ + break; + barrier(); + } + + arch_spin_unlock(&rw->wait); +} +EXPORT_SYMBOL(arch_write_lock_wait); + +void arch_spin_relax(arch_spinlock_t *lp) +{ + int cpu; + + cpu = READ_ONCE(lp->lock) & _Q_LOCK_CPU_MASK; + if (!cpu) + return; + if (MACHINE_IS_LPAR && !arch_vcpu_is_preempted(cpu - 1)) + return; + smp_yield_cpu(cpu - 1); +} +EXPORT_SYMBOL(arch_spin_relax); diff --git a/arch/s390/lib/string.c b/arch/s390/lib/string.c new file mode 100644 index 0000000000..7d87418182 --- /dev/null +++ b/arch/s390/lib/string.c @@ -0,0 +1,348 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Optimized string functions + * + * S390 version + * Copyright IBM Corp. 2004 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) + */ + +#define IN_ARCH_STRING_C 1 +#ifndef __NO_FORTIFY +# define __NO_FORTIFY +#endif + +#include +#include +#include + +/* + * Helper functions to find the end of a string + */ +static inline char *__strend(const char *s) +{ + unsigned long e = 0; + + asm volatile( + " lghi 0,0\n" + "0: srst %[e],%[s]\n" + " jo 0b\n" + : [e] "+&a" (e), [s] "+&a" (s) + : + : "cc", "memory", "0"); + return (char *)e; +} + +static inline char *__strnend(const char *s, size_t n) +{ + const char *p = s + n; + + asm volatile( + " lghi 0,0\n" + "0: srst %[p],%[s]\n" + " jo 0b\n" + : [p] "+&d" (p), [s] "+&a" (s) + : + : "cc", "memory", "0"); + return (char *)p; +} + +/** + * strlen - Find the length of a string + * @s: The string to be sized + * + * returns the length of @s + */ +#ifdef __HAVE_ARCH_STRLEN +size_t strlen(const char *s) +{ + return __strend(s) - s; +} +EXPORT_SYMBOL(strlen); +#endif + +/** + * strnlen - Find the length of a length-limited string + * @s: The string to be sized + * @n: The maximum number of bytes to search + * + * returns the minimum of the length of @s and @n + */ +#ifdef __HAVE_ARCH_STRNLEN +size_t strnlen(const char *s, size_t n) +{ + return __strnend(s, n) - s; +} +EXPORT_SYMBOL(strnlen); +#endif + +/** + * strcpy - Copy a %NUL terminated string + * @dest: Where to copy the string to + * @src: Where to copy the string from + * + * returns a pointer to @dest + */ +#ifdef __HAVE_ARCH_STRCPY +char *strcpy(char *dest, const char *src) +{ + char *ret = dest; + + asm volatile( + " lghi 0,0\n" + "0: mvst %[dest],%[src]\n" + " jo 0b\n" + : [dest] "+&a" (dest), [src] "+&a" (src) + : + : "cc", "memory", "0"); + return ret; +} +EXPORT_SYMBOL(strcpy); +#endif + +/** + * strncpy - Copy a length-limited, %NUL-terminated string + * @dest: Where to copy the string to + * @src: Where to copy the string from + * @n: The maximum number of bytes to copy + * + * The result is not %NUL-terminated if the source exceeds + * @n bytes. + */ +#ifdef __HAVE_ARCH_STRNCPY +char *strncpy(char *dest, const char *src, size_t n) +{ + size_t len = __strnend(src, n) - src; + memset(dest + len, 0, n - len); + memcpy(dest, src, len); + return dest; +} +EXPORT_SYMBOL(strncpy); +#endif + +/** + * strcat - Append one %NUL-terminated string to another + * @dest: The string to be appended to + * @src: The string to append to it + * + * returns a pointer to @dest + */ +#ifdef __HAVE_ARCH_STRCAT +char *strcat(char *dest, const char *src) +{ + unsigned long dummy = 0; + char *ret = dest; + + asm volatile( + " lghi 0,0\n" + "0: srst %[dummy],%[dest]\n" + " jo 0b\n" + "1: mvst %[dummy],%[src]\n" + " jo 1b\n" + : [dummy] "+&a" (dummy), [dest] "+&a" (dest), [src] "+&a" (src) + : + : "cc", "memory", "0"); + return ret; +} +EXPORT_SYMBOL(strcat); +#endif + +/** + * strlcat - Append a length-limited, %NUL-terminated string to another + * @dest: The string to be appended to + * @src: The string to append to it + * @n: The size of the destination buffer. + */ +#ifdef __HAVE_ARCH_STRLCAT +size_t strlcat(char *dest, const char *src, size_t n) +{ + size_t dsize = __strend(dest) - dest; + size_t len = __strend(src) - src; + size_t res = dsize + len; + + if (dsize < n) { + dest += dsize; + n -= dsize; + if (len >= n) + len = n - 1; + dest[len] = '\0'; + memcpy(dest, src, len); + } + return res; +} +EXPORT_SYMBOL(strlcat); +#endif + +/** + * strncat - Append a length-limited, %NUL-terminated string to another + * @dest: The string to be appended to + * @src: The string to append to it + * @n: The maximum numbers of bytes to copy + * + * returns a pointer to @dest + * + * Note that in contrast to strncpy, strncat ensures the result is + * terminated. + */ +#ifdef __HAVE_ARCH_STRNCAT +char *strncat(char *dest, const char *src, size_t n) +{ + size_t len = __strnend(src, n) - src; + char *p = __strend(dest); + + p[len] = '\0'; + memcpy(p, src, len); + return dest; +} +EXPORT_SYMBOL(strncat); +#endif + +/** + * strcmp - Compare two strings + * @s1: One string + * @s2: Another string + * + * returns 0 if @s1 and @s2 are equal, + * < 0 if @s1 is less than @s2 + * > 0 if @s1 is greater than @s2 + */ +#ifdef __HAVE_ARCH_STRCMP +int strcmp(const char *s1, const char *s2) +{ + int ret = 0; + + asm volatile( + " lghi 0,0\n" + "0: clst %[s1],%[s2]\n" + " jo 0b\n" + " je 1f\n" + " ic %[ret],0(%[s1])\n" + " ic 0,0(%[s2])\n" + " sr %[ret],0\n" + "1:" + : [ret] "+&d" (ret), [s1] "+&a" (s1), [s2] "+&a" (s2) + : + : "cc", "memory", "0"); + return ret; +} +EXPORT_SYMBOL(strcmp); +#endif + +static inline int clcle(const char *s1, unsigned long l1, + const char *s2, unsigned long l2) +{ + union register_pair r1 = { .even = (unsigned long)s1, .odd = l1, }; + union register_pair r3 = { .even = (unsigned long)s2, .odd = l2, }; + int cc; + + asm volatile( + "0: clcle %[r1],%[r3],0\n" + " jo 0b\n" + " ipm %[cc]\n" + " srl %[cc],28\n" + : [cc] "=&d" (cc), [r1] "+&d" (r1.pair), [r3] "+&d" (r3.pair) + : + : "cc", "memory"); + return cc; +} + +/** + * strstr - Find the first substring in a %NUL terminated string + * @s1: The string to be searched + * @s2: The string to search for + */ +#ifdef __HAVE_ARCH_STRSTR +char *strstr(const char *s1, const char *s2) +{ + int l1, l2; + + l2 = __strend(s2) - s2; + if (!l2) + return (char *) s1; + l1 = __strend(s1) - s1; + while (l1-- >= l2) { + int cc; + + cc = clcle(s1, l2, s2, l2); + if (!cc) + return (char *) s1; + s1++; + } + return NULL; +} +EXPORT_SYMBOL(strstr); +#endif + +/** + * memchr - Find a character in an area of memory. + * @s: The memory area + * @c: The byte to search for + * @n: The size of the area. + * + * returns the address of the first occurrence of @c, or %NULL + * if @c is not found + */ +#ifdef __HAVE_ARCH_MEMCHR +void *memchr(const void *s, int c, size_t n) +{ + const void *ret = s + n; + + asm volatile( + " lgr 0,%[c]\n" + "0: srst %[ret],%[s]\n" + " jo 0b\n" + " jl 1f\n" + " la %[ret],0\n" + "1:" + : [ret] "+&a" (ret), [s] "+&a" (s) + : [c] "d" (c) + : "cc", "memory", "0"); + return (void *) ret; +} +EXPORT_SYMBOL(memchr); +#endif + +/** + * memcmp - Compare two areas of memory + * @s1: One area of memory + * @s2: Another area of memory + * @n: The size of the area. + */ +#ifdef __HAVE_ARCH_MEMCMP +int memcmp(const void *s1, const void *s2, size_t n) +{ + int ret; + + ret = clcle(s1, n, s2, n); + if (ret) + ret = ret == 1 ? -1 : 1; + return ret; +} +EXPORT_SYMBOL(memcmp); +#endif + +/** + * memscan - Find a character in an area of memory. + * @s: The memory area + * @c: The byte to search for + * @n: The size of the area. + * + * returns the address of the first occurrence of @c, or 1 byte past + * the area if @c is not found + */ +#ifdef __HAVE_ARCH_MEMSCAN +void *memscan(void *s, int c, size_t n) +{ + const void *ret = s + n; + + asm volatile( + " lgr 0,%[c]\n" + "0: srst %[ret],%[s]\n" + " jo 0b\n" + : [ret] "+&a" (ret), [s] "+&a" (s) + : [c] "d" (c) + : "cc", "memory", "0"); + return (void *)ret; +} +EXPORT_SYMBOL(memscan); +#endif diff --git a/arch/s390/lib/test_kprobes.c b/arch/s390/lib/test_kprobes.c new file mode 100644 index 0000000000..9e62d62812 --- /dev/null +++ b/arch/s390/lib/test_kprobes.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0+ + +#include +#include +#include +#include +#include "test_kprobes.h" + +static struct kprobe kp; + +static void setup_kprobe(struct kunit *test, struct kprobe *kp, + const char *symbol, int offset) +{ + kp->offset = offset; + kp->addr = NULL; + kp->symbol_name = symbol; +} + +static void test_kprobe_offset(struct kunit *test, struct kprobe *kp, + const char *target, int offset) +{ + int ret; + + setup_kprobe(test, kp, target, 0); + ret = register_kprobe(kp); + if (!ret) + unregister_kprobe(kp); + KUNIT_EXPECT_EQ(test, 0, ret); + setup_kprobe(test, kp, target, offset); + ret = register_kprobe(kp); + KUNIT_EXPECT_EQ(test, -EINVAL, ret); + if (!ret) + unregister_kprobe(kp); +} + +static void test_kprobe_odd(struct kunit *test) +{ + test_kprobe_offset(test, &kp, "kprobes_target_odd", + kprobes_target_odd_offs); +} + +static void test_kprobe_in_insn4(struct kunit *test) +{ + test_kprobe_offset(test, &kp, "kprobes_target_in_insn4", + kprobes_target_in_insn4_offs); +} + +static void test_kprobe_in_insn6_lo(struct kunit *test) +{ + test_kprobe_offset(test, &kp, "kprobes_target_in_insn6_lo", + kprobes_target_in_insn6_lo_offs); +} + +static void test_kprobe_in_insn6_hi(struct kunit *test) +{ + test_kprobe_offset(test, &kp, "kprobes_target_in_insn6_hi", + kprobes_target_in_insn6_hi_offs); +} + +static struct kunit_case kprobes_testcases[] = { + KUNIT_CASE(test_kprobe_odd), + KUNIT_CASE(test_kprobe_in_insn4), + KUNIT_CASE(test_kprobe_in_insn6_lo), + KUNIT_CASE(test_kprobe_in_insn6_hi), + {} +}; + +static struct kunit_suite kprobes_test_suite = { + .name = "kprobes_test_s390", + .test_cases = kprobes_testcases, +}; + +kunit_test_suites(&kprobes_test_suite); + +MODULE_LICENSE("GPL"); diff --git a/arch/s390/lib/test_kprobes.h b/arch/s390/lib/test_kprobes.h new file mode 100644 index 0000000000..2b4c9bc337 --- /dev/null +++ b/arch/s390/lib/test_kprobes.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +#ifndef TEST_KPROBES_H +#define TEST_KPROBES_H + +extern unsigned long kprobes_target_odd_offs; +extern unsigned long kprobes_target_in_insn4_offs; +extern unsigned long kprobes_target_in_insn6_lo_offs; +extern unsigned long kprobes_target_in_insn6_hi_offs; + +#endif diff --git a/arch/s390/lib/test_kprobes_asm.S b/arch/s390/lib/test_kprobes_asm.S new file mode 100644 index 0000000000..ade7a30423 --- /dev/null +++ b/arch/s390/lib/test_kprobes_asm.S @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#include +#include + +#define KPROBES_TARGET_START(name) \ + SYM_FUNC_START(name); \ + FTRACE_GEN_NOP_ASM(name) + +#define KPROBES_TARGET_END(name) \ + SYM_FUNC_END(name); \ + SYM_DATA(name##_offs, .quad 1b - name) + +KPROBES_TARGET_START(kprobes_target_in_insn4) + .word 0x4700 // bc 0,0 +1: .word 0x0000 + br %r14 +KPROBES_TARGET_END(kprobes_target_in_insn4) + +KPROBES_TARGET_START(kprobes_target_in_insn6_lo) + .word 0xe310 // ly 1,0 +1: .word 0x0000 + .word 0x0058 + br %r14 +KPROBES_TARGET_END(kprobes_target_in_insn6_lo) + +KPROBES_TARGET_START(kprobes_target_in_insn6_hi) + .word 0xe310 // ly 1,0 + .word 0x0000 +1: .word 0x0058 + br %r14 +KPROBES_TARGET_END(kprobes_target_in_insn6_hi) + +KPROBES_TARGET_START(kprobes_target_bp) + nop + .word 0x0000 + nop +1: br %r14 +KPROBES_TARGET_END(kprobes_target_bp) + +KPROBES_TARGET_START(kprobes_target_odd) + .byte 0x07 +1: .byte 0x07 + br %r14 +KPROBES_TARGET_END(kprobes_target_odd) diff --git a/arch/s390/lib/test_modules.c b/arch/s390/lib/test_modules.c new file mode 100644 index 0000000000..9894009fc1 --- /dev/null +++ b/arch/s390/lib/test_modules.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0+ + +#include +#include + +#include "test_modules.h" + +/* + * Test that modules with many relocations are loaded properly. + */ +static void test_modules_many_vmlinux_relocs(struct kunit *test) +{ + int result = 0; + +#define CALL_RETURN(i) result += test_modules_return_ ## i() + REPEAT_10000(CALL_RETURN); + KUNIT_ASSERT_EQ(test, result, 49995000); +} + +static struct kunit_case modules_testcases[] = { + KUNIT_CASE(test_modules_many_vmlinux_relocs), + {} +}; + +static struct kunit_suite modules_test_suite = { + .name = "modules_test_s390", + .test_cases = modules_testcases, +}; + +kunit_test_suites(&modules_test_suite); + +MODULE_LICENSE("GPL"); diff --git a/arch/s390/lib/test_modules.h b/arch/s390/lib/test_modules.h new file mode 100644 index 0000000000..6371fcf176 --- /dev/null +++ b/arch/s390/lib/test_modules.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +#ifndef TEST_MODULES_H +#define TEST_MODULES_H + +#define __REPEAT_10000_3(f, x) \ + f(x ## 0); \ + f(x ## 1); \ + f(x ## 2); \ + f(x ## 3); \ + f(x ## 4); \ + f(x ## 5); \ + f(x ## 6); \ + f(x ## 7); \ + f(x ## 8); \ + f(x ## 9) +#define __REPEAT_10000_2(f, x) \ + __REPEAT_10000_3(f, x ## 0); \ + __REPEAT_10000_3(f, x ## 1); \ + __REPEAT_10000_3(f, x ## 2); \ + __REPEAT_10000_3(f, x ## 3); \ + __REPEAT_10000_3(f, x ## 4); \ + __REPEAT_10000_3(f, x ## 5); \ + __REPEAT_10000_3(f, x ## 6); \ + __REPEAT_10000_3(f, x ## 7); \ + __REPEAT_10000_3(f, x ## 8); \ + __REPEAT_10000_3(f, x ## 9) +#define __REPEAT_10000_1(f, x) \ + __REPEAT_10000_2(f, x ## 0); \ + __REPEAT_10000_2(f, x ## 1); \ + __REPEAT_10000_2(f, x ## 2); \ + __REPEAT_10000_2(f, x ## 3); \ + __REPEAT_10000_2(f, x ## 4); \ + __REPEAT_10000_2(f, x ## 5); \ + __REPEAT_10000_2(f, x ## 6); \ + __REPEAT_10000_2(f, x ## 7); \ + __REPEAT_10000_2(f, x ## 8); \ + __REPEAT_10000_2(f, x ## 9) +#define REPEAT_10000(f) \ + __REPEAT_10000_1(f, 0); \ + __REPEAT_10000_1(f, 1); \ + __REPEAT_10000_1(f, 2); \ + __REPEAT_10000_1(f, 3); \ + __REPEAT_10000_1(f, 4); \ + __REPEAT_10000_1(f, 5); \ + __REPEAT_10000_1(f, 6); \ + __REPEAT_10000_1(f, 7); \ + __REPEAT_10000_1(f, 8); \ + __REPEAT_10000_1(f, 9) + +#define DECLARE_RETURN(i) int test_modules_return_ ## i(void) +REPEAT_10000(DECLARE_RETURN); + +#endif diff --git a/arch/s390/lib/test_modules_helpers.c b/arch/s390/lib/test_modules_helpers.c new file mode 100644 index 0000000000..1670349a03 --- /dev/null +++ b/arch/s390/lib/test_modules_helpers.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0+ + +#include + +#include "test_modules.h" + +#define DEFINE_RETURN(i) \ + int test_modules_return_ ## i(void) \ + { \ + return 1 ## i - 10000; \ + } \ + EXPORT_SYMBOL_GPL(test_modules_return_ ## i) +REPEAT_10000(DEFINE_RETURN); diff --git a/arch/s390/lib/test_unwind.c b/arch/s390/lib/test_unwind.c new file mode 100644 index 0000000000..7231bf97b9 --- /dev/null +++ b/arch/s390/lib/test_unwind.c @@ -0,0 +1,522 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Test module for unwind_for_each_frame + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct kunit *current_test; + +#define BT_BUF_SIZE (PAGE_SIZE * 4) + +static bool force_bt; +module_param_named(backtrace, force_bt, bool, 0444); +MODULE_PARM_DESC(backtrace, "print backtraces for all tests"); + +/* + * To avoid printk line limit split backtrace by lines + */ +static void print_backtrace(char *bt) +{ + char *p; + + while (true) { + p = strsep(&bt, "\n"); + if (!p) + break; + kunit_err(current_test, "%s\n", p); + } +} + +/* + * Calls unwind_for_each_frame(task, regs, sp) and verifies that the result + * contains unwindme_func2 followed by unwindme_func1. + */ +static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs, + unsigned long sp) +{ + int frame_count, prev_is_func2, seen_func2_func1, seen_arch_rethook_trampoline; + const int max_frames = 128; + struct unwind_state state; + size_t bt_pos = 0; + int ret = 0; + char *bt; + + bt = kmalloc(BT_BUF_SIZE, GFP_ATOMIC); + if (!bt) { + kunit_err(current_test, "failed to allocate backtrace buffer\n"); + return -ENOMEM; + } + /* Unwind. */ + frame_count = 0; + prev_is_func2 = 0; + seen_func2_func1 = 0; + seen_arch_rethook_trampoline = 0; + unwind_for_each_frame(&state, task, regs, sp) { + unsigned long addr = unwind_get_return_address(&state); + char sym[KSYM_SYMBOL_LEN]; + + if (frame_count++ == max_frames) + break; + if (state.reliable && !addr) { + kunit_err(current_test, "unwind state reliable but addr is 0\n"); + ret = -EINVAL; + break; + } + sprint_symbol(sym, addr); + if (bt_pos < BT_BUF_SIZE) { + bt_pos += snprintf(bt + bt_pos, BT_BUF_SIZE - bt_pos, + state.reliable ? " [%-7s%px] %pSR\n" : + "([%-7s%px] %pSR)\n", + stack_type_name(state.stack_info.type), + (void *)state.sp, (void *)state.ip); + if (bt_pos >= BT_BUF_SIZE) + kunit_err(current_test, "backtrace buffer is too small\n"); + } + frame_count += 1; + if (prev_is_func2 && str_has_prefix(sym, "unwindme_func1")) + seen_func2_func1 = 1; + prev_is_func2 = str_has_prefix(sym, "unwindme_func2"); + if (str_has_prefix(sym, "arch_rethook_trampoline+0x0/")) + seen_arch_rethook_trampoline = 1; + } + + /* Check the results. */ + if (unwind_error(&state)) { + kunit_err(current_test, "unwind error\n"); + ret = -EINVAL; + } + if (!seen_func2_func1) { + kunit_err(current_test, "unwindme_func2 and unwindme_func1 not found\n"); + ret = -EINVAL; + } + if (frame_count == max_frames) { + kunit_err(current_test, "Maximum number of frames exceeded\n"); + ret = -EINVAL; + } + if (seen_arch_rethook_trampoline) { + kunit_err(current_test, "arch_rethook_trampoline+0x0 in unwinding results\n"); + ret = -EINVAL; + } + if (ret || force_bt) + print_backtrace(bt); + kfree(bt); + return ret; +} + +/* State of the task being unwound. */ +struct unwindme { + int flags; + int ret; + struct task_struct *task; + struct completion task_ready; + wait_queue_head_t task_wq; + unsigned long sp; +}; + +static struct unwindme *unwindme; + +/* Values of unwindme.flags. */ +#define UWM_DEFAULT 0x0 +#define UWM_THREAD 0x1 /* Unwind a separate task. */ +#define UWM_REGS 0x2 /* Pass regs to test_unwind(). */ +#define UWM_SP 0x4 /* Pass sp to test_unwind(). */ +#define UWM_CALLER 0x8 /* Unwind starting from caller. */ +#define UWM_SWITCH_STACK 0x10 /* Use call_on_stack. */ +#define UWM_IRQ 0x20 /* Unwind from irq context. */ +#define UWM_PGM 0x40 /* Unwind from program check handler */ +#define UWM_KPROBE_ON_FTRACE 0x80 /* Unwind from kprobe handler called via ftrace. */ +#define UWM_FTRACE 0x100 /* Unwind from ftrace handler. */ +#define UWM_KRETPROBE 0x200 /* Unwind through kretprobed function. */ +#define UWM_KRETPROBE_HANDLER 0x400 /* Unwind from kretprobe handler. */ + +static __always_inline struct pt_regs fake_pt_regs(void) +{ + struct pt_regs regs; + + memset(®s, 0, sizeof(regs)); + regs.gprs[15] = current_stack_pointer; + + asm volatile( + "basr %[psw_addr],0\n" + : [psw_addr] "=d" (regs.psw.addr)); + return regs; +} + +static int kretprobe_ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + struct unwindme *u = unwindme; + + if (!(u->flags & UWM_KRETPROBE_HANDLER)) + return 0; + + u->ret = test_unwind(NULL, (u->flags & UWM_REGS) ? regs : NULL, + (u->flags & UWM_SP) ? u->sp : 0); + + return 0; +} + +static noinline notrace int test_unwind_kretprobed_func(struct unwindme *u) +{ + struct pt_regs regs; + + if (!(u->flags & UWM_KRETPROBE)) + return 0; + + regs = fake_pt_regs(); + return test_unwind(NULL, (u->flags & UWM_REGS) ? ®s : NULL, + (u->flags & UWM_SP) ? u->sp : 0); +} + +static noinline int test_unwind_kretprobed_func_caller(struct unwindme *u) +{ + return test_unwind_kretprobed_func(u); +} + +static int test_unwind_kretprobe(struct unwindme *u) +{ + int ret; + struct kretprobe my_kretprobe; + + if (!IS_ENABLED(CONFIG_KPROBES)) + kunit_skip(current_test, "requires CONFIG_KPROBES"); + + u->ret = -1; /* make sure kprobe is called */ + unwindme = u; + + memset(&my_kretprobe, 0, sizeof(my_kretprobe)); + my_kretprobe.handler = kretprobe_ret_handler; + my_kretprobe.maxactive = 1; + my_kretprobe.kp.addr = (kprobe_opcode_t *)test_unwind_kretprobed_func; + + ret = register_kretprobe(&my_kretprobe); + + if (ret < 0) { + kunit_err(current_test, "register_kretprobe failed %d\n", ret); + return -EINVAL; + } + + ret = test_unwind_kretprobed_func_caller(u); + unregister_kretprobe(&my_kretprobe); + unwindme = NULL; + if (u->flags & UWM_KRETPROBE_HANDLER) + ret = u->ret; + return ret; +} + +static int kprobe_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct unwindme *u = unwindme; + + u->ret = test_unwind(NULL, (u->flags & UWM_REGS) ? regs : NULL, + (u->flags & UWM_SP) ? u->sp : 0); + return 0; +} + +extern const char test_unwind_kprobed_insn[]; + +static noinline void test_unwind_kprobed_func(void) +{ + asm volatile( + " nopr %%r7\n" + "test_unwind_kprobed_insn:\n" + " nopr %%r7\n" + :); +} + +static int test_unwind_kprobe(struct unwindme *u) +{ + struct kprobe kp; + int ret; + + if (!IS_ENABLED(CONFIG_KPROBES)) + kunit_skip(current_test, "requires CONFIG_KPROBES"); + if (!IS_ENABLED(CONFIG_KPROBES_ON_FTRACE) && u->flags & UWM_KPROBE_ON_FTRACE) + kunit_skip(current_test, "requires CONFIG_KPROBES_ON_FTRACE"); + + u->ret = -1; /* make sure kprobe is called */ + unwindme = u; + memset(&kp, 0, sizeof(kp)); + kp.pre_handler = kprobe_pre_handler; + kp.addr = u->flags & UWM_KPROBE_ON_FTRACE ? + (kprobe_opcode_t *)test_unwind_kprobed_func : + (kprobe_opcode_t *)test_unwind_kprobed_insn; + ret = register_kprobe(&kp); + if (ret < 0) { + kunit_err(current_test, "register_kprobe failed %d\n", ret); + return -EINVAL; + } + + test_unwind_kprobed_func(); + unregister_kprobe(&kp); + unwindme = NULL; + return u->ret; +} + +static void notrace __used test_unwind_ftrace_handler(unsigned long ip, + unsigned long parent_ip, + struct ftrace_ops *fops, + struct ftrace_regs *fregs) +{ + struct unwindme *u = (struct unwindme *)fregs->regs.gprs[2]; + + u->ret = test_unwind(NULL, (u->flags & UWM_REGS) ? &fregs->regs : NULL, + (u->flags & UWM_SP) ? u->sp : 0); +} + +static noinline int test_unwind_ftraced_func(struct unwindme *u) +{ + return READ_ONCE(u)->ret; +} + +static int test_unwind_ftrace(struct unwindme *u) +{ + int ret; +#ifdef CONFIG_DYNAMIC_FTRACE + struct ftrace_ops *fops; + + fops = kunit_kzalloc(current_test, sizeof(*fops), GFP_KERNEL); + fops->func = test_unwind_ftrace_handler; + fops->flags = FTRACE_OPS_FL_DYNAMIC | + FTRACE_OPS_FL_RECURSION | + FTRACE_OPS_FL_SAVE_REGS | + FTRACE_OPS_FL_PERMANENT; +#else + kunit_skip(current_test, "requires CONFIG_DYNAMIC_FTRACE"); +#endif + + ret = ftrace_set_filter_ip(fops, (unsigned long)test_unwind_ftraced_func, 0, 0); + if (ret) { + kunit_err(current_test, "failed to set ftrace filter (%d)\n", ret); + return -1; + } + + ret = register_ftrace_function(fops); + if (!ret) { + ret = test_unwind_ftraced_func(u); + unregister_ftrace_function(fops); + } else { + kunit_err(current_test, "failed to register ftrace handler (%d)\n", ret); + } + + ftrace_set_filter_ip(fops, (unsigned long)test_unwind_ftraced_func, 1, 0); + return ret; +} + +/* This function may or may not appear in the backtrace. */ +static noinline int unwindme_func4(struct unwindme *u) +{ + if (!(u->flags & UWM_CALLER)) + u->sp = current_frame_address(); + if (u->flags & UWM_THREAD) { + complete(&u->task_ready); + wait_event(u->task_wq, kthread_should_park()); + kthread_parkme(); + return 0; + } else if (u->flags & (UWM_PGM | UWM_KPROBE_ON_FTRACE)) { + return test_unwind_kprobe(u); + } else if (u->flags & (UWM_KRETPROBE | UWM_KRETPROBE_HANDLER)) { + return test_unwind_kretprobe(u); + } else if (u->flags & UWM_FTRACE) { + return test_unwind_ftrace(u); + } else { + struct pt_regs regs = fake_pt_regs(); + + return test_unwind(NULL, + (u->flags & UWM_REGS) ? ®s : NULL, + (u->flags & UWM_SP) ? u->sp : 0); + } +} + +/* This function may or may not appear in the backtrace. */ +static noinline int unwindme_func3(struct unwindme *u) +{ + u->sp = current_frame_address(); + return unwindme_func4(u); +} + +/* This function must appear in the backtrace. */ +static noinline int unwindme_func2(struct unwindme *u) +{ + unsigned long flags; + int rc; + + if (u->flags & UWM_SWITCH_STACK) { + local_irq_save(flags); + local_mcck_disable(); + rc = call_on_stack(1, S390_lowcore.nodat_stack, + int, unwindme_func3, struct unwindme *, u); + local_mcck_enable(); + local_irq_restore(flags); + return rc; + } else { + return unwindme_func3(u); + } +} + +/* This function must follow unwindme_func2 in the backtrace. */ +static noinline int unwindme_func1(void *u) +{ + return unwindme_func2((struct unwindme *)u); +} + +static void unwindme_timer_fn(struct timer_list *unused) +{ + struct unwindme *u = READ_ONCE(unwindme); + + if (u) { + unwindme = NULL; + u->task = NULL; + u->ret = unwindme_func1(u); + complete(&u->task_ready); + } +} + +static struct timer_list unwind_timer; + +static int test_unwind_irq(struct unwindme *u) +{ + unwindme = u; + init_completion(&u->task_ready); + timer_setup(&unwind_timer, unwindme_timer_fn, 0); + mod_timer(&unwind_timer, jiffies + 1); + wait_for_completion(&u->task_ready); + return u->ret; +} + +/* Spawns a task and passes it to test_unwind(). */ +static int test_unwind_task(struct unwindme *u) +{ + struct task_struct *task; + int ret; + + /* Initialize thread-related fields. */ + init_completion(&u->task_ready); + init_waitqueue_head(&u->task_wq); + + /* + * Start the task and wait until it reaches unwindme_func4() and sleeps + * in (task_ready, unwind_done] range. + */ + task = kthread_run(unwindme_func1, u, "%s", __func__); + if (IS_ERR(task)) { + kunit_err(current_test, "kthread_run() failed\n"); + return PTR_ERR(task); + } + /* + * Make sure task reaches unwindme_func4 before parking it, + * we might park it before kthread function has been executed otherwise + */ + wait_for_completion(&u->task_ready); + kthread_park(task); + /* Unwind. */ + ret = test_unwind(task, NULL, (u->flags & UWM_SP) ? u->sp : 0); + kthread_stop(task); + return ret; +} + +struct test_params { + int flags; + char *name; +}; + +/* + * Create required parameter list for tests + */ +#define TEST_WITH_FLAGS(f) { .flags = f, .name = #f } +static const struct test_params param_list[] = { + TEST_WITH_FLAGS(UWM_DEFAULT), + TEST_WITH_FLAGS(UWM_SP), + TEST_WITH_FLAGS(UWM_REGS), + TEST_WITH_FLAGS(UWM_SWITCH_STACK), + TEST_WITH_FLAGS(UWM_SP | UWM_REGS), + TEST_WITH_FLAGS(UWM_CALLER | UWM_SP), + TEST_WITH_FLAGS(UWM_CALLER | UWM_SP | UWM_REGS), + TEST_WITH_FLAGS(UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK), + TEST_WITH_FLAGS(UWM_THREAD), + TEST_WITH_FLAGS(UWM_THREAD | UWM_SP), + TEST_WITH_FLAGS(UWM_THREAD | UWM_CALLER | UWM_SP), + TEST_WITH_FLAGS(UWM_IRQ), + TEST_WITH_FLAGS(UWM_IRQ | UWM_SWITCH_STACK), + TEST_WITH_FLAGS(UWM_IRQ | UWM_SP), + TEST_WITH_FLAGS(UWM_IRQ | UWM_REGS), + TEST_WITH_FLAGS(UWM_IRQ | UWM_SP | UWM_REGS), + TEST_WITH_FLAGS(UWM_IRQ | UWM_CALLER | UWM_SP), + TEST_WITH_FLAGS(UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS), + TEST_WITH_FLAGS(UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK), + TEST_WITH_FLAGS(UWM_PGM), + TEST_WITH_FLAGS(UWM_PGM | UWM_SP), + TEST_WITH_FLAGS(UWM_PGM | UWM_REGS), + TEST_WITH_FLAGS(UWM_PGM | UWM_SP | UWM_REGS), + TEST_WITH_FLAGS(UWM_KPROBE_ON_FTRACE), + TEST_WITH_FLAGS(UWM_KPROBE_ON_FTRACE | UWM_SP), + TEST_WITH_FLAGS(UWM_KPROBE_ON_FTRACE | UWM_REGS), + TEST_WITH_FLAGS(UWM_KPROBE_ON_FTRACE | UWM_SP | UWM_REGS), + TEST_WITH_FLAGS(UWM_FTRACE), + TEST_WITH_FLAGS(UWM_FTRACE | UWM_SP), + TEST_WITH_FLAGS(UWM_FTRACE | UWM_REGS), + TEST_WITH_FLAGS(UWM_FTRACE | UWM_SP | UWM_REGS), + TEST_WITH_FLAGS(UWM_KRETPROBE), + TEST_WITH_FLAGS(UWM_KRETPROBE | UWM_SP), + TEST_WITH_FLAGS(UWM_KRETPROBE | UWM_REGS), + TEST_WITH_FLAGS(UWM_KRETPROBE | UWM_SP | UWM_REGS), + TEST_WITH_FLAGS(UWM_KRETPROBE_HANDLER), + TEST_WITH_FLAGS(UWM_KRETPROBE_HANDLER | UWM_SP), + TEST_WITH_FLAGS(UWM_KRETPROBE_HANDLER | UWM_REGS), + TEST_WITH_FLAGS(UWM_KRETPROBE_HANDLER | UWM_SP | UWM_REGS), +}; + +/* + * Parameter description generator: required for KUNIT_ARRAY_PARAM() + */ +static void get_desc(const struct test_params *params, char *desc) +{ + strscpy(desc, params->name, KUNIT_PARAM_DESC_SIZE); +} + +/* + * Create test_unwind_gen_params + */ +KUNIT_ARRAY_PARAM(test_unwind, param_list, get_desc); + +static void test_unwind_flags(struct kunit *test) +{ + struct unwindme u; + const struct test_params *params; + + current_test = test; + params = (const struct test_params *)test->param_value; + u.flags = params->flags; + if (u.flags & UWM_THREAD) + KUNIT_EXPECT_EQ(test, 0, test_unwind_task(&u)); + else if (u.flags & UWM_IRQ) + KUNIT_EXPECT_EQ(test, 0, test_unwind_irq(&u)); + else + KUNIT_EXPECT_EQ(test, 0, unwindme_func1(&u)); +} + +static struct kunit_case unwind_test_cases[] = { + KUNIT_CASE_PARAM(test_unwind_flags, test_unwind_gen_params), + {} +}; + +static struct kunit_suite test_unwind_suite = { + .name = "test_unwind", + .test_cases = unwind_test_cases, +}; + +kunit_test_suites(&test_unwind_suite); + +MODULE_LICENSE("GPL"); diff --git a/arch/s390/lib/tishift.S b/arch/s390/lib/tishift.S new file mode 100644 index 0000000000..96214f51f4 --- /dev/null +++ b/arch/s390/lib/tishift.S @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include +#include +#include + + .section .noinstr.text, "ax" + + GEN_BR_THUNK %r14 + +SYM_FUNC_START(__ashlti3) + lmg %r0,%r1,0(%r3) + cije %r4,0,1f + lhi %r3,64 + sr %r3,%r4 + jnh 0f + srlg %r3,%r1,0(%r3) + sllg %r0,%r0,0(%r4) + sllg %r1,%r1,0(%r4) + ogr %r0,%r3 + j 1f +0: sllg %r0,%r1,-64(%r4) + lghi %r1,0 +1: stmg %r0,%r1,0(%r2) + BR_EX %r14 +SYM_FUNC_END(__ashlti3) +EXPORT_SYMBOL(__ashlti3) + +SYM_FUNC_START(__ashrti3) + lmg %r0,%r1,0(%r3) + cije %r4,0,1f + lhi %r3,64 + sr %r3,%r4 + jnh 0f + sllg %r3,%r0,0(%r3) + srlg %r1,%r1,0(%r4) + srag %r0,%r0,0(%r4) + ogr %r1,%r3 + j 1f +0: srag %r1,%r0,-64(%r4) + srag %r0,%r0,63 +1: stmg %r0,%r1,0(%r2) + BR_EX %r14 +SYM_FUNC_END(__ashrti3) +EXPORT_SYMBOL(__ashrti3) + +SYM_FUNC_START(__lshrti3) + lmg %r0,%r1,0(%r3) + cije %r4,0,1f + lhi %r3,64 + sr %r3,%r4 + jnh 0f + sllg %r3,%r0,0(%r3) + srlg %r1,%r1,0(%r4) + srlg %r0,%r0,0(%r4) + ogr %r1,%r3 + j 1f +0: srlg %r1,%r0,-64(%r4) + lghi %r0,0 +1: stmg %r0,%r1,0(%r2) + BR_EX %r14 +SYM_FUNC_END(__lshrti3) +EXPORT_SYMBOL(__lshrti3) diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c new file mode 100644 index 0000000000..e4a13d7cab --- /dev/null +++ b/arch/s390/lib/uaccess.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Standard user space access functions based on mvcp/mvcs and doing + * interesting things in the secondary space mode. + * + * Copyright IBM Corp. 2006,2014 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), + * Gerald Schaefer (gerald.schaefer@de.ibm.com) + */ + +#include +#include +#include +#include + +#ifdef CONFIG_DEBUG_ENTRY +void debug_user_asce(int exit) +{ + unsigned long cr1, cr7; + + __ctl_store(cr1, 1, 1); + __ctl_store(cr7, 7, 7); + if (cr1 == S390_lowcore.kernel_asce && cr7 == S390_lowcore.user_asce) + return; + panic("incorrect ASCE on kernel %s\n" + "cr1: %016lx cr7: %016lx\n" + "kernel: %016llx user: %016llx\n", + exit ? "exit" : "entry", cr1, cr7, + S390_lowcore.kernel_asce, S390_lowcore.user_asce); +} +#endif /*CONFIG_DEBUG_ENTRY */ + +static unsigned long raw_copy_from_user_key(void *to, const void __user *from, + unsigned long size, unsigned long key) +{ + unsigned long rem; + union oac spec = { + .oac2.key = key, + .oac2.as = PSW_BITS_AS_SECONDARY, + .oac2.k = 1, + .oac2.a = 1, + }; + + asm volatile( + " lr 0,%[spec]\n" + "0: mvcos 0(%[to]),0(%[from]),%[size]\n" + "1: jz 5f\n" + " algr %[size],%[val]\n" + " slgr %[from],%[val]\n" + " slgr %[to],%[val]\n" + " j 0b\n" + "2: la %[rem],4095(%[from])\n" /* rem = from + 4095 */ + " nr %[rem],%[val]\n" /* rem = (from + 4095) & -4096 */ + " slgr %[rem],%[from]\n" + " clgr %[size],%[rem]\n" /* copy crosses next page boundary? */ + " jnh 6f\n" + "3: mvcos 0(%[to]),0(%[from]),%[rem]\n" + "4: slgr %[size],%[rem]\n" + " j 6f\n" + "5: slgr %[size],%[size]\n" + "6:\n" + EX_TABLE(0b, 2b) + EX_TABLE(1b, 2b) + EX_TABLE(3b, 6b) + EX_TABLE(4b, 6b) + : [size] "+&a" (size), [from] "+&a" (from), [to] "+&a" (to), [rem] "=&a" (rem) + : [val] "a" (-4096UL), [spec] "d" (spec.val) + : "cc", "memory", "0"); + return size; +} + +unsigned long raw_copy_from_user(void *to, const void __user *from, unsigned long n) +{ + return raw_copy_from_user_key(to, from, n, 0); +} +EXPORT_SYMBOL(raw_copy_from_user); + +unsigned long _copy_from_user_key(void *to, const void __user *from, + unsigned long n, unsigned long key) +{ + unsigned long res = n; + + might_fault(); + if (!should_fail_usercopy()) { + instrument_copy_from_user_before(to, from, n); + res = raw_copy_from_user_key(to, from, n, key); + instrument_copy_from_user_after(to, from, n, res); + } + if (unlikely(res)) + memset(to + (n - res), 0, res); + return res; +} +EXPORT_SYMBOL(_copy_from_user_key); + +static unsigned long raw_copy_to_user_key(void __user *to, const void *from, + unsigned long size, unsigned long key) +{ + unsigned long rem; + union oac spec = { + .oac1.key = key, + .oac1.as = PSW_BITS_AS_SECONDARY, + .oac1.k = 1, + .oac1.a = 1, + }; + + asm volatile( + " lr 0,%[spec]\n" + "0: mvcos 0(%[to]),0(%[from]),%[size]\n" + "1: jz 5f\n" + " algr %[size],%[val]\n" + " slgr %[to],%[val]\n" + " slgr %[from],%[val]\n" + " j 0b\n" + "2: la %[rem],4095(%[to])\n" /* rem = to + 4095 */ + " nr %[rem],%[val]\n" /* rem = (to + 4095) & -4096 */ + " slgr %[rem],%[to]\n" + " clgr %[size],%[rem]\n" /* copy crosses next page boundary? */ + " jnh 6f\n" + "3: mvcos 0(%[to]),0(%[from]),%[rem]\n" + "4: slgr %[size],%[rem]\n" + " j 6f\n" + "5: slgr %[size],%[size]\n" + "6:\n" + EX_TABLE(0b, 2b) + EX_TABLE(1b, 2b) + EX_TABLE(3b, 6b) + EX_TABLE(4b, 6b) + : [size] "+&a" (size), [to] "+&a" (to), [from] "+&a" (from), [rem] "=&a" (rem) + : [val] "a" (-4096UL), [spec] "d" (spec.val) + : "cc", "memory", "0"); + return size; +} + +unsigned long raw_copy_to_user(void __user *to, const void *from, unsigned long n) +{ + return raw_copy_to_user_key(to, from, n, 0); +} +EXPORT_SYMBOL(raw_copy_to_user); + +unsigned long _copy_to_user_key(void __user *to, const void *from, + unsigned long n, unsigned long key) +{ + might_fault(); + if (should_fail_usercopy()) + return n; + instrument_copy_to_user(to, from, n); + return raw_copy_to_user_key(to, from, n, key); +} +EXPORT_SYMBOL(_copy_to_user_key); + +unsigned long __clear_user(void __user *to, unsigned long size) +{ + unsigned long rem; + union oac spec = { + .oac1.as = PSW_BITS_AS_SECONDARY, + .oac1.a = 1, + }; + + asm volatile( + " lr 0,%[spec]\n" + "0: mvcos 0(%[to]),0(%[zeropg]),%[size]\n" + "1: jz 5f\n" + " algr %[size],%[val]\n" + " slgr %[to],%[val]\n" + " j 0b\n" + "2: la %[rem],4095(%[to])\n" /* rem = to + 4095 */ + " nr %[rem],%[val]\n" /* rem = (to + 4095) & -4096 */ + " slgr %[rem],%[to]\n" + " clgr %[size],%[rem]\n" /* copy crosses next page boundary? */ + " jnh 6f\n" + "3: mvcos 0(%[to]),0(%[zeropg]),%[rem]\n" + "4: slgr %[size],%[rem]\n" + " j 6f\n" + "5: slgr %[size],%[size]\n" + "6:\n" + EX_TABLE(0b, 2b) + EX_TABLE(1b, 2b) + EX_TABLE(3b, 6b) + EX_TABLE(4b, 6b) + : [size] "+&a" (size), [to] "+&a" (to), [rem] "=&a" (rem) + : [val] "a" (-4096UL), [zeropg] "a" (empty_zero_page), [spec] "d" (spec.val) + : "cc", "memory", "0"); + return size; +} +EXPORT_SYMBOL(__clear_user); diff --git a/arch/s390/lib/xor.c b/arch/s390/lib/xor.c new file mode 100644 index 0000000000..fb924a8041 --- /dev/null +++ b/arch/s390/lib/xor.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Optimized xor_block operation for RAID4/5 + * + * Copyright IBM Corp. 2016 + * Author(s): Martin Schwidefsky + */ + +#include +#include +#include +#include + +static void xor_xc_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2) +{ + asm volatile( + " larl 1,2f\n" + " aghi %0,-1\n" + " jm 3f\n" + " srlg 0,%0,8\n" + " ltgr 0,0\n" + " jz 1f\n" + "0: xc 0(256,%1),0(%2)\n" + " la %1,256(%1)\n" + " la %2,256(%2)\n" + " brctg 0,0b\n" + "1: ex %0,0(1)\n" + " j 3f\n" + "2: xc 0(1,%1),0(%2)\n" + "3:\n" + : : "d" (bytes), "a" (p1), "a" (p2) + : "0", "1", "cc", "memory"); +} + +static void xor_xc_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3) +{ + asm volatile( + " larl 1,2f\n" + " aghi %0,-1\n" + " jm 3f\n" + " srlg 0,%0,8\n" + " ltgr 0,0\n" + " jz 1f\n" + "0: xc 0(256,%1),0(%2)\n" + " xc 0(256,%1),0(%3)\n" + " la %1,256(%1)\n" + " la %2,256(%2)\n" + " la %3,256(%3)\n" + " brctg 0,0b\n" + "1: ex %0,0(1)\n" + " ex %0,6(1)\n" + " j 3f\n" + "2: xc 0(1,%1),0(%2)\n" + " xc 0(1,%1),0(%3)\n" + "3:\n" + : "+d" (bytes), "+a" (p1), "+a" (p2), "+a" (p3) + : : "0", "1", "cc", "memory"); +} + +static void xor_xc_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4) +{ + asm volatile( + " larl 1,2f\n" + " aghi %0,-1\n" + " jm 3f\n" + " srlg 0,%0,8\n" + " ltgr 0,0\n" + " jz 1f\n" + "0: xc 0(256,%1),0(%2)\n" + " xc 0(256,%1),0(%3)\n" + " xc 0(256,%1),0(%4)\n" + " la %1,256(%1)\n" + " la %2,256(%2)\n" + " la %3,256(%3)\n" + " la %4,256(%4)\n" + " brctg 0,0b\n" + "1: ex %0,0(1)\n" + " ex %0,6(1)\n" + " ex %0,12(1)\n" + " j 3f\n" + "2: xc 0(1,%1),0(%2)\n" + " xc 0(1,%1),0(%3)\n" + " xc 0(1,%1),0(%4)\n" + "3:\n" + : "+d" (bytes), "+a" (p1), "+a" (p2), "+a" (p3), "+a" (p4) + : : "0", "1", "cc", "memory"); +} + +static void xor_xc_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5) +{ + asm volatile( + " larl 1,2f\n" + " aghi %0,-1\n" + " jm 3f\n" + " srlg 0,%0,8\n" + " ltgr 0,0\n" + " jz 1f\n" + "0: xc 0(256,%1),0(%2)\n" + " xc 0(256,%1),0(%3)\n" + " xc 0(256,%1),0(%4)\n" + " xc 0(256,%1),0(%5)\n" + " la %1,256(%1)\n" + " la %2,256(%2)\n" + " la %3,256(%3)\n" + " la %4,256(%4)\n" + " la %5,256(%5)\n" + " brctg 0,0b\n" + "1: ex %0,0(1)\n" + " ex %0,6(1)\n" + " ex %0,12(1)\n" + " ex %0,18(1)\n" + " j 3f\n" + "2: xc 0(1,%1),0(%2)\n" + " xc 0(1,%1),0(%3)\n" + " xc 0(1,%1),0(%4)\n" + " xc 0(1,%1),0(%5)\n" + "3:\n" + : "+d" (bytes), "+a" (p1), "+a" (p2), "+a" (p3), "+a" (p4), + "+a" (p5) + : : "0", "1", "cc", "memory"); +} + +struct xor_block_template xor_block_xc = { + .name = "xc", + .do_2 = xor_xc_2, + .do_3 = xor_xc_3, + .do_4 = xor_xc_4, + .do_5 = xor_xc_5, +}; +EXPORT_SYMBOL(xor_block_xc); diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile new file mode 100644 index 0000000000..352ff520fd --- /dev/null +++ b/arch/s390/mm/Makefile @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the linux s390-specific parts of the memory manager. +# + +obj-y := init.o fault.o extmem.o mmap.o vmem.o maccess.o +obj-y += page-states.o pageattr.o pgtable.o pgalloc.o extable.o + +obj-$(CONFIG_CMM) += cmm.o +obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o +obj-$(CONFIG_PTDUMP_CORE) += dump_pagetables.o +obj-$(CONFIG_PGSTE) += gmap.o +obj-$(CONFIG_PFAULT) += pfault.o diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c new file mode 100644 index 0000000000..f475153132 --- /dev/null +++ b/arch/s390/mm/cmm.c @@ -0,0 +1,431 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Collaborative memory management interface. + * + * Copyright IBM Corp 2003, 2010 + * Author(s): Martin Schwidefsky , + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef CONFIG_CMM_IUCV +static char *cmm_default_sender = "VMRMSVM"; +#endif +static char *sender; +module_param(sender, charp, 0400); +MODULE_PARM_DESC(sender, + "Guest name that may send SMSG messages (default VMRMSVM)"); + +#include "../../../drivers/s390/net/smsgiucv.h" + +#define CMM_NR_PAGES ((PAGE_SIZE / sizeof(unsigned long)) - 2) + +struct cmm_page_array { + struct cmm_page_array *next; + unsigned long index; + unsigned long pages[CMM_NR_PAGES]; +}; + +static long cmm_pages; +static long cmm_timed_pages; +static volatile long cmm_pages_target; +static volatile long cmm_timed_pages_target; +static long cmm_timeout_pages; +static long cmm_timeout_seconds; + +static struct cmm_page_array *cmm_page_list; +static struct cmm_page_array *cmm_timed_page_list; +static DEFINE_SPINLOCK(cmm_lock); + +static struct task_struct *cmm_thread_ptr; +static DECLARE_WAIT_QUEUE_HEAD(cmm_thread_wait); + +static void cmm_timer_fn(struct timer_list *); +static void cmm_set_timer(void); +static DEFINE_TIMER(cmm_timer, cmm_timer_fn); + +static long cmm_alloc_pages(long nr, long *counter, + struct cmm_page_array **list) +{ + struct cmm_page_array *pa, *npa; + unsigned long addr; + + while (nr) { + addr = __get_free_page(GFP_NOIO); + if (!addr) + break; + spin_lock(&cmm_lock); + pa = *list; + if (!pa || pa->index >= CMM_NR_PAGES) { + /* Need a new page for the page list. */ + spin_unlock(&cmm_lock); + npa = (struct cmm_page_array *) + __get_free_page(GFP_NOIO); + if (!npa) { + free_page(addr); + break; + } + spin_lock(&cmm_lock); + pa = *list; + if (!pa || pa->index >= CMM_NR_PAGES) { + npa->next = pa; + npa->index = 0; + pa = npa; + *list = pa; + } else + free_page((unsigned long) npa); + } + diag10_range(virt_to_pfn((void *)addr), 1); + pa->pages[pa->index++] = addr; + (*counter)++; + spin_unlock(&cmm_lock); + nr--; + } + return nr; +} + +static long cmm_free_pages(long nr, long *counter, struct cmm_page_array **list) +{ + struct cmm_page_array *pa; + unsigned long addr; + + spin_lock(&cmm_lock); + pa = *list; + while (nr) { + if (!pa || pa->index <= 0) + break; + addr = pa->pages[--pa->index]; + if (pa->index == 0) { + pa = pa->next; + free_page((unsigned long) *list); + *list = pa; + } + free_page(addr); + (*counter)--; + nr--; + } + spin_unlock(&cmm_lock); + return nr; +} + +static int cmm_oom_notify(struct notifier_block *self, + unsigned long dummy, void *parm) +{ + unsigned long *freed = parm; + long nr = 256; + + nr = cmm_free_pages(nr, &cmm_timed_pages, &cmm_timed_page_list); + if (nr > 0) + nr = cmm_free_pages(nr, &cmm_pages, &cmm_page_list); + cmm_pages_target = cmm_pages; + cmm_timed_pages_target = cmm_timed_pages; + *freed += 256 - nr; + return NOTIFY_OK; +} + +static struct notifier_block cmm_oom_nb = { + .notifier_call = cmm_oom_notify, +}; + +static int cmm_thread(void *dummy) +{ + int rc; + + while (1) { + rc = wait_event_interruptible(cmm_thread_wait, + cmm_pages != cmm_pages_target || + cmm_timed_pages != cmm_timed_pages_target || + kthread_should_stop()); + if (kthread_should_stop() || rc == -ERESTARTSYS) { + cmm_pages_target = cmm_pages; + cmm_timed_pages_target = cmm_timed_pages; + break; + } + if (cmm_pages_target > cmm_pages) { + if (cmm_alloc_pages(1, &cmm_pages, &cmm_page_list)) + cmm_pages_target = cmm_pages; + } else if (cmm_pages_target < cmm_pages) { + cmm_free_pages(1, &cmm_pages, &cmm_page_list); + } + if (cmm_timed_pages_target > cmm_timed_pages) { + if (cmm_alloc_pages(1, &cmm_timed_pages, + &cmm_timed_page_list)) + cmm_timed_pages_target = cmm_timed_pages; + } else if (cmm_timed_pages_target < cmm_timed_pages) { + cmm_free_pages(1, &cmm_timed_pages, + &cmm_timed_page_list); + } + if (cmm_timed_pages > 0 && !timer_pending(&cmm_timer)) + cmm_set_timer(); + } + return 0; +} + +static void cmm_kick_thread(void) +{ + wake_up(&cmm_thread_wait); +} + +static void cmm_set_timer(void) +{ + if (cmm_timed_pages_target <= 0 || cmm_timeout_seconds <= 0) { + if (timer_pending(&cmm_timer)) + del_timer(&cmm_timer); + return; + } + mod_timer(&cmm_timer, jiffies + msecs_to_jiffies(cmm_timeout_seconds * MSEC_PER_SEC)); +} + +static void cmm_timer_fn(struct timer_list *unused) +{ + long nr; + + nr = cmm_timed_pages_target - cmm_timeout_pages; + if (nr < 0) + cmm_timed_pages_target = 0; + else + cmm_timed_pages_target = nr; + cmm_kick_thread(); + cmm_set_timer(); +} + +static void cmm_set_pages(long nr) +{ + cmm_pages_target = nr; + cmm_kick_thread(); +} + +static long cmm_get_pages(void) +{ + return cmm_pages; +} + +static void cmm_add_timed_pages(long nr) +{ + cmm_timed_pages_target += nr; + cmm_kick_thread(); +} + +static long cmm_get_timed_pages(void) +{ + return cmm_timed_pages; +} + +static void cmm_set_timeout(long nr, long seconds) +{ + cmm_timeout_pages = nr; + cmm_timeout_seconds = seconds; + cmm_set_timer(); +} + +static int cmm_skip_blanks(char *cp, char **endp) +{ + char *str; + + for (str = cp; *str == ' ' || *str == '\t'; str++) + ; + *endp = str; + return str != cp; +} + +static int cmm_pages_handler(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + long nr = cmm_get_pages(); + struct ctl_table ctl_entry = { + .procname = ctl->procname, + .data = &nr, + .maxlen = sizeof(long), + }; + int rc; + + rc = proc_doulongvec_minmax(&ctl_entry, write, buffer, lenp, ppos); + if (rc < 0 || !write) + return rc; + + cmm_set_pages(nr); + return 0; +} + +static int cmm_timed_pages_handler(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, + loff_t *ppos) +{ + long nr = cmm_get_timed_pages(); + struct ctl_table ctl_entry = { + .procname = ctl->procname, + .data = &nr, + .maxlen = sizeof(long), + }; + int rc; + + rc = proc_doulongvec_minmax(&ctl_entry, write, buffer, lenp, ppos); + if (rc < 0 || !write) + return rc; + + cmm_add_timed_pages(nr); + return 0; +} + +static int cmm_timeout_handler(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + char buf[64], *p; + long nr, seconds; + unsigned int len; + + if (!*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + + if (write) { + len = min(*lenp, sizeof(buf)); + memcpy(buf, buffer, len); + buf[len - 1] = '\0'; + cmm_skip_blanks(buf, &p); + nr = simple_strtoul(p, &p, 0); + cmm_skip_blanks(p, &p); + seconds = simple_strtoul(p, &p, 0); + cmm_set_timeout(nr, seconds); + *ppos += *lenp; + } else { + len = sprintf(buf, "%ld %ld\n", + cmm_timeout_pages, cmm_timeout_seconds); + if (len > *lenp) + len = *lenp; + memcpy(buffer, buf, len); + *lenp = len; + *ppos += len; + } + return 0; +} + +static struct ctl_table cmm_table[] = { + { + .procname = "cmm_pages", + .mode = 0644, + .proc_handler = cmm_pages_handler, + }, + { + .procname = "cmm_timed_pages", + .mode = 0644, + .proc_handler = cmm_timed_pages_handler, + }, + { + .procname = "cmm_timeout", + .mode = 0644, + .proc_handler = cmm_timeout_handler, + }, + { } +}; + +#ifdef CONFIG_CMM_IUCV +#define SMSG_PREFIX "CMM" +static void cmm_smsg_target(const char *from, char *msg) +{ + long nr, seconds; + + if (strlen(sender) > 0 && strcmp(from, sender) != 0) + return; + if (!cmm_skip_blanks(msg + strlen(SMSG_PREFIX), &msg)) + return; + if (strncmp(msg, "SHRINK", 6) == 0) { + if (!cmm_skip_blanks(msg + 6, &msg)) + return; + nr = simple_strtoul(msg, &msg, 0); + cmm_skip_blanks(msg, &msg); + if (*msg == '\0') + cmm_set_pages(nr); + } else if (strncmp(msg, "RELEASE", 7) == 0) { + if (!cmm_skip_blanks(msg + 7, &msg)) + return; + nr = simple_strtoul(msg, &msg, 0); + cmm_skip_blanks(msg, &msg); + if (*msg == '\0') + cmm_add_timed_pages(nr); + } else if (strncmp(msg, "REUSE", 5) == 0) { + if (!cmm_skip_blanks(msg + 5, &msg)) + return; + nr = simple_strtoul(msg, &msg, 0); + if (!cmm_skip_blanks(msg, &msg)) + return; + seconds = simple_strtoul(msg, &msg, 0); + cmm_skip_blanks(msg, &msg); + if (*msg == '\0') + cmm_set_timeout(nr, seconds); + } +} +#endif + +static struct ctl_table_header *cmm_sysctl_header; + +static int __init cmm_init(void) +{ + int rc = -ENOMEM; + + cmm_sysctl_header = register_sysctl("vm", cmm_table); + if (!cmm_sysctl_header) + goto out_sysctl; +#ifdef CONFIG_CMM_IUCV + /* convert sender to uppercase characters */ + if (sender) + string_upper(sender, sender); + else + sender = cmm_default_sender; + + rc = smsg_register_callback(SMSG_PREFIX, cmm_smsg_target); + if (rc < 0) + goto out_smsg; +#endif + rc = register_oom_notifier(&cmm_oom_nb); + if (rc < 0) + goto out_oom_notify; + cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread"); + if (!IS_ERR(cmm_thread_ptr)) + return 0; + + rc = PTR_ERR(cmm_thread_ptr); + unregister_oom_notifier(&cmm_oom_nb); +out_oom_notify: +#ifdef CONFIG_CMM_IUCV + smsg_unregister_callback(SMSG_PREFIX, cmm_smsg_target); +out_smsg: +#endif + unregister_sysctl_table(cmm_sysctl_header); +out_sysctl: + del_timer_sync(&cmm_timer); + return rc; +} +module_init(cmm_init); + +static void __exit cmm_exit(void) +{ + unregister_sysctl_table(cmm_sysctl_header); +#ifdef CONFIG_CMM_IUCV + smsg_unregister_callback(SMSG_PREFIX, cmm_smsg_target); +#endif + unregister_oom_notifier(&cmm_oom_nb); + kthread_stop(cmm_thread_ptr); + del_timer_sync(&cmm_timer); + cmm_free_pages(cmm_pages, &cmm_pages, &cmm_page_list); + cmm_free_pages(cmm_timed_pages, &cmm_timed_pages, &cmm_timed_page_list); +} +module_exit(cmm_exit); + +MODULE_LICENSE("GPL"); diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c new file mode 100644 index 0000000000..b51666967a --- /dev/null +++ b/arch/s390/mm/dump_pagetables.c @@ -0,0 +1,315 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static unsigned long max_addr; + +struct addr_marker { + unsigned long start_address; + const char *name; +}; + +enum address_markers_idx { + IDENTITY_BEFORE_NR = 0, + IDENTITY_BEFORE_END_NR, + AMODE31_START_NR, + AMODE31_END_NR, + KERNEL_START_NR, + KERNEL_END_NR, +#ifdef CONFIG_KFENCE + KFENCE_START_NR, + KFENCE_END_NR, +#endif + IDENTITY_AFTER_NR, + IDENTITY_AFTER_END_NR, + VMEMMAP_NR, + VMEMMAP_END_NR, + VMALLOC_NR, + VMALLOC_END_NR, + MODULES_NR, + MODULES_END_NR, + ABS_LOWCORE_NR, + ABS_LOWCORE_END_NR, + MEMCPY_REAL_NR, + MEMCPY_REAL_END_NR, +#ifdef CONFIG_KASAN + KASAN_SHADOW_START_NR, + KASAN_SHADOW_END_NR, +#endif +}; + +static struct addr_marker address_markers[] = { + [IDENTITY_BEFORE_NR] = {0, "Identity Mapping Start"}, + [IDENTITY_BEFORE_END_NR] = {(unsigned long)_stext, "Identity Mapping End"}, + [AMODE31_START_NR] = {0, "Amode31 Area Start"}, + [AMODE31_END_NR] = {0, "Amode31 Area End"}, + [KERNEL_START_NR] = {(unsigned long)_stext, "Kernel Image Start"}, + [KERNEL_END_NR] = {(unsigned long)_end, "Kernel Image End"}, +#ifdef CONFIG_KFENCE + [KFENCE_START_NR] = {0, "KFence Pool Start"}, + [KFENCE_END_NR] = {0, "KFence Pool End"}, +#endif + [IDENTITY_AFTER_NR] = {(unsigned long)_end, "Identity Mapping Start"}, + [IDENTITY_AFTER_END_NR] = {0, "Identity Mapping End"}, + [VMEMMAP_NR] = {0, "vmemmap Area Start"}, + [VMEMMAP_END_NR] = {0, "vmemmap Area End"}, + [VMALLOC_NR] = {0, "vmalloc Area Start"}, + [VMALLOC_END_NR] = {0, "vmalloc Area End"}, + [MODULES_NR] = {0, "Modules Area Start"}, + [MODULES_END_NR] = {0, "Modules Area End"}, + [ABS_LOWCORE_NR] = {0, "Lowcore Area Start"}, + [ABS_LOWCORE_END_NR] = {0, "Lowcore Area End"}, + [MEMCPY_REAL_NR] = {0, "Real Memory Copy Area Start"}, + [MEMCPY_REAL_END_NR] = {0, "Real Memory Copy Area End"}, +#ifdef CONFIG_KASAN + [KASAN_SHADOW_START_NR] = {KASAN_SHADOW_START, "Kasan Shadow Start"}, + [KASAN_SHADOW_END_NR] = {KASAN_SHADOW_END, "Kasan Shadow End"}, +#endif + { -1, NULL } +}; + +struct pg_state { + struct ptdump_state ptdump; + struct seq_file *seq; + int level; + unsigned int current_prot; + bool check_wx; + unsigned long wx_pages; + unsigned long start_address; + const struct addr_marker *marker; +}; + +#define pt_dump_seq_printf(m, fmt, args...) \ +({ \ + struct seq_file *__m = (m); \ + \ + if (__m) \ + seq_printf(__m, fmt, ##args); \ +}) + +#define pt_dump_seq_puts(m, fmt) \ +({ \ + struct seq_file *__m = (m); \ + \ + if (__m) \ + seq_printf(__m, fmt); \ +}) + +static void print_prot(struct seq_file *m, unsigned int pr, int level) +{ + static const char * const level_name[] = + { "ASCE", "PGD", "PUD", "PMD", "PTE" }; + + pt_dump_seq_printf(m, "%s ", level_name[level]); + if (pr & _PAGE_INVALID) { + pt_dump_seq_printf(m, "I\n"); + return; + } + pt_dump_seq_puts(m, (pr & _PAGE_PROTECT) ? "RO " : "RW "); + pt_dump_seq_puts(m, (pr & _PAGE_NOEXEC) ? "NX\n" : "X\n"); +} + +static void note_prot_wx(struct pg_state *st, unsigned long addr) +{ +#ifdef CONFIG_DEBUG_WX + if (!st->check_wx) + return; + if (st->current_prot & _PAGE_INVALID) + return; + if (st->current_prot & _PAGE_PROTECT) + return; + if (st->current_prot & _PAGE_NOEXEC) + return; + /* + * The first lowcore page is W+X if spectre mitigations are using + * trampolines or the BEAR enhancements facility is not installed, + * in which case we have two lpswe instructions in lowcore that need + * to be executable. + */ + if (addr == PAGE_SIZE && (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear))) + return; + WARN_ONCE(1, "s390/mm: Found insecure W+X mapping at address %pS\n", + (void *)st->start_address); + st->wx_pages += (addr - st->start_address) / PAGE_SIZE; +#endif /* CONFIG_DEBUG_WX */ +} + +static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val) +{ + int width = sizeof(unsigned long) * 2; + static const char units[] = "KMGTPE"; + const char *unit = units; + unsigned long delta; + struct pg_state *st; + struct seq_file *m; + unsigned int prot; + + st = container_of(pt_st, struct pg_state, ptdump); + m = st->seq; + prot = val & (_PAGE_PROTECT | _PAGE_NOEXEC); + if (level == 4 && (val & _PAGE_INVALID)) + prot = _PAGE_INVALID; + /* For pmd_none() & friends val gets passed as zero. */ + if (level != 4 && !val) + prot = _PAGE_INVALID; + /* Final flush from generic code. */ + if (level == -1) + addr = max_addr; + if (st->level == -1) { + pt_dump_seq_printf(m, "---[ %s ]---\n", st->marker->name); + st->start_address = addr; + st->current_prot = prot; + st->level = level; + } else if (prot != st->current_prot || level != st->level || + addr >= st->marker[1].start_address) { + note_prot_wx(st, addr); + pt_dump_seq_printf(m, "0x%0*lx-0x%0*lx ", + width, st->start_address, + width, addr); + delta = (addr - st->start_address) >> 10; + while (!(delta & 0x3ff) && unit[1]) { + delta >>= 10; + unit++; + } + pt_dump_seq_printf(m, "%9lu%c ", delta, *unit); + print_prot(m, st->current_prot, st->level); + while (addr >= st->marker[1].start_address) { + st->marker++; + pt_dump_seq_printf(m, "---[ %s ]---\n", st->marker->name); + } + st->start_address = addr; + st->current_prot = prot; + st->level = level; + } +} + +#ifdef CONFIG_DEBUG_WX +void ptdump_check_wx(void) +{ + struct pg_state st = { + .ptdump = { + .note_page = note_page, + .range = (struct ptdump_range[]) { + {.start = 0, .end = max_addr}, + {.start = 0, .end = 0}, + } + }, + .seq = NULL, + .level = -1, + .current_prot = 0, + .check_wx = true, + .wx_pages = 0, + .start_address = 0, + .marker = (struct addr_marker[]) { + { .start_address = 0, .name = NULL}, + { .start_address = -1, .name = NULL}, + }, + }; + + if (!MACHINE_HAS_NX) + return; + ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); + if (st.wx_pages) + pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", st.wx_pages); + else + pr_info("Checked W+X mappings: passed, no %sW+X pages found\n", + (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)) ? + "unexpected " : ""); +} +#endif /* CONFIG_DEBUG_WX */ + +#ifdef CONFIG_PTDUMP_DEBUGFS +static int ptdump_show(struct seq_file *m, void *v) +{ + struct pg_state st = { + .ptdump = { + .note_page = note_page, + .range = (struct ptdump_range[]) { + {.start = 0, .end = max_addr}, + {.start = 0, .end = 0}, + } + }, + .seq = m, + .level = -1, + .current_prot = 0, + .check_wx = false, + .wx_pages = 0, + .start_address = 0, + .marker = address_markers, + }; + + get_online_mems(); + mutex_lock(&cpa_mutex); + ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); + mutex_unlock(&cpa_mutex); + put_online_mems(); + return 0; +} +DEFINE_SHOW_ATTRIBUTE(ptdump); +#endif /* CONFIG_PTDUMP_DEBUGFS */ + +/* + * Heapsort from lib/sort.c is not a stable sorting algorithm, do a simple + * insertion sort to preserve the original order of markers with the same + * start address. + */ +static void sort_address_markers(void) +{ + struct addr_marker tmp; + int i, j; + + for (i = 1; i < ARRAY_SIZE(address_markers) - 1; i++) { + tmp = address_markers[i]; + for (j = i - 1; j >= 0 && address_markers[j].start_address > tmp.start_address; j--) + address_markers[j + 1] = address_markers[j]; + address_markers[j + 1] = tmp; + } +} + +static int pt_dump_init(void) +{ +#ifdef CONFIG_KFENCE + unsigned long kfence_start = (unsigned long)__kfence_pool; +#endif + /* + * Figure out the maximum virtual address being accessible with the + * kernel ASCE. We need this to keep the page table walker functions + * from accessing non-existent entries. + */ + max_addr = (S390_lowcore.kernel_asce & _REGION_ENTRY_TYPE_MASK) >> 2; + max_addr = 1UL << (max_addr * 11 + 31); + address_markers[IDENTITY_AFTER_END_NR].start_address = ident_map_size; + address_markers[AMODE31_START_NR].start_address = (unsigned long)__samode31; + address_markers[AMODE31_END_NR].start_address = (unsigned long)__eamode31; + address_markers[MODULES_NR].start_address = MODULES_VADDR; + address_markers[MODULES_END_NR].start_address = MODULES_END; + address_markers[ABS_LOWCORE_NR].start_address = __abs_lowcore; + address_markers[ABS_LOWCORE_END_NR].start_address = __abs_lowcore + ABS_LOWCORE_MAP_SIZE; + address_markers[MEMCPY_REAL_NR].start_address = __memcpy_real_area; + address_markers[MEMCPY_REAL_END_NR].start_address = __memcpy_real_area + MEMCPY_REAL_SIZE; + address_markers[VMEMMAP_NR].start_address = (unsigned long) vmemmap; + address_markers[VMEMMAP_END_NR].start_address = (unsigned long)vmemmap + vmemmap_size; + address_markers[VMALLOC_NR].start_address = VMALLOC_START; + address_markers[VMALLOC_END_NR].start_address = VMALLOC_END; +#ifdef CONFIG_KFENCE + address_markers[KFENCE_START_NR].start_address = kfence_start; + address_markers[KFENCE_END_NR].start_address = kfence_start + KFENCE_POOL_SIZE; +#endif + sort_address_markers(); +#ifdef CONFIG_PTDUMP_DEBUGFS + debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops); +#endif /* CONFIG_PTDUMP_DEBUGFS */ + return 0; +} +device_initcall(pt_dump_init); diff --git a/arch/s390/mm/extable.c b/arch/s390/mm/extable.c new file mode 100644 index 0000000000..fe87291df9 --- /dev/null +++ b/arch/s390/mm/extable.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include + +const struct exception_table_entry *s390_search_extables(unsigned long addr) +{ + const struct exception_table_entry *fixup; + size_t num; + + fixup = search_exception_tables(addr); + if (fixup) + return fixup; + num = __stop_amode31_ex_table - __start_amode31_ex_table; + return search_extable(__start_amode31_ex_table, num, addr); +} + +static bool ex_handler_fixup(const struct exception_table_entry *ex, struct pt_regs *regs) +{ + regs->psw.addr = extable_fixup(ex); + return true; +} + +static bool ex_handler_ua_store(const struct exception_table_entry *ex, struct pt_regs *regs) +{ + unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data); + + regs->gprs[reg_err] = -EFAULT; + regs->psw.addr = extable_fixup(ex); + return true; +} + +static bool ex_handler_ua_load_mem(const struct exception_table_entry *ex, struct pt_regs *regs) +{ + unsigned int reg_addr = FIELD_GET(EX_DATA_REG_ADDR, ex->data); + unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data); + size_t len = FIELD_GET(EX_DATA_LEN, ex->data); + + regs->gprs[reg_err] = -EFAULT; + memset((void *)regs->gprs[reg_addr], 0, len); + regs->psw.addr = extable_fixup(ex); + return true; +} + +static bool ex_handler_ua_load_reg(const struct exception_table_entry *ex, + bool pair, struct pt_regs *regs) +{ + unsigned int reg_zero = FIELD_GET(EX_DATA_REG_ADDR, ex->data); + unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data); + + regs->gprs[reg_err] = -EFAULT; + regs->gprs[reg_zero] = 0; + if (pair) + regs->gprs[reg_zero + 1] = 0; + regs->psw.addr = extable_fixup(ex); + return true; +} + +bool fixup_exception(struct pt_regs *regs) +{ + const struct exception_table_entry *ex; + + ex = s390_search_extables(instruction_pointer(regs)); + if (!ex) + return false; + switch (ex->type) { + case EX_TYPE_FIXUP: + return ex_handler_fixup(ex, regs); + case EX_TYPE_BPF: + return ex_handler_bpf(ex, regs); + case EX_TYPE_UA_STORE: + return ex_handler_ua_store(ex, regs); + case EX_TYPE_UA_LOAD_MEM: + return ex_handler_ua_load_mem(ex, regs); + case EX_TYPE_UA_LOAD_REG: + return ex_handler_ua_load_reg(ex, false, regs); + case EX_TYPE_UA_LOAD_REGPAIR: + return ex_handler_ua_load_reg(ex, true, regs); + } + panic("invalid exception table entry"); +} diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c new file mode 100644 index 0000000000..e41869f5cc --- /dev/null +++ b/arch/s390/mm/extmem.c @@ -0,0 +1,660 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Author(s)......: Carsten Otte + * Rob M van der Heij + * Steven Shultz + * Bugreports.to..: + * Copyright IBM Corp. 2002, 2004 + */ + +#define KMSG_COMPONENT "extmem" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DCSS_PURGESEG 0x08 +#define DCSS_LOADSHRX 0x20 +#define DCSS_LOADNSRX 0x24 +#define DCSS_FINDSEGX 0x2c +#define DCSS_SEGEXTX 0x38 +#define DCSS_FINDSEGA 0x0c + +struct qrange { + unsigned long start; /* last byte type */ + unsigned long end; /* last byte reserved */ +}; + +struct qout64 { + unsigned long segstart; + unsigned long segend; + int segcnt; + int segrcnt; + struct qrange range[6]; +}; + +struct qin64 { + char qopcode; + char rsrv1[3]; + char qrcode; + char rsrv2[3]; + char qname[8]; + unsigned int qoutptr; + short int qoutlen; +}; + +struct dcss_segment { + struct list_head list; + char dcss_name[8]; + char res_name[16]; + unsigned long start_addr; + unsigned long end; + refcount_t ref_count; + int do_nonshared; + unsigned int vm_segtype; + struct qrange range[6]; + int segcnt; + struct resource *res; +}; + +static DEFINE_MUTEX(dcss_lock); +static LIST_HEAD(dcss_list); +static char *segtype_string[] = { "SW", "EW", "SR", "ER", "SN", "EN", "SC", + "EW/EN-MIXED" }; +static int loadshr_scode = DCSS_LOADSHRX; +static int loadnsr_scode = DCSS_LOADNSRX; +static int purgeseg_scode = DCSS_PURGESEG; +static int segext_scode = DCSS_SEGEXTX; + +/* + * Create the 8 bytes, ebcdic VM segment name from + * an ascii name. + */ +static void +dcss_mkname(char *name, char *dcss_name) +{ + int i; + + for (i = 0; i < 8; i++) { + if (name[i] == '\0') + break; + dcss_name[i] = toupper(name[i]); + } + for (; i < 8; i++) + dcss_name[i] = ' '; + ASCEBC(dcss_name, 8); +} + + +/* + * search all segments in dcss_list, and return the one + * namend *name. If not found, return NULL. + */ +static struct dcss_segment * +segment_by_name (char *name) +{ + char dcss_name[9]; + struct list_head *l; + struct dcss_segment *tmp, *retval = NULL; + + BUG_ON(!mutex_is_locked(&dcss_lock)); + dcss_mkname (name, dcss_name); + list_for_each (l, &dcss_list) { + tmp = list_entry (l, struct dcss_segment, list); + if (memcmp(tmp->dcss_name, dcss_name, 8) == 0) { + retval = tmp; + break; + } + } + return retval; +} + + +/* + * Perform a function on a dcss segment. + */ +static inline int +dcss_diag(int *func, void *parameter, + unsigned long *ret1, unsigned long *ret2) +{ + unsigned long rx, ry; + int rc; + + rx = (unsigned long) parameter; + ry = (unsigned long) *func; + + diag_stat_inc(DIAG_STAT_X064); + asm volatile( + " diag %0,%1,0x64\n" + " ipm %2\n" + " srl %2,28\n" + : "+d" (rx), "+d" (ry), "=d" (rc) : : "cc"); + *ret1 = rx; + *ret2 = ry; + return rc; +} + +static inline int +dcss_diag_translate_rc (int vm_rc) { + if (vm_rc == 44) + return -ENOENT; + return -EIO; +} + + +/* do a diag to get info about a segment. + * fills start_address, end and vm_segtype fields + */ +static int +query_segment_type (struct dcss_segment *seg) +{ + unsigned long dummy, vmrc; + int diag_cc, rc, i; + struct qout64 *qout; + struct qin64 *qin; + + qin = kmalloc(sizeof(*qin), GFP_KERNEL | GFP_DMA); + qout = kmalloc(sizeof(*qout), GFP_KERNEL | GFP_DMA); + if ((qin == NULL) || (qout == NULL)) { + rc = -ENOMEM; + goto out_free; + } + + /* initialize diag input parameters */ + qin->qopcode = DCSS_FINDSEGA; + qin->qoutptr = (unsigned long) qout; + qin->qoutlen = sizeof(struct qout64); + memcpy (qin->qname, seg->dcss_name, 8); + + diag_cc = dcss_diag(&segext_scode, qin, &dummy, &vmrc); + + if (diag_cc < 0) { + rc = diag_cc; + goto out_free; + } + if (diag_cc > 1) { + pr_warn("Querying a DCSS type failed with rc=%ld\n", vmrc); + rc = dcss_diag_translate_rc (vmrc); + goto out_free; + } + + if (qout->segcnt > 6) { + rc = -EOPNOTSUPP; + goto out_free; + } + + if (qout->segcnt == 1) { + seg->vm_segtype = qout->range[0].start & 0xff; + } else { + /* multi-part segment. only one type supported here: + - all parts are contiguous + - all parts are either EW or EN type + - maximum 6 parts allowed */ + unsigned long start = qout->segstart >> PAGE_SHIFT; + for (i=0; isegcnt; i++) { + if (((qout->range[i].start & 0xff) != SEG_TYPE_EW) && + ((qout->range[i].start & 0xff) != SEG_TYPE_EN)) { + rc = -EOPNOTSUPP; + goto out_free; + } + if (start != qout->range[i].start >> PAGE_SHIFT) { + rc = -EOPNOTSUPP; + goto out_free; + } + start = (qout->range[i].end >> PAGE_SHIFT) + 1; + } + seg->vm_segtype = SEG_TYPE_EWEN; + } + + /* analyze diag output and update seg */ + seg->start_addr = qout->segstart; + seg->end = qout->segend; + + memcpy (seg->range, qout->range, 6*sizeof(struct qrange)); + seg->segcnt = qout->segcnt; + + rc = 0; + + out_free: + kfree(qin); + kfree(qout); + return rc; +} + +/* + * get info about a segment + * possible return values: + * -ENOSYS : we are not running on VM + * -EIO : could not perform query diagnose + * -ENOENT : no such segment + * -EOPNOTSUPP: multi-part segment cannot be used with linux + * -ENOMEM : out of memory + * 0 .. 6 : type of segment as defined in include/asm-s390/extmem.h + */ +int +segment_type (char* name) +{ + int rc; + struct dcss_segment seg; + + if (!MACHINE_IS_VM) + return -ENOSYS; + + dcss_mkname(name, seg.dcss_name); + rc = query_segment_type (&seg); + if (rc < 0) + return rc; + return seg.vm_segtype; +} + +/* + * check if segment collides with other segments that are currently loaded + * returns 1 if this is the case, 0 if no collision was found + */ +static int +segment_overlaps_others (struct dcss_segment *seg) +{ + struct list_head *l; + struct dcss_segment *tmp; + + BUG_ON(!mutex_is_locked(&dcss_lock)); + list_for_each(l, &dcss_list) { + tmp = list_entry(l, struct dcss_segment, list); + if ((tmp->start_addr >> 20) > (seg->end >> 20)) + continue; + if ((tmp->end >> 20) < (seg->start_addr >> 20)) + continue; + if (seg == tmp) + continue; + return 1; + } + return 0; +} + +/* + * real segment loading function, called from segment_load + * Must return either an error code < 0, or the segment type code >= 0 + */ +static int +__segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long *end) +{ + unsigned long start_addr, end_addr, dummy; + struct dcss_segment *seg; + int rc, diag_cc, segtype; + + start_addr = end_addr = 0; + segtype = -1; + seg = kmalloc(sizeof(*seg), GFP_KERNEL | GFP_DMA); + if (seg == NULL) { + rc = -ENOMEM; + goto out; + } + dcss_mkname (name, seg->dcss_name); + rc = query_segment_type (seg); + if (rc < 0) + goto out_free; + + if (segment_overlaps_others(seg)) { + rc = -EBUSY; + goto out_free; + } + + seg->res = kzalloc(sizeof(struct resource), GFP_KERNEL); + if (seg->res == NULL) { + rc = -ENOMEM; + goto out_free; + } + seg->res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; + seg->res->start = seg->start_addr; + seg->res->end = seg->end; + memcpy(&seg->res_name, seg->dcss_name, 8); + EBCASC(seg->res_name, 8); + seg->res_name[8] = '\0'; + strlcat(seg->res_name, " (DCSS)", sizeof(seg->res_name)); + seg->res->name = seg->res_name; + segtype = seg->vm_segtype; + if (segtype == SEG_TYPE_SC || + ((segtype == SEG_TYPE_SR || segtype == SEG_TYPE_ER) && !do_nonshared)) + seg->res->flags |= IORESOURCE_READONLY; + + /* Check for overlapping resources before adding the mapping. */ + if (request_resource(&iomem_resource, seg->res)) { + rc = -EBUSY; + goto out_free_resource; + } + + rc = vmem_add_mapping(seg->start_addr, seg->end - seg->start_addr + 1); + if (rc) + goto out_resource; + + if (do_nonshared) + diag_cc = dcss_diag(&loadnsr_scode, seg->dcss_name, + &start_addr, &end_addr); + else + diag_cc = dcss_diag(&loadshr_scode, seg->dcss_name, + &start_addr, &end_addr); + if (diag_cc < 0) { + dcss_diag(&purgeseg_scode, seg->dcss_name, + &dummy, &dummy); + rc = diag_cc; + goto out_mapping; + } + if (diag_cc > 1) { + pr_warn("Loading DCSS %s failed with rc=%ld\n", name, end_addr); + rc = dcss_diag_translate_rc(end_addr); + dcss_diag(&purgeseg_scode, seg->dcss_name, + &dummy, &dummy); + goto out_mapping; + } + seg->start_addr = start_addr; + seg->end = end_addr; + seg->do_nonshared = do_nonshared; + refcount_set(&seg->ref_count, 1); + list_add(&seg->list, &dcss_list); + *addr = seg->start_addr; + *end = seg->end; + if (do_nonshared) + pr_info("DCSS %s of range %px to %px and type %s loaded as " + "exclusive-writable\n", name, (void*) seg->start_addr, + (void*) seg->end, segtype_string[seg->vm_segtype]); + else { + pr_info("DCSS %s of range %px to %px and type %s loaded in " + "shared access mode\n", name, (void*) seg->start_addr, + (void*) seg->end, segtype_string[seg->vm_segtype]); + } + goto out; + out_mapping: + vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1); + out_resource: + release_resource(seg->res); + out_free_resource: + kfree(seg->res); + out_free: + kfree(seg); + out: + return rc < 0 ? rc : segtype; +} + +/* + * this function loads a DCSS segment + * name : name of the DCSS + * do_nonshared : 0 indicates that the dcss should be shared with other linux images + * 1 indicates that the dcss should be exclusive for this linux image + * addr : will be filled with start address of the segment + * end : will be filled with end address of the segment + * return values: + * -ENOSYS : we are not running on VM + * -EIO : could not perform query or load diagnose + * -ENOENT : no such segment + * -EOPNOTSUPP: multi-part segment cannot be used with linux + * -EBUSY : segment cannot be used (overlaps with dcss or storage) + * -ERANGE : segment cannot be used (exceeds kernel mapping range) + * -EPERM : segment is currently loaded with incompatible permissions + * -ENOMEM : out of memory + * 0 .. 6 : type of segment as defined in include/asm-s390/extmem.h + */ +int +segment_load (char *name, int do_nonshared, unsigned long *addr, + unsigned long *end) +{ + struct dcss_segment *seg; + int rc; + + if (!MACHINE_IS_VM) + return -ENOSYS; + + mutex_lock(&dcss_lock); + seg = segment_by_name (name); + if (seg == NULL) + rc = __segment_load (name, do_nonshared, addr, end); + else { + if (do_nonshared == seg->do_nonshared) { + refcount_inc(&seg->ref_count); + *addr = seg->start_addr; + *end = seg->end; + rc = seg->vm_segtype; + } else { + *addr = *end = 0; + rc = -EPERM; + } + } + mutex_unlock(&dcss_lock); + return rc; +} + +/* + * this function modifies the shared state of a DCSS segment. note that + * name : name of the DCSS + * do_nonshared : 0 indicates that the dcss should be shared with other linux images + * 1 indicates that the dcss should be exclusive for this linux image + * return values: + * -EIO : could not perform load diagnose (segment gone!) + * -ENOENT : no such segment (segment gone!) + * -EAGAIN : segment is in use by other exploiters, try later + * -EINVAL : no segment with the given name is currently loaded - name invalid + * -EBUSY : segment can temporarily not be used (overlaps with dcss) + * 0 : operation succeeded + */ +int +segment_modify_shared (char *name, int do_nonshared) +{ + struct dcss_segment *seg; + unsigned long start_addr, end_addr, dummy; + int rc, diag_cc; + + start_addr = end_addr = 0; + mutex_lock(&dcss_lock); + seg = segment_by_name (name); + if (seg == NULL) { + rc = -EINVAL; + goto out_unlock; + } + if (do_nonshared == seg->do_nonshared) { + pr_info("DCSS %s is already in the requested access " + "mode\n", name); + rc = 0; + goto out_unlock; + } + if (refcount_read(&seg->ref_count) != 1) { + pr_warn("DCSS %s is in use and cannot be reloaded\n", name); + rc = -EAGAIN; + goto out_unlock; + } + release_resource(seg->res); + if (do_nonshared) + seg->res->flags &= ~IORESOURCE_READONLY; + else + if (seg->vm_segtype == SEG_TYPE_SR || + seg->vm_segtype == SEG_TYPE_ER) + seg->res->flags |= IORESOURCE_READONLY; + + if (request_resource(&iomem_resource, seg->res)) { + pr_warn("DCSS %s overlaps with used memory resources and cannot be reloaded\n", + name); + rc = -EBUSY; + kfree(seg->res); + goto out_del_mem; + } + + dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy); + if (do_nonshared) + diag_cc = dcss_diag(&loadnsr_scode, seg->dcss_name, + &start_addr, &end_addr); + else + diag_cc = dcss_diag(&loadshr_scode, seg->dcss_name, + &start_addr, &end_addr); + if (diag_cc < 0) { + rc = diag_cc; + goto out_del_res; + } + if (diag_cc > 1) { + pr_warn("Reloading DCSS %s failed with rc=%ld\n", + name, end_addr); + rc = dcss_diag_translate_rc(end_addr); + goto out_del_res; + } + seg->start_addr = start_addr; + seg->end = end_addr; + seg->do_nonshared = do_nonshared; + rc = 0; + goto out_unlock; + out_del_res: + release_resource(seg->res); + kfree(seg->res); + out_del_mem: + vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1); + list_del(&seg->list); + dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy); + kfree(seg); + out_unlock: + mutex_unlock(&dcss_lock); + return rc; +} + +/* + * Decrease the use count of a DCSS segment and remove + * it from the address space if nobody is using it + * any longer. + */ +void +segment_unload(char *name) +{ + unsigned long dummy; + struct dcss_segment *seg; + + if (!MACHINE_IS_VM) + return; + + mutex_lock(&dcss_lock); + seg = segment_by_name (name); + if (seg == NULL) { + pr_err("Unloading unknown DCSS %s failed\n", name); + goto out_unlock; + } + if (!refcount_dec_and_test(&seg->ref_count)) + goto out_unlock; + release_resource(seg->res); + kfree(seg->res); + vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1); + list_del(&seg->list); + dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy); + kfree(seg); +out_unlock: + mutex_unlock(&dcss_lock); +} + +/* + * save segment content permanently + */ +void +segment_save(char *name) +{ + struct dcss_segment *seg; + char cmd1[160]; + char cmd2[80]; + int i, response; + + if (!MACHINE_IS_VM) + return; + + mutex_lock(&dcss_lock); + seg = segment_by_name (name); + + if (seg == NULL) { + pr_err("Saving unknown DCSS %s failed\n", name); + goto out; + } + + sprintf(cmd1, "DEFSEG %s", name); + for (i=0; isegcnt; i++) { + sprintf(cmd1+strlen(cmd1), " %lX-%lX %s", + seg->range[i].start >> PAGE_SHIFT, + seg->range[i].end >> PAGE_SHIFT, + segtype_string[seg->range[i].start & 0xff]); + } + sprintf(cmd2, "SAVESEG %s", name); + response = 0; + cpcmd(cmd1, NULL, 0, &response); + if (response) { + pr_err("Saving a DCSS failed with DEFSEG response code " + "%i\n", response); + goto out; + } + cpcmd(cmd2, NULL, 0, &response); + if (response) { + pr_err("Saving a DCSS failed with SAVESEG response code " + "%i\n", response); + goto out; + } +out: + mutex_unlock(&dcss_lock); +} + +/* + * print appropriate error message for segment_load()/segment_type() + * return code + */ +void segment_warning(int rc, char *seg_name) +{ + switch (rc) { + case -ENOENT: + pr_err("DCSS %s cannot be loaded or queried\n", seg_name); + break; + case -ENOSYS: + pr_err("DCSS %s cannot be loaded or queried without " + "z/VM\n", seg_name); + break; + case -EIO: + pr_err("Loading or querying DCSS %s resulted in a " + "hardware error\n", seg_name); + break; + case -EOPNOTSUPP: + pr_err("DCSS %s has multiple page ranges and cannot be " + "loaded or queried\n", seg_name); + break; + case -EBUSY: + pr_err("%s needs used memory resources and cannot be " + "loaded or queried\n", seg_name); + break; + case -EPERM: + pr_err("DCSS %s is already loaded in a different access " + "mode\n", seg_name); + break; + case -ENOMEM: + pr_err("There is not enough memory to load or query " + "DCSS %s\n", seg_name); + break; + case -ERANGE: { + struct range mhp_range = arch_get_mappable_range(); + + pr_err("DCSS %s exceeds the kernel mapping range (%llu) " + "and cannot be loaded\n", seg_name, mhp_range.end + 1); + break; + } + default: + break; + } +} + +EXPORT_SYMBOL(segment_load); +EXPORT_SYMBOL(segment_unload); +EXPORT_SYMBOL(segment_save); +EXPORT_SYMBOL(segment_type); +EXPORT_SYMBOL(segment_modify_shared); +EXPORT_SYMBOL(segment_warning); diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c new file mode 100644 index 0000000000..b678295931 --- /dev/null +++ b/arch/s390/mm/fault.c @@ -0,0 +1,708 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * S390 version + * Copyright IBM Corp. 1999 + * Author(s): Hartmut Penner (hp@de.ibm.com) + * Ulrich Weigand (uweigand@de.ibm.com) + * + * Derived from "arch/i386/mm/fault.c" + * Copyright (C) 1995 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../kernel/entry.h" + +#define __FAIL_ADDR_MASK -4096L + +/* + * Allocate private vm_fault_reason from top. Please make sure it won't + * collide with vm_fault_reason. + */ +#define VM_FAULT_BADCONTEXT ((__force vm_fault_t)0x80000000) +#define VM_FAULT_BADMAP ((__force vm_fault_t)0x40000000) +#define VM_FAULT_BADACCESS ((__force vm_fault_t)0x20000000) +#define VM_FAULT_SIGNAL ((__force vm_fault_t)0x10000000) +#define VM_FAULT_PFAULT ((__force vm_fault_t)0x8000000) + +enum fault_type { + KERNEL_FAULT, + USER_FAULT, + GMAP_FAULT, +}; + +static unsigned long store_indication __read_mostly; + +static int __init fault_init(void) +{ + if (test_facility(75)) + store_indication = 0xc00; + return 0; +} +early_initcall(fault_init); + +/* + * Find out which address space caused the exception. + */ +static enum fault_type get_fault_type(struct pt_regs *regs) +{ + unsigned long trans_exc_code; + + trans_exc_code = regs->int_parm_long & 3; + if (likely(trans_exc_code == 0)) { + /* primary space exception */ + if (user_mode(regs)) + return USER_FAULT; + if (!IS_ENABLED(CONFIG_PGSTE)) + return KERNEL_FAULT; + if (test_pt_regs_flag(regs, PIF_GUEST_FAULT)) + return GMAP_FAULT; + return KERNEL_FAULT; + } + if (trans_exc_code == 2) + return USER_FAULT; + if (trans_exc_code == 1) { + /* access register mode, not used in the kernel */ + return USER_FAULT; + } + /* home space exception -> access via kernel ASCE */ + return KERNEL_FAULT; +} + +static unsigned long get_fault_address(struct pt_regs *regs) +{ + unsigned long trans_exc_code = regs->int_parm_long; + + return trans_exc_code & __FAIL_ADDR_MASK; +} + +static bool fault_is_write(struct pt_regs *regs) +{ + unsigned long trans_exc_code = regs->int_parm_long; + + return (trans_exc_code & store_indication) == 0x400; +} + +static int bad_address(void *p) +{ + unsigned long dummy; + + return get_kernel_nofault(dummy, (unsigned long *)p); +} + +static void dump_pagetable(unsigned long asce, unsigned long address) +{ + unsigned long *table = __va(asce & _ASCE_ORIGIN); + + pr_alert("AS:%016lx ", asce); + switch (asce & _ASCE_TYPE_MASK) { + case _ASCE_TYPE_REGION1: + table += (address & _REGION1_INDEX) >> _REGION1_SHIFT; + if (bad_address(table)) + goto bad; + pr_cont("R1:%016lx ", *table); + if (*table & _REGION_ENTRY_INVALID) + goto out; + table = __va(*table & _REGION_ENTRY_ORIGIN); + fallthrough; + case _ASCE_TYPE_REGION2: + table += (address & _REGION2_INDEX) >> _REGION2_SHIFT; + if (bad_address(table)) + goto bad; + pr_cont("R2:%016lx ", *table); + if (*table & _REGION_ENTRY_INVALID) + goto out; + table = __va(*table & _REGION_ENTRY_ORIGIN); + fallthrough; + case _ASCE_TYPE_REGION3: + table += (address & _REGION3_INDEX) >> _REGION3_SHIFT; + if (bad_address(table)) + goto bad; + pr_cont("R3:%016lx ", *table); + if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE)) + goto out; + table = __va(*table & _REGION_ENTRY_ORIGIN); + fallthrough; + case _ASCE_TYPE_SEGMENT: + table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; + if (bad_address(table)) + goto bad; + pr_cont("S:%016lx ", *table); + if (*table & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE)) + goto out; + table = __va(*table & _SEGMENT_ENTRY_ORIGIN); + } + table += (address & _PAGE_INDEX) >> _PAGE_SHIFT; + if (bad_address(table)) + goto bad; + pr_cont("P:%016lx ", *table); +out: + pr_cont("\n"); + return; +bad: + pr_cont("BAD\n"); +} + +static void dump_fault_info(struct pt_regs *regs) +{ + unsigned long asce; + + pr_alert("Failing address: %016lx TEID: %016lx\n", + regs->int_parm_long & __FAIL_ADDR_MASK, regs->int_parm_long); + pr_alert("Fault in "); + switch (regs->int_parm_long & 3) { + case 3: + pr_cont("home space "); + break; + case 2: + pr_cont("secondary space "); + break; + case 1: + pr_cont("access register "); + break; + case 0: + pr_cont("primary space "); + break; + } + pr_cont("mode while using "); + switch (get_fault_type(regs)) { + case USER_FAULT: + asce = S390_lowcore.user_asce; + pr_cont("user "); + break; + case GMAP_FAULT: + asce = ((struct gmap *) S390_lowcore.gmap)->asce; + pr_cont("gmap "); + break; + case KERNEL_FAULT: + asce = S390_lowcore.kernel_asce; + pr_cont("kernel "); + break; + default: + unreachable(); + } + pr_cont("ASCE.\n"); + dump_pagetable(asce, regs->int_parm_long & __FAIL_ADDR_MASK); +} + +int show_unhandled_signals = 1; + +void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault) +{ + if ((task_pid_nr(current) > 1) && !show_unhandled_signals) + return; + if (!unhandled_signal(current, signr)) + return; + if (!printk_ratelimit()) + return; + printk(KERN_ALERT "User process fault: interruption code %04x ilc:%d ", + regs->int_code & 0xffff, regs->int_code >> 17); + print_vma_addr(KERN_CONT "in ", regs->psw.addr); + printk(KERN_CONT "\n"); + if (is_mm_fault) + dump_fault_info(regs); + show_regs(regs); +} + +/* + * Send SIGSEGV to task. This is an external routine + * to keep the stack usage of do_page_fault small. + */ +static noinline void do_sigsegv(struct pt_regs *regs, int si_code) +{ + report_user_fault(regs, SIGSEGV, 1); + force_sig_fault(SIGSEGV, si_code, + (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK)); +} + +static noinline void do_no_context(struct pt_regs *regs, vm_fault_t fault) +{ + enum fault_type fault_type; + unsigned long address; + bool is_write; + + if (fixup_exception(regs)) + return; + fault_type = get_fault_type(regs); + if ((fault_type == KERNEL_FAULT) && (fault == VM_FAULT_BADCONTEXT)) { + address = get_fault_address(regs); + is_write = fault_is_write(regs); + if (kfence_handle_page_fault(address, is_write, regs)) + return; + } + /* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice. + */ + if (fault_type == KERNEL_FAULT) + printk(KERN_ALERT "Unable to handle kernel pointer dereference" + " in virtual kernel address space\n"); + else + printk(KERN_ALERT "Unable to handle kernel paging request" + " in virtual user address space\n"); + dump_fault_info(regs); + die(regs, "Oops"); +} + +static noinline void do_low_address(struct pt_regs *regs) +{ + /* Low-address protection hit in kernel mode means + NULL pointer write access in kernel mode. */ + if (regs->psw.mask & PSW_MASK_PSTATE) { + /* Low-address protection hit in user mode 'cannot happen'. */ + die (regs, "Low-address protection"); + } + + do_no_context(regs, VM_FAULT_BADACCESS); +} + +static noinline void do_sigbus(struct pt_regs *regs) +{ + /* + * Send a sigbus, regardless of whether we were in kernel + * or user mode. + */ + force_sig_fault(SIGBUS, BUS_ADRERR, + (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK)); +} + +static noinline void do_fault_error(struct pt_regs *regs, vm_fault_t fault) +{ + int si_code; + + switch (fault) { + case VM_FAULT_BADACCESS: + case VM_FAULT_BADMAP: + /* Bad memory access. Check if it is kernel or user space. */ + if (user_mode(regs)) { + /* User mode accesses just cause a SIGSEGV */ + si_code = (fault == VM_FAULT_BADMAP) ? + SEGV_MAPERR : SEGV_ACCERR; + do_sigsegv(regs, si_code); + break; + } + fallthrough; + case VM_FAULT_BADCONTEXT: + case VM_FAULT_PFAULT: + do_no_context(regs, fault); + break; + case VM_FAULT_SIGNAL: + if (!user_mode(regs)) + do_no_context(regs, fault); + break; + default: /* fault & VM_FAULT_ERROR */ + if (fault & VM_FAULT_OOM) { + if (!user_mode(regs)) + do_no_context(regs, fault); + else + pagefault_out_of_memory(); + } else if (fault & VM_FAULT_SIGSEGV) { + /* Kernel mode? Handle exceptions or die */ + if (!user_mode(regs)) + do_no_context(regs, fault); + else + do_sigsegv(regs, SEGV_MAPERR); + } else if (fault & VM_FAULT_SIGBUS) { + /* Kernel mode? Handle exceptions or die */ + if (!user_mode(regs)) + do_no_context(regs, fault); + else + do_sigbus(regs); + } else + BUG(); + break; + } +} + +/* + * This routine handles page faults. It determines the address, + * and the problem, and then passes it off to one of the appropriate + * routines. + * + * interruption code (int_code): + * 04 Protection -> Write-Protection (suppression) + * 10 Segment translation -> Not present (nullification) + * 11 Page translation -> Not present (nullification) + * 3b Region third trans. -> Not present (nullification) + */ +static inline vm_fault_t do_exception(struct pt_regs *regs, int access) +{ + struct gmap *gmap; + struct task_struct *tsk; + struct mm_struct *mm; + struct vm_area_struct *vma; + enum fault_type type; + unsigned long address; + unsigned int flags; + vm_fault_t fault; + bool is_write; + + tsk = current; + /* + * The instruction that caused the program check has + * been nullified. Don't signal single step via SIGTRAP. + */ + clear_thread_flag(TIF_PER_TRAP); + + if (kprobe_page_fault(regs, 14)) + return 0; + + mm = tsk->mm; + address = get_fault_address(regs); + is_write = fault_is_write(regs); + + /* + * Verify that the fault happened in user space, that + * we are not in an interrupt and that there is a + * user context. + */ + fault = VM_FAULT_BADCONTEXT; + type = get_fault_type(regs); + switch (type) { + case KERNEL_FAULT: + goto out; + case USER_FAULT: + case GMAP_FAULT: + if (faulthandler_disabled() || !mm) + goto out; + break; + } + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + flags = FAULT_FLAG_DEFAULT; + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; + if (is_write) + access = VM_WRITE; + if (access == VM_WRITE) + flags |= FAULT_FLAG_WRITE; + if (!(flags & FAULT_FLAG_USER)) + goto lock_mmap; + vma = lock_vma_under_rcu(mm, address); + if (!vma) + goto lock_mmap; + if (!(vma->vm_flags & access)) { + vma_end_read(vma); + goto lock_mmap; + } + fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); + if (!(fault & VM_FAULT_RETRY)) { + count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + if (likely(!(fault & VM_FAULT_ERROR))) + fault = 0; + goto out; + } + count_vm_vma_lock_event(VMA_LOCK_RETRY); + /* Quick path to respond to signals */ + if (fault_signal_pending(fault, regs)) { + fault = VM_FAULT_SIGNAL; + goto out; + } +lock_mmap: + mmap_read_lock(mm); + + gmap = NULL; + if (IS_ENABLED(CONFIG_PGSTE) && type == GMAP_FAULT) { + gmap = (struct gmap *) S390_lowcore.gmap; + current->thread.gmap_addr = address; + current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE); + current->thread.gmap_int_code = regs->int_code & 0xffff; + address = __gmap_translate(gmap, address); + if (address == -EFAULT) { + fault = VM_FAULT_BADMAP; + goto out_up; + } + if (gmap->pfault_enabled) + flags |= FAULT_FLAG_RETRY_NOWAIT; + } + +retry: + fault = VM_FAULT_BADMAP; + vma = find_vma(mm, address); + if (!vma) + goto out_up; + + if (unlikely(vma->vm_start > address)) { + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto out_up; + vma = expand_stack(mm, address); + if (!vma) + goto out; + } + + /* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ + fault = VM_FAULT_BADACCESS; + if (unlikely(!(vma->vm_flags & access))) + goto out_up; + + /* + * If for any reason at all we couldn't handle the fault, + * make sure we exit gracefully rather than endlessly redo + * the fault. + */ + fault = handle_mm_fault(vma, address, flags, regs); + if (fault_signal_pending(fault, regs)) { + fault = VM_FAULT_SIGNAL; + if (flags & FAULT_FLAG_RETRY_NOWAIT) + goto out_up; + goto out; + } + + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) { + if (gmap) { + mmap_read_lock(mm); + goto out_gmap; + } + fault = 0; + goto out; + } + + if (unlikely(fault & VM_FAULT_ERROR)) + goto out_up; + + if (fault & VM_FAULT_RETRY) { + if (IS_ENABLED(CONFIG_PGSTE) && gmap && + (flags & FAULT_FLAG_RETRY_NOWAIT)) { + /* + * FAULT_FLAG_RETRY_NOWAIT has been set, mmap_lock has + * not been released + */ + current->thread.gmap_pfault = 1; + fault = VM_FAULT_PFAULT; + goto out_up; + } + flags &= ~FAULT_FLAG_RETRY_NOWAIT; + flags |= FAULT_FLAG_TRIED; + mmap_read_lock(mm); + goto retry; + } +out_gmap: + if (IS_ENABLED(CONFIG_PGSTE) && gmap) { + address = __gmap_link(gmap, current->thread.gmap_addr, + address); + if (address == -EFAULT) { + fault = VM_FAULT_BADMAP; + goto out_up; + } + if (address == -ENOMEM) { + fault = VM_FAULT_OOM; + goto out_up; + } + } + fault = 0; +out_up: + mmap_read_unlock(mm); +out: + return fault; +} + +void do_protection_exception(struct pt_regs *regs) +{ + unsigned long trans_exc_code; + int access; + vm_fault_t fault; + + trans_exc_code = regs->int_parm_long; + /* + * Protection exceptions are suppressing, decrement psw address. + * The exception to this rule are aborted transactions, for these + * the PSW already points to the correct location. + */ + if (!(regs->int_code & 0x200)) + regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16); + /* + * Check for low-address protection. This needs to be treated + * as a special case because the translation exception code + * field is not guaranteed to contain valid data in this case. + */ + if (unlikely(!(trans_exc_code & 4))) { + do_low_address(regs); + return; + } + if (unlikely(MACHINE_HAS_NX && (trans_exc_code & 0x80))) { + regs->int_parm_long = (trans_exc_code & ~PAGE_MASK) | + (regs->psw.addr & PAGE_MASK); + access = VM_EXEC; + fault = VM_FAULT_BADACCESS; + } else { + access = VM_WRITE; + fault = do_exception(regs, access); + } + if (unlikely(fault)) + do_fault_error(regs, fault); +} +NOKPROBE_SYMBOL(do_protection_exception); + +void do_dat_exception(struct pt_regs *regs) +{ + int access; + vm_fault_t fault; + + access = VM_ACCESS_FLAGS; + fault = do_exception(regs, access); + if (unlikely(fault)) + do_fault_error(regs, fault); +} +NOKPROBE_SYMBOL(do_dat_exception); + +#if IS_ENABLED(CONFIG_PGSTE) + +void do_secure_storage_access(struct pt_regs *regs) +{ + unsigned long addr = regs->int_parm_long & __FAIL_ADDR_MASK; + struct vm_area_struct *vma; + struct mm_struct *mm; + struct page *page; + struct gmap *gmap; + int rc; + + /* + * bit 61 tells us if the address is valid, if it's not we + * have a major problem and should stop the kernel or send a + * SIGSEGV to the process. Unfortunately bit 61 is not + * reliable without the misc UV feature so we need to check + * for that as well. + */ + if (uv_has_feature(BIT_UV_FEAT_MISC) && + !test_bit_inv(61, ®s->int_parm_long)) { + /* + * When this happens, userspace did something that it + * was not supposed to do, e.g. branching into secure + * memory. Trigger a segmentation fault. + */ + if (user_mode(regs)) { + send_sig(SIGSEGV, current, 0); + return; + } + + /* + * The kernel should never run into this case and we + * have no way out of this situation. + */ + panic("Unexpected PGM 0x3d with TEID bit 61=0"); + } + + switch (get_fault_type(regs)) { + case GMAP_FAULT: + mm = current->mm; + gmap = (struct gmap *)S390_lowcore.gmap; + mmap_read_lock(mm); + addr = __gmap_translate(gmap, addr); + mmap_read_unlock(mm); + if (IS_ERR_VALUE(addr)) { + do_fault_error(regs, VM_FAULT_BADMAP); + break; + } + fallthrough; + case USER_FAULT: + mm = current->mm; + mmap_read_lock(mm); + vma = find_vma(mm, addr); + if (!vma) { + mmap_read_unlock(mm); + do_fault_error(regs, VM_FAULT_BADMAP); + break; + } + page = follow_page(vma, addr, FOLL_WRITE | FOLL_GET); + if (IS_ERR_OR_NULL(page)) { + mmap_read_unlock(mm); + break; + } + if (arch_make_page_accessible(page)) + send_sig(SIGSEGV, current, 0); + put_page(page); + mmap_read_unlock(mm); + break; + case KERNEL_FAULT: + page = phys_to_page(addr); + if (unlikely(!try_get_page(page))) + break; + rc = arch_make_page_accessible(page); + put_page(page); + if (rc) + BUG(); + break; + default: + do_fault_error(regs, VM_FAULT_BADMAP); + WARN_ON_ONCE(1); + } +} +NOKPROBE_SYMBOL(do_secure_storage_access); + +void do_non_secure_storage_access(struct pt_regs *regs) +{ + unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK; + struct gmap *gmap = (struct gmap *)S390_lowcore.gmap; + + if (get_fault_type(regs) != GMAP_FAULT) { + do_fault_error(regs, VM_FAULT_BADMAP); + WARN_ON_ONCE(1); + return; + } + + if (gmap_convert_to_secure(gmap, gaddr) == -EINVAL) + send_sig(SIGSEGV, current, 0); +} +NOKPROBE_SYMBOL(do_non_secure_storage_access); + +void do_secure_storage_violation(struct pt_regs *regs) +{ + unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK; + struct gmap *gmap = (struct gmap *)S390_lowcore.gmap; + + /* + * If the VM has been rebooted, its address space might still contain + * secure pages from the previous boot. + * Clear the page so it can be reused. + */ + if (!gmap_destroy_page(gmap, gaddr)) + return; + /* + * Either KVM messed up the secure guest mapping or the same + * page is mapped into multiple secure guests. + * + * This exception is only triggered when a guest 2 is running + * and can therefore never occur in kernel context. + */ + printk_ratelimited(KERN_WARNING + "Secure storage violation in task: %s, pid %d\n", + current->comm, current->pid); + send_sig(SIGSEGV, current, 0); +} + +#endif /* CONFIG_PGSTE */ diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c new file mode 100644 index 0000000000..20786f6883 --- /dev/null +++ b/arch/s390/mm/gmap.c @@ -0,0 +1,2894 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM guest address space mapping code + * + * Copyright IBM Corp. 2007, 2020 + * Author(s): Martin Schwidefsky + * David Hildenbrand + * Janosch Frank + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define GMAP_SHADOW_FAKE_TABLE 1ULL + +static struct page *gmap_alloc_crst(void) +{ + struct page *page; + + page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); + if (!page) + return NULL; + arch_set_page_dat(page, CRST_ALLOC_ORDER); + return page; +} + +/** + * gmap_alloc - allocate and initialize a guest address space + * @limit: maximum address of the gmap address space + * + * Returns a guest address space structure. + */ +static struct gmap *gmap_alloc(unsigned long limit) +{ + struct gmap *gmap; + struct page *page; + unsigned long *table; + unsigned long etype, atype; + + if (limit < _REGION3_SIZE) { + limit = _REGION3_SIZE - 1; + atype = _ASCE_TYPE_SEGMENT; + etype = _SEGMENT_ENTRY_EMPTY; + } else if (limit < _REGION2_SIZE) { + limit = _REGION2_SIZE - 1; + atype = _ASCE_TYPE_REGION3; + etype = _REGION3_ENTRY_EMPTY; + } else if (limit < _REGION1_SIZE) { + limit = _REGION1_SIZE - 1; + atype = _ASCE_TYPE_REGION2; + etype = _REGION2_ENTRY_EMPTY; + } else { + limit = -1UL; + atype = _ASCE_TYPE_REGION1; + etype = _REGION1_ENTRY_EMPTY; + } + gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT); + if (!gmap) + goto out; + INIT_LIST_HEAD(&gmap->crst_list); + INIT_LIST_HEAD(&gmap->children); + INIT_LIST_HEAD(&gmap->pt_list); + INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT); + INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT); + INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT); + spin_lock_init(&gmap->guest_table_lock); + spin_lock_init(&gmap->shadow_lock); + refcount_set(&gmap->ref_count, 1); + page = gmap_alloc_crst(); + if (!page) + goto out_free; + page->index = 0; + list_add(&page->lru, &gmap->crst_list); + table = page_to_virt(page); + crst_table_init(table, etype); + gmap->table = table; + gmap->asce = atype | _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | __pa(table); + gmap->asce_end = limit; + return gmap; + +out_free: + kfree(gmap); +out: + return NULL; +} + +/** + * gmap_create - create a guest address space + * @mm: pointer to the parent mm_struct + * @limit: maximum size of the gmap address space + * + * Returns a guest address space structure. + */ +struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit) +{ + struct gmap *gmap; + unsigned long gmap_asce; + + gmap = gmap_alloc(limit); + if (!gmap) + return NULL; + gmap->mm = mm; + spin_lock(&mm->context.lock); + list_add_rcu(&gmap->list, &mm->context.gmap_list); + if (list_is_singular(&mm->context.gmap_list)) + gmap_asce = gmap->asce; + else + gmap_asce = -1UL; + WRITE_ONCE(mm->context.gmap_asce, gmap_asce); + spin_unlock(&mm->context.lock); + return gmap; +} +EXPORT_SYMBOL_GPL(gmap_create); + +static void gmap_flush_tlb(struct gmap *gmap) +{ + if (MACHINE_HAS_IDTE) + __tlb_flush_idte(gmap->asce); + else + __tlb_flush_global(); +} + +static void gmap_radix_tree_free(struct radix_tree_root *root) +{ + struct radix_tree_iter iter; + unsigned long indices[16]; + unsigned long index; + void __rcu **slot; + int i, nr; + + /* A radix tree is freed by deleting all of its entries */ + index = 0; + do { + nr = 0; + radix_tree_for_each_slot(slot, root, &iter, index) { + indices[nr] = iter.index; + if (++nr == 16) + break; + } + for (i = 0; i < nr; i++) { + index = indices[i]; + radix_tree_delete(root, index); + } + } while (nr > 0); +} + +static void gmap_rmap_radix_tree_free(struct radix_tree_root *root) +{ + struct gmap_rmap *rmap, *rnext, *head; + struct radix_tree_iter iter; + unsigned long indices[16]; + unsigned long index; + void __rcu **slot; + int i, nr; + + /* A radix tree is freed by deleting all of its entries */ + index = 0; + do { + nr = 0; + radix_tree_for_each_slot(slot, root, &iter, index) { + indices[nr] = iter.index; + if (++nr == 16) + break; + } + for (i = 0; i < nr; i++) { + index = indices[i]; + head = radix_tree_delete(root, index); + gmap_for_each_rmap_safe(rmap, rnext, head) + kfree(rmap); + } + } while (nr > 0); +} + +/** + * gmap_free - free a guest address space + * @gmap: pointer to the guest address space structure + * + * No locks required. There are no references to this gmap anymore. + */ +static void gmap_free(struct gmap *gmap) +{ + struct page *page, *next; + + /* Flush tlb of all gmaps (if not already done for shadows) */ + if (!(gmap_is_shadow(gmap) && gmap->removed)) + gmap_flush_tlb(gmap); + /* Free all segment & region tables. */ + list_for_each_entry_safe(page, next, &gmap->crst_list, lru) + __free_pages(page, CRST_ALLOC_ORDER); + gmap_radix_tree_free(&gmap->guest_to_host); + gmap_radix_tree_free(&gmap->host_to_guest); + + /* Free additional data for a shadow gmap */ + if (gmap_is_shadow(gmap)) { + /* Free all page tables. */ + list_for_each_entry_safe(page, next, &gmap->pt_list, lru) + page_table_free_pgste(page); + gmap_rmap_radix_tree_free(&gmap->host_to_rmap); + /* Release reference to the parent */ + gmap_put(gmap->parent); + } + + kfree(gmap); +} + +/** + * gmap_get - increase reference counter for guest address space + * @gmap: pointer to the guest address space structure + * + * Returns the gmap pointer + */ +struct gmap *gmap_get(struct gmap *gmap) +{ + refcount_inc(&gmap->ref_count); + return gmap; +} +EXPORT_SYMBOL_GPL(gmap_get); + +/** + * gmap_put - decrease reference counter for guest address space + * @gmap: pointer to the guest address space structure + * + * If the reference counter reaches zero the guest address space is freed. + */ +void gmap_put(struct gmap *gmap) +{ + if (refcount_dec_and_test(&gmap->ref_count)) + gmap_free(gmap); +} +EXPORT_SYMBOL_GPL(gmap_put); + +/** + * gmap_remove - remove a guest address space but do not free it yet + * @gmap: pointer to the guest address space structure + */ +void gmap_remove(struct gmap *gmap) +{ + struct gmap *sg, *next; + unsigned long gmap_asce; + + /* Remove all shadow gmaps linked to this gmap */ + if (!list_empty(&gmap->children)) { + spin_lock(&gmap->shadow_lock); + list_for_each_entry_safe(sg, next, &gmap->children, list) { + list_del(&sg->list); + gmap_put(sg); + } + spin_unlock(&gmap->shadow_lock); + } + /* Remove gmap from the pre-mm list */ + spin_lock(&gmap->mm->context.lock); + list_del_rcu(&gmap->list); + if (list_empty(&gmap->mm->context.gmap_list)) + gmap_asce = 0; + else if (list_is_singular(&gmap->mm->context.gmap_list)) + gmap_asce = list_first_entry(&gmap->mm->context.gmap_list, + struct gmap, list)->asce; + else + gmap_asce = -1UL; + WRITE_ONCE(gmap->mm->context.gmap_asce, gmap_asce); + spin_unlock(&gmap->mm->context.lock); + synchronize_rcu(); + /* Put reference */ + gmap_put(gmap); +} +EXPORT_SYMBOL_GPL(gmap_remove); + +/** + * gmap_enable - switch primary space to the guest address space + * @gmap: pointer to the guest address space structure + */ +void gmap_enable(struct gmap *gmap) +{ + S390_lowcore.gmap = (unsigned long) gmap; +} +EXPORT_SYMBOL_GPL(gmap_enable); + +/** + * gmap_disable - switch back to the standard primary address space + * @gmap: pointer to the guest address space structure + */ +void gmap_disable(struct gmap *gmap) +{ + S390_lowcore.gmap = 0UL; +} +EXPORT_SYMBOL_GPL(gmap_disable); + +/** + * gmap_get_enabled - get a pointer to the currently enabled gmap + * + * Returns a pointer to the currently enabled gmap. 0 if none is enabled. + */ +struct gmap *gmap_get_enabled(void) +{ + return (struct gmap *) S390_lowcore.gmap; +} +EXPORT_SYMBOL_GPL(gmap_get_enabled); + +/* + * gmap_alloc_table is assumed to be called with mmap_lock held + */ +static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, + unsigned long init, unsigned long gaddr) +{ + struct page *page; + unsigned long *new; + + /* since we dont free the gmap table until gmap_free we can unlock */ + page = gmap_alloc_crst(); + if (!page) + return -ENOMEM; + new = page_to_virt(page); + crst_table_init(new, init); + spin_lock(&gmap->guest_table_lock); + if (*table & _REGION_ENTRY_INVALID) { + list_add(&page->lru, &gmap->crst_list); + *table = __pa(new) | _REGION_ENTRY_LENGTH | + (*table & _REGION_ENTRY_TYPE_MASK); + page->index = gaddr; + page = NULL; + } + spin_unlock(&gmap->guest_table_lock); + if (page) + __free_pages(page, CRST_ALLOC_ORDER); + return 0; +} + +/** + * __gmap_segment_gaddr - find virtual address from segment pointer + * @entry: pointer to a segment table entry in the guest address space + * + * Returns the virtual address in the guest address space for the segment + */ +static unsigned long __gmap_segment_gaddr(unsigned long *entry) +{ + struct page *page; + unsigned long offset; + + offset = (unsigned long) entry / sizeof(unsigned long); + offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE; + page = pmd_pgtable_page((pmd_t *) entry); + return page->index + offset; +} + +/** + * __gmap_unlink_by_vmaddr - unlink a single segment via a host address + * @gmap: pointer to the guest address space structure + * @vmaddr: address in the host process address space + * + * Returns 1 if a TLB flush is required + */ +static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) +{ + unsigned long *entry; + int flush = 0; + + BUG_ON(gmap_is_shadow(gmap)); + spin_lock(&gmap->guest_table_lock); + entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); + if (entry) { + flush = (*entry != _SEGMENT_ENTRY_EMPTY); + *entry = _SEGMENT_ENTRY_EMPTY; + } + spin_unlock(&gmap->guest_table_lock); + return flush; +} + +/** + * __gmap_unmap_by_gaddr - unmap a single segment via a guest address + * @gmap: pointer to the guest address space structure + * @gaddr: address in the guest address space + * + * Returns 1 if a TLB flush is required + */ +static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr) +{ + unsigned long vmaddr; + + vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host, + gaddr >> PMD_SHIFT); + return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0; +} + +/** + * gmap_unmap_segment - unmap segment from the guest address space + * @gmap: pointer to the guest address space structure + * @to: address in the guest address space + * @len: length of the memory area to unmap + * + * Returns 0 if the unmap succeeded, -EINVAL if not. + */ +int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) +{ + unsigned long off; + int flush; + + BUG_ON(gmap_is_shadow(gmap)); + if ((to | len) & (PMD_SIZE - 1)) + return -EINVAL; + if (len == 0 || to + len < to) + return -EINVAL; + + flush = 0; + mmap_write_lock(gmap->mm); + for (off = 0; off < len; off += PMD_SIZE) + flush |= __gmap_unmap_by_gaddr(gmap, to + off); + mmap_write_unlock(gmap->mm); + if (flush) + gmap_flush_tlb(gmap); + return 0; +} +EXPORT_SYMBOL_GPL(gmap_unmap_segment); + +/** + * gmap_map_segment - map a segment to the guest address space + * @gmap: pointer to the guest address space structure + * @from: source address in the parent address space + * @to: target address in the guest address space + * @len: length of the memory area to map + * + * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not. + */ +int gmap_map_segment(struct gmap *gmap, unsigned long from, + unsigned long to, unsigned long len) +{ + unsigned long off; + int flush; + + BUG_ON(gmap_is_shadow(gmap)); + if ((from | to | len) & (PMD_SIZE - 1)) + return -EINVAL; + if (len == 0 || from + len < from || to + len < to || + from + len - 1 > TASK_SIZE_MAX || to + len - 1 > gmap->asce_end) + return -EINVAL; + + flush = 0; + mmap_write_lock(gmap->mm); + for (off = 0; off < len; off += PMD_SIZE) { + /* Remove old translation */ + flush |= __gmap_unmap_by_gaddr(gmap, to + off); + /* Store new translation */ + if (radix_tree_insert(&gmap->guest_to_host, + (to + off) >> PMD_SHIFT, + (void *) from + off)) + break; + } + mmap_write_unlock(gmap->mm); + if (flush) + gmap_flush_tlb(gmap); + if (off >= len) + return 0; + gmap_unmap_segment(gmap, to, len); + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(gmap_map_segment); + +/** + * __gmap_translate - translate a guest address to a user space address + * @gmap: pointer to guest mapping meta data structure + * @gaddr: guest address + * + * Returns user space address which corresponds to the guest address or + * -EFAULT if no such mapping exists. + * This function does not establish potentially missing page table entries. + * The mmap_lock of the mm that belongs to the address space must be held + * when this function gets called. + * + * Note: Can also be called for shadow gmaps. + */ +unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) +{ + unsigned long vmaddr; + + vmaddr = (unsigned long) + radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); + /* Note: guest_to_host is empty for a shadow gmap */ + return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT; +} +EXPORT_SYMBOL_GPL(__gmap_translate); + +/** + * gmap_translate - translate a guest address to a user space address + * @gmap: pointer to guest mapping meta data structure + * @gaddr: guest address + * + * Returns user space address which corresponds to the guest address or + * -EFAULT if no such mapping exists. + * This function does not establish potentially missing page table entries. + */ +unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr) +{ + unsigned long rc; + + mmap_read_lock(gmap->mm); + rc = __gmap_translate(gmap, gaddr); + mmap_read_unlock(gmap->mm); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_translate); + +/** + * gmap_unlink - disconnect a page table from the gmap shadow tables + * @mm: pointer to the parent mm_struct + * @table: pointer to the host page table + * @vmaddr: vm address associated with the host page table + */ +void gmap_unlink(struct mm_struct *mm, unsigned long *table, + unsigned long vmaddr) +{ + struct gmap *gmap; + int flush; + + rcu_read_lock(); + list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { + flush = __gmap_unlink_by_vmaddr(gmap, vmaddr); + if (flush) + gmap_flush_tlb(gmap); + } + rcu_read_unlock(); +} + +static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new, + unsigned long gaddr); + +/** + * __gmap_link - set up shadow page tables to connect a host to a guest address + * @gmap: pointer to guest mapping meta data structure + * @gaddr: guest address + * @vmaddr: vm address + * + * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT + * if the vm address is already mapped to a different guest segment. + * The mmap_lock of the mm that belongs to the address space must be held + * when this function gets called. + */ +int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) +{ + struct mm_struct *mm; + unsigned long *table; + spinlock_t *ptl; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + u64 unprot; + int rc; + + BUG_ON(gmap_is_shadow(gmap)); + /* Create higher level tables in the gmap page table */ + table = gmap->table; + if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { + table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; + if ((*table & _REGION_ENTRY_INVALID) && + gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, + gaddr & _REGION1_MASK)) + return -ENOMEM; + table = __va(*table & _REGION_ENTRY_ORIGIN); + } + if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { + table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; + if ((*table & _REGION_ENTRY_INVALID) && + gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, + gaddr & _REGION2_MASK)) + return -ENOMEM; + table = __va(*table & _REGION_ENTRY_ORIGIN); + } + if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { + table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; + if ((*table & _REGION_ENTRY_INVALID) && + gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, + gaddr & _REGION3_MASK)) + return -ENOMEM; + table = __va(*table & _REGION_ENTRY_ORIGIN); + } + table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; + /* Walk the parent mm page table */ + mm = gmap->mm; + pgd = pgd_offset(mm, vmaddr); + VM_BUG_ON(pgd_none(*pgd)); + p4d = p4d_offset(pgd, vmaddr); + VM_BUG_ON(p4d_none(*p4d)); + pud = pud_offset(p4d, vmaddr); + VM_BUG_ON(pud_none(*pud)); + /* large puds cannot yet be handled */ + if (pud_large(*pud)) + return -EFAULT; + pmd = pmd_offset(pud, vmaddr); + VM_BUG_ON(pmd_none(*pmd)); + /* Are we allowed to use huge pages? */ + if (pmd_large(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m) + return -EFAULT; + /* Link gmap segment table entry location to page table. */ + rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); + if (rc) + return rc; + ptl = pmd_lock(mm, pmd); + spin_lock(&gmap->guest_table_lock); + if (*table == _SEGMENT_ENTRY_EMPTY) { + rc = radix_tree_insert(&gmap->host_to_guest, + vmaddr >> PMD_SHIFT, table); + if (!rc) { + if (pmd_large(*pmd)) { + *table = (pmd_val(*pmd) & + _SEGMENT_ENTRY_HARDWARE_BITS_LARGE) + | _SEGMENT_ENTRY_GMAP_UC; + } else + *table = pmd_val(*pmd) & + _SEGMENT_ENTRY_HARDWARE_BITS; + } + } else if (*table & _SEGMENT_ENTRY_PROTECT && + !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) { + unprot = (u64)*table; + unprot &= ~_SEGMENT_ENTRY_PROTECT; + unprot |= _SEGMENT_ENTRY_GMAP_UC; + gmap_pmdp_xchg(gmap, (pmd_t *)table, __pmd(unprot), gaddr); + } + spin_unlock(&gmap->guest_table_lock); + spin_unlock(ptl); + radix_tree_preload_end(); + return rc; +} + +/** + * gmap_fault - resolve a fault on a guest address + * @gmap: pointer to guest mapping meta data structure + * @gaddr: guest address + * @fault_flags: flags to pass down to handle_mm_fault() + * + * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT + * if the vm address is already mapped to a different guest segment. + */ +int gmap_fault(struct gmap *gmap, unsigned long gaddr, + unsigned int fault_flags) +{ + unsigned long vmaddr; + int rc; + bool unlocked; + + mmap_read_lock(gmap->mm); + +retry: + unlocked = false; + vmaddr = __gmap_translate(gmap, gaddr); + if (IS_ERR_VALUE(vmaddr)) { + rc = vmaddr; + goto out_up; + } + if (fixup_user_fault(gmap->mm, vmaddr, fault_flags, + &unlocked)) { + rc = -EFAULT; + goto out_up; + } + /* + * In the case that fixup_user_fault unlocked the mmap_lock during + * faultin redo __gmap_translate to not race with a map/unmap_segment. + */ + if (unlocked) + goto retry; + + rc = __gmap_link(gmap, gaddr, vmaddr); +out_up: + mmap_read_unlock(gmap->mm); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_fault); + +/* + * this function is assumed to be called with mmap_lock held + */ +void __gmap_zap(struct gmap *gmap, unsigned long gaddr) +{ + struct vm_area_struct *vma; + unsigned long vmaddr; + spinlock_t *ptl; + pte_t *ptep; + + /* Find the vm address for the guest address */ + vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host, + gaddr >> PMD_SHIFT); + if (vmaddr) { + vmaddr |= gaddr & ~PMD_MASK; + + vma = vma_lookup(gmap->mm, vmaddr); + if (!vma || is_vm_hugetlb_page(vma)) + return; + + /* Get pointer to the page table entry */ + ptep = get_locked_pte(gmap->mm, vmaddr, &ptl); + if (likely(ptep)) { + ptep_zap_unused(gmap->mm, vmaddr, ptep, 0); + pte_unmap_unlock(ptep, ptl); + } + } +} +EXPORT_SYMBOL_GPL(__gmap_zap); + +void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) +{ + unsigned long gaddr, vmaddr, size; + struct vm_area_struct *vma; + + mmap_read_lock(gmap->mm); + for (gaddr = from; gaddr < to; + gaddr = (gaddr + PMD_SIZE) & PMD_MASK) { + /* Find the vm address for the guest address */ + vmaddr = (unsigned long) + radix_tree_lookup(&gmap->guest_to_host, + gaddr >> PMD_SHIFT); + if (!vmaddr) + continue; + vmaddr |= gaddr & ~PMD_MASK; + /* Find vma in the parent mm */ + vma = find_vma(gmap->mm, vmaddr); + if (!vma) + continue; + /* + * We do not discard pages that are backed by + * hugetlbfs, so we don't have to refault them. + */ + if (is_vm_hugetlb_page(vma)) + continue; + size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); + zap_page_range_single(vma, vmaddr, size, NULL); + } + mmap_read_unlock(gmap->mm); +} +EXPORT_SYMBOL_GPL(gmap_discard); + +static LIST_HEAD(gmap_notifier_list); +static DEFINE_SPINLOCK(gmap_notifier_lock); + +/** + * gmap_register_pte_notifier - register a pte invalidation callback + * @nb: pointer to the gmap notifier block + */ +void gmap_register_pte_notifier(struct gmap_notifier *nb) +{ + spin_lock(&gmap_notifier_lock); + list_add_rcu(&nb->list, &gmap_notifier_list); + spin_unlock(&gmap_notifier_lock); +} +EXPORT_SYMBOL_GPL(gmap_register_pte_notifier); + +/** + * gmap_unregister_pte_notifier - remove a pte invalidation callback + * @nb: pointer to the gmap notifier block + */ +void gmap_unregister_pte_notifier(struct gmap_notifier *nb) +{ + spin_lock(&gmap_notifier_lock); + list_del_rcu(&nb->list); + spin_unlock(&gmap_notifier_lock); + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier); + +/** + * gmap_call_notifier - call all registered invalidation callbacks + * @gmap: pointer to guest mapping meta data structure + * @start: start virtual address in the guest address space + * @end: end virtual address in the guest address space + */ +static void gmap_call_notifier(struct gmap *gmap, unsigned long start, + unsigned long end) +{ + struct gmap_notifier *nb; + + list_for_each_entry(nb, &gmap_notifier_list, list) + nb->notifier_call(gmap, start, end); +} + +/** + * gmap_table_walk - walk the gmap page tables + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @level: page table level to stop at + * + * Returns a table entry pointer for the given guest address and @level + * @level=0 : returns a pointer to a page table table entry (or NULL) + * @level=1 : returns a pointer to a segment table entry (or NULL) + * @level=2 : returns a pointer to a region-3 table entry (or NULL) + * @level=3 : returns a pointer to a region-2 table entry (or NULL) + * @level=4 : returns a pointer to a region-1 table entry (or NULL) + * + * Returns NULL if the gmap page tables could not be walked to the + * requested level. + * + * Note: Can also be called for shadow gmaps. + */ +static inline unsigned long *gmap_table_walk(struct gmap *gmap, + unsigned long gaddr, int level) +{ + const int asce_type = gmap->asce & _ASCE_TYPE_MASK; + unsigned long *table = gmap->table; + + if (gmap_is_shadow(gmap) && gmap->removed) + return NULL; + + if (WARN_ON_ONCE(level > (asce_type >> 2) + 1)) + return NULL; + + if (asce_type != _ASCE_TYPE_REGION1 && + gaddr & (-1UL << (31 + (asce_type >> 2) * 11))) + return NULL; + + switch (asce_type) { + case _ASCE_TYPE_REGION1: + table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; + if (level == 4) + break; + if (*table & _REGION_ENTRY_INVALID) + return NULL; + table = __va(*table & _REGION_ENTRY_ORIGIN); + fallthrough; + case _ASCE_TYPE_REGION2: + table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; + if (level == 3) + break; + if (*table & _REGION_ENTRY_INVALID) + return NULL; + table = __va(*table & _REGION_ENTRY_ORIGIN); + fallthrough; + case _ASCE_TYPE_REGION3: + table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; + if (level == 2) + break; + if (*table & _REGION_ENTRY_INVALID) + return NULL; + table = __va(*table & _REGION_ENTRY_ORIGIN); + fallthrough; + case _ASCE_TYPE_SEGMENT: + table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; + if (level == 1) + break; + if (*table & _REGION_ENTRY_INVALID) + return NULL; + table = __va(*table & _SEGMENT_ENTRY_ORIGIN); + table += (gaddr & _PAGE_INDEX) >> _PAGE_SHIFT; + } + return table; +} + +/** + * gmap_pte_op_walk - walk the gmap page table, get the page table lock + * and return the pte pointer + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @ptl: pointer to the spinlock pointer + * + * Returns a pointer to the locked pte for a guest address, or NULL + */ +static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr, + spinlock_t **ptl) +{ + unsigned long *table; + + BUG_ON(gmap_is_shadow(gmap)); + /* Walk the gmap page table, lock and get pte pointer */ + table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */ + if (!table || *table & _SEGMENT_ENTRY_INVALID) + return NULL; + return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl); +} + +/** + * gmap_pte_op_fixup - force a page in and connect the gmap page table + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @vmaddr: address in the host process address space + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * + * Returns 0 if the caller can retry __gmap_translate (might fail again), + * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing + * up or connecting the gmap page table. + */ +static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr, + unsigned long vmaddr, int prot) +{ + struct mm_struct *mm = gmap->mm; + unsigned int fault_flags; + bool unlocked = false; + + BUG_ON(gmap_is_shadow(gmap)); + fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0; + if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked)) + return -EFAULT; + if (unlocked) + /* lost mmap_lock, caller has to retry __gmap_translate */ + return 0; + /* Connect the page tables */ + return __gmap_link(gmap, gaddr, vmaddr); +} + +/** + * gmap_pte_op_end - release the page table lock + * @ptep: pointer to the locked pte + * @ptl: pointer to the page table spinlock + */ +static void gmap_pte_op_end(pte_t *ptep, spinlock_t *ptl) +{ + pte_unmap_unlock(ptep, ptl); +} + +/** + * gmap_pmd_op_walk - walk the gmap tables, get the guest table lock + * and return the pmd pointer + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * + * Returns a pointer to the pmd for a guest address, or NULL + */ +static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr) +{ + pmd_t *pmdp; + + BUG_ON(gmap_is_shadow(gmap)); + pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1); + if (!pmdp) + return NULL; + + /* without huge pages, there is no need to take the table lock */ + if (!gmap->mm->context.allow_gmap_hpage_1m) + return pmd_none(*pmdp) ? NULL : pmdp; + + spin_lock(&gmap->guest_table_lock); + if (pmd_none(*pmdp)) { + spin_unlock(&gmap->guest_table_lock); + return NULL; + } + + /* 4k page table entries are locked via the pte (pte_alloc_map_lock). */ + if (!pmd_large(*pmdp)) + spin_unlock(&gmap->guest_table_lock); + return pmdp; +} + +/** + * gmap_pmd_op_end - release the guest_table_lock if needed + * @gmap: pointer to the guest mapping meta data structure + * @pmdp: pointer to the pmd + */ +static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp) +{ + if (pmd_large(*pmdp)) + spin_unlock(&gmap->guest_table_lock); +} + +/* + * gmap_protect_pmd - remove access rights to memory and set pmd notification bits + * @pmdp: pointer to the pmd to be protected + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * @bits: notification bits to set + * + * Returns: + * 0 if successfully protected + * -EAGAIN if a fixup is needed + * -EINVAL if unsupported notifier bits have been specified + * + * Expected to be called with sg->mm->mmap_lock in read and + * guest_table_lock held. + */ +static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr, + pmd_t *pmdp, int prot, unsigned long bits) +{ + int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID; + int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT; + pmd_t new = *pmdp; + + /* Fixup needed */ + if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE))) + return -EAGAIN; + + if (prot == PROT_NONE && !pmd_i) { + new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); + gmap_pmdp_xchg(gmap, pmdp, new, gaddr); + } + + if (prot == PROT_READ && !pmd_p) { + new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); + new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_PROTECT)); + gmap_pmdp_xchg(gmap, pmdp, new, gaddr); + } + + if (bits & GMAP_NOTIFY_MPROT) + set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN))); + + /* Shadow GMAP protection needs split PMDs */ + if (bits & GMAP_NOTIFY_SHADOW) + return -EINVAL; + + return 0; +} + +/* + * gmap_protect_pte - remove access rights to memory and set pgste bits + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @pmdp: pointer to the pmd associated with the pte + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * @bits: notification bits to set + * + * Returns 0 if successfully protected, -ENOMEM if out of memory and + * -EAGAIN if a fixup is needed. + * + * Expected to be called with sg->mm->mmap_lock in read + */ +static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr, + pmd_t *pmdp, int prot, unsigned long bits) +{ + int rc; + pte_t *ptep; + spinlock_t *ptl; + unsigned long pbits = 0; + + if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) + return -EAGAIN; + + ptep = pte_alloc_map_lock(gmap->mm, pmdp, gaddr, &ptl); + if (!ptep) + return -ENOMEM; + + pbits |= (bits & GMAP_NOTIFY_MPROT) ? PGSTE_IN_BIT : 0; + pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0; + /* Protect and unlock. */ + rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits); + gmap_pte_op_end(ptep, ptl); + return rc; +} + +/* + * gmap_protect_range - remove access rights to memory and set pgste bits + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @len: size of area + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * @bits: pgste notification bits to set + * + * Returns 0 if successfully protected, -ENOMEM if out of memory and + * -EFAULT if gaddr is invalid (or mapping for shadows is missing). + * + * Called with sg->mm->mmap_lock in read. + */ +static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr, + unsigned long len, int prot, unsigned long bits) +{ + unsigned long vmaddr, dist; + pmd_t *pmdp; + int rc; + + BUG_ON(gmap_is_shadow(gmap)); + while (len) { + rc = -EAGAIN; + pmdp = gmap_pmd_op_walk(gmap, gaddr); + if (pmdp) { + if (!pmd_large(*pmdp)) { + rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, + bits); + if (!rc) { + len -= PAGE_SIZE; + gaddr += PAGE_SIZE; + } + } else { + rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, + bits); + if (!rc) { + dist = HPAGE_SIZE - (gaddr & ~HPAGE_MASK); + len = len < dist ? 0 : len - dist; + gaddr = (gaddr & HPAGE_MASK) + HPAGE_SIZE; + } + } + gmap_pmd_op_end(gmap, pmdp); + } + if (rc) { + if (rc == -EINVAL) + return rc; + + /* -EAGAIN, fixup of userspace mm and gmap */ + vmaddr = __gmap_translate(gmap, gaddr); + if (IS_ERR_VALUE(vmaddr)) + return vmaddr; + rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot); + if (rc) + return rc; + } + } + return 0; +} + +/** + * gmap_mprotect_notify - change access rights for a range of ptes and + * call the notifier if any pte changes again + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @len: size of area + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * + * Returns 0 if for each page in the given range a gmap mapping exists, + * the new access rights could be set and the notifier could be armed. + * If the gmap mapping is missing for one or more pages -EFAULT is + * returned. If no memory could be allocated -ENOMEM is returned. + * This function establishes missing page table entries. + */ +int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr, + unsigned long len, int prot) +{ + int rc; + + if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap)) + return -EINVAL; + if (!MACHINE_HAS_ESOP && prot == PROT_READ) + return -EINVAL; + mmap_read_lock(gmap->mm); + rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT); + mmap_read_unlock(gmap->mm); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_mprotect_notify); + +/** + * gmap_read_table - get an unsigned long value from a guest page table using + * absolute addressing, without marking the page referenced. + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @val: pointer to the unsigned long value to return + * + * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT + * if reading using the virtual address failed. -EINVAL if called on a gmap + * shadow. + * + * Called with gmap->mm->mmap_lock in read. + */ +int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) +{ + unsigned long address, vmaddr; + spinlock_t *ptl; + pte_t *ptep, pte; + int rc; + + if (gmap_is_shadow(gmap)) + return -EINVAL; + + while (1) { + rc = -EAGAIN; + ptep = gmap_pte_op_walk(gmap, gaddr, &ptl); + if (ptep) { + pte = *ptep; + if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) { + address = pte_val(pte) & PAGE_MASK; + address += gaddr & ~PAGE_MASK; + *val = *(unsigned long *)__va(address); + set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG))); + /* Do *NOT* clear the _PAGE_INVALID bit! */ + rc = 0; + } + gmap_pte_op_end(ptep, ptl); + } + if (!rc) + break; + vmaddr = __gmap_translate(gmap, gaddr); + if (IS_ERR_VALUE(vmaddr)) { + rc = vmaddr; + break; + } + rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ); + if (rc) + break; + } + return rc; +} +EXPORT_SYMBOL_GPL(gmap_read_table); + +/** + * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree + * @sg: pointer to the shadow guest address space structure + * @vmaddr: vm address associated with the rmap + * @rmap: pointer to the rmap structure + * + * Called with the sg->guest_table_lock + */ +static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, + struct gmap_rmap *rmap) +{ + struct gmap_rmap *temp; + void __rcu **slot; + + BUG_ON(!gmap_is_shadow(sg)); + slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); + if (slot) { + rmap->next = radix_tree_deref_slot_protected(slot, + &sg->guest_table_lock); + for (temp = rmap->next; temp; temp = temp->next) { + if (temp->raddr == rmap->raddr) { + kfree(rmap); + return; + } + } + radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); + } else { + rmap->next = NULL; + radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT, + rmap); + } +} + +/** + * gmap_protect_rmap - restrict access rights to memory (RO) and create an rmap + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow gmap + * @paddr: address in the parent guest address space + * @len: length of the memory area to protect + * + * Returns 0 if successfully protected and the rmap was created, -ENOMEM + * if out of memory and -EFAULT if paddr is invalid. + */ +static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, + unsigned long paddr, unsigned long len) +{ + struct gmap *parent; + struct gmap_rmap *rmap; + unsigned long vmaddr; + spinlock_t *ptl; + pte_t *ptep; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + parent = sg->parent; + while (len) { + vmaddr = __gmap_translate(parent, paddr); + if (IS_ERR_VALUE(vmaddr)) + return vmaddr; + rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT); + if (!rmap) + return -ENOMEM; + rmap->raddr = raddr; + rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); + if (rc) { + kfree(rmap); + return rc; + } + rc = -EAGAIN; + ptep = gmap_pte_op_walk(parent, paddr, &ptl); + if (ptep) { + spin_lock(&sg->guest_table_lock); + rc = ptep_force_prot(parent->mm, paddr, ptep, PROT_READ, + PGSTE_VSIE_BIT); + if (!rc) + gmap_insert_rmap(sg, vmaddr, rmap); + spin_unlock(&sg->guest_table_lock); + gmap_pte_op_end(ptep, ptl); + } + radix_tree_preload_end(); + if (rc) { + kfree(rmap); + rc = gmap_pte_op_fixup(parent, paddr, vmaddr, PROT_READ); + if (rc) + return rc; + continue; + } + paddr += PAGE_SIZE; + len -= PAGE_SIZE; + } + return 0; +} + +#define _SHADOW_RMAP_MASK 0x7 +#define _SHADOW_RMAP_REGION1 0x5 +#define _SHADOW_RMAP_REGION2 0x4 +#define _SHADOW_RMAP_REGION3 0x3 +#define _SHADOW_RMAP_SEGMENT 0x2 +#define _SHADOW_RMAP_PGTABLE 0x1 + +/** + * gmap_idte_one - invalidate a single region or segment table entry + * @asce: region or segment table *origin* + table-type bits + * @vaddr: virtual address to identify the table entry to flush + * + * The invalid bit of a single region or segment table entry is set + * and the associated TLB entries depending on the entry are flushed. + * The table-type of the @asce identifies the portion of the @vaddr + * that is used as the invalidation index. + */ +static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr) +{ + asm volatile( + " idte %0,0,%1" + : : "a" (asce), "a" (vaddr) : "cc", "memory"); +} + +/** + * gmap_unshadow_page - remove a page from a shadow page table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * + * Called with the sg->guest_table_lock + */ +static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr) +{ + unsigned long *table; + + BUG_ON(!gmap_is_shadow(sg)); + table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */ + if (!table || *table & _PAGE_INVALID) + return; + gmap_call_notifier(sg, raddr, raddr + _PAGE_SIZE - 1); + ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table); +} + +/** + * __gmap_unshadow_pgt - remove all entries from a shadow page table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * @pgt: pointer to the start of a shadow page table + * + * Called with the sg->guest_table_lock + */ +static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr, + unsigned long *pgt) +{ + int i; + + BUG_ON(!gmap_is_shadow(sg)); + for (i = 0; i < _PAGE_ENTRIES; i++, raddr += _PAGE_SIZE) + pgt[i] = _PAGE_INVALID; +} + +/** + * gmap_unshadow_pgt - remove a shadow page table from a segment entry + * @sg: pointer to the shadow guest address space structure + * @raddr: address in the shadow guest address space + * + * Called with the sg->guest_table_lock + */ +static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) +{ + unsigned long *ste; + phys_addr_t sto, pgt; + struct page *page; + + BUG_ON(!gmap_is_shadow(sg)); + ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */ + if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN)) + return; + gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1); + sto = __pa(ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT)); + gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr); + pgt = *ste & _SEGMENT_ENTRY_ORIGIN; + *ste = _SEGMENT_ENTRY_EMPTY; + __gmap_unshadow_pgt(sg, raddr, __va(pgt)); + /* Free page table */ + page = phys_to_page(pgt); + list_del(&page->lru); + page_table_free_pgste(page); +} + +/** + * __gmap_unshadow_sgt - remove all entries from a shadow segment table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * @sgt: pointer to the start of a shadow segment table + * + * Called with the sg->guest_table_lock + */ +static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, + unsigned long *sgt) +{ + struct page *page; + phys_addr_t pgt; + int i; + + BUG_ON(!gmap_is_shadow(sg)); + for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) { + if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN)) + continue; + pgt = sgt[i] & _REGION_ENTRY_ORIGIN; + sgt[i] = _SEGMENT_ENTRY_EMPTY; + __gmap_unshadow_pgt(sg, raddr, __va(pgt)); + /* Free page table */ + page = phys_to_page(pgt); + list_del(&page->lru); + page_table_free_pgste(page); + } +} + +/** + * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * + * Called with the shadow->guest_table_lock + */ +static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) +{ + unsigned long r3o, *r3e; + phys_addr_t sgt; + struct page *page; + + BUG_ON(!gmap_is_shadow(sg)); + r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */ + if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN)) + return; + gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1); + r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT)); + gmap_idte_one(__pa(r3o) | _ASCE_TYPE_REGION3, raddr); + sgt = *r3e & _REGION_ENTRY_ORIGIN; + *r3e = _REGION3_ENTRY_EMPTY; + __gmap_unshadow_sgt(sg, raddr, __va(sgt)); + /* Free segment table */ + page = phys_to_page(sgt); + list_del(&page->lru); + __free_pages(page, CRST_ALLOC_ORDER); +} + +/** + * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table + * @sg: pointer to the shadow guest address space structure + * @raddr: address in the shadow guest address space + * @r3t: pointer to the start of a shadow region-3 table + * + * Called with the sg->guest_table_lock + */ +static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, + unsigned long *r3t) +{ + struct page *page; + phys_addr_t sgt; + int i; + + BUG_ON(!gmap_is_shadow(sg)); + for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) { + if (!(r3t[i] & _REGION_ENTRY_ORIGIN)) + continue; + sgt = r3t[i] & _REGION_ENTRY_ORIGIN; + r3t[i] = _REGION3_ENTRY_EMPTY; + __gmap_unshadow_sgt(sg, raddr, __va(sgt)); + /* Free segment table */ + page = phys_to_page(sgt); + list_del(&page->lru); + __free_pages(page, CRST_ALLOC_ORDER); + } +} + +/** + * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * + * Called with the sg->guest_table_lock + */ +static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) +{ + unsigned long r2o, *r2e; + phys_addr_t r3t; + struct page *page; + + BUG_ON(!gmap_is_shadow(sg)); + r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */ + if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN)) + return; + gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1); + r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT)); + gmap_idte_one(__pa(r2o) | _ASCE_TYPE_REGION2, raddr); + r3t = *r2e & _REGION_ENTRY_ORIGIN; + *r2e = _REGION2_ENTRY_EMPTY; + __gmap_unshadow_r3t(sg, raddr, __va(r3t)); + /* Free region 3 table */ + page = phys_to_page(r3t); + list_del(&page->lru); + __free_pages(page, CRST_ALLOC_ORDER); +} + +/** + * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * @r2t: pointer to the start of a shadow region-2 table + * + * Called with the sg->guest_table_lock + */ +static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, + unsigned long *r2t) +{ + phys_addr_t r3t; + struct page *page; + int i; + + BUG_ON(!gmap_is_shadow(sg)); + for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) { + if (!(r2t[i] & _REGION_ENTRY_ORIGIN)) + continue; + r3t = r2t[i] & _REGION_ENTRY_ORIGIN; + r2t[i] = _REGION2_ENTRY_EMPTY; + __gmap_unshadow_r3t(sg, raddr, __va(r3t)); + /* Free region 3 table */ + page = phys_to_page(r3t); + list_del(&page->lru); + __free_pages(page, CRST_ALLOC_ORDER); + } +} + +/** + * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * + * Called with the sg->guest_table_lock + */ +static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) +{ + unsigned long r1o, *r1e; + struct page *page; + phys_addr_t r2t; + + BUG_ON(!gmap_is_shadow(sg)); + r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */ + if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN)) + return; + gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1); + r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT)); + gmap_idte_one(__pa(r1o) | _ASCE_TYPE_REGION1, raddr); + r2t = *r1e & _REGION_ENTRY_ORIGIN; + *r1e = _REGION1_ENTRY_EMPTY; + __gmap_unshadow_r2t(sg, raddr, __va(r2t)); + /* Free region 2 table */ + page = phys_to_page(r2t); + list_del(&page->lru); + __free_pages(page, CRST_ALLOC_ORDER); +} + +/** + * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * @r1t: pointer to the start of a shadow region-1 table + * + * Called with the shadow->guest_table_lock + */ +static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, + unsigned long *r1t) +{ + unsigned long asce; + struct page *page; + phys_addr_t r2t; + int i; + + BUG_ON(!gmap_is_shadow(sg)); + asce = __pa(r1t) | _ASCE_TYPE_REGION1; + for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) { + if (!(r1t[i] & _REGION_ENTRY_ORIGIN)) + continue; + r2t = r1t[i] & _REGION_ENTRY_ORIGIN; + __gmap_unshadow_r2t(sg, raddr, __va(r2t)); + /* Clear entry and flush translation r1t -> r2t */ + gmap_idte_one(asce, raddr); + r1t[i] = _REGION1_ENTRY_EMPTY; + /* Free region 2 table */ + page = phys_to_page(r2t); + list_del(&page->lru); + __free_pages(page, CRST_ALLOC_ORDER); + } +} + +/** + * gmap_unshadow - remove a shadow page table completely + * @sg: pointer to the shadow guest address space structure + * + * Called with sg->guest_table_lock + */ +static void gmap_unshadow(struct gmap *sg) +{ + unsigned long *table; + + BUG_ON(!gmap_is_shadow(sg)); + if (sg->removed) + return; + sg->removed = 1; + gmap_call_notifier(sg, 0, -1UL); + gmap_flush_tlb(sg); + table = __va(sg->asce & _ASCE_ORIGIN); + switch (sg->asce & _ASCE_TYPE_MASK) { + case _ASCE_TYPE_REGION1: + __gmap_unshadow_r1t(sg, 0, table); + break; + case _ASCE_TYPE_REGION2: + __gmap_unshadow_r2t(sg, 0, table); + break; + case _ASCE_TYPE_REGION3: + __gmap_unshadow_r3t(sg, 0, table); + break; + case _ASCE_TYPE_SEGMENT: + __gmap_unshadow_sgt(sg, 0, table); + break; + } +} + +/** + * gmap_find_shadow - find a specific asce in the list of shadow tables + * @parent: pointer to the parent gmap + * @asce: ASCE for which the shadow table is created + * @edat_level: edat level to be used for the shadow translation + * + * Returns the pointer to a gmap if a shadow table with the given asce is + * already available, ERR_PTR(-EAGAIN) if another one is just being created, + * otherwise NULL + */ +static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce, + int edat_level) +{ + struct gmap *sg; + + list_for_each_entry(sg, &parent->children, list) { + if (sg->orig_asce != asce || sg->edat_level != edat_level || + sg->removed) + continue; + if (!sg->initialized) + return ERR_PTR(-EAGAIN); + refcount_inc(&sg->ref_count); + return sg; + } + return NULL; +} + +/** + * gmap_shadow_valid - check if a shadow guest address space matches the + * given properties and is still valid + * @sg: pointer to the shadow guest address space structure + * @asce: ASCE for which the shadow table is requested + * @edat_level: edat level to be used for the shadow translation + * + * Returns 1 if the gmap shadow is still valid and matches the given + * properties, the caller can continue using it. Returns 0 otherwise, the + * caller has to request a new shadow gmap in this case. + * + */ +int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level) +{ + if (sg->removed) + return 0; + return sg->orig_asce == asce && sg->edat_level == edat_level; +} +EXPORT_SYMBOL_GPL(gmap_shadow_valid); + +/** + * gmap_shadow - create/find a shadow guest address space + * @parent: pointer to the parent gmap + * @asce: ASCE for which the shadow table is created + * @edat_level: edat level to be used for the shadow translation + * + * The pages of the top level page table referred by the asce parameter + * will be set to read-only and marked in the PGSTEs of the kvm process. + * The shadow table will be removed automatically on any change to the + * PTE mapping for the source table. + * + * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory, + * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the + * parent gmap table could not be protected. + */ +struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, + int edat_level) +{ + struct gmap *sg, *new; + unsigned long limit; + int rc; + + BUG_ON(parent->mm->context.allow_gmap_hpage_1m); + BUG_ON(gmap_is_shadow(parent)); + spin_lock(&parent->shadow_lock); + sg = gmap_find_shadow(parent, asce, edat_level); + spin_unlock(&parent->shadow_lock); + if (sg) + return sg; + /* Create a new shadow gmap */ + limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11)); + if (asce & _ASCE_REAL_SPACE) + limit = -1UL; + new = gmap_alloc(limit); + if (!new) + return ERR_PTR(-ENOMEM); + new->mm = parent->mm; + new->parent = gmap_get(parent); + new->orig_asce = asce; + new->edat_level = edat_level; + new->initialized = false; + spin_lock(&parent->shadow_lock); + /* Recheck if another CPU created the same shadow */ + sg = gmap_find_shadow(parent, asce, edat_level); + if (sg) { + spin_unlock(&parent->shadow_lock); + gmap_free(new); + return sg; + } + if (asce & _ASCE_REAL_SPACE) { + /* only allow one real-space gmap shadow */ + list_for_each_entry(sg, &parent->children, list) { + if (sg->orig_asce & _ASCE_REAL_SPACE) { + spin_lock(&sg->guest_table_lock); + gmap_unshadow(sg); + spin_unlock(&sg->guest_table_lock); + list_del(&sg->list); + gmap_put(sg); + break; + } + } + } + refcount_set(&new->ref_count, 2); + list_add(&new->list, &parent->children); + if (asce & _ASCE_REAL_SPACE) { + /* nothing to protect, return right away */ + new->initialized = true; + spin_unlock(&parent->shadow_lock); + return new; + } + spin_unlock(&parent->shadow_lock); + /* protect after insertion, so it will get properly invalidated */ + mmap_read_lock(parent->mm); + rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN, + ((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE, + PROT_READ, GMAP_NOTIFY_SHADOW); + mmap_read_unlock(parent->mm); + spin_lock(&parent->shadow_lock); + new->initialized = true; + if (rc) { + list_del(&new->list); + gmap_free(new); + new = ERR_PTR(rc); + } + spin_unlock(&parent->shadow_lock); + return new; +} +EXPORT_SYMBOL_GPL(gmap_shadow); + +/** + * gmap_shadow_r2t - create an empty shadow region 2 table + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @r2t: parent gmap address of the region 2 table to get shadowed + * @fake: r2t references contiguous guest memory block, not a r2t + * + * The r2t parameter specifies the address of the source table. The + * four pages of the source table are made read-only in the parent gmap + * address space. A write to the source table area @r2t will automatically + * remove the shadow r2 table and all of its descendants. + * + * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory and + * -EFAULT if an address in the parent gmap could not be resolved. + * + * Called with sg->mm->mmap_lock in read. + */ +int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, + int fake) +{ + unsigned long raddr, origin, offset, len; + unsigned long *table; + phys_addr_t s_r2t; + struct page *page; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + /* Allocate a shadow region second table */ + page = gmap_alloc_crst(); + if (!page) + return -ENOMEM; + page->index = r2t & _REGION_ENTRY_ORIGIN; + if (fake) + page->index |= GMAP_SHADOW_FAKE_TABLE; + s_r2t = page_to_phys(page); + /* Install shadow region second table */ + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */ + if (!table) { + rc = -EAGAIN; /* Race with unshadow */ + goto out_free; + } + if (!(*table & _REGION_ENTRY_INVALID)) { + rc = 0; /* Already established */ + goto out_free; + } else if (*table & _REGION_ENTRY_ORIGIN) { + rc = -EAGAIN; /* Race with shadow */ + goto out_free; + } + crst_table_init(__va(s_r2t), _REGION2_ENTRY_EMPTY); + /* mark as invalid as long as the parent table is not protected */ + *table = s_r2t | _REGION_ENTRY_LENGTH | + _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID; + if (sg->edat_level >= 1) + *table |= (r2t & _REGION_ENTRY_PROTECT); + list_add(&page->lru, &sg->crst_list); + if (fake) { + /* nothing to protect for fake tables */ + *table &= ~_REGION_ENTRY_INVALID; + spin_unlock(&sg->guest_table_lock); + return 0; + } + spin_unlock(&sg->guest_table_lock); + /* Make r2t read-only in parent gmap page table */ + raddr = (saddr & _REGION1_MASK) | _SHADOW_RMAP_REGION1; + origin = r2t & _REGION_ENTRY_ORIGIN; + offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; + len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; + rc = gmap_protect_rmap(sg, raddr, origin + offset, len); + spin_lock(&sg->guest_table_lock); + if (!rc) { + table = gmap_table_walk(sg, saddr, 4); + if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r2t) + rc = -EAGAIN; /* Race with unshadow */ + else + *table &= ~_REGION_ENTRY_INVALID; + } else { + gmap_unshadow_r2t(sg, raddr); + } + spin_unlock(&sg->guest_table_lock); + return rc; +out_free: + spin_unlock(&sg->guest_table_lock); + __free_pages(page, CRST_ALLOC_ORDER); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_shadow_r2t); + +/** + * gmap_shadow_r3t - create a shadow region 3 table + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @r3t: parent gmap address of the region 3 table to get shadowed + * @fake: r3t references contiguous guest memory block, not a r3t + * + * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory and + * -EFAULT if an address in the parent gmap could not be resolved. + * + * Called with sg->mm->mmap_lock in read. + */ +int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, + int fake) +{ + unsigned long raddr, origin, offset, len; + unsigned long *table; + phys_addr_t s_r3t; + struct page *page; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + /* Allocate a shadow region second table */ + page = gmap_alloc_crst(); + if (!page) + return -ENOMEM; + page->index = r3t & _REGION_ENTRY_ORIGIN; + if (fake) + page->index |= GMAP_SHADOW_FAKE_TABLE; + s_r3t = page_to_phys(page); + /* Install shadow region second table */ + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */ + if (!table) { + rc = -EAGAIN; /* Race with unshadow */ + goto out_free; + } + if (!(*table & _REGION_ENTRY_INVALID)) { + rc = 0; /* Already established */ + goto out_free; + } else if (*table & _REGION_ENTRY_ORIGIN) { + rc = -EAGAIN; /* Race with shadow */ + goto out_free; + } + crst_table_init(__va(s_r3t), _REGION3_ENTRY_EMPTY); + /* mark as invalid as long as the parent table is not protected */ + *table = s_r3t | _REGION_ENTRY_LENGTH | + _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID; + if (sg->edat_level >= 1) + *table |= (r3t & _REGION_ENTRY_PROTECT); + list_add(&page->lru, &sg->crst_list); + if (fake) { + /* nothing to protect for fake tables */ + *table &= ~_REGION_ENTRY_INVALID; + spin_unlock(&sg->guest_table_lock); + return 0; + } + spin_unlock(&sg->guest_table_lock); + /* Make r3t read-only in parent gmap page table */ + raddr = (saddr & _REGION2_MASK) | _SHADOW_RMAP_REGION2; + origin = r3t & _REGION_ENTRY_ORIGIN; + offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; + len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; + rc = gmap_protect_rmap(sg, raddr, origin + offset, len); + spin_lock(&sg->guest_table_lock); + if (!rc) { + table = gmap_table_walk(sg, saddr, 3); + if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r3t) + rc = -EAGAIN; /* Race with unshadow */ + else + *table &= ~_REGION_ENTRY_INVALID; + } else { + gmap_unshadow_r3t(sg, raddr); + } + spin_unlock(&sg->guest_table_lock); + return rc; +out_free: + spin_unlock(&sg->guest_table_lock); + __free_pages(page, CRST_ALLOC_ORDER); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_shadow_r3t); + +/** + * gmap_shadow_sgt - create a shadow segment table + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @sgt: parent gmap address of the segment table to get shadowed + * @fake: sgt references contiguous guest memory block, not a sgt + * + * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory and + * -EFAULT if an address in the parent gmap could not be resolved. + * + * Called with sg->mm->mmap_lock in read. + */ +int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, + int fake) +{ + unsigned long raddr, origin, offset, len; + unsigned long *table; + phys_addr_t s_sgt; + struct page *page; + int rc; + + BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE)); + /* Allocate a shadow segment table */ + page = gmap_alloc_crst(); + if (!page) + return -ENOMEM; + page->index = sgt & _REGION_ENTRY_ORIGIN; + if (fake) + page->index |= GMAP_SHADOW_FAKE_TABLE; + s_sgt = page_to_phys(page); + /* Install shadow region second table */ + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */ + if (!table) { + rc = -EAGAIN; /* Race with unshadow */ + goto out_free; + } + if (!(*table & _REGION_ENTRY_INVALID)) { + rc = 0; /* Already established */ + goto out_free; + } else if (*table & _REGION_ENTRY_ORIGIN) { + rc = -EAGAIN; /* Race with shadow */ + goto out_free; + } + crst_table_init(__va(s_sgt), _SEGMENT_ENTRY_EMPTY); + /* mark as invalid as long as the parent table is not protected */ + *table = s_sgt | _REGION_ENTRY_LENGTH | + _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID; + if (sg->edat_level >= 1) + *table |= sgt & _REGION_ENTRY_PROTECT; + list_add(&page->lru, &sg->crst_list); + if (fake) { + /* nothing to protect for fake tables */ + *table &= ~_REGION_ENTRY_INVALID; + spin_unlock(&sg->guest_table_lock); + return 0; + } + spin_unlock(&sg->guest_table_lock); + /* Make sgt read-only in parent gmap page table */ + raddr = (saddr & _REGION3_MASK) | _SHADOW_RMAP_REGION3; + origin = sgt & _REGION_ENTRY_ORIGIN; + offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; + len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; + rc = gmap_protect_rmap(sg, raddr, origin + offset, len); + spin_lock(&sg->guest_table_lock); + if (!rc) { + table = gmap_table_walk(sg, saddr, 2); + if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_sgt) + rc = -EAGAIN; /* Race with unshadow */ + else + *table &= ~_REGION_ENTRY_INVALID; + } else { + gmap_unshadow_sgt(sg, raddr); + } + spin_unlock(&sg->guest_table_lock); + return rc; +out_free: + spin_unlock(&sg->guest_table_lock); + __free_pages(page, CRST_ALLOC_ORDER); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_shadow_sgt); + +/** + * gmap_shadow_pgt_lookup - find a shadow page table + * @sg: pointer to the shadow guest address space structure + * @saddr: the address in the shadow aguest address space + * @pgt: parent gmap address of the page table to get shadowed + * @dat_protection: if the pgtable is marked as protected by dat + * @fake: pgt references contiguous guest memory block, not a pgtable + * + * Returns 0 if the shadow page table was found and -EAGAIN if the page + * table was not found. + * + * Called with sg->mm->mmap_lock in read. + */ +int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, + unsigned long *pgt, int *dat_protection, + int *fake) +{ + unsigned long *table; + struct page *page; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ + if (table && !(*table & _SEGMENT_ENTRY_INVALID)) { + /* Shadow page tables are full pages (pte+pgste) */ + page = pfn_to_page(*table >> PAGE_SHIFT); + *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE; + *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT); + *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE); + rc = 0; + } else { + rc = -EAGAIN; + } + spin_unlock(&sg->guest_table_lock); + return rc; + +} +EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup); + +/** + * gmap_shadow_pgt - instantiate a shadow page table + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @pgt: parent gmap address of the page table to get shadowed + * @fake: pgt references contiguous guest memory block, not a pgtable + * + * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory, + * -EFAULT if an address in the parent gmap could not be resolved and + * + * Called with gmap->mm->mmap_lock in read + */ +int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, + int fake) +{ + unsigned long raddr, origin; + unsigned long *table; + struct page *page; + phys_addr_t s_pgt; + int rc; + + BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE)); + /* Allocate a shadow page table */ + page = page_table_alloc_pgste(sg->mm); + if (!page) + return -ENOMEM; + page->index = pgt & _SEGMENT_ENTRY_ORIGIN; + if (fake) + page->index |= GMAP_SHADOW_FAKE_TABLE; + s_pgt = page_to_phys(page); + /* Install shadow page table */ + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ + if (!table) { + rc = -EAGAIN; /* Race with unshadow */ + goto out_free; + } + if (!(*table & _SEGMENT_ENTRY_INVALID)) { + rc = 0; /* Already established */ + goto out_free; + } else if (*table & _SEGMENT_ENTRY_ORIGIN) { + rc = -EAGAIN; /* Race with shadow */ + goto out_free; + } + /* mark as invalid as long as the parent table is not protected */ + *table = (unsigned long) s_pgt | _SEGMENT_ENTRY | + (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID; + list_add(&page->lru, &sg->pt_list); + if (fake) { + /* nothing to protect for fake tables */ + *table &= ~_SEGMENT_ENTRY_INVALID; + spin_unlock(&sg->guest_table_lock); + return 0; + } + spin_unlock(&sg->guest_table_lock); + /* Make pgt read-only in parent gmap page table (not the pgste) */ + raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT; + origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK; + rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE); + spin_lock(&sg->guest_table_lock); + if (!rc) { + table = gmap_table_walk(sg, saddr, 1); + if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != s_pgt) + rc = -EAGAIN; /* Race with unshadow */ + else + *table &= ~_SEGMENT_ENTRY_INVALID; + } else { + gmap_unshadow_pgt(sg, raddr); + } + spin_unlock(&sg->guest_table_lock); + return rc; +out_free: + spin_unlock(&sg->guest_table_lock); + page_table_free_pgste(page); + return rc; + +} +EXPORT_SYMBOL_GPL(gmap_shadow_pgt); + +/** + * gmap_shadow_page - create a shadow page mapping + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @pte: pte in parent gmap address space to get shadowed + * + * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory and + * -EFAULT if an address in the parent gmap could not be resolved. + * + * Called with sg->mm->mmap_lock in read. + */ +int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) +{ + struct gmap *parent; + struct gmap_rmap *rmap; + unsigned long vmaddr, paddr; + spinlock_t *ptl; + pte_t *sptep, *tptep; + int prot; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + parent = sg->parent; + prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE; + + rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT); + if (!rmap) + return -ENOMEM; + rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE; + + while (1) { + paddr = pte_val(pte) & PAGE_MASK; + vmaddr = __gmap_translate(parent, paddr); + if (IS_ERR_VALUE(vmaddr)) { + rc = vmaddr; + break; + } + rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); + if (rc) + break; + rc = -EAGAIN; + sptep = gmap_pte_op_walk(parent, paddr, &ptl); + if (sptep) { + spin_lock(&sg->guest_table_lock); + /* Get page table pointer */ + tptep = (pte_t *) gmap_table_walk(sg, saddr, 0); + if (!tptep) { + spin_unlock(&sg->guest_table_lock); + gmap_pte_op_end(sptep, ptl); + radix_tree_preload_end(); + break; + } + rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte); + if (rc > 0) { + /* Success and a new mapping */ + gmap_insert_rmap(sg, vmaddr, rmap); + rmap = NULL; + rc = 0; + } + gmap_pte_op_end(sptep, ptl); + spin_unlock(&sg->guest_table_lock); + } + radix_tree_preload_end(); + if (!rc) + break; + rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot); + if (rc) + break; + } + kfree(rmap); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_shadow_page); + +/* + * gmap_shadow_notify - handle notifications for shadow gmap + * + * Called with sg->parent->shadow_lock. + */ +static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, + unsigned long gaddr) +{ + struct gmap_rmap *rmap, *rnext, *head; + unsigned long start, end, bits, raddr; + + BUG_ON(!gmap_is_shadow(sg)); + + spin_lock(&sg->guest_table_lock); + if (sg->removed) { + spin_unlock(&sg->guest_table_lock); + return; + } + /* Check for top level table */ + start = sg->orig_asce & _ASCE_ORIGIN; + end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE; + if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start && + gaddr < end) { + /* The complete shadow table has to go */ + gmap_unshadow(sg); + spin_unlock(&sg->guest_table_lock); + list_del(&sg->list); + gmap_put(sg); + return; + } + /* Remove the page table tree from on specific entry */ + head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); + gmap_for_each_rmap_safe(rmap, rnext, head) { + bits = rmap->raddr & _SHADOW_RMAP_MASK; + raddr = rmap->raddr ^ bits; + switch (bits) { + case _SHADOW_RMAP_REGION1: + gmap_unshadow_r2t(sg, raddr); + break; + case _SHADOW_RMAP_REGION2: + gmap_unshadow_r3t(sg, raddr); + break; + case _SHADOW_RMAP_REGION3: + gmap_unshadow_sgt(sg, raddr); + break; + case _SHADOW_RMAP_SEGMENT: + gmap_unshadow_pgt(sg, raddr); + break; + case _SHADOW_RMAP_PGTABLE: + gmap_unshadow_page(sg, raddr); + break; + } + kfree(rmap); + } + spin_unlock(&sg->guest_table_lock); +} + +/** + * ptep_notify - call all invalidation callbacks for a specific pte. + * @mm: pointer to the process mm_struct + * @vmaddr: virtual address in the process address space + * @pte: pointer to the page table entry + * @bits: bits from the pgste that caused the notify call + * + * This function is assumed to be called with the page table lock held + * for the pte to notify. + */ +void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, + pte_t *pte, unsigned long bits) +{ + unsigned long offset, gaddr = 0; + unsigned long *table; + struct gmap *gmap, *sg, *next; + + offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); + offset = offset * (PAGE_SIZE / sizeof(pte_t)); + rcu_read_lock(); + list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { + spin_lock(&gmap->guest_table_lock); + table = radix_tree_lookup(&gmap->host_to_guest, + vmaddr >> PMD_SHIFT); + if (table) + gaddr = __gmap_segment_gaddr(table) + offset; + spin_unlock(&gmap->guest_table_lock); + if (!table) + continue; + + if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) { + spin_lock(&gmap->shadow_lock); + list_for_each_entry_safe(sg, next, + &gmap->children, list) + gmap_shadow_notify(sg, vmaddr, gaddr); + spin_unlock(&gmap->shadow_lock); + } + if (bits & PGSTE_IN_BIT) + gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(ptep_notify); + +static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp, + unsigned long gaddr) +{ + set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN))); + gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1); +} + +/** + * gmap_pmdp_xchg - exchange a gmap pmd with another + * @gmap: pointer to the guest address space structure + * @pmdp: pointer to the pmd entry + * @new: replacement entry + * @gaddr: the affected guest address + * + * This function is assumed to be called with the guest_table_lock + * held. + */ +static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new, + unsigned long gaddr) +{ + gaddr &= HPAGE_MASK; + pmdp_notify_gmap(gmap, pmdp, gaddr); + new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN)); + if (MACHINE_HAS_TLB_GUEST) + __pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce, + IDTE_GLOBAL); + else if (MACHINE_HAS_IDTE) + __pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL); + else + __pmdp_csp(pmdp); + set_pmd(pmdp, new); +} + +static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr, + int purge) +{ + pmd_t *pmdp; + struct gmap *gmap; + unsigned long gaddr; + + rcu_read_lock(); + list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { + spin_lock(&gmap->guest_table_lock); + pmdp = (pmd_t *)radix_tree_delete(&gmap->host_to_guest, + vmaddr >> PMD_SHIFT); + if (pmdp) { + gaddr = __gmap_segment_gaddr((unsigned long *)pmdp); + pmdp_notify_gmap(gmap, pmdp, gaddr); + WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | + _SEGMENT_ENTRY_GMAP_UC)); + if (purge) + __pmdp_csp(pmdp); + set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); + } + spin_unlock(&gmap->guest_table_lock); + } + rcu_read_unlock(); +} + +/** + * gmap_pmdp_invalidate - invalidate all affected guest pmd entries without + * flushing + * @mm: pointer to the process mm_struct + * @vmaddr: virtual address in the process address space + */ +void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr) +{ + gmap_pmdp_clear(mm, vmaddr, 0); +} +EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate); + +/** + * gmap_pmdp_csp - csp all affected guest pmd entries + * @mm: pointer to the process mm_struct + * @vmaddr: virtual address in the process address space + */ +void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr) +{ + gmap_pmdp_clear(mm, vmaddr, 1); +} +EXPORT_SYMBOL_GPL(gmap_pmdp_csp); + +/** + * gmap_pmdp_idte_local - invalidate and clear a guest pmd entry + * @mm: pointer to the process mm_struct + * @vmaddr: virtual address in the process address space + */ +void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr) +{ + unsigned long *entry, gaddr; + struct gmap *gmap; + pmd_t *pmdp; + + rcu_read_lock(); + list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { + spin_lock(&gmap->guest_table_lock); + entry = radix_tree_delete(&gmap->host_to_guest, + vmaddr >> PMD_SHIFT); + if (entry) { + pmdp = (pmd_t *)entry; + gaddr = __gmap_segment_gaddr(entry); + pmdp_notify_gmap(gmap, pmdp, gaddr); + WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | + _SEGMENT_ENTRY_GMAP_UC)); + if (MACHINE_HAS_TLB_GUEST) + __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, + gmap->asce, IDTE_LOCAL); + else if (MACHINE_HAS_IDTE) + __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL); + *entry = _SEGMENT_ENTRY_EMPTY; + } + spin_unlock(&gmap->guest_table_lock); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local); + +/** + * gmap_pmdp_idte_global - invalidate and clear a guest pmd entry + * @mm: pointer to the process mm_struct + * @vmaddr: virtual address in the process address space + */ +void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr) +{ + unsigned long *entry, gaddr; + struct gmap *gmap; + pmd_t *pmdp; + + rcu_read_lock(); + list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { + spin_lock(&gmap->guest_table_lock); + entry = radix_tree_delete(&gmap->host_to_guest, + vmaddr >> PMD_SHIFT); + if (entry) { + pmdp = (pmd_t *)entry; + gaddr = __gmap_segment_gaddr(entry); + pmdp_notify_gmap(gmap, pmdp, gaddr); + WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | + _SEGMENT_ENTRY_GMAP_UC)); + if (MACHINE_HAS_TLB_GUEST) + __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, + gmap->asce, IDTE_GLOBAL); + else if (MACHINE_HAS_IDTE) + __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL); + else + __pmdp_csp(pmdp); + *entry = _SEGMENT_ENTRY_EMPTY; + } + spin_unlock(&gmap->guest_table_lock); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global); + +/** + * gmap_test_and_clear_dirty_pmd - test and reset segment dirty status + * @gmap: pointer to guest address space + * @pmdp: pointer to the pmd to be tested + * @gaddr: virtual address in the guest address space + * + * This function is assumed to be called with the guest_table_lock + * held. + */ +static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp, + unsigned long gaddr) +{ + if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) + return false; + + /* Already protected memory, which did not change is clean */ + if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT && + !(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC)) + return false; + + /* Clear UC indication and reset protection */ + set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_UC))); + gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0); + return true; +} + +/** + * gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment + * @gmap: pointer to guest address space + * @bitmap: dirty bitmap for this pmd + * @gaddr: virtual address in the guest address space + * @vmaddr: virtual address in the host address space + * + * This function is assumed to be called with the guest_table_lock + * held. + */ +void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4], + unsigned long gaddr, unsigned long vmaddr) +{ + int i; + pmd_t *pmdp; + pte_t *ptep; + spinlock_t *ptl; + + pmdp = gmap_pmd_op_walk(gmap, gaddr); + if (!pmdp) + return; + + if (pmd_large(*pmdp)) { + if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr)) + bitmap_fill(bitmap, _PAGE_ENTRIES); + } else { + for (i = 0; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) { + ptep = pte_alloc_map_lock(gmap->mm, pmdp, vmaddr, &ptl); + if (!ptep) + continue; + if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep)) + set_bit(i, bitmap); + pte_unmap_unlock(ptep, ptl); + } + } + gmap_pmd_op_end(gmap, pmdp); +} +EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + + split_huge_pmd(vma, pmd, addr); + return 0; +} + +static const struct mm_walk_ops thp_split_walk_ops = { + .pmd_entry = thp_split_walk_pmd_entry, + .walk_lock = PGWALK_WRLOCK_VERIFY, +}; + +static inline void thp_split_mm(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); + + for_each_vma(vmi, vma) { + vm_flags_mod(vma, VM_NOHUGEPAGE, VM_HUGEPAGE); + walk_page_vma(vma, &thp_split_walk_ops, NULL); + } + mm->def_flags |= VM_NOHUGEPAGE; +} +#else +static inline void thp_split_mm(struct mm_struct *mm) +{ +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +/* + * Remove all empty zero pages from the mapping for lazy refaulting + * - This must be called after mm->context.has_pgste is set, to avoid + * future creation of zero pages + * - This must be called after THP was disabled. + * + * mm contracts with s390, that even if mm were to remove a page table, + * racing with the loop below and so causing pte_offset_map_lock() to fail, + * it will never insert a page table containing empty zero pages once + * mm_forbids_zeropage(mm) i.e. mm->context.has_pgste is set. + */ +static int __zap_zero_pages(pmd_t *pmd, unsigned long start, + unsigned long end, struct mm_walk *walk) +{ + unsigned long addr; + + for (addr = start; addr != end; addr += PAGE_SIZE) { + pte_t *ptep; + spinlock_t *ptl; + + ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (!ptep) + break; + if (is_zero_pfn(pte_pfn(*ptep))) + ptep_xchg_direct(walk->mm, addr, ptep, __pte(_PAGE_INVALID)); + pte_unmap_unlock(ptep, ptl); + } + return 0; +} + +static const struct mm_walk_ops zap_zero_walk_ops = { + .pmd_entry = __zap_zero_pages, + .walk_lock = PGWALK_WRLOCK, +}; + +/* + * switch on pgstes for its userspace process (for kvm) + */ +int s390_enable_sie(void) +{ + struct mm_struct *mm = current->mm; + + /* Do we have pgstes? if yes, we are done */ + if (mm_has_pgste(mm)) + return 0; + /* Fail if the page tables are 2K */ + if (!mm_alloc_pgste(mm)) + return -EINVAL; + mmap_write_lock(mm); + mm->context.has_pgste = 1; + /* split thp mappings and disable thp for future mappings */ + thp_split_mm(mm); + walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL); + mmap_write_unlock(mm); + return 0; +} +EXPORT_SYMBOL_GPL(s390_enable_sie); + +int gmap_mark_unmergeable(void) +{ + /* + * Make sure to disable KSM (if enabled for the whole process or + * individual VMAs). Note that nothing currently hinders user space + * from re-enabling it. + */ + return ksm_disable(current->mm); +} +EXPORT_SYMBOL_GPL(gmap_mark_unmergeable); + +/* + * Enable storage key handling from now on and initialize the storage + * keys with the default key. + */ +static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + /* Clear storage key */ + ptep_zap_key(walk->mm, addr, pte); + return 0; +} + +/* + * Give a chance to schedule after setting a key to 256 pages. + * We only hold the mm lock, which is a rwsem and the kvm srcu. + * Both can sleep. + */ +static int __s390_enable_skey_pmd(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + cond_resched(); + return 0; +} + +static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, + unsigned long hmask, unsigned long next, + struct mm_walk *walk) +{ + pmd_t *pmd = (pmd_t *)pte; + unsigned long start, end; + struct page *page = pmd_page(*pmd); + + /* + * The write check makes sure we do not set a key on shared + * memory. This is needed as the walker does not differentiate + * between actual guest memory and the process executable or + * shared libraries. + */ + if (pmd_val(*pmd) & _SEGMENT_ENTRY_INVALID || + !(pmd_val(*pmd) & _SEGMENT_ENTRY_WRITE)) + return 0; + + start = pmd_val(*pmd) & HPAGE_MASK; + end = start + HPAGE_SIZE - 1; + __storage_key_init_range(start, end); + set_bit(PG_arch_1, &page->flags); + cond_resched(); + return 0; +} + +static const struct mm_walk_ops enable_skey_walk_ops = { + .hugetlb_entry = __s390_enable_skey_hugetlb, + .pte_entry = __s390_enable_skey_pte, + .pmd_entry = __s390_enable_skey_pmd, + .walk_lock = PGWALK_WRLOCK, +}; + +int s390_enable_skey(void) +{ + struct mm_struct *mm = current->mm; + int rc = 0; + + mmap_write_lock(mm); + if (mm_uses_skeys(mm)) + goto out_up; + + mm->context.uses_skeys = 1; + rc = gmap_mark_unmergeable(); + if (rc) { + mm->context.uses_skeys = 0; + goto out_up; + } + walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL); + +out_up: + mmap_write_unlock(mm); + return rc; +} +EXPORT_SYMBOL_GPL(s390_enable_skey); + +/* + * Reset CMMA state, make all pages stable again. + */ +static int __s390_reset_cmma(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + ptep_zap_unused(walk->mm, addr, pte, 1); + return 0; +} + +static const struct mm_walk_ops reset_cmma_walk_ops = { + .pte_entry = __s390_reset_cmma, + .walk_lock = PGWALK_WRLOCK, +}; + +void s390_reset_cmma(struct mm_struct *mm) +{ + mmap_write_lock(mm); + walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL); + mmap_write_unlock(mm); +} +EXPORT_SYMBOL_GPL(s390_reset_cmma); + +#define GATHER_GET_PAGES 32 + +struct reset_walk_state { + unsigned long next; + unsigned long count; + unsigned long pfns[GATHER_GET_PAGES]; +}; + +static int s390_gather_pages(pte_t *ptep, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct reset_walk_state *p = walk->private; + pte_t pte = READ_ONCE(*ptep); + + if (pte_present(pte)) { + /* we have a reference from the mapping, take an extra one */ + get_page(phys_to_page(pte_val(pte))); + p->pfns[p->count] = phys_to_pfn(pte_val(pte)); + p->next = next; + p->count++; + } + return p->count >= GATHER_GET_PAGES; +} + +static const struct mm_walk_ops gather_pages_ops = { + .pte_entry = s390_gather_pages, + .walk_lock = PGWALK_RDLOCK, +}; + +/* + * Call the Destroy secure page UVC on each page in the given array of PFNs. + * Each page needs to have an extra reference, which will be released here. + */ +void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns) +{ + unsigned long i; + + for (i = 0; i < count; i++) { + /* we always have an extra reference */ + uv_destroy_owned_page(pfn_to_phys(pfns[i])); + /* get rid of the extra reference */ + put_page(pfn_to_page(pfns[i])); + cond_resched(); + } +} +EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns); + +/** + * __s390_uv_destroy_range - Call the destroy secure page UVC on each page + * in the given range of the given address space. + * @mm: the mm to operate on + * @start: the start of the range + * @end: the end of the range + * @interruptible: if not 0, stop when a fatal signal is received + * + * Walk the given range of the given address space and call the destroy + * secure page UVC on each page. Optionally exit early if a fatal signal is + * pending. + * + * Return: 0 on success, -EINTR if the function stopped before completing + */ +int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, + unsigned long end, bool interruptible) +{ + struct reset_walk_state state = { .next = start }; + int r = 1; + + while (r > 0) { + state.count = 0; + mmap_read_lock(mm); + r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state); + mmap_read_unlock(mm); + cond_resched(); + s390_uv_destroy_pfns(state.count, state.pfns); + if (interruptible && fatal_signal_pending(current)) + return -EINTR; + } + return 0; +} +EXPORT_SYMBOL_GPL(__s390_uv_destroy_range); + +/** + * s390_unlist_old_asce - Remove the topmost level of page tables from the + * list of page tables of the gmap. + * @gmap: the gmap whose table is to be removed + * + * On s390x, KVM keeps a list of all pages containing the page tables of the + * gmap (the CRST list). This list is used at tear down time to free all + * pages that are now not needed anymore. + * + * This function removes the topmost page of the tree (the one pointed to by + * the ASCE) from the CRST list. + * + * This means that it will not be freed when the VM is torn down, and needs + * to be handled separately by the caller, unless a leak is actually + * intended. Notice that this function will only remove the page from the + * list, the page will still be used as a top level page table (and ASCE). + */ +void s390_unlist_old_asce(struct gmap *gmap) +{ + struct page *old; + + old = virt_to_page(gmap->table); + spin_lock(&gmap->guest_table_lock); + list_del(&old->lru); + /* + * Sometimes the topmost page might need to be "removed" multiple + * times, for example if the VM is rebooted into secure mode several + * times concurrently, or if s390_replace_asce fails after calling + * s390_remove_old_asce and is attempted again later. In that case + * the old asce has been removed from the list, and therefore it + * will not be freed when the VM terminates, but the ASCE is still + * in use and still pointed to. + * A subsequent call to replace_asce will follow the pointer and try + * to remove the same page from the list again. + * Therefore it's necessary that the page of the ASCE has valid + * pointers, so list_del can work (and do nothing) without + * dereferencing stale or invalid pointers. + */ + INIT_LIST_HEAD(&old->lru); + spin_unlock(&gmap->guest_table_lock); +} +EXPORT_SYMBOL_GPL(s390_unlist_old_asce); + +/** + * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy + * @gmap: the gmap whose ASCE needs to be replaced + * + * If the ASCE is a SEGMENT type then this function will return -EINVAL, + * otherwise the pointers in the host_to_guest radix tree will keep pointing + * to the wrong pages, causing use-after-free and memory corruption. + * If the allocation of the new top level page table fails, the ASCE is not + * replaced. + * In any case, the old ASCE is always removed from the gmap CRST list. + * Therefore the caller has to make sure to save a pointer to it + * beforehand, unless a leak is actually intended. + */ +int s390_replace_asce(struct gmap *gmap) +{ + unsigned long asce; + struct page *page; + void *table; + + s390_unlist_old_asce(gmap); + + /* Replacing segment type ASCEs would cause serious issues */ + if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) + return -EINVAL; + + page = gmap_alloc_crst(); + if (!page) + return -ENOMEM; + page->index = 0; + table = page_to_virt(page); + memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT)); + + /* + * The caller has to deal with the old ASCE, but here we make sure + * the new one is properly added to the CRST list, so that + * it will be freed when the VM is torn down. + */ + spin_lock(&gmap->guest_table_lock); + list_add(&page->lru, &gmap->crst_list); + spin_unlock(&gmap->guest_table_lock); + + /* Set new table origin while preserving existing ASCE control bits */ + asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table); + WRITE_ONCE(gmap->asce, asce); + WRITE_ONCE(gmap->mm->context.gmap_asce, asce); + WRITE_ONCE(gmap->table, table); + + return 0; +} +EXPORT_SYMBOL_GPL(s390_replace_asce); diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c new file mode 100644 index 0000000000..297a6d897d --- /dev/null +++ b/arch/s390/mm/hugetlbpage.c @@ -0,0 +1,342 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * IBM System z Huge TLB Page Support for Kernel. + * + * Copyright IBM Corp. 2007,2020 + * Author(s): Gerald Schaefer + */ + +#define KMSG_COMPONENT "hugetlb" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include + +/* + * If the bit selected by single-bit bitmask "a" is set within "x", move + * it to the position indicated by single-bit bitmask "b". + */ +#define move_set_bit(x, a, b) (((x) & (a)) >> ilog2(a) << ilog2(b)) + +static inline unsigned long __pte_to_rste(pte_t pte) +{ + unsigned long rste; + + /* + * Convert encoding pte bits pmd / pud bits + * lIR.uswrdy.p dy..R...I...wr + * empty 010.000000.0 -> 00..0...1...00 + * prot-none, clean, old 111.000000.1 -> 00..1...1...00 + * prot-none, clean, young 111.000001.1 -> 01..1...1...00 + * prot-none, dirty, old 111.000010.1 -> 10..1...1...00 + * prot-none, dirty, young 111.000011.1 -> 11..1...1...00 + * read-only, clean, old 111.000100.1 -> 00..1...1...01 + * read-only, clean, young 101.000101.1 -> 01..1...0...01 + * read-only, dirty, old 111.000110.1 -> 10..1...1...01 + * read-only, dirty, young 101.000111.1 -> 11..1...0...01 + * read-write, clean, old 111.001100.1 -> 00..1...1...11 + * read-write, clean, young 101.001101.1 -> 01..1...0...11 + * read-write, dirty, old 110.001110.1 -> 10..0...1...11 + * read-write, dirty, young 100.001111.1 -> 11..0...0...11 + * HW-bits: R read-only, I invalid + * SW-bits: p present, y young, d dirty, r read, w write, s special, + * u unused, l large + */ + if (pte_present(pte)) { + rste = pte_val(pte) & PAGE_MASK; + rste |= move_set_bit(pte_val(pte), _PAGE_READ, + _SEGMENT_ENTRY_READ); + rste |= move_set_bit(pte_val(pte), _PAGE_WRITE, + _SEGMENT_ENTRY_WRITE); + rste |= move_set_bit(pte_val(pte), _PAGE_INVALID, + _SEGMENT_ENTRY_INVALID); + rste |= move_set_bit(pte_val(pte), _PAGE_PROTECT, + _SEGMENT_ENTRY_PROTECT); + rste |= move_set_bit(pte_val(pte), _PAGE_DIRTY, + _SEGMENT_ENTRY_DIRTY); + rste |= move_set_bit(pte_val(pte), _PAGE_YOUNG, + _SEGMENT_ENTRY_YOUNG); +#ifdef CONFIG_MEM_SOFT_DIRTY + rste |= move_set_bit(pte_val(pte), _PAGE_SOFT_DIRTY, + _SEGMENT_ENTRY_SOFT_DIRTY); +#endif + rste |= move_set_bit(pte_val(pte), _PAGE_NOEXEC, + _SEGMENT_ENTRY_NOEXEC); + } else + rste = _SEGMENT_ENTRY_EMPTY; + return rste; +} + +static inline pte_t __rste_to_pte(unsigned long rste) +{ + unsigned long pteval; + int present; + + if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) + present = pud_present(__pud(rste)); + else + present = pmd_present(__pmd(rste)); + + /* + * Convert encoding pmd / pud bits pte bits + * dy..R...I...wr lIR.uswrdy.p + * empty 00..0...1...00 -> 010.000000.0 + * prot-none, clean, old 00..1...1...00 -> 111.000000.1 + * prot-none, clean, young 01..1...1...00 -> 111.000001.1 + * prot-none, dirty, old 10..1...1...00 -> 111.000010.1 + * prot-none, dirty, young 11..1...1...00 -> 111.000011.1 + * read-only, clean, old 00..1...1...01 -> 111.000100.1 + * read-only, clean, young 01..1...0...01 -> 101.000101.1 + * read-only, dirty, old 10..1...1...01 -> 111.000110.1 + * read-only, dirty, young 11..1...0...01 -> 101.000111.1 + * read-write, clean, old 00..1...1...11 -> 111.001100.1 + * read-write, clean, young 01..1...0...11 -> 101.001101.1 + * read-write, dirty, old 10..0...1...11 -> 110.001110.1 + * read-write, dirty, young 11..0...0...11 -> 100.001111.1 + * HW-bits: R read-only, I invalid + * SW-bits: p present, y young, d dirty, r read, w write, s special, + * u unused, l large + */ + if (present) { + pteval = rste & _SEGMENT_ENTRY_ORIGIN_LARGE; + pteval |= _PAGE_LARGE | _PAGE_PRESENT; + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_READ, _PAGE_READ); + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_WRITE, _PAGE_WRITE); + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_INVALID, _PAGE_INVALID); + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_PROTECT, _PAGE_PROTECT); + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_DIRTY, _PAGE_DIRTY); + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_YOUNG, _PAGE_YOUNG); +#ifdef CONFIG_MEM_SOFT_DIRTY + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY, _PAGE_SOFT_DIRTY); +#endif + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_NOEXEC, _PAGE_NOEXEC); + } else + pteval = _PAGE_INVALID; + return __pte(pteval); +} + +static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste) +{ + struct page *page; + unsigned long size, paddr; + + if (!mm_uses_skeys(mm) || + rste & _SEGMENT_ENTRY_INVALID) + return; + + if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) { + page = pud_page(__pud(rste)); + size = PUD_SIZE; + paddr = rste & PUD_MASK; + } else { + page = pmd_page(__pmd(rste)); + size = PMD_SIZE; + paddr = rste & PMD_MASK; + } + + if (!test_and_set_bit(PG_arch_1, &page->flags)) + __storage_key_init_range(paddr, paddr + size - 1); +} + +void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + unsigned long rste; + + rste = __pte_to_rste(pte); + if (!MACHINE_HAS_NX) + rste &= ~_SEGMENT_ENTRY_NOEXEC; + + /* Set correct table type for 2G hugepages */ + if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) { + if (likely(pte_present(pte))) + rste |= _REGION3_ENTRY_LARGE; + rste |= _REGION_ENTRY_TYPE_R3; + } else if (likely(pte_present(pte))) + rste |= _SEGMENT_ENTRY_LARGE; + + clear_huge_pte_skeys(mm, rste); + set_pte(ptep, __pte(rste)); +} + +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned long sz) +{ + __set_huge_pte_at(mm, addr, ptep, pte); +} + +pte_t huge_ptep_get(pte_t *ptep) +{ + return __rste_to_pte(pte_val(*ptep)); +} + +pte_t huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + pte_t pte = huge_ptep_get(ptep); + pmd_t *pmdp = (pmd_t *) ptep; + pud_t *pudp = (pud_t *) ptep; + + if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) + pudp_xchg_direct(mm, addr, pudp, __pud(_REGION3_ENTRY_EMPTY)); + else + pmdp_xchg_direct(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); + return pte; +} + +pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, unsigned long sz) +{ + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp = NULL; + + pgdp = pgd_offset(mm, addr); + p4dp = p4d_alloc(mm, pgdp, addr); + if (p4dp) { + pudp = pud_alloc(mm, p4dp, addr); + if (pudp) { + if (sz == PUD_SIZE) + return (pte_t *) pudp; + else if (sz == PMD_SIZE) + pmdp = pmd_alloc(mm, pudp, addr); + } + } + return (pte_t *) pmdp; +} + +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) +{ + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp = NULL; + + pgdp = pgd_offset(mm, addr); + if (pgd_present(*pgdp)) { + p4dp = p4d_offset(pgdp, addr); + if (p4d_present(*p4dp)) { + pudp = pud_offset(p4dp, addr); + if (pud_present(*pudp)) { + if (pud_large(*pudp)) + return (pte_t *) pudp; + pmdp = pmd_offset(pudp, addr); + } + } + } + return (pte_t *) pmdp; +} + +int pmd_huge(pmd_t pmd) +{ + return pmd_large(pmd); +} + +int pud_huge(pud_t pud) +{ + return pud_large(pud); +} + +bool __init arch_hugetlb_valid_size(unsigned long size) +{ + if (MACHINE_HAS_EDAT1 && size == PMD_SIZE) + return true; + else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE) + return true; + else + return false; +} + +static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + struct hstate *h = hstate_file(file); + struct vm_unmapped_area_info info; + + info.flags = 0; + info.length = len; + info.low_limit = current->mm->mmap_base; + info.high_limit = TASK_SIZE; + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + return vm_unmapped_area(&info); +} + +static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, + unsigned long addr0, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + struct hstate *h = hstate_file(file); + struct vm_unmapped_area_info info; + unsigned long addr; + + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = PAGE_SIZE; + info.high_limit = current->mm->mmap_base; + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + addr = vm_unmapped_area(&info); + + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + if (addr & ~PAGE_MASK) { + VM_BUG_ON(addr != -ENOMEM); + info.flags = 0; + info.low_limit = TASK_UNMAPPED_BASE; + info.high_limit = TASK_SIZE; + addr = vm_unmapped_area(&info); + } + + return addr; +} + +unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + struct hstate *h = hstate_file(file); + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + + if (len & ~huge_page_mask(h)) + return -EINVAL; + if (len > TASK_SIZE - mmap_min_addr) + return -ENOMEM; + + if (flags & MAP_FIXED) { + if (prepare_hugepage_range(file, addr, len)) + return -EINVAL; + goto check_asce_limit; + } + + if (addr) { + addr = ALIGN(addr, huge_page_size(h)); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && + (!vma || addr + len <= vm_start_gap(vma))) + goto check_asce_limit; + } + + if (mm->get_unmapped_area == arch_get_unmapped_area) + addr = hugetlb_get_unmapped_area_bottomup(file, addr, len, + pgoff, flags); + else + addr = hugetlb_get_unmapped_area_topdown(file, addr, len, + pgoff, flags); + if (offset_in_page(addr)) + return addr; + +check_asce_limit: + return check_asce_limit(mm, addr, len); +} diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c new file mode 100644 index 0000000000..8b94d2212d --- /dev/null +++ b/arch/s390/mm/init.c @@ -0,0 +1,313 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * S390 version + * Copyright IBM Corp. 1999 + * Author(s): Hartmut Penner (hp@de.ibm.com) + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1995 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(".bss..swapper_pg_dir"); +pgd_t invalid_pg_dir[PTRS_PER_PGD] __section(".bss..invalid_pg_dir"); + +unsigned long __bootdata_preserved(s390_invalid_asce); + +unsigned long empty_zero_page, zero_page_mask; +EXPORT_SYMBOL(empty_zero_page); +EXPORT_SYMBOL(zero_page_mask); + +static void __init setup_zero_pages(void) +{ + unsigned int order; + struct page *page; + int i; + + /* Latest machines require a mapping granularity of 512KB */ + order = 7; + + /* Limit number of empty zero pages for small memory sizes */ + while (order > 2 && (totalram_pages() >> 10) < (1UL << order)) + order--; + + empty_zero_page = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order); + if (!empty_zero_page) + panic("Out of memory in setup_zero_pages"); + + page = virt_to_page((void *) empty_zero_page); + split_page(page, order); + for (i = 1 << order; i > 0; i--) { + mark_page_reserved(page); + page++; + } + + zero_page_mask = ((PAGE_SIZE << order) - 1) & PAGE_MASK; +} + +/* + * paging_init() sets up the page tables + */ +void __init paging_init(void) +{ + unsigned long max_zone_pfns[MAX_NR_ZONES]; + + vmem_map_init(); + sparse_init(); + zone_dma_bits = 31; + memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); + max_zone_pfns[ZONE_DMA] = virt_to_pfn(MAX_DMA_ADDRESS); + max_zone_pfns[ZONE_NORMAL] = max_low_pfn; + free_area_init(max_zone_pfns); +} + +void mark_rodata_ro(void) +{ + unsigned long size = __end_ro_after_init - __start_ro_after_init; + + __set_memory_ro(__start_ro_after_init, __end_ro_after_init); + pr_info("Write protected read-only-after-init data: %luk\n", size >> 10); + debug_checkwx(); +} + +int set_memory_encrypted(unsigned long vaddr, int numpages) +{ + int i; + + /* make specified pages unshared, (swiotlb, dma_free) */ + for (i = 0; i < numpages; ++i) { + uv_remove_shared(virt_to_phys((void *)vaddr)); + vaddr += PAGE_SIZE; + } + return 0; +} + +int set_memory_decrypted(unsigned long vaddr, int numpages) +{ + int i; + /* make specified pages shared (swiotlb, dma_alloca) */ + for (i = 0; i < numpages; ++i) { + uv_set_shared(virt_to_phys((void *)vaddr)); + vaddr += PAGE_SIZE; + } + return 0; +} + +/* are we a protected virtualization guest? */ +bool force_dma_unencrypted(struct device *dev) +{ + return is_prot_virt_guest(); +} + +/* protected virtualization */ +static void pv_init(void) +{ + if (!is_prot_virt_guest()) + return; + + virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc); + + /* make sure bounce buffers are shared */ + swiotlb_init(true, SWIOTLB_FORCE | SWIOTLB_VERBOSE); + swiotlb_update_mem_attributes(); +} + +void __init mem_init(void) +{ + cpumask_set_cpu(0, &init_mm.context.cpu_attach_mask); + cpumask_set_cpu(0, mm_cpumask(&init_mm)); + + set_max_mapnr(max_low_pfn); + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); + + pv_init(); + kfence_split_mapping(); + /* Setup guest page hinting */ + cmma_init(); + + /* this will put all low memory onto the freelists */ + memblock_free_all(); + setup_zero_pages(); /* Setup zeroed pages. */ + + cmma_init_nodat(); +} + +void free_initmem(void) +{ + set_memory_rwnx((unsigned long)_sinittext, + (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT); + free_initmem_default(POISON_FREE_INITMEM); +} + +unsigned long memory_block_size_bytes(void) +{ + /* + * Make sure the memory block size is always greater + * or equal than the memory increment size. + */ + return max_t(unsigned long, MIN_MEMORY_BLOCK_SIZE, sclp.rzm); +} + +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(__per_cpu_offset); + +static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) +{ + return LOCAL_DISTANCE; +} + +static int __init pcpu_cpu_to_node(int cpu) +{ + return 0; +} + +void __init setup_per_cpu_areas(void) +{ + unsigned long delta; + unsigned int cpu; + int rc; + + /* + * Always reserve area for module percpu variables. That's + * what the legacy allocator did. + */ + rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, + PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, + pcpu_cpu_distance, + pcpu_cpu_to_node); + if (rc < 0) + panic("Failed to initialize percpu areas."); + + delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; + for_each_possible_cpu(cpu) + __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; +} + +#ifdef CONFIG_MEMORY_HOTPLUG + +#ifdef CONFIG_CMA + +/* Prevent memory blocks which contain cma regions from going offline */ + +struct s390_cma_mem_data { + unsigned long start; + unsigned long end; +}; + +static int s390_cma_check_range(struct cma *cma, void *data) +{ + struct s390_cma_mem_data *mem_data; + unsigned long start, end; + + mem_data = data; + start = cma_get_base(cma); + end = start + cma_get_size(cma); + if (end < mem_data->start) + return 0; + if (start >= mem_data->end) + return 0; + return -EBUSY; +} + +static int s390_cma_mem_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct s390_cma_mem_data mem_data; + struct memory_notify *arg; + int rc = 0; + + arg = data; + mem_data.start = arg->start_pfn << PAGE_SHIFT; + mem_data.end = mem_data.start + (arg->nr_pages << PAGE_SHIFT); + if (action == MEM_GOING_OFFLINE) + rc = cma_for_each_area(s390_cma_check_range, &mem_data); + return notifier_from_errno(rc); +} + +static struct notifier_block s390_cma_mem_nb = { + .notifier_call = s390_cma_mem_notifier, +}; + +static int __init s390_cma_mem_init(void) +{ + return register_memory_notifier(&s390_cma_mem_nb); +} +device_initcall(s390_cma_mem_init); + +#endif /* CONFIG_CMA */ + +int arch_add_memory(int nid, u64 start, u64 size, + struct mhp_params *params) +{ + unsigned long start_pfn = PFN_DOWN(start); + unsigned long size_pages = PFN_DOWN(size); + int rc; + + if (WARN_ON_ONCE(params->altmap)) + return -EINVAL; + + if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot)) + return -EINVAL; + + VM_BUG_ON(!mhp_range_allowed(start, size, true)); + rc = vmem_add_mapping(start, size); + if (rc) + return rc; + + rc = __add_pages(nid, start_pfn, size_pages, params); + if (rc) + vmem_remove_mapping(start, size); + return rc; +} + +void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + __remove_pages(start_pfn, nr_pages, altmap); + vmem_remove_mapping(start, size); +} +#endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c new file mode 100644 index 0000000000..c805b3e259 --- /dev/null +++ b/arch/s390/mm/maccess.c @@ -0,0 +1,193 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Access kernel memory without faulting -- s390 specific implementation. + * + * Copyright IBM Corp. 2009, 2015 + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned long __bootdata_preserved(__memcpy_real_area); +pte_t *__bootdata_preserved(memcpy_real_ptep); +static DEFINE_MUTEX(memcpy_real_mutex); + +static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t size) +{ + unsigned long aligned, offset, count; + char tmp[8]; + + aligned = (unsigned long) dst & ~7UL; + offset = (unsigned long) dst & 7UL; + size = min(8UL - offset, size); + count = size - 1; + asm volatile( + " bras 1,0f\n" + " mvc 0(1,%4),0(%5)\n" + "0: mvc 0(8,%3),0(%0)\n" + " ex %1,0(1)\n" + " lg %1,0(%3)\n" + " lra %0,0(%0)\n" + " sturg %1,%0\n" + : "+&a" (aligned), "+&a" (count), "=m" (tmp) + : "a" (&tmp), "a" (&tmp[offset]), "a" (src) + : "cc", "memory", "1"); + return size; +} + +/* + * s390_kernel_write - write to kernel memory bypassing DAT + * @dst: destination address + * @src: source address + * @size: number of bytes to copy + * + * This function writes to kernel memory bypassing DAT and possible page table + * write protection. It writes to the destination using the sturg instruction. + * Therefore we have a read-modify-write sequence: the function reads eight + * bytes from destination at an eight byte boundary, modifies the bytes + * requested and writes the result back in a loop. + */ +static DEFINE_SPINLOCK(s390_kernel_write_lock); + +notrace void *s390_kernel_write(void *dst, const void *src, size_t size) +{ + void *tmp = dst; + unsigned long flags; + long copied; + + spin_lock_irqsave(&s390_kernel_write_lock, flags); + while (size) { + copied = s390_kernel_write_odd(tmp, src, size); + tmp += copied; + src += copied; + size -= copied; + } + spin_unlock_irqrestore(&s390_kernel_write_lock, flags); + + return dst; +} + +size_t memcpy_real_iter(struct iov_iter *iter, unsigned long src, size_t count) +{ + size_t len, copied, res = 0; + unsigned long phys, offset; + void *chunk; + pte_t pte; + + BUILD_BUG_ON(MEMCPY_REAL_SIZE != PAGE_SIZE); + while (count) { + phys = src & MEMCPY_REAL_MASK; + offset = src & ~MEMCPY_REAL_MASK; + chunk = (void *)(__memcpy_real_area + offset); + len = min(count, MEMCPY_REAL_SIZE - offset); + pte = mk_pte_phys(phys, PAGE_KERNEL_RO); + + mutex_lock(&memcpy_real_mutex); + if (pte_val(pte) != pte_val(*memcpy_real_ptep)) { + __ptep_ipte(__memcpy_real_area, memcpy_real_ptep, 0, 0, IPTE_GLOBAL); + set_pte(memcpy_real_ptep, pte); + } + copied = copy_to_iter(chunk, len, iter); + mutex_unlock(&memcpy_real_mutex); + + count -= copied; + src += copied; + res += copied; + if (copied < len) + break; + } + return res; +} + +int memcpy_real(void *dest, unsigned long src, size_t count) +{ + struct iov_iter iter; + struct kvec kvec; + + kvec.iov_base = dest; + kvec.iov_len = count; + iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count); + if (memcpy_real_iter(&iter, src, count) < count) + return -EFAULT; + return 0; +} + +/* + * Find CPU that owns swapped prefix page + */ +static int get_swapped_owner(phys_addr_t addr) +{ + phys_addr_t lc; + int cpu; + + for_each_online_cpu(cpu) { + lc = virt_to_phys(lowcore_ptr[cpu]); + if (addr > lc + sizeof(struct lowcore) - 1 || addr < lc) + continue; + return cpu; + } + return -1; +} + +/* + * Convert a physical pointer for /dev/mem access + * + * For swapped prefix pages a new buffer is returned that contains a copy of + * the absolute memory. The buffer size is maximum one page large. + */ +void *xlate_dev_mem_ptr(phys_addr_t addr) +{ + void *ptr = phys_to_virt(addr); + void *bounce = ptr; + struct lowcore *abs_lc; + unsigned long size; + int this_cpu, cpu; + + cpus_read_lock(); + this_cpu = get_cpu(); + if (addr >= sizeof(struct lowcore)) { + cpu = get_swapped_owner(addr); + if (cpu < 0) + goto out; + } + bounce = (void *)__get_free_page(GFP_ATOMIC); + if (!bounce) + goto out; + size = PAGE_SIZE - (addr & ~PAGE_MASK); + if (addr < sizeof(struct lowcore)) { + abs_lc = get_abs_lowcore(); + ptr = (void *)abs_lc + addr; + memcpy(bounce, ptr, size); + put_abs_lowcore(abs_lc); + } else if (cpu == this_cpu) { + ptr = (void *)(addr - virt_to_phys(lowcore_ptr[cpu])); + memcpy(bounce, ptr, size); + } else { + memcpy(bounce, ptr, size); + } +out: + put_cpu(); + cpus_read_unlock(); + return bounce; +} + +/* + * Free converted buffer for /dev/mem access (if necessary) + */ +void unxlate_dev_mem_ptr(phys_addr_t addr, void *ptr) +{ + if (addr != virt_to_phys(ptr)) + free_page((unsigned long)ptr); +} diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c new file mode 100644 index 0000000000..fc9a7dc26c --- /dev/null +++ b/arch/s390/mm/mmap.c @@ -0,0 +1,210 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * flexible mmap layout support + * + * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. + * All Rights Reserved. + * + * Started by Ingo Molnar + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static unsigned long stack_maxrandom_size(void) +{ + if (!(current->flags & PF_RANDOMIZE)) + return 0; + return STACK_RND_MASK << PAGE_SHIFT; +} + +static inline int mmap_is_legacy(struct rlimit *rlim_stack) +{ + if (current->personality & ADDR_COMPAT_LAYOUT) + return 1; + if (rlim_stack->rlim_cur == RLIM_INFINITY) + return 1; + return sysctl_legacy_va_layout; +} + +unsigned long arch_mmap_rnd(void) +{ + return (get_random_u32() & MMAP_RND_MASK) << PAGE_SHIFT; +} + +static unsigned long mmap_base_legacy(unsigned long rnd) +{ + return TASK_UNMAPPED_BASE + rnd; +} + +static inline unsigned long mmap_base(unsigned long rnd, + struct rlimit *rlim_stack) +{ + unsigned long gap = rlim_stack->rlim_cur; + unsigned long pad = stack_maxrandom_size() + stack_guard_gap; + unsigned long gap_min, gap_max; + + /* Values close to RLIM_INFINITY can overflow. */ + if (gap + pad > gap) + gap += pad; + + /* + * Top of mmap area (just below the process stack). + * Leave at least a ~128 MB hole. + */ + gap_min = SZ_128M; + gap_max = (STACK_TOP / 6) * 5; + + if (gap < gap_min) + gap = gap_min; + else if (gap > gap_max) + gap = gap_max; + + return PAGE_ALIGN(STACK_TOP - gap - rnd); +} + +unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct vm_unmapped_area_info info; + + if (len > TASK_SIZE - mmap_min_addr) + return -ENOMEM; + + if (flags & MAP_FIXED) + goto check_asce_limit; + + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && + (!vma || addr + len <= vm_start_gap(vma))) + goto check_asce_limit; + } + + info.flags = 0; + info.length = len; + info.low_limit = mm->mmap_base; + info.high_limit = TASK_SIZE; + if (filp || (flags & MAP_SHARED)) + info.align_mask = MMAP_ALIGN_MASK << PAGE_SHIFT; + else + info.align_mask = 0; + info.align_offset = pgoff << PAGE_SHIFT; + addr = vm_unmapped_area(&info); + if (offset_in_page(addr)) + return addr; + +check_asce_limit: + return check_asce_limit(mm, addr, len); +} + +unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + struct vm_unmapped_area_info info; + + /* requested length too big for entire address space */ + if (len > TASK_SIZE - mmap_min_addr) + return -ENOMEM; + + if (flags & MAP_FIXED) + goto check_asce_limit; + + /* requesting a specific address */ + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && + (!vma || addr + len <= vm_start_gap(vma))) + goto check_asce_limit; + } + + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = PAGE_SIZE; + info.high_limit = mm->mmap_base; + if (filp || (flags & MAP_SHARED)) + info.align_mask = MMAP_ALIGN_MASK << PAGE_SHIFT; + else + info.align_mask = 0; + info.align_offset = pgoff << PAGE_SHIFT; + addr = vm_unmapped_area(&info); + + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + if (offset_in_page(addr)) { + VM_BUG_ON(addr != -ENOMEM); + info.flags = 0; + info.low_limit = TASK_UNMAPPED_BASE; + info.high_limit = TASK_SIZE; + addr = vm_unmapped_area(&info); + if (offset_in_page(addr)) + return addr; + } + +check_asce_limit: + return check_asce_limit(mm, addr, len); +} + +/* + * This function, called very early during the creation of a new + * process VM image, sets up which VM layout function to use: + */ +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) +{ + unsigned long random_factor = 0UL; + + if (current->flags & PF_RANDOMIZE) + random_factor = arch_mmap_rnd(); + + /* + * Fall back to the standard layout if the personality + * bit is set, or if the expected stack growth is unlimited: + */ + if (mmap_is_legacy(rlim_stack)) { + mm->mmap_base = mmap_base_legacy(random_factor); + mm->get_unmapped_area = arch_get_unmapped_area; + } else { + mm->mmap_base = mmap_base(random_factor, rlim_stack); + mm->get_unmapped_area = arch_get_unmapped_area_topdown; + } +} + +static const pgprot_t protection_map[16] = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = PAGE_RO, + [VM_WRITE] = PAGE_RO, + [VM_WRITE | VM_READ] = PAGE_RO, + [VM_EXEC] = PAGE_RX, + [VM_EXEC | VM_READ] = PAGE_RX, + [VM_EXEC | VM_WRITE] = PAGE_RX, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_RX, + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = PAGE_RO, + [VM_SHARED | VM_WRITE] = PAGE_RW, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_RW, + [VM_SHARED | VM_EXEC] = PAGE_RX, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_RX, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_RWX, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_RWX +}; +DECLARE_VM_GET_PAGE_PROT diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c new file mode 100644 index 0000000000..79a037f49f --- /dev/null +++ b/arch/s390/mm/page-states.c @@ -0,0 +1,226 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2008 + * + * Guest page hinting for unused pages. + * + * Author(s): Martin Schwidefsky + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int cmma_flag = 1; + +static int __init cmma(char *str) +{ + bool enabled; + + if (!kstrtobool(str, &enabled)) + cmma_flag = enabled; + return 1; +} +__setup("cmma=", cmma); + +static inline int cmma_test_essa(void) +{ + unsigned long tmp = 0; + int rc = -EOPNOTSUPP; + + /* test ESSA_GET_STATE */ + asm volatile( + " .insn rrf,0xb9ab0000,%[tmp],%[tmp],%[cmd],0\n" + "0: la %[rc],0\n" + "1:\n" + EX_TABLE(0b,1b) + : [rc] "+&d" (rc), [tmp] "+&d" (tmp) + : [cmd] "i" (ESSA_GET_STATE)); + return rc; +} + +void __init cmma_init(void) +{ + if (!cmma_flag) + return; + if (cmma_test_essa()) { + cmma_flag = 0; + return; + } + if (test_facility(147)) + cmma_flag = 2; +} + +static inline void set_page_unused(struct page *page, int order) +{ + int i, rc; + + for (i = 0; i < (1 << order); i++) + asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" + : "=&d" (rc) + : "a" (page_to_phys(page + i)), + "i" (ESSA_SET_UNUSED)); +} + +static inline void set_page_stable_dat(struct page *page, int order) +{ + int i, rc; + + for (i = 0; i < (1 << order); i++) + asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" + : "=&d" (rc) + : "a" (page_to_phys(page + i)), + "i" (ESSA_SET_STABLE)); +} + +static inline void set_page_stable_nodat(struct page *page, int order) +{ + int i, rc; + + for (i = 0; i < (1 << order); i++) + asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" + : "=&d" (rc) + : "a" (page_to_phys(page + i)), + "i" (ESSA_SET_STABLE_NODAT)); +} + +static void mark_kernel_pmd(pud_t *pud, unsigned long addr, unsigned long end) +{ + unsigned long next; + struct page *page; + pmd_t *pmd; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none(*pmd) || pmd_large(*pmd)) + continue; + page = phys_to_page(pmd_val(*pmd)); + set_bit(PG_arch_1, &page->flags); + } while (pmd++, addr = next, addr != end); +} + +static void mark_kernel_pud(p4d_t *p4d, unsigned long addr, unsigned long end) +{ + unsigned long next; + struct page *page; + pud_t *pud; + int i; + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none(*pud) || pud_large(*pud)) + continue; + if (!pud_folded(*pud)) { + page = phys_to_page(pud_val(*pud)); + for (i = 0; i < 4; i++) + set_bit(PG_arch_1, &page[i].flags); + } + mark_kernel_pmd(pud, addr, next); + } while (pud++, addr = next, addr != end); +} + +static void mark_kernel_p4d(pgd_t *pgd, unsigned long addr, unsigned long end) +{ + unsigned long next; + struct page *page; + p4d_t *p4d; + int i; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (p4d_none(*p4d)) + continue; + if (!p4d_folded(*p4d)) { + page = phys_to_page(p4d_val(*p4d)); + for (i = 0; i < 4; i++) + set_bit(PG_arch_1, &page[i].flags); + } + mark_kernel_pud(p4d, addr, next); + } while (p4d++, addr = next, addr != end); +} + +static void mark_kernel_pgd(void) +{ + unsigned long addr, next; + struct page *page; + pgd_t *pgd; + int i; + + addr = 0; + pgd = pgd_offset_k(addr); + do { + next = pgd_addr_end(addr, MODULES_END); + if (pgd_none(*pgd)) + continue; + if (!pgd_folded(*pgd)) { + page = phys_to_page(pgd_val(*pgd)); + for (i = 0; i < 4; i++) + set_bit(PG_arch_1, &page[i].flags); + } + mark_kernel_p4d(pgd, addr, next); + } while (pgd++, addr = next, addr != MODULES_END); +} + +void __init cmma_init_nodat(void) +{ + struct page *page; + unsigned long start, end, ix; + int i; + + if (cmma_flag < 2) + return; + /* Mark pages used in kernel page tables */ + mark_kernel_pgd(); + page = virt_to_page(&swapper_pg_dir); + for (i = 0; i < 4; i++) + set_bit(PG_arch_1, &page[i].flags); + page = virt_to_page(&invalid_pg_dir); + for (i = 0; i < 4; i++) + set_bit(PG_arch_1, &page[i].flags); + + /* Set all kernel pages not used for page tables to stable/no-dat */ + for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, NULL) { + page = pfn_to_page(start); + for (ix = start; ix < end; ix++, page++) { + if (__test_and_clear_bit(PG_arch_1, &page->flags)) + continue; /* skip page table pages */ + if (!list_empty(&page->lru)) + continue; /* skip free pages */ + set_page_stable_nodat(page, 0); + } + } +} + +void arch_free_page(struct page *page, int order) +{ + if (!cmma_flag) + return; + set_page_unused(page, order); +} + +void arch_alloc_page(struct page *page, int order) +{ + if (!cmma_flag) + return; + if (cmma_flag < 2) + set_page_stable_dat(page, order); + else + set_page_stable_nodat(page, order); +} + +void arch_set_page_dat(struct page *page, int order) +{ + if (!cmma_flag) + return; + set_page_stable_dat(page, order); +} diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c new file mode 100644 index 0000000000..b87e96c64b --- /dev/null +++ b/arch/s390/mm/pageattr.c @@ -0,0 +1,452 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2011 + * Author(s): Jan Glauber + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static inline unsigned long sske_frame(unsigned long addr, unsigned char skey) +{ + asm volatile(".insn rrf,0xb22b0000,%[skey],%[addr],1,0" + : [addr] "+a" (addr) : [skey] "d" (skey)); + return addr; +} + +void __storage_key_init_range(unsigned long start, unsigned long end) +{ + unsigned long boundary, size; + + while (start < end) { + if (MACHINE_HAS_EDAT1) { + /* set storage keys for a 1MB frame */ + size = 1UL << 20; + boundary = (start + size) & ~(size - 1); + if (boundary <= end) { + do { + start = sske_frame(start, PAGE_DEFAULT_KEY); + } while (start < boundary); + continue; + } + } + page_set_storage_key(start, PAGE_DEFAULT_KEY, 1); + start += PAGE_SIZE; + } +} + +#ifdef CONFIG_PROC_FS +atomic_long_t __bootdata_preserved(direct_pages_count[PG_DIRECT_MAP_MAX]); + +void arch_report_meminfo(struct seq_file *m) +{ + seq_printf(m, "DirectMap4k: %8lu kB\n", + atomic_long_read(&direct_pages_count[PG_DIRECT_MAP_4K]) << 2); + seq_printf(m, "DirectMap1M: %8lu kB\n", + atomic_long_read(&direct_pages_count[PG_DIRECT_MAP_1M]) << 10); + seq_printf(m, "DirectMap2G: %8lu kB\n", + atomic_long_read(&direct_pages_count[PG_DIRECT_MAP_2G]) << 21); +} +#endif /* CONFIG_PROC_FS */ + +static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr, + unsigned long dtt) +{ + unsigned long *table, mask; + + mask = 0; + if (MACHINE_HAS_EDAT2) { + switch (dtt) { + case CRDTE_DTT_REGION3: + mask = ~(PTRS_PER_PUD * sizeof(pud_t) - 1); + break; + case CRDTE_DTT_SEGMENT: + mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); + break; + case CRDTE_DTT_PAGE: + mask = ~(PTRS_PER_PTE * sizeof(pte_t) - 1); + break; + } + table = (unsigned long *)((unsigned long)old & mask); + crdte(*old, new, table, dtt, addr, S390_lowcore.kernel_asce); + } else if (MACHINE_HAS_IDTE) { + cspg(old, *old, new); + } else { + csp((unsigned int *)old + 1, *old, new); + } +} + +static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end, + unsigned long flags) +{ + pte_t *ptep, new; + + if (flags == SET_MEMORY_4K) + return 0; + ptep = pte_offset_kernel(pmdp, addr); + do { + new = *ptep; + if (pte_none(new)) + return -EINVAL; + if (flags & SET_MEMORY_RO) + new = pte_wrprotect(new); + else if (flags & SET_MEMORY_RW) + new = pte_mkwrite_novma(pte_mkdirty(new)); + if (flags & SET_MEMORY_NX) + new = set_pte_bit(new, __pgprot(_PAGE_NOEXEC)); + else if (flags & SET_MEMORY_X) + new = clear_pte_bit(new, __pgprot(_PAGE_NOEXEC)); + if (flags & SET_MEMORY_INV) { + new = set_pte_bit(new, __pgprot(_PAGE_INVALID)); + } else if (flags & SET_MEMORY_DEF) { + new = __pte(pte_val(new) & PAGE_MASK); + new = set_pte_bit(new, PAGE_KERNEL); + if (!MACHINE_HAS_NX) + new = clear_pte_bit(new, __pgprot(_PAGE_NOEXEC)); + } + pgt_set((unsigned long *)ptep, pte_val(new), addr, CRDTE_DTT_PAGE); + ptep++; + addr += PAGE_SIZE; + cond_resched(); + } while (addr < end); + return 0; +} + +static int split_pmd_page(pmd_t *pmdp, unsigned long addr) +{ + unsigned long pte_addr, prot; + pte_t *pt_dir, *ptep; + pmd_t new; + int i, ro, nx; + + pt_dir = vmem_pte_alloc(); + if (!pt_dir) + return -ENOMEM; + pte_addr = pmd_pfn(*pmdp) << PAGE_SHIFT; + ro = !!(pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT); + nx = !!(pmd_val(*pmdp) & _SEGMENT_ENTRY_NOEXEC); + prot = pgprot_val(ro ? PAGE_KERNEL_RO : PAGE_KERNEL); + if (!nx) + prot &= ~_PAGE_NOEXEC; + ptep = pt_dir; + for (i = 0; i < PTRS_PER_PTE; i++) { + set_pte(ptep, __pte(pte_addr | prot)); + pte_addr += PAGE_SIZE; + ptep++; + } + new = __pmd(__pa(pt_dir) | _SEGMENT_ENTRY); + pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT); + update_page_count(PG_DIRECT_MAP_4K, PTRS_PER_PTE); + update_page_count(PG_DIRECT_MAP_1M, -1); + return 0; +} + +static void modify_pmd_page(pmd_t *pmdp, unsigned long addr, + unsigned long flags) +{ + pmd_t new = *pmdp; + + if (flags & SET_MEMORY_RO) + new = pmd_wrprotect(new); + else if (flags & SET_MEMORY_RW) + new = pmd_mkwrite_novma(pmd_mkdirty(new)); + if (flags & SET_MEMORY_NX) + new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC)); + else if (flags & SET_MEMORY_X) + new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC)); + if (flags & SET_MEMORY_INV) { + new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); + } else if (flags & SET_MEMORY_DEF) { + new = __pmd(pmd_val(new) & PMD_MASK); + new = set_pmd_bit(new, SEGMENT_KERNEL); + if (!MACHINE_HAS_NX) + new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC)); + } + pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT); +} + +static int walk_pmd_level(pud_t *pudp, unsigned long addr, unsigned long end, + unsigned long flags) +{ + unsigned long next; + int need_split; + pmd_t *pmdp; + int rc = 0; + + pmdp = pmd_offset(pudp, addr); + do { + if (pmd_none(*pmdp)) + return -EINVAL; + next = pmd_addr_end(addr, end); + if (pmd_large(*pmdp)) { + need_split = !!(flags & SET_MEMORY_4K); + need_split |= !!(addr & ~PMD_MASK); + need_split |= !!(addr + PMD_SIZE > next); + if (need_split) { + rc = split_pmd_page(pmdp, addr); + if (rc) + return rc; + continue; + } + modify_pmd_page(pmdp, addr, flags); + } else { + rc = walk_pte_level(pmdp, addr, next, flags); + if (rc) + return rc; + } + pmdp++; + addr = next; + cond_resched(); + } while (addr < end); + return rc; +} + +static int split_pud_page(pud_t *pudp, unsigned long addr) +{ + unsigned long pmd_addr, prot; + pmd_t *pm_dir, *pmdp; + pud_t new; + int i, ro, nx; + + pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); + if (!pm_dir) + return -ENOMEM; + pmd_addr = pud_pfn(*pudp) << PAGE_SHIFT; + ro = !!(pud_val(*pudp) & _REGION_ENTRY_PROTECT); + nx = !!(pud_val(*pudp) & _REGION_ENTRY_NOEXEC); + prot = pgprot_val(ro ? SEGMENT_KERNEL_RO : SEGMENT_KERNEL); + if (!nx) + prot &= ~_SEGMENT_ENTRY_NOEXEC; + pmdp = pm_dir; + for (i = 0; i < PTRS_PER_PMD; i++) { + set_pmd(pmdp, __pmd(pmd_addr | prot)); + pmd_addr += PMD_SIZE; + pmdp++; + } + new = __pud(__pa(pm_dir) | _REGION3_ENTRY); + pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3); + update_page_count(PG_DIRECT_MAP_1M, PTRS_PER_PMD); + update_page_count(PG_DIRECT_MAP_2G, -1); + return 0; +} + +static void modify_pud_page(pud_t *pudp, unsigned long addr, + unsigned long flags) +{ + pud_t new = *pudp; + + if (flags & SET_MEMORY_RO) + new = pud_wrprotect(new); + else if (flags & SET_MEMORY_RW) + new = pud_mkwrite(pud_mkdirty(new)); + if (flags & SET_MEMORY_NX) + new = set_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC)); + else if (flags & SET_MEMORY_X) + new = clear_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC)); + if (flags & SET_MEMORY_INV) { + new = set_pud_bit(new, __pgprot(_REGION_ENTRY_INVALID)); + } else if (flags & SET_MEMORY_DEF) { + new = __pud(pud_val(new) & PUD_MASK); + new = set_pud_bit(new, REGION3_KERNEL); + if (!MACHINE_HAS_NX) + new = clear_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC)); + } + pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3); +} + +static int walk_pud_level(p4d_t *p4d, unsigned long addr, unsigned long end, + unsigned long flags) +{ + unsigned long next; + int need_split; + pud_t *pudp; + int rc = 0; + + pudp = pud_offset(p4d, addr); + do { + if (pud_none(*pudp)) + return -EINVAL; + next = pud_addr_end(addr, end); + if (pud_large(*pudp)) { + need_split = !!(flags & SET_MEMORY_4K); + need_split |= !!(addr & ~PUD_MASK); + need_split |= !!(addr + PUD_SIZE > next); + if (need_split) { + rc = split_pud_page(pudp, addr); + if (rc) + break; + continue; + } + modify_pud_page(pudp, addr, flags); + } else { + rc = walk_pmd_level(pudp, addr, next, flags); + } + pudp++; + addr = next; + cond_resched(); + } while (addr < end && !rc); + return rc; +} + +static int walk_p4d_level(pgd_t *pgd, unsigned long addr, unsigned long end, + unsigned long flags) +{ + unsigned long next; + p4d_t *p4dp; + int rc = 0; + + p4dp = p4d_offset(pgd, addr); + do { + if (p4d_none(*p4dp)) + return -EINVAL; + next = p4d_addr_end(addr, end); + rc = walk_pud_level(p4dp, addr, next, flags); + p4dp++; + addr = next; + cond_resched(); + } while (addr < end && !rc); + return rc; +} + +DEFINE_MUTEX(cpa_mutex); + +static int change_page_attr(unsigned long addr, unsigned long end, + unsigned long flags) +{ + unsigned long next; + int rc = -EINVAL; + pgd_t *pgdp; + + pgdp = pgd_offset_k(addr); + do { + if (pgd_none(*pgdp)) + break; + next = pgd_addr_end(addr, end); + rc = walk_p4d_level(pgdp, addr, next, flags); + if (rc) + break; + cond_resched(); + } while (pgdp++, addr = next, addr < end && !rc); + return rc; +} + +static int change_page_attr_alias(unsigned long addr, unsigned long end, + unsigned long flags) +{ + unsigned long alias, offset, va_start, va_end; + struct vm_struct *area; + int rc = 0; + + /* + * Changes to read-only permissions on kernel VA mappings are also + * applied to the kernel direct mapping. Execute permissions are + * intentionally not transferred to keep all allocated pages within + * the direct mapping non-executable. + */ + flags &= SET_MEMORY_RO | SET_MEMORY_RW; + if (!flags) + return 0; + area = NULL; + while (addr < end) { + if (!area) + area = find_vm_area((void *)addr); + if (!area || !(area->flags & VM_ALLOC)) + return 0; + va_start = (unsigned long)area->addr; + va_end = va_start + area->nr_pages * PAGE_SIZE; + offset = (addr - va_start) >> PAGE_SHIFT; + alias = (unsigned long)page_address(area->pages[offset]); + rc = change_page_attr(alias, alias + PAGE_SIZE, flags); + if (rc) + break; + addr += PAGE_SIZE; + if (addr >= va_end) + area = NULL; + } + return rc; +} + +int __set_memory(unsigned long addr, unsigned long numpages, unsigned long flags) +{ + unsigned long end; + int rc; + + if (!MACHINE_HAS_NX) + flags &= ~(SET_MEMORY_NX | SET_MEMORY_X); + if (!flags) + return 0; + if (!numpages) + return 0; + addr &= PAGE_MASK; + end = addr + numpages * PAGE_SIZE; + mutex_lock(&cpa_mutex); + rc = change_page_attr(addr, end, flags); + if (rc) + goto out; + rc = change_page_attr_alias(addr, end, flags); +out: + mutex_unlock(&cpa_mutex); + return rc; +} + +int set_direct_map_invalid_noflush(struct page *page) +{ + return __set_memory((unsigned long)page_to_virt(page), 1, SET_MEMORY_INV); +} + +int set_direct_map_default_noflush(struct page *page) +{ + return __set_memory((unsigned long)page_to_virt(page), 1, SET_MEMORY_DEF); +} + +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) + +static void ipte_range(pte_t *pte, unsigned long address, int nr) +{ + int i; + + if (test_facility(13)) { + __ptep_ipte_range(address, nr - 1, pte, IPTE_GLOBAL); + return; + } + for (i = 0; i < nr; i++) { + __ptep_ipte(address, pte, 0, 0, IPTE_GLOBAL); + address += PAGE_SIZE; + pte++; + } +} + +void __kernel_map_pages(struct page *page, int numpages, int enable) +{ + unsigned long address; + pte_t *ptep, pte; + int nr, i, j; + + for (i = 0; i < numpages;) { + address = (unsigned long)page_to_virt(page + i); + ptep = virt_to_kpte(address); + nr = (unsigned long)ptep >> ilog2(sizeof(long)); + nr = PTRS_PER_PTE - (nr & (PTRS_PER_PTE - 1)); + nr = min(numpages - i, nr); + if (enable) { + for (j = 0; j < nr; j++) { + pte = clear_pte_bit(*ptep, __pgprot(_PAGE_INVALID)); + set_pte(ptep, pte); + address += PAGE_SIZE; + ptep++; + } + } else { + ipte_range(ptep, address, nr); + } + i += nr; + } +} + +#endif /* CONFIG_DEBUG_PAGEALLOC */ diff --git a/arch/s390/mm/pfault.c b/arch/s390/mm/pfault.c new file mode 100644 index 0000000000..1aac13bb8f --- /dev/null +++ b/arch/s390/mm/pfault.c @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 1999, 2023 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define __SUBCODE_MASK 0x0600 +#define __PF_RES_FIELD 0x8000000000000000UL + +/* + * 'pfault' pseudo page faults routines. + */ +static int pfault_disable; + +static int __init nopfault(char *str) +{ + pfault_disable = 1; + return 1; +} +early_param("nopfault", nopfault); + +struct pfault_refbk { + u16 refdiagc; + u16 reffcode; + u16 refdwlen; + u16 refversn; + u64 refgaddr; + u64 refselmk; + u64 refcmpmk; + u64 reserved; +}; + +static struct pfault_refbk pfault_init_refbk = { + .refdiagc = 0x258, + .reffcode = 0, + .refdwlen = 5, + .refversn = 2, + .refgaddr = __LC_LPP, + .refselmk = 1UL << 48, + .refcmpmk = 1UL << 48, + .reserved = __PF_RES_FIELD +}; + +int __pfault_init(void) +{ + int rc = -EOPNOTSUPP; + + if (pfault_disable) + return rc; + diag_stat_inc(DIAG_STAT_X258); + asm volatile( + " diag %[refbk],%[rc],0x258\n" + "0: nopr %%r7\n" + EX_TABLE(0b, 0b) + : [rc] "+d" (rc) + : [refbk] "a" (&pfault_init_refbk), "m" (pfault_init_refbk) + : "cc"); + return rc; +} + +static struct pfault_refbk pfault_fini_refbk = { + .refdiagc = 0x258, + .reffcode = 1, + .refdwlen = 5, + .refversn = 2, +}; + +void __pfault_fini(void) +{ + if (pfault_disable) + return; + diag_stat_inc(DIAG_STAT_X258); + asm volatile( + " diag %[refbk],0,0x258\n" + "0: nopr %%r7\n" + EX_TABLE(0b, 0b) + : + : [refbk] "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) + : "cc"); +} + +static DEFINE_SPINLOCK(pfault_lock); +static LIST_HEAD(pfault_list); + +#define PF_COMPLETE 0x0080 + +/* + * The mechanism of our pfault code: if Linux is running as guest, runs a user + * space process and the user space process accesses a page that the host has + * paged out we get a pfault interrupt. + * + * This allows us, within the guest, to schedule a different process. Without + * this mechanism the host would have to suspend the whole virtual cpu until + * the page has been paged in. + * + * So when we get such an interrupt then we set the state of the current task + * to uninterruptible and also set the need_resched flag. Both happens within + * interrupt context(!). If we later on want to return to user space we + * recognize the need_resched flag and then call schedule(). It's not very + * obvious how this works... + * + * Of course we have a lot of additional fun with the completion interrupt (-> + * host signals that a page of a process has been paged in and the process can + * continue to run). This interrupt can arrive on any cpu and, since we have + * virtual cpus, actually appear before the interrupt that signals that a page + * is missing. + */ +static void pfault_interrupt(struct ext_code ext_code, + unsigned int param32, unsigned long param64) +{ + struct task_struct *tsk; + __u16 subcode; + pid_t pid; + + /* + * Get the external interruption subcode & pfault initial/completion + * signal bit. VM stores this in the 'cpu address' field associated + * with the external interrupt. + */ + subcode = ext_code.subcode; + if ((subcode & 0xff00) != __SUBCODE_MASK) + return; + inc_irq_stat(IRQEXT_PFL); + /* Get the token (= pid of the affected task). */ + pid = param64 & LPP_PID_MASK; + rcu_read_lock(); + tsk = find_task_by_pid_ns(pid, &init_pid_ns); + if (tsk) + get_task_struct(tsk); + rcu_read_unlock(); + if (!tsk) + return; + spin_lock(&pfault_lock); + if (subcode & PF_COMPLETE) { + /* signal bit is set -> a page has been swapped in by VM */ + if (tsk->thread.pfault_wait == 1) { + /* + * Initial interrupt was faster than the completion + * interrupt. pfault_wait is valid. Set pfault_wait + * back to zero and wake up the process. This can + * safely be done because the task is still sleeping + * and can't produce new pfaults. + */ + tsk->thread.pfault_wait = 0; + list_del(&tsk->thread.list); + wake_up_process(tsk); + put_task_struct(tsk); + } else { + /* + * Completion interrupt was faster than initial + * interrupt. Set pfault_wait to -1 so the initial + * interrupt doesn't put the task to sleep. + * If the task is not running, ignore the completion + * interrupt since it must be a leftover of a PFAULT + * CANCEL operation which didn't remove all pending + * completion interrupts. + */ + if (task_is_running(tsk)) + tsk->thread.pfault_wait = -1; + } + } else { + /* signal bit not set -> a real page is missing. */ + if (WARN_ON_ONCE(tsk != current)) + goto out; + if (tsk->thread.pfault_wait == 1) { + /* Already on the list with a reference: put to sleep */ + goto block; + } else if (tsk->thread.pfault_wait == -1) { + /* + * Completion interrupt was faster than the initial + * interrupt (pfault_wait == -1). Set pfault_wait + * back to zero and exit. + */ + tsk->thread.pfault_wait = 0; + } else { + /* + * Initial interrupt arrived before completion + * interrupt. Let the task sleep. + * An extra task reference is needed since a different + * cpu may set the task state to TASK_RUNNING again + * before the scheduler is reached. + */ + get_task_struct(tsk); + tsk->thread.pfault_wait = 1; + list_add(&tsk->thread.list, &pfault_list); +block: + /* + * Since this must be a userspace fault, there + * is no kernel task state to trample. Rely on the + * return to userspace schedule() to block. + */ + __set_current_state(TASK_UNINTERRUPTIBLE); + set_tsk_need_resched(tsk); + set_preempt_need_resched(); + } + } +out: + spin_unlock(&pfault_lock); + put_task_struct(tsk); +} + +static int pfault_cpu_dead(unsigned int cpu) +{ + struct thread_struct *thread, *next; + struct task_struct *tsk; + + spin_lock_irq(&pfault_lock); + list_for_each_entry_safe(thread, next, &pfault_list, list) { + thread->pfault_wait = 0; + list_del(&thread->list); + tsk = container_of(thread, struct task_struct, thread); + wake_up_process(tsk); + put_task_struct(tsk); + } + spin_unlock_irq(&pfault_lock); + return 0; +} + +static int __init pfault_irq_init(void) +{ + int rc; + + rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); + if (rc) + goto out_extint; + rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP; + if (rc) + goto out_pfault; + irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL); + cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead", + NULL, pfault_cpu_dead); + return 0; + +out_pfault: + unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); +out_extint: + pfault_disable = 1; + return rc; +} +early_initcall(pfault_irq_init); diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c new file mode 100644 index 0000000000..6396d6b06a --- /dev/null +++ b/arch/s390/mm/pgalloc.c @@ -0,0 +1,757 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Page table allocation functions + * + * Copyright IBM Corp. 2016 + * Author(s): Martin Schwidefsky + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_PGSTE + +int page_table_allocate_pgste = 0; +EXPORT_SYMBOL(page_table_allocate_pgste); + +static struct ctl_table page_table_sysctl[] = { + { + .procname = "allocate_pgste", + .data = &page_table_allocate_pgste, + .maxlen = sizeof(int), + .mode = S_IRUGO | S_IWUSR, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { } +}; + +static int __init page_table_register_sysctl(void) +{ + return register_sysctl("vm", page_table_sysctl) ? 0 : -ENOMEM; +} +__initcall(page_table_register_sysctl); + +#endif /* CONFIG_PGSTE */ + +unsigned long *crst_table_alloc(struct mm_struct *mm) +{ + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER); + + if (!ptdesc) + return NULL; + arch_set_page_dat(ptdesc_page(ptdesc), CRST_ALLOC_ORDER); + return (unsigned long *) ptdesc_to_virt(ptdesc); +} + +void crst_table_free(struct mm_struct *mm, unsigned long *table) +{ + pagetable_free(virt_to_ptdesc(table)); +} + +static void __crst_table_upgrade(void *arg) +{ + struct mm_struct *mm = arg; + + /* change all active ASCEs to avoid the creation of new TLBs */ + if (current->active_mm == mm) { + S390_lowcore.user_asce = mm->context.asce; + __ctl_load(S390_lowcore.user_asce, 7, 7); + } + __tlb_flush_local(); +} + +int crst_table_upgrade(struct mm_struct *mm, unsigned long end) +{ + unsigned long *pgd = NULL, *p4d = NULL, *__pgd; + unsigned long asce_limit = mm->context.asce_limit; + + /* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */ + VM_BUG_ON(asce_limit < _REGION2_SIZE); + + if (end <= asce_limit) + return 0; + + if (asce_limit == _REGION2_SIZE) { + p4d = crst_table_alloc(mm); + if (unlikely(!p4d)) + goto err_p4d; + crst_table_init(p4d, _REGION2_ENTRY_EMPTY); + } + if (end > _REGION1_SIZE) { + pgd = crst_table_alloc(mm); + if (unlikely(!pgd)) + goto err_pgd; + crst_table_init(pgd, _REGION1_ENTRY_EMPTY); + } + + spin_lock_bh(&mm->page_table_lock); + + /* + * This routine gets called with mmap_lock lock held and there is + * no reason to optimize for the case of otherwise. However, if + * that would ever change, the below check will let us know. + */ + VM_BUG_ON(asce_limit != mm->context.asce_limit); + + if (p4d) { + __pgd = (unsigned long *) mm->pgd; + p4d_populate(mm, (p4d_t *) p4d, (pud_t *) __pgd); + mm->pgd = (pgd_t *) p4d; + mm->context.asce_limit = _REGION1_SIZE; + mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | _ASCE_TYPE_REGION2; + mm_inc_nr_puds(mm); + } + if (pgd) { + __pgd = (unsigned long *) mm->pgd; + pgd_populate(mm, (pgd_t *) pgd, (p4d_t *) __pgd); + mm->pgd = (pgd_t *) pgd; + mm->context.asce_limit = TASK_SIZE_MAX; + mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | _ASCE_TYPE_REGION1; + } + + spin_unlock_bh(&mm->page_table_lock); + + on_each_cpu(__crst_table_upgrade, mm, 0); + + return 0; + +err_pgd: + crst_table_free(mm, p4d); +err_p4d: + return -ENOMEM; +} + +static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) +{ + return atomic_fetch_xor(bits, v) ^ bits; +} + +#ifdef CONFIG_PGSTE + +struct page *page_table_alloc_pgste(struct mm_struct *mm) +{ + struct ptdesc *ptdesc; + u64 *table; + + ptdesc = pagetable_alloc(GFP_KERNEL, 0); + if (ptdesc) { + table = (u64 *)ptdesc_to_virt(ptdesc); + arch_set_page_dat(virt_to_page(table), 0); + memset64(table, _PAGE_INVALID, PTRS_PER_PTE); + memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE); + } + return ptdesc_page(ptdesc); +} + +void page_table_free_pgste(struct page *page) +{ + pagetable_free(page_ptdesc(page)); +} + +#endif /* CONFIG_PGSTE */ + +/* + * A 2KB-pgtable is either upper or lower half of a normal page. + * The second half of the page may be unused or used as another + * 2KB-pgtable. + * + * Whenever possible the parent page for a new 2KB-pgtable is picked + * from the list of partially allocated pages mm_context_t::pgtable_list. + * In case the list is empty a new parent page is allocated and added to + * the list. + * + * When a parent page gets fully allocated it contains 2KB-pgtables in both + * upper and lower halves and is removed from mm_context_t::pgtable_list. + * + * When 2KB-pgtable is freed from to fully allocated parent page that + * page turns partially allocated and added to mm_context_t::pgtable_list. + * + * If 2KB-pgtable is freed from the partially allocated parent page that + * page turns unused and gets removed from mm_context_t::pgtable_list. + * Furthermore, the unused parent page is released. + * + * As follows from the above, no unallocated or fully allocated parent + * pages are contained in mm_context_t::pgtable_list. + * + * The upper byte (bits 24-31) of the parent page _refcount is used + * for tracking contained 2KB-pgtables and has the following format: + * + * PP AA + * 01234567 upper byte (bits 24-31) of struct page::_refcount + * || || + * || |+--- upper 2KB-pgtable is allocated + * || +---- lower 2KB-pgtable is allocated + * |+------- upper 2KB-pgtable is pending for removal + * +-------- lower 2KB-pgtable is pending for removal + * + * (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why + * using _refcount is possible). + * + * When 2KB-pgtable is allocated the corresponding AA bit is set to 1. + * The parent page is either: + * - added to mm_context_t::pgtable_list in case the second half of the + * parent page is still unallocated; + * - removed from mm_context_t::pgtable_list in case both hales of the + * parent page are allocated; + * These operations are protected with mm_context_t::lock. + * + * When 2KB-pgtable is deallocated the corresponding AA bit is set to 0 + * and the corresponding PP bit is set to 1 in a single atomic operation. + * Thus, PP and AA bits corresponding to the same 2KB-pgtable are mutually + * exclusive and may never be both set to 1! + * The parent page is either: + * - added to mm_context_t::pgtable_list in case the second half of the + * parent page is still allocated; + * - removed from mm_context_t::pgtable_list in case the second half of + * the parent page is unallocated; + * These operations are protected with mm_context_t::lock. + * + * It is important to understand that mm_context_t::lock only protects + * mm_context_t::pgtable_list and AA bits, but not the parent page itself + * and PP bits. + * + * Releasing the parent page happens whenever the PP bit turns from 1 to 0, + * while both AA bits and the second PP bit are already unset. Then the + * parent page does not contain any 2KB-pgtable fragment anymore, and it has + * also been removed from mm_context_t::pgtable_list. It is safe to release + * the page therefore. + * + * PGSTE memory spaces use full 4KB-pgtables and do not need most of the + * logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable + * while the PP bits are never used, nor such a page is added to or removed + * from mm_context_t::pgtable_list. + * + * pte_free_defer() overrides those rules: it takes the page off pgtable_list, + * and prevents both 2K fragments from being reused. pte_free_defer() has to + * guarantee that its pgtable cannot be reused before the RCU grace period + * has elapsed (which page_table_free_rcu() does not actually guarantee). + * But for simplicity, because page->rcu_head overlays page->lru, and because + * the RCU callback might not be called before the mm_context_t has been freed, + * pte_free_defer() in this implementation prevents both fragments from being + * reused, and delays making the call to RCU until both fragments are freed. + */ +unsigned long *page_table_alloc(struct mm_struct *mm) +{ + unsigned long *table; + struct ptdesc *ptdesc; + unsigned int mask, bit; + + /* Try to get a fragment of a 4K page as a 2K page table */ + if (!mm_alloc_pgste(mm)) { + table = NULL; + spin_lock_bh(&mm->context.lock); + if (!list_empty(&mm->context.pgtable_list)) { + ptdesc = list_first_entry(&mm->context.pgtable_list, + struct ptdesc, pt_list); + mask = atomic_read(&ptdesc->_refcount) >> 24; + /* + * The pending removal bits must also be checked. + * Failure to do so might lead to an impossible + * value of (i.e 0x13 or 0x23) written to _refcount. + * Such values violate the assumption that pending and + * allocation bits are mutually exclusive, and the rest + * of the code unrails as result. That could lead to + * a whole bunch of races and corruptions. + */ + mask = (mask | (mask >> 4)) & 0x03U; + if (mask != 0x03U) { + table = (unsigned long *) ptdesc_to_virt(ptdesc); + bit = mask & 1; /* =1 -> second 2K */ + if (bit) + table += PTRS_PER_PTE; + atomic_xor_bits(&ptdesc->_refcount, + 0x01U << (bit + 24)); + list_del_init(&ptdesc->pt_list); + } + } + spin_unlock_bh(&mm->context.lock); + if (table) + return table; + } + /* Allocate a fresh page */ + ptdesc = pagetable_alloc(GFP_KERNEL, 0); + if (!ptdesc) + return NULL; + if (!pagetable_pte_ctor(ptdesc)) { + pagetable_free(ptdesc); + return NULL; + } + arch_set_page_dat(ptdesc_page(ptdesc), 0); + /* Initialize page table */ + table = (unsigned long *) ptdesc_to_virt(ptdesc); + if (mm_alloc_pgste(mm)) { + /* Return 4K page table with PGSTEs */ + INIT_LIST_HEAD(&ptdesc->pt_list); + atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24); + memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); + memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); + } else { + /* Return the first 2K fragment of the page */ + atomic_xor_bits(&ptdesc->_refcount, 0x01U << 24); + memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE); + spin_lock_bh(&mm->context.lock); + list_add(&ptdesc->pt_list, &mm->context.pgtable_list); + spin_unlock_bh(&mm->context.lock); + } + return table; +} + +static void page_table_release_check(struct page *page, void *table, + unsigned int half, unsigned int mask) +{ + char msg[128]; + + if (!IS_ENABLED(CONFIG_DEBUG_VM)) + return; + if (!mask && list_empty(&page->lru)) + return; + snprintf(msg, sizeof(msg), + "Invalid pgtable %p release half 0x%02x mask 0x%02x", + table, half, mask); + dump_page(page, msg); +} + +static void pte_free_now(struct rcu_head *head) +{ + struct ptdesc *ptdesc; + + ptdesc = container_of(head, struct ptdesc, pt_rcu_head); + pagetable_pte_dtor(ptdesc); + pagetable_free(ptdesc); +} + +void page_table_free(struct mm_struct *mm, unsigned long *table) +{ + unsigned int mask, bit, half; + struct ptdesc *ptdesc = virt_to_ptdesc(table); + + if (!mm_alloc_pgste(mm)) { + /* Free 2K page table fragment of a 4K page */ + bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); + spin_lock_bh(&mm->context.lock); + /* + * Mark the page for delayed release. The actual release + * will happen outside of the critical section from this + * function or from __tlb_remove_table() + */ + mask = atomic_xor_bits(&ptdesc->_refcount, 0x11U << (bit + 24)); + mask >>= 24; + if ((mask & 0x03U) && !folio_test_active(ptdesc_folio(ptdesc))) { + /* + * Other half is allocated, and neither half has had + * its free deferred: add page to head of list, to make + * this freed half available for immediate reuse. + */ + list_add(&ptdesc->pt_list, &mm->context.pgtable_list); + } else { + /* If page is on list, now remove it. */ + list_del_init(&ptdesc->pt_list); + } + spin_unlock_bh(&mm->context.lock); + mask = atomic_xor_bits(&ptdesc->_refcount, 0x10U << (bit + 24)); + mask >>= 24; + if (mask != 0x00U) + return; + half = 0x01U << bit; + } else { + half = 0x03U; + mask = atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24); + mask >>= 24; + } + + page_table_release_check(ptdesc_page(ptdesc), table, half, mask); + if (folio_test_clear_active(ptdesc_folio(ptdesc))) + call_rcu(&ptdesc->pt_rcu_head, pte_free_now); + else + pte_free_now(&ptdesc->pt_rcu_head); +} + +void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, + unsigned long vmaddr) +{ + struct mm_struct *mm; + unsigned int bit, mask; + struct ptdesc *ptdesc = virt_to_ptdesc(table); + + mm = tlb->mm; + if (mm_alloc_pgste(mm)) { + gmap_unlink(mm, table, vmaddr); + table = (unsigned long *) ((unsigned long)table | 0x03U); + tlb_remove_ptdesc(tlb, table); + return; + } + bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); + spin_lock_bh(&mm->context.lock); + /* + * Mark the page for delayed release. The actual release will happen + * outside of the critical section from __tlb_remove_table() or from + * page_table_free() + */ + mask = atomic_xor_bits(&ptdesc->_refcount, 0x11U << (bit + 24)); + mask >>= 24; + if ((mask & 0x03U) && !folio_test_active(ptdesc_folio(ptdesc))) { + /* + * Other half is allocated, and neither half has had + * its free deferred: add page to end of list, to make + * this freed half available for reuse once its pending + * bit has been cleared by __tlb_remove_table(). + */ + list_add_tail(&ptdesc->pt_list, &mm->context.pgtable_list); + } else { + /* If page is on list, now remove it. */ + list_del_init(&ptdesc->pt_list); + } + spin_unlock_bh(&mm->context.lock); + table = (unsigned long *) ((unsigned long) table | (0x01U << bit)); + tlb_remove_table(tlb, table); +} + +void __tlb_remove_table(void *_table) +{ + unsigned int mask = (unsigned long) _table & 0x03U, half = mask; + void *table = (void *)((unsigned long) _table ^ mask); + struct ptdesc *ptdesc = virt_to_ptdesc(table); + + switch (half) { + case 0x00U: /* pmd, pud, or p4d */ + pagetable_free(ptdesc); + return; + case 0x01U: /* lower 2K of a 4K page table */ + case 0x02U: /* higher 2K of a 4K page table */ + mask = atomic_xor_bits(&ptdesc->_refcount, mask << (4 + 24)); + mask >>= 24; + if (mask != 0x00U) + return; + break; + case 0x03U: /* 4K page table with pgstes */ + mask = atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24); + mask >>= 24; + break; + } + + page_table_release_check(ptdesc_page(ptdesc), table, half, mask); + if (folio_test_clear_active(ptdesc_folio(ptdesc))) + call_rcu(&ptdesc->pt_rcu_head, pte_free_now); + else + pte_free_now(&ptdesc->pt_rcu_head); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) +{ + struct page *page; + + page = virt_to_page(pgtable); + SetPageActive(page); + page_table_free(mm, (unsigned long *)pgtable); + /* + * page_table_free() does not do the pgste gmap_unlink() which + * page_table_free_rcu() does: warn us if pgste ever reaches here. + */ + WARN_ON_ONCE(mm_has_pgste(mm)); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +/* + * Base infrastructure required to generate basic asces, region, segment, + * and page tables that do not make use of enhanced features like EDAT1. + */ + +static struct kmem_cache *base_pgt_cache; + +static unsigned long *base_pgt_alloc(void) +{ + unsigned long *table; + + table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL); + if (table) + memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); + return table; +} + +static void base_pgt_free(unsigned long *table) +{ + kmem_cache_free(base_pgt_cache, table); +} + +static unsigned long *base_crst_alloc(unsigned long val) +{ + unsigned long *table; + struct ptdesc *ptdesc; + + ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, CRST_ALLOC_ORDER); + if (!ptdesc) + return NULL; + table = ptdesc_address(ptdesc); + + crst_table_init(table, val); + return table; +} + +static void base_crst_free(unsigned long *table) +{ + pagetable_free(virt_to_ptdesc(table)); +} + +#define BASE_ADDR_END_FUNC(NAME, SIZE) \ +static inline unsigned long base_##NAME##_addr_end(unsigned long addr, \ + unsigned long end) \ +{ \ + unsigned long next = (addr + (SIZE)) & ~((SIZE) - 1); \ + \ + return (next - 1) < (end - 1) ? next : end; \ +} + +BASE_ADDR_END_FUNC(page, _PAGE_SIZE) +BASE_ADDR_END_FUNC(segment, _SEGMENT_SIZE) +BASE_ADDR_END_FUNC(region3, _REGION3_SIZE) +BASE_ADDR_END_FUNC(region2, _REGION2_SIZE) +BASE_ADDR_END_FUNC(region1, _REGION1_SIZE) + +static inline unsigned long base_lra(unsigned long address) +{ + unsigned long real; + + asm volatile( + " lra %0,0(%1)\n" + : "=d" (real) : "a" (address) : "cc"); + return real; +} + +static int base_page_walk(unsigned long *origin, unsigned long addr, + unsigned long end, int alloc) +{ + unsigned long *pte, next; + + if (!alloc) + return 0; + pte = origin; + pte += (addr & _PAGE_INDEX) >> _PAGE_SHIFT; + do { + next = base_page_addr_end(addr, end); + *pte = base_lra(addr); + } while (pte++, addr = next, addr < end); + return 0; +} + +static int base_segment_walk(unsigned long *origin, unsigned long addr, + unsigned long end, int alloc) +{ + unsigned long *ste, next, *table; + int rc; + + ste = origin; + ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; + do { + next = base_segment_addr_end(addr, end); + if (*ste & _SEGMENT_ENTRY_INVALID) { + if (!alloc) + continue; + table = base_pgt_alloc(); + if (!table) + return -ENOMEM; + *ste = __pa(table) | _SEGMENT_ENTRY; + } + table = __va(*ste & _SEGMENT_ENTRY_ORIGIN); + rc = base_page_walk(table, addr, next, alloc); + if (rc) + return rc; + if (!alloc) + base_pgt_free(table); + cond_resched(); + } while (ste++, addr = next, addr < end); + return 0; +} + +static int base_region3_walk(unsigned long *origin, unsigned long addr, + unsigned long end, int alloc) +{ + unsigned long *rtte, next, *table; + int rc; + + rtte = origin; + rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT; + do { + next = base_region3_addr_end(addr, end); + if (*rtte & _REGION_ENTRY_INVALID) { + if (!alloc) + continue; + table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY); + if (!table) + return -ENOMEM; + *rtte = __pa(table) | _REGION3_ENTRY; + } + table = __va(*rtte & _REGION_ENTRY_ORIGIN); + rc = base_segment_walk(table, addr, next, alloc); + if (rc) + return rc; + if (!alloc) + base_crst_free(table); + } while (rtte++, addr = next, addr < end); + return 0; +} + +static int base_region2_walk(unsigned long *origin, unsigned long addr, + unsigned long end, int alloc) +{ + unsigned long *rste, next, *table; + int rc; + + rste = origin; + rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT; + do { + next = base_region2_addr_end(addr, end); + if (*rste & _REGION_ENTRY_INVALID) { + if (!alloc) + continue; + table = base_crst_alloc(_REGION3_ENTRY_EMPTY); + if (!table) + return -ENOMEM; + *rste = __pa(table) | _REGION2_ENTRY; + } + table = __va(*rste & _REGION_ENTRY_ORIGIN); + rc = base_region3_walk(table, addr, next, alloc); + if (rc) + return rc; + if (!alloc) + base_crst_free(table); + } while (rste++, addr = next, addr < end); + return 0; +} + +static int base_region1_walk(unsigned long *origin, unsigned long addr, + unsigned long end, int alloc) +{ + unsigned long *rfte, next, *table; + int rc; + + rfte = origin; + rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT; + do { + next = base_region1_addr_end(addr, end); + if (*rfte & _REGION_ENTRY_INVALID) { + if (!alloc) + continue; + table = base_crst_alloc(_REGION2_ENTRY_EMPTY); + if (!table) + return -ENOMEM; + *rfte = __pa(table) | _REGION1_ENTRY; + } + table = __va(*rfte & _REGION_ENTRY_ORIGIN); + rc = base_region2_walk(table, addr, next, alloc); + if (rc) + return rc; + if (!alloc) + base_crst_free(table); + } while (rfte++, addr = next, addr < end); + return 0; +} + +/** + * base_asce_free - free asce and tables returned from base_asce_alloc() + * @asce: asce to be freed + * + * Frees all region, segment, and page tables that were allocated with a + * corresponding base_asce_alloc() call. + */ +void base_asce_free(unsigned long asce) +{ + unsigned long *table = __va(asce & _ASCE_ORIGIN); + + if (!asce) + return; + switch (asce & _ASCE_TYPE_MASK) { + case _ASCE_TYPE_SEGMENT: + base_segment_walk(table, 0, _REGION3_SIZE, 0); + break; + case _ASCE_TYPE_REGION3: + base_region3_walk(table, 0, _REGION2_SIZE, 0); + break; + case _ASCE_TYPE_REGION2: + base_region2_walk(table, 0, _REGION1_SIZE, 0); + break; + case _ASCE_TYPE_REGION1: + base_region1_walk(table, 0, TASK_SIZE_MAX, 0); + break; + } + base_crst_free(table); +} + +static int base_pgt_cache_init(void) +{ + static DEFINE_MUTEX(base_pgt_cache_mutex); + unsigned long sz = _PAGE_TABLE_SIZE; + + if (base_pgt_cache) + return 0; + mutex_lock(&base_pgt_cache_mutex); + if (!base_pgt_cache) + base_pgt_cache = kmem_cache_create("base_pgt", sz, sz, 0, NULL); + mutex_unlock(&base_pgt_cache_mutex); + return base_pgt_cache ? 0 : -ENOMEM; +} + +/** + * base_asce_alloc - create kernel mapping without enhanced DAT features + * @addr: virtual start address of kernel mapping + * @num_pages: number of consecutive pages + * + * Generate an asce, including all required region, segment and page tables, + * that can be used to access the virtual kernel mapping. The difference is + * that the returned asce does not make use of any enhanced DAT features like + * e.g. large pages. This is required for some I/O functions that pass an + * asce, like e.g. some service call requests. + * + * Note: the returned asce may NEVER be attached to any cpu. It may only be + * used for I/O requests. tlb entries that might result because the + * asce was attached to a cpu won't be cleared. + */ +unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages) +{ + unsigned long asce, *table, end; + int rc; + + if (base_pgt_cache_init()) + return 0; + end = addr + num_pages * PAGE_SIZE; + if (end <= _REGION3_SIZE) { + table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY); + if (!table) + return 0; + rc = base_segment_walk(table, addr, end, 1); + asce = __pa(table) | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH; + } else if (end <= _REGION2_SIZE) { + table = base_crst_alloc(_REGION3_ENTRY_EMPTY); + if (!table) + return 0; + rc = base_region3_walk(table, addr, end, 1); + asce = __pa(table) | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; + } else if (end <= _REGION1_SIZE) { + table = base_crst_alloc(_REGION2_ENTRY_EMPTY); + if (!table) + return 0; + rc = base_region2_walk(table, addr, end, 1); + asce = __pa(table) | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; + } else { + table = base_crst_alloc(_REGION1_ENTRY_EMPTY); + if (!table) + return 0; + rc = base_region1_walk(table, addr, end, 1); + asce = __pa(table) | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH; + } + if (rc) { + base_asce_free(asce); + asce = 0; + } + return asce; +} diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c new file mode 100644 index 0000000000..5cb9294154 --- /dev/null +++ b/arch/s390/mm/pgtable.c @@ -0,0 +1,1207 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2007, 2011 + * Author(s): Martin Schwidefsky + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +pgprot_t pgprot_writecombine(pgprot_t prot) +{ + /* + * mio_wb_bit_mask may be set on a different CPU, but it is only set + * once at init and only read afterwards. + */ + return __pgprot(pgprot_val(prot) | mio_wb_bit_mask); +} +EXPORT_SYMBOL_GPL(pgprot_writecombine); + +pgprot_t pgprot_writethrough(pgprot_t prot) +{ + /* + * mio_wb_bit_mask may be set on a different CPU, but it is only set + * once at init and only read afterwards. + */ + return __pgprot(pgprot_val(prot) & ~mio_wb_bit_mask); +} +EXPORT_SYMBOL_GPL(pgprot_writethrough); + +static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, int nodat) +{ + unsigned long opt, asce; + + if (MACHINE_HAS_TLB_GUEST) { + opt = 0; + asce = READ_ONCE(mm->context.gmap_asce); + if (asce == 0UL || nodat) + opt |= IPTE_NODAT; + if (asce != -1UL) { + asce = asce ? : mm->context.asce; + opt |= IPTE_GUEST_ASCE; + } + __ptep_ipte(addr, ptep, opt, asce, IPTE_LOCAL); + } else { + __ptep_ipte(addr, ptep, 0, 0, IPTE_LOCAL); + } +} + +static inline void ptep_ipte_global(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, int nodat) +{ + unsigned long opt, asce; + + if (MACHINE_HAS_TLB_GUEST) { + opt = 0; + asce = READ_ONCE(mm->context.gmap_asce); + if (asce == 0UL || nodat) + opt |= IPTE_NODAT; + if (asce != -1UL) { + asce = asce ? : mm->context.asce; + opt |= IPTE_GUEST_ASCE; + } + __ptep_ipte(addr, ptep, opt, asce, IPTE_GLOBAL); + } else { + __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); + } +} + +static inline pte_t ptep_flush_direct(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + int nodat) +{ + pte_t old; + + old = *ptep; + if (unlikely(pte_val(old) & _PAGE_INVALID)) + return old; + atomic_inc(&mm->context.flush_count); + if (MACHINE_HAS_TLB_LC && + cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) + ptep_ipte_local(mm, addr, ptep, nodat); + else + ptep_ipte_global(mm, addr, ptep, nodat); + atomic_dec(&mm->context.flush_count); + return old; +} + +static inline pte_t ptep_flush_lazy(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + int nodat) +{ + pte_t old; + + old = *ptep; + if (unlikely(pte_val(old) & _PAGE_INVALID)) + return old; + atomic_inc(&mm->context.flush_count); + if (cpumask_equal(&mm->context.cpu_attach_mask, + cpumask_of(smp_processor_id()))) { + set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_INVALID))); + mm->context.flush_mm = 1; + } else + ptep_ipte_global(mm, addr, ptep, nodat); + atomic_dec(&mm->context.flush_count); + return old; +} + +static inline pgste_t pgste_get_lock(pte_t *ptep) +{ + unsigned long new = 0; +#ifdef CONFIG_PGSTE + unsigned long old; + + asm( + " lg %0,%2\n" + "0: lgr %1,%0\n" + " nihh %0,0xff7f\n" /* clear PCL bit in old */ + " oihh %1,0x0080\n" /* set PCL bit in new */ + " csg %0,%1,%2\n" + " jl 0b\n" + : "=&d" (old), "=&d" (new), "=Q" (ptep[PTRS_PER_PTE]) + : "Q" (ptep[PTRS_PER_PTE]) : "cc", "memory"); +#endif + return __pgste(new); +} + +static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste) +{ +#ifdef CONFIG_PGSTE + asm( + " nihh %1,0xff7f\n" /* clear PCL bit */ + " stg %1,%0\n" + : "=Q" (ptep[PTRS_PER_PTE]) + : "d" (pgste_val(pgste)), "Q" (ptep[PTRS_PER_PTE]) + : "cc", "memory"); +#endif +} + +static inline pgste_t pgste_get(pte_t *ptep) +{ + unsigned long pgste = 0; +#ifdef CONFIG_PGSTE + pgste = *(unsigned long *)(ptep + PTRS_PER_PTE); +#endif + return __pgste(pgste); +} + +static inline void pgste_set(pte_t *ptep, pgste_t pgste) +{ +#ifdef CONFIG_PGSTE + *(pgste_t *)(ptep + PTRS_PER_PTE) = pgste; +#endif +} + +static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste, + struct mm_struct *mm) +{ +#ifdef CONFIG_PGSTE + unsigned long address, bits, skey; + + if (!mm_uses_skeys(mm) || pte_val(pte) & _PAGE_INVALID) + return pgste; + address = pte_val(pte) & PAGE_MASK; + skey = (unsigned long) page_get_storage_key(address); + bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); + /* Transfer page changed & referenced bit to guest bits in pgste */ + pgste_val(pgste) |= bits << 48; /* GR bit & GC bit */ + /* Copy page access key and fetch protection bit to pgste */ + pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT); + pgste_val(pgste) |= (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; +#endif + return pgste; + +} + +static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry, + struct mm_struct *mm) +{ +#ifdef CONFIG_PGSTE + unsigned long address; + unsigned long nkey; + + if (!mm_uses_skeys(mm) || pte_val(entry) & _PAGE_INVALID) + return; + VM_BUG_ON(!(pte_val(*ptep) & _PAGE_INVALID)); + address = pte_val(entry) & PAGE_MASK; + /* + * Set page access key and fetch protection bit from pgste. + * The guest C/R information is still in the PGSTE, set real + * key C/R to 0. + */ + nkey = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; + nkey |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; + page_set_storage_key(address, nkey, 0); +#endif +} + +static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry) +{ +#ifdef CONFIG_PGSTE + if ((pte_val(entry) & _PAGE_PRESENT) && + (pte_val(entry) & _PAGE_WRITE) && + !(pte_val(entry) & _PAGE_INVALID)) { + if (!MACHINE_HAS_ESOP) { + /* + * Without enhanced suppression-on-protection force + * the dirty bit on for all writable ptes. + */ + entry = set_pte_bit(entry, __pgprot(_PAGE_DIRTY)); + entry = clear_pte_bit(entry, __pgprot(_PAGE_PROTECT)); + } + if (!(pte_val(entry) & _PAGE_PROTECT)) + /* This pte allows write access, set user-dirty */ + pgste_val(pgste) |= PGSTE_UC_BIT; + } +#endif + set_pte(ptep, entry); + return pgste; +} + +static inline pgste_t pgste_pte_notify(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, pgste_t pgste) +{ +#ifdef CONFIG_PGSTE + unsigned long bits; + + bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT); + if (bits) { + pgste_val(pgste) ^= bits; + ptep_notify(mm, addr, ptep, bits); + } +#endif + return pgste; +} + +static inline pgste_t ptep_xchg_start(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + pgste_t pgste = __pgste(0); + + if (mm_has_pgste(mm)) { + pgste = pgste_get_lock(ptep); + pgste = pgste_pte_notify(mm, addr, ptep, pgste); + } + return pgste; +} + +static inline pte_t ptep_xchg_commit(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + pgste_t pgste, pte_t old, pte_t new) +{ + if (mm_has_pgste(mm)) { + if (pte_val(old) & _PAGE_INVALID) + pgste_set_key(ptep, pgste, new, mm); + if (pte_val(new) & _PAGE_INVALID) { + pgste = pgste_update_all(old, pgste, mm); + if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) == + _PGSTE_GPS_USAGE_UNUSED) + old = set_pte_bit(old, __pgprot(_PAGE_UNUSED)); + } + pgste = pgste_set_pte(ptep, pgste, new); + pgste_set_unlock(ptep, pgste); + } else { + set_pte(ptep, new); + } + return old; +} + +pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t new) +{ + pgste_t pgste; + pte_t old; + int nodat; + + preempt_disable(); + pgste = ptep_xchg_start(mm, addr, ptep); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + old = ptep_flush_direct(mm, addr, ptep, nodat); + old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); + preempt_enable(); + return old; +} +EXPORT_SYMBOL(ptep_xchg_direct); + +/* + * Caller must check that new PTE only differs in _PAGE_PROTECT HW bit, so that + * RDP can be used instead of IPTE. See also comments at pte_allow_rdp(). + */ +void ptep_reset_dat_prot(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t new) +{ + preempt_disable(); + atomic_inc(&mm->context.flush_count); + if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) + __ptep_rdp(addr, ptep, 0, 0, 1); + else + __ptep_rdp(addr, ptep, 0, 0, 0); + /* + * PTE is not invalidated by RDP, only _PAGE_PROTECT is cleared. That + * means it is still valid and active, and must not be changed according + * to the architecture. But writing a new value that only differs in SW + * bits is allowed. + */ + set_pte(ptep, new); + atomic_dec(&mm->context.flush_count); + preempt_enable(); +} +EXPORT_SYMBOL(ptep_reset_dat_prot); + +pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t new) +{ + pgste_t pgste; + pte_t old; + int nodat; + + preempt_disable(); + pgste = ptep_xchg_start(mm, addr, ptep); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + old = ptep_flush_lazy(mm, addr, ptep, nodat); + old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); + preempt_enable(); + return old; +} +EXPORT_SYMBOL(ptep_xchg_lazy); + +pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep) +{ + pgste_t pgste; + pte_t old; + int nodat; + struct mm_struct *mm = vma->vm_mm; + + preempt_disable(); + pgste = ptep_xchg_start(mm, addr, ptep); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + old = ptep_flush_lazy(mm, addr, ptep, nodat); + if (mm_has_pgste(mm)) { + pgste = pgste_update_all(old, pgste, mm); + pgste_set(ptep, pgste); + } + return old; +} + +void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, pte_t old_pte, pte_t pte) +{ + pgste_t pgste; + struct mm_struct *mm = vma->vm_mm; + + if (!MACHINE_HAS_NX) + pte = clear_pte_bit(pte, __pgprot(_PAGE_NOEXEC)); + if (mm_has_pgste(mm)) { + pgste = pgste_get(ptep); + pgste_set_key(ptep, pgste, pte, mm); + pgste = pgste_set_pte(ptep, pgste, pte); + pgste_set_unlock(ptep, pgste); + } else { + set_pte(ptep, pte); + } + preempt_enable(); +} + +static inline void pmdp_idte_local(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + if (MACHINE_HAS_TLB_GUEST) + __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, + mm->context.asce, IDTE_LOCAL); + else + __pmdp_idte(addr, pmdp, 0, 0, IDTE_LOCAL); + if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) + gmap_pmdp_idte_local(mm, addr); +} + +static inline void pmdp_idte_global(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + if (MACHINE_HAS_TLB_GUEST) { + __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, + mm->context.asce, IDTE_GLOBAL); + if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) + gmap_pmdp_idte_global(mm, addr); + } else if (MACHINE_HAS_IDTE) { + __pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL); + if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) + gmap_pmdp_idte_global(mm, addr); + } else { + __pmdp_csp(pmdp); + if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) + gmap_pmdp_csp(mm, addr); + } +} + +static inline pmd_t pmdp_flush_direct(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t old; + + old = *pmdp; + if (pmd_val(old) & _SEGMENT_ENTRY_INVALID) + return old; + atomic_inc(&mm->context.flush_count); + if (MACHINE_HAS_TLB_LC && + cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) + pmdp_idte_local(mm, addr, pmdp); + else + pmdp_idte_global(mm, addr, pmdp); + atomic_dec(&mm->context.flush_count); + return old; +} + +static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t old; + + old = *pmdp; + if (pmd_val(old) & _SEGMENT_ENTRY_INVALID) + return old; + atomic_inc(&mm->context.flush_count); + if (cpumask_equal(&mm->context.cpu_attach_mask, + cpumask_of(smp_processor_id()))) { + set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_INVALID))); + mm->context.flush_mm = 1; + if (mm_has_pgste(mm)) + gmap_pmdp_invalidate(mm, addr); + } else { + pmdp_idte_global(mm, addr, pmdp); + } + atomic_dec(&mm->context.flush_count); + return old; +} + +#ifdef CONFIG_PGSTE +static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp) +{ + struct vm_area_struct *vma; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + + /* We need a valid VMA, otherwise this is clearly a fault. */ + vma = vma_lookup(mm, addr); + if (!vma) + return -EFAULT; + + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + return -ENOENT; + + p4d = p4d_offset(pgd, addr); + if (!p4d_present(*p4d)) + return -ENOENT; + + pud = pud_offset(p4d, addr); + if (!pud_present(*pud)) + return -ENOENT; + + /* Large PUDs are not supported yet. */ + if (pud_large(*pud)) + return -EFAULT; + + *pmdp = pmd_offset(pud, addr); + return 0; +} +#endif + +pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t new) +{ + pmd_t old; + + preempt_disable(); + old = pmdp_flush_direct(mm, addr, pmdp); + set_pmd(pmdp, new); + preempt_enable(); + return old; +} +EXPORT_SYMBOL(pmdp_xchg_direct); + +pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t new) +{ + pmd_t old; + + preempt_disable(); + old = pmdp_flush_lazy(mm, addr, pmdp); + set_pmd(pmdp, new); + preempt_enable(); + return old; +} +EXPORT_SYMBOL(pmdp_xchg_lazy); + +static inline void pudp_idte_local(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + if (MACHINE_HAS_TLB_GUEST) + __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, + mm->context.asce, IDTE_LOCAL); + else + __pudp_idte(addr, pudp, 0, 0, IDTE_LOCAL); +} + +static inline void pudp_idte_global(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + if (MACHINE_HAS_TLB_GUEST) + __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, + mm->context.asce, IDTE_GLOBAL); + else if (MACHINE_HAS_IDTE) + __pudp_idte(addr, pudp, 0, 0, IDTE_GLOBAL); + else + /* + * Invalid bit position is the same for pmd and pud, so we can + * re-use _pmd_csp() here + */ + __pmdp_csp((pmd_t *) pudp); +} + +static inline pud_t pudp_flush_direct(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + pud_t old; + + old = *pudp; + if (pud_val(old) & _REGION_ENTRY_INVALID) + return old; + atomic_inc(&mm->context.flush_count); + if (MACHINE_HAS_TLB_LC && + cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) + pudp_idte_local(mm, addr, pudp); + else + pudp_idte_global(mm, addr, pudp); + atomic_dec(&mm->context.flush_count); + return old; +} + +pud_t pudp_xchg_direct(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t new) +{ + pud_t old; + + preempt_disable(); + old = pudp_flush_direct(mm, addr, pudp); + set_pud(pudp, new); + preempt_enable(); + return old; +} +EXPORT_SYMBOL(pudp_xchg_direct); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) +{ + struct list_head *lh = (struct list_head *) pgtable; + + assert_spin_locked(pmd_lockptr(mm, pmdp)); + + /* FIFO */ + if (!pmd_huge_pte(mm, pmdp)) + INIT_LIST_HEAD(lh); + else + list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); + pmd_huge_pte(mm, pmdp) = pgtable; +} + +pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) +{ + struct list_head *lh; + pgtable_t pgtable; + pte_t *ptep; + + assert_spin_locked(pmd_lockptr(mm, pmdp)); + + /* FIFO */ + pgtable = pmd_huge_pte(mm, pmdp); + lh = (struct list_head *) pgtable; + if (list_empty(lh)) + pmd_huge_pte(mm, pmdp) = NULL; + else { + pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; + list_del(lh); + } + ptep = (pte_t *) pgtable; + set_pte(ptep, __pte(_PAGE_INVALID)); + ptep++; + set_pte(ptep, __pte(_PAGE_INVALID)); + return pgtable; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#ifdef CONFIG_PGSTE +void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t entry) +{ + pgste_t pgste; + + /* the mm_has_pgste() check is done in set_pte_at() */ + preempt_disable(); + pgste = pgste_get_lock(ptep); + pgste_val(pgste) &= ~_PGSTE_GPS_ZERO; + pgste_set_key(ptep, pgste, entry, mm); + pgste = pgste_set_pte(ptep, pgste, entry); + pgste_set_unlock(ptep, pgste); + preempt_enable(); +} + +void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + pgste_t pgste; + + preempt_disable(); + pgste = pgste_get_lock(ptep); + pgste_val(pgste) |= PGSTE_IN_BIT; + pgste_set_unlock(ptep, pgste); + preempt_enable(); +} + +/** + * ptep_force_prot - change access rights of a locked pte + * @mm: pointer to the process mm_struct + * @addr: virtual address in the guest address space + * @ptep: pointer to the page table entry + * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE + * @bit: pgste bit to set (e.g. for notification) + * + * Returns 0 if the access rights were changed and -EAGAIN if the current + * and requested access rights are incompatible. + */ +int ptep_force_prot(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, int prot, unsigned long bit) +{ + pte_t entry; + pgste_t pgste; + int pte_i, pte_p, nodat; + + pgste = pgste_get_lock(ptep); + entry = *ptep; + /* Check pte entry after all locks have been acquired */ + pte_i = pte_val(entry) & _PAGE_INVALID; + pte_p = pte_val(entry) & _PAGE_PROTECT; + if ((pte_i && (prot != PROT_NONE)) || + (pte_p && (prot & PROT_WRITE))) { + pgste_set_unlock(ptep, pgste); + return -EAGAIN; + } + /* Change access rights and set pgste bit */ + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + if (prot == PROT_NONE && !pte_i) { + ptep_flush_direct(mm, addr, ptep, nodat); + pgste = pgste_update_all(entry, pgste, mm); + entry = set_pte_bit(entry, __pgprot(_PAGE_INVALID)); + } + if (prot == PROT_READ && !pte_p) { + ptep_flush_direct(mm, addr, ptep, nodat); + entry = clear_pte_bit(entry, __pgprot(_PAGE_INVALID)); + entry = set_pte_bit(entry, __pgprot(_PAGE_PROTECT)); + } + pgste_val(pgste) |= bit; + pgste = pgste_set_pte(ptep, pgste, entry); + pgste_set_unlock(ptep, pgste); + return 0; +} + +int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, + pte_t *sptep, pte_t *tptep, pte_t pte) +{ + pgste_t spgste, tpgste; + pte_t spte, tpte; + int rc = -EAGAIN; + + if (!(pte_val(*tptep) & _PAGE_INVALID)) + return 0; /* already shadowed */ + spgste = pgste_get_lock(sptep); + spte = *sptep; + if (!(pte_val(spte) & _PAGE_INVALID) && + !((pte_val(spte) & _PAGE_PROTECT) && + !(pte_val(pte) & _PAGE_PROTECT))) { + pgste_val(spgste) |= PGSTE_VSIE_BIT; + tpgste = pgste_get_lock(tptep); + tpte = __pte((pte_val(spte) & PAGE_MASK) | + (pte_val(pte) & _PAGE_PROTECT)); + /* don't touch the storage key - it belongs to parent pgste */ + tpgste = pgste_set_pte(tptep, tpgste, tpte); + pgste_set_unlock(tptep, tpgste); + rc = 1; + } + pgste_set_unlock(sptep, spgste); + return rc; +} + +void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep) +{ + pgste_t pgste; + int nodat; + + pgste = pgste_get_lock(ptep); + /* notifier is called by the caller */ + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + ptep_flush_direct(mm, saddr, ptep, nodat); + /* don't touch the storage key - it belongs to parent pgste */ + pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID)); + pgste_set_unlock(ptep, pgste); +} + +static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) +{ + if (!non_swap_entry(entry)) + dec_mm_counter(mm, MM_SWAPENTS); + else if (is_migration_entry(entry)) { + struct page *page = pfn_swap_entry_to_page(entry); + + dec_mm_counter(mm, mm_counter(page)); + } + free_swap_and_cache(entry); +} + +void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, int reset) +{ + unsigned long pgstev; + pgste_t pgste; + pte_t pte; + + /* Zap unused and logically-zero pages */ + preempt_disable(); + pgste = pgste_get_lock(ptep); + pgstev = pgste_val(pgste); + pte = *ptep; + if (!reset && pte_swap(pte) && + ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED || + (pgstev & _PGSTE_GPS_ZERO))) { + ptep_zap_swap_entry(mm, pte_to_swp_entry(pte)); + pte_clear(mm, addr, ptep); + } + if (reset) + pgste_val(pgste) &= ~(_PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT); + pgste_set_unlock(ptep, pgste); + preempt_enable(); +} + +void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + unsigned long ptev; + pgste_t pgste; + + /* Clear storage key ACC and F, but set R/C */ + preempt_disable(); + pgste = pgste_get_lock(ptep); + pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT); + pgste_val(pgste) |= PGSTE_GR_BIT | PGSTE_GC_BIT; + ptev = pte_val(*ptep); + if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) + page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 0); + pgste_set_unlock(ptep, pgste); + preempt_enable(); +} + +/* + * Test and reset if a guest page is dirty + */ +bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + pgste_t pgste; + pte_t pte; + bool dirty; + int nodat; + + pgste = pgste_get_lock(ptep); + dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT); + pgste_val(pgste) &= ~PGSTE_UC_BIT; + pte = *ptep; + if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { + pgste = pgste_pte_notify(mm, addr, ptep, pgste); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + ptep_ipte_global(mm, addr, ptep, nodat); + if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE)) + pte = set_pte_bit(pte, __pgprot(_PAGE_PROTECT)); + else + pte = set_pte_bit(pte, __pgprot(_PAGE_INVALID)); + set_pte(ptep, pte); + } + pgste_set_unlock(ptep, pgste); + return dirty; +} +EXPORT_SYMBOL_GPL(ptep_test_and_clear_uc); + +int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, + unsigned char key, bool nq) +{ + unsigned long keyul, paddr; + spinlock_t *ptl; + pgste_t old, new; + pmd_t *pmdp; + pte_t *ptep; + + /* + * If we don't have a PTE table and if there is no huge page mapped, + * we can ignore attempts to set the key to 0, because it already is 0. + */ + switch (pmd_lookup(mm, addr, &pmdp)) { + case -ENOENT: + return key ? -EFAULT : 0; + case 0: + break; + default: + return -EFAULT; + } +again: + ptl = pmd_lock(mm, pmdp); + if (!pmd_present(*pmdp)) { + spin_unlock(ptl); + return key ? -EFAULT : 0; + } + + if (pmd_large(*pmdp)) { + paddr = pmd_val(*pmdp) & HPAGE_MASK; + paddr |= addr & ~HPAGE_MASK; + /* + * Huge pmds need quiescing operations, they are + * always mapped. + */ + page_set_storage_key(paddr, key, 1); + spin_unlock(ptl); + return 0; + } + spin_unlock(ptl); + + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + if (!ptep) + goto again; + new = old = pgste_get_lock(ptep); + pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | + PGSTE_ACC_BITS | PGSTE_FP_BIT); + keyul = (unsigned long) key; + pgste_val(new) |= (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48; + pgste_val(new) |= (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; + if (!(pte_val(*ptep) & _PAGE_INVALID)) { + unsigned long bits, skey; + + paddr = pte_val(*ptep) & PAGE_MASK; + skey = (unsigned long) page_get_storage_key(paddr); + bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); + skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT); + /* Set storage key ACC and FP */ + page_set_storage_key(paddr, skey, !nq); + /* Merge host changed & referenced into pgste */ + pgste_val(new) |= bits << 52; + } + /* changing the guest storage key is considered a change of the page */ + if ((pgste_val(new) ^ pgste_val(old)) & + (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) + pgste_val(new) |= PGSTE_UC_BIT; + + pgste_set_unlock(ptep, new); + pte_unmap_unlock(ptep, ptl); + return 0; +} +EXPORT_SYMBOL(set_guest_storage_key); + +/* + * Conditionally set a guest storage key (handling csske). + * oldkey will be updated when either mr or mc is set and a pointer is given. + * + * Returns 0 if a guests storage key update wasn't necessary, 1 if the guest + * storage key was updated and -EFAULT on access errors. + */ +int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr, + unsigned char key, unsigned char *oldkey, + bool nq, bool mr, bool mc) +{ + unsigned char tmp, mask = _PAGE_ACC_BITS | _PAGE_FP_BIT; + int rc; + + /* we can drop the pgste lock between getting and setting the key */ + if (mr | mc) { + rc = get_guest_storage_key(current->mm, addr, &tmp); + if (rc) + return rc; + if (oldkey) + *oldkey = tmp; + if (!mr) + mask |= _PAGE_REFERENCED; + if (!mc) + mask |= _PAGE_CHANGED; + if (!((tmp ^ key) & mask)) + return 0; + } + rc = set_guest_storage_key(current->mm, addr, key, nq); + return rc < 0 ? rc : 1; +} +EXPORT_SYMBOL(cond_set_guest_storage_key); + +/* + * Reset a guest reference bit (rrbe), returning the reference and changed bit. + * + * Returns < 0 in case of error, otherwise the cc to be reported to the guest. + */ +int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) +{ + spinlock_t *ptl; + unsigned long paddr; + pgste_t old, new; + pmd_t *pmdp; + pte_t *ptep; + int cc = 0; + + /* + * If we don't have a PTE table and if there is no huge page mapped, + * the storage key is 0 and there is nothing for us to do. + */ + switch (pmd_lookup(mm, addr, &pmdp)) { + case -ENOENT: + return 0; + case 0: + break; + default: + return -EFAULT; + } +again: + ptl = pmd_lock(mm, pmdp); + if (!pmd_present(*pmdp)) { + spin_unlock(ptl); + return 0; + } + + if (pmd_large(*pmdp)) { + paddr = pmd_val(*pmdp) & HPAGE_MASK; + paddr |= addr & ~HPAGE_MASK; + cc = page_reset_referenced(paddr); + spin_unlock(ptl); + return cc; + } + spin_unlock(ptl); + + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + if (!ptep) + goto again; + new = old = pgste_get_lock(ptep); + /* Reset guest reference bit only */ + pgste_val(new) &= ~PGSTE_GR_BIT; + + if (!(pte_val(*ptep) & _PAGE_INVALID)) { + paddr = pte_val(*ptep) & PAGE_MASK; + cc = page_reset_referenced(paddr); + /* Merge real referenced bit into host-set */ + pgste_val(new) |= ((unsigned long) cc << 53) & PGSTE_HR_BIT; + } + /* Reflect guest's logical view, not physical */ + cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49; + /* Changing the guest storage key is considered a change of the page */ + if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT) + pgste_val(new) |= PGSTE_UC_BIT; + + pgste_set_unlock(ptep, new); + pte_unmap_unlock(ptep, ptl); + return cc; +} +EXPORT_SYMBOL(reset_guest_reference_bit); + +int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, + unsigned char *key) +{ + unsigned long paddr; + spinlock_t *ptl; + pgste_t pgste; + pmd_t *pmdp; + pte_t *ptep; + + /* + * If we don't have a PTE table and if there is no huge page mapped, + * the storage key is 0. + */ + *key = 0; + + switch (pmd_lookup(mm, addr, &pmdp)) { + case -ENOENT: + return 0; + case 0: + break; + default: + return -EFAULT; + } +again: + ptl = pmd_lock(mm, pmdp); + if (!pmd_present(*pmdp)) { + spin_unlock(ptl); + return 0; + } + + if (pmd_large(*pmdp)) { + paddr = pmd_val(*pmdp) & HPAGE_MASK; + paddr |= addr & ~HPAGE_MASK; + *key = page_get_storage_key(paddr); + spin_unlock(ptl); + return 0; + } + spin_unlock(ptl); + + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + if (!ptep) + goto again; + pgste = pgste_get_lock(ptep); + *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; + paddr = pte_val(*ptep) & PAGE_MASK; + if (!(pte_val(*ptep) & _PAGE_INVALID)) + *key = page_get_storage_key(paddr); + /* Reflect guest's logical view, not physical */ + *key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; + pgste_set_unlock(ptep, pgste); + pte_unmap_unlock(ptep, ptl); + return 0; +} +EXPORT_SYMBOL(get_guest_storage_key); + +/** + * pgste_perform_essa - perform ESSA actions on the PGSTE. + * @mm: the memory context. It must have PGSTEs, no check is performed here! + * @hva: the host virtual address of the page whose PGSTE is to be processed + * @orc: the specific action to perform, see the ESSA_SET_* macros. + * @oldpte: the PTE will be saved there if the pointer is not NULL. + * @oldpgste: the old PGSTE will be saved there if the pointer is not NULL. + * + * Return: 1 if the page is to be added to the CBRL, otherwise 0, + * or < 0 in case of error. -EINVAL is returned for invalid values + * of orc, -EFAULT for invalid addresses. + */ +int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, + unsigned long *oldpte, unsigned long *oldpgste) +{ + struct vm_area_struct *vma; + unsigned long pgstev; + spinlock_t *ptl; + pgste_t pgste; + pte_t *ptep; + int res = 0; + + WARN_ON_ONCE(orc > ESSA_MAX); + if (unlikely(orc > ESSA_MAX)) + return -EINVAL; + + vma = vma_lookup(mm, hva); + if (!vma || is_vm_hugetlb_page(vma)) + return -EFAULT; + ptep = get_locked_pte(mm, hva, &ptl); + if (unlikely(!ptep)) + return -EFAULT; + pgste = pgste_get_lock(ptep); + pgstev = pgste_val(pgste); + if (oldpte) + *oldpte = pte_val(*ptep); + if (oldpgste) + *oldpgste = pgstev; + + switch (orc) { + case ESSA_GET_STATE: + break; + case ESSA_SET_STABLE: + pgstev &= ~(_PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT); + pgstev |= _PGSTE_GPS_USAGE_STABLE; + break; + case ESSA_SET_UNUSED: + pgstev &= ~_PGSTE_GPS_USAGE_MASK; + pgstev |= _PGSTE_GPS_USAGE_UNUSED; + if (pte_val(*ptep) & _PAGE_INVALID) + res = 1; + break; + case ESSA_SET_VOLATILE: + pgstev &= ~_PGSTE_GPS_USAGE_MASK; + pgstev |= _PGSTE_GPS_USAGE_VOLATILE; + if (pte_val(*ptep) & _PAGE_INVALID) + res = 1; + break; + case ESSA_SET_POT_VOLATILE: + pgstev &= ~_PGSTE_GPS_USAGE_MASK; + if (!(pte_val(*ptep) & _PAGE_INVALID)) { + pgstev |= _PGSTE_GPS_USAGE_POT_VOLATILE; + break; + } + if (pgstev & _PGSTE_GPS_ZERO) { + pgstev |= _PGSTE_GPS_USAGE_VOLATILE; + break; + } + if (!(pgstev & PGSTE_GC_BIT)) { + pgstev |= _PGSTE_GPS_USAGE_VOLATILE; + res = 1; + break; + } + break; + case ESSA_SET_STABLE_RESIDENT: + pgstev &= ~_PGSTE_GPS_USAGE_MASK; + pgstev |= _PGSTE_GPS_USAGE_STABLE; + /* + * Since the resident state can go away any time after this + * call, we will not make this page resident. We can revisit + * this decision if a guest will ever start using this. + */ + break; + case ESSA_SET_STABLE_IF_RESIDENT: + if (!(pte_val(*ptep) & _PAGE_INVALID)) { + pgstev &= ~_PGSTE_GPS_USAGE_MASK; + pgstev |= _PGSTE_GPS_USAGE_STABLE; + } + break; + case ESSA_SET_STABLE_NODAT: + pgstev &= ~_PGSTE_GPS_USAGE_MASK; + pgstev |= _PGSTE_GPS_USAGE_STABLE | _PGSTE_GPS_NODAT; + break; + default: + /* we should never get here! */ + break; + } + /* If we are discarding a page, set it to logical zero */ + if (res) + pgstev |= _PGSTE_GPS_ZERO; + + pgste_val(pgste) = pgstev; + pgste_set_unlock(ptep, pgste); + pte_unmap_unlock(ptep, ptl); + return res; +} +EXPORT_SYMBOL(pgste_perform_essa); + +/** + * set_pgste_bits - set specific PGSTE bits. + * @mm: the memory context. It must have PGSTEs, no check is performed here! + * @hva: the host virtual address of the page whose PGSTE is to be processed + * @bits: a bitmask representing the bits that will be touched + * @value: the values of the bits to be written. Only the bits in the mask + * will be written. + * + * Return: 0 on success, < 0 in case of error. + */ +int set_pgste_bits(struct mm_struct *mm, unsigned long hva, + unsigned long bits, unsigned long value) +{ + struct vm_area_struct *vma; + spinlock_t *ptl; + pgste_t new; + pte_t *ptep; + + vma = vma_lookup(mm, hva); + if (!vma || is_vm_hugetlb_page(vma)) + return -EFAULT; + ptep = get_locked_pte(mm, hva, &ptl); + if (unlikely(!ptep)) + return -EFAULT; + new = pgste_get_lock(ptep); + + pgste_val(new) &= ~bits; + pgste_val(new) |= value & bits; + + pgste_set_unlock(ptep, new); + pte_unmap_unlock(ptep, ptl); + return 0; +} +EXPORT_SYMBOL(set_pgste_bits); + +/** + * get_pgste - get the current PGSTE for the given address. + * @mm: the memory context. It must have PGSTEs, no check is performed here! + * @hva: the host virtual address of the page whose PGSTE is to be processed + * @pgstep: will be written with the current PGSTE for the given address. + * + * Return: 0 on success, < 0 in case of error. + */ +int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep) +{ + struct vm_area_struct *vma; + spinlock_t *ptl; + pte_t *ptep; + + vma = vma_lookup(mm, hva); + if (!vma || is_vm_hugetlb_page(vma)) + return -EFAULT; + ptep = get_locked_pte(mm, hva, &ptl); + if (unlikely(!ptep)) + return -EFAULT; + *pgstep = pgste_val(pgste_get(ptep)); + pte_unmap_unlock(ptep, ptl); + return 0; +} +EXPORT_SYMBOL(get_pgste); +#endif diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c new file mode 100644 index 0000000000..6d276103c6 --- /dev/null +++ b/arch/s390/mm/vmem.c @@ -0,0 +1,669 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2006 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_MUTEX(vmem_mutex); + +static void __ref *vmem_alloc_pages(unsigned int order) +{ + unsigned long size = PAGE_SIZE << order; + + if (slab_is_available()) + return (void *)__get_free_pages(GFP_KERNEL, order); + return memblock_alloc(size, size); +} + +static void vmem_free_pages(unsigned long addr, int order) +{ + /* We don't expect boot memory to be removed ever. */ + if (!slab_is_available() || + WARN_ON_ONCE(PageReserved(virt_to_page((void *)addr)))) + return; + free_pages(addr, order); +} + +void *vmem_crst_alloc(unsigned long val) +{ + unsigned long *table; + + table = vmem_alloc_pages(CRST_ALLOC_ORDER); + if (!table) + return NULL; + crst_table_init(table, val); + if (slab_is_available()) + arch_set_page_dat(virt_to_page(table), CRST_ALLOC_ORDER); + return table; +} + +pte_t __ref *vmem_pte_alloc(void) +{ + unsigned long size = PTRS_PER_PTE * sizeof(pte_t); + pte_t *pte; + + if (slab_is_available()) + pte = (pte_t *) page_table_alloc(&init_mm); + else + pte = (pte_t *) memblock_alloc(size, size); + if (!pte) + return NULL; + memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); + return pte; +} + +static void vmem_pte_free(unsigned long *table) +{ + /* We don't expect boot memory to be removed ever. */ + if (!slab_is_available() || + WARN_ON_ONCE(PageReserved(virt_to_page(table)))) + return; + page_table_free(&init_mm, table); +} + +#define PAGE_UNUSED 0xFD + +/* + * The unused vmemmap range, which was not yet memset(PAGE_UNUSED) ranges + * from unused_sub_pmd_start to next PMD_SIZE boundary. + */ +static unsigned long unused_sub_pmd_start; + +static void vmemmap_flush_unused_sub_pmd(void) +{ + if (!unused_sub_pmd_start) + return; + memset((void *)unused_sub_pmd_start, PAGE_UNUSED, + ALIGN(unused_sub_pmd_start, PMD_SIZE) - unused_sub_pmd_start); + unused_sub_pmd_start = 0; +} + +static void vmemmap_mark_sub_pmd_used(unsigned long start, unsigned long end) +{ + /* + * As we expect to add in the same granularity as we remove, it's + * sufficient to mark only some piece used to block the memmap page from + * getting removed (just in case the memmap never gets initialized, + * e.g., because the memory block never gets onlined). + */ + memset((void *)start, 0, sizeof(struct page)); +} + +static void vmemmap_use_sub_pmd(unsigned long start, unsigned long end) +{ + /* + * We only optimize if the new used range directly follows the + * previously unused range (esp., when populating consecutive sections). + */ + if (unused_sub_pmd_start == start) { + unused_sub_pmd_start = end; + if (likely(IS_ALIGNED(unused_sub_pmd_start, PMD_SIZE))) + unused_sub_pmd_start = 0; + return; + } + vmemmap_flush_unused_sub_pmd(); + vmemmap_mark_sub_pmd_used(start, end); +} + +static void vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end) +{ + unsigned long page = ALIGN_DOWN(start, PMD_SIZE); + + vmemmap_flush_unused_sub_pmd(); + + /* Could be our memmap page is filled with PAGE_UNUSED already ... */ + vmemmap_mark_sub_pmd_used(start, end); + + /* Mark the unused parts of the new memmap page PAGE_UNUSED. */ + if (!IS_ALIGNED(start, PMD_SIZE)) + memset((void *)page, PAGE_UNUSED, start - page); + /* + * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of + * consecutive sections. Remember for the last added PMD the last + * unused range in the populated PMD. + */ + if (!IS_ALIGNED(end, PMD_SIZE)) + unused_sub_pmd_start = end; +} + +/* Returns true if the PMD is completely unused and can be freed. */ +static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end) +{ + unsigned long page = ALIGN_DOWN(start, PMD_SIZE); + + vmemmap_flush_unused_sub_pmd(); + memset((void *)start, PAGE_UNUSED, end - start); + return !memchr_inv((void *)page, PAGE_UNUSED, PMD_SIZE); +} + +/* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ +static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr, + unsigned long end, bool add, bool direct) +{ + unsigned long prot, pages = 0; + int ret = -ENOMEM; + pte_t *pte; + + prot = pgprot_val(PAGE_KERNEL); + if (!MACHINE_HAS_NX) + prot &= ~_PAGE_NOEXEC; + + pte = pte_offset_kernel(pmd, addr); + for (; addr < end; addr += PAGE_SIZE, pte++) { + if (!add) { + if (pte_none(*pte)) + continue; + if (!direct) + vmem_free_pages((unsigned long) pfn_to_virt(pte_pfn(*pte)), 0); + pte_clear(&init_mm, addr, pte); + } else if (pte_none(*pte)) { + if (!direct) { + void *new_page = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE); + + if (!new_page) + goto out; + set_pte(pte, __pte(__pa(new_page) | prot)); + } else { + set_pte(pte, __pte(__pa(addr) | prot)); + } + } else { + continue; + } + pages++; + } + ret = 0; +out: + if (direct) + update_page_count(PG_DIRECT_MAP_4K, add ? pages : -pages); + return ret; +} + +static void try_free_pte_table(pmd_t *pmd, unsigned long start) +{ + pte_t *pte; + int i; + + /* We can safely assume this is fully in 1:1 mapping & vmemmap area */ + pte = pte_offset_kernel(pmd, start); + for (i = 0; i < PTRS_PER_PTE; i++, pte++) { + if (!pte_none(*pte)) + return; + } + vmem_pte_free((unsigned long *) pmd_deref(*pmd)); + pmd_clear(pmd); +} + +/* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ +static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, + unsigned long end, bool add, bool direct) +{ + unsigned long next, prot, pages = 0; + int ret = -ENOMEM; + pmd_t *pmd; + pte_t *pte; + + prot = pgprot_val(SEGMENT_KERNEL); + if (!MACHINE_HAS_NX) + prot &= ~_SEGMENT_ENTRY_NOEXEC; + + pmd = pmd_offset(pud, addr); + for (; addr < end; addr = next, pmd++) { + next = pmd_addr_end(addr, end); + if (!add) { + if (pmd_none(*pmd)) + continue; + if (pmd_large(*pmd)) { + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE)) { + if (!direct) + vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE)); + pmd_clear(pmd); + pages++; + } else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) { + vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE)); + pmd_clear(pmd); + } + continue; + } + } else if (pmd_none(*pmd)) { + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE) && + MACHINE_HAS_EDAT1 && direct && + !debug_pagealloc_enabled()) { + set_pmd(pmd, __pmd(__pa(addr) | prot)); + pages++; + continue; + } else if (!direct && MACHINE_HAS_EDAT1) { + void *new_page; + + /* + * Use 1MB frames for vmemmap if available. We + * always use large frames even if they are only + * partially used. Otherwise we would have also + * page tables since vmemmap_populate gets + * called for each section separately. + */ + new_page = vmemmap_alloc_block(PMD_SIZE, NUMA_NO_NODE); + if (new_page) { + set_pmd(pmd, __pmd(__pa(new_page) | prot)); + if (!IS_ALIGNED(addr, PMD_SIZE) || + !IS_ALIGNED(next, PMD_SIZE)) { + vmemmap_use_new_sub_pmd(addr, next); + } + continue; + } + } + pte = vmem_pte_alloc(); + if (!pte) + goto out; + pmd_populate(&init_mm, pmd, pte); + } else if (pmd_large(*pmd)) { + if (!direct) + vmemmap_use_sub_pmd(addr, next); + continue; + } + ret = modify_pte_table(pmd, addr, next, add, direct); + if (ret) + goto out; + if (!add) + try_free_pte_table(pmd, addr & PMD_MASK); + } + ret = 0; +out: + if (direct) + update_page_count(PG_DIRECT_MAP_1M, add ? pages : -pages); + return ret; +} + +static void try_free_pmd_table(pud_t *pud, unsigned long start) +{ + pmd_t *pmd; + int i; + + pmd = pmd_offset(pud, start); + for (i = 0; i < PTRS_PER_PMD; i++, pmd++) + if (!pmd_none(*pmd)) + return; + vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER); + pud_clear(pud); +} + +static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, + bool add, bool direct) +{ + unsigned long next, prot, pages = 0; + int ret = -ENOMEM; + pud_t *pud; + pmd_t *pmd; + + prot = pgprot_val(REGION3_KERNEL); + if (!MACHINE_HAS_NX) + prot &= ~_REGION_ENTRY_NOEXEC; + pud = pud_offset(p4d, addr); + for (; addr < end; addr = next, pud++) { + next = pud_addr_end(addr, end); + if (!add) { + if (pud_none(*pud)) + continue; + if (pud_large(*pud)) { + if (IS_ALIGNED(addr, PUD_SIZE) && + IS_ALIGNED(next, PUD_SIZE)) { + pud_clear(pud); + pages++; + } + continue; + } + } else if (pud_none(*pud)) { + if (IS_ALIGNED(addr, PUD_SIZE) && + IS_ALIGNED(next, PUD_SIZE) && + MACHINE_HAS_EDAT2 && direct && + !debug_pagealloc_enabled()) { + set_pud(pud, __pud(__pa(addr) | prot)); + pages++; + continue; + } + pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); + if (!pmd) + goto out; + pud_populate(&init_mm, pud, pmd); + } else if (pud_large(*pud)) { + continue; + } + ret = modify_pmd_table(pud, addr, next, add, direct); + if (ret) + goto out; + if (!add) + try_free_pmd_table(pud, addr & PUD_MASK); + } + ret = 0; +out: + if (direct) + update_page_count(PG_DIRECT_MAP_2G, add ? pages : -pages); + return ret; +} + +static void try_free_pud_table(p4d_t *p4d, unsigned long start) +{ + pud_t *pud; + int i; + + pud = pud_offset(p4d, start); + for (i = 0; i < PTRS_PER_PUD; i++, pud++) { + if (!pud_none(*pud)) + return; + } + vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER); + p4d_clear(p4d); +} + +static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end, + bool add, bool direct) +{ + unsigned long next; + int ret = -ENOMEM; + p4d_t *p4d; + pud_t *pud; + + p4d = p4d_offset(pgd, addr); + for (; addr < end; addr = next, p4d++) { + next = p4d_addr_end(addr, end); + if (!add) { + if (p4d_none(*p4d)) + continue; + } else if (p4d_none(*p4d)) { + pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); + if (!pud) + goto out; + p4d_populate(&init_mm, p4d, pud); + } + ret = modify_pud_table(p4d, addr, next, add, direct); + if (ret) + goto out; + if (!add) + try_free_pud_table(p4d, addr & P4D_MASK); + } + ret = 0; +out: + return ret; +} + +static void try_free_p4d_table(pgd_t *pgd, unsigned long start) +{ + p4d_t *p4d; + int i; + + p4d = p4d_offset(pgd, start); + for (i = 0; i < PTRS_PER_P4D; i++, p4d++) { + if (!p4d_none(*p4d)) + return; + } + vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER); + pgd_clear(pgd); +} + +static int modify_pagetable(unsigned long start, unsigned long end, bool add, + bool direct) +{ + unsigned long addr, next; + int ret = -ENOMEM; + pgd_t *pgd; + p4d_t *p4d; + + if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end))) + return -EINVAL; + /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ + if (WARN_ON_ONCE(end > VMALLOC_START)) + return -EINVAL; + for (addr = start; addr < end; addr = next) { + next = pgd_addr_end(addr, end); + pgd = pgd_offset_k(addr); + + if (!add) { + if (pgd_none(*pgd)) + continue; + } else if (pgd_none(*pgd)) { + p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); + if (!p4d) + goto out; + pgd_populate(&init_mm, pgd, p4d); + } + ret = modify_p4d_table(pgd, addr, next, add, direct); + if (ret) + goto out; + if (!add) + try_free_p4d_table(pgd, addr & PGDIR_MASK); + } + ret = 0; +out: + if (!add) + flush_tlb_kernel_range(start, end); + return ret; +} + +static int add_pagetable(unsigned long start, unsigned long end, bool direct) +{ + return modify_pagetable(start, end, true, direct); +} + +static int remove_pagetable(unsigned long start, unsigned long end, bool direct) +{ + return modify_pagetable(start, end, false, direct); +} + +/* + * Add a physical memory range to the 1:1 mapping. + */ +static int vmem_add_range(unsigned long start, unsigned long size) +{ + start = (unsigned long)__va(start); + return add_pagetable(start, start + size, true); +} + +/* + * Remove a physical memory range from the 1:1 mapping. + */ +static void vmem_remove_range(unsigned long start, unsigned long size) +{ + start = (unsigned long)__va(start); + remove_pagetable(start, start + size, true); +} + +/* + * Add a backed mem_map array to the virtual mem_map array. + */ +int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, + struct vmem_altmap *altmap) +{ + int ret; + + mutex_lock(&vmem_mutex); + /* We don't care about the node, just use NUMA_NO_NODE on allocations */ + ret = add_pagetable(start, end, false); + if (ret) + remove_pagetable(start, end, false); + mutex_unlock(&vmem_mutex); + return ret; +} + +void vmemmap_free(unsigned long start, unsigned long end, + struct vmem_altmap *altmap) +{ + mutex_lock(&vmem_mutex); + remove_pagetable(start, end, false); + mutex_unlock(&vmem_mutex); +} + +void vmem_remove_mapping(unsigned long start, unsigned long size) +{ + mutex_lock(&vmem_mutex); + vmem_remove_range(start, size); + mutex_unlock(&vmem_mutex); +} + +struct range arch_get_mappable_range(void) +{ + struct range mhp_range; + + mhp_range.start = 0; + mhp_range.end = max_mappable - 1; + return mhp_range; +} + +int vmem_add_mapping(unsigned long start, unsigned long size) +{ + struct range range = arch_get_mappable_range(); + int ret; + + if (start < range.start || + start + size > range.end + 1 || + start + size < start) + return -ERANGE; + + mutex_lock(&vmem_mutex); + ret = vmem_add_range(start, size); + if (ret) + vmem_remove_range(start, size); + mutex_unlock(&vmem_mutex); + return ret; +} + +/* + * Allocate new or return existing page-table entry, but do not map it + * to any physical address. If missing, allocate segment- and region- + * table entries along. Meeting a large segment- or region-table entry + * while traversing is an error, since the function is expected to be + * called against virtual regions reserved for 4KB mappings only. + */ +pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc) +{ + pte_t *ptep = NULL; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) { + if (!alloc) + goto out; + p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); + if (!p4d) + goto out; + pgd_populate(&init_mm, pgd, p4d); + } + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) { + if (!alloc) + goto out; + pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); + if (!pud) + goto out; + p4d_populate(&init_mm, p4d, pud); + } + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) { + if (!alloc) + goto out; + pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); + if (!pmd) + goto out; + pud_populate(&init_mm, pud, pmd); + } else if (WARN_ON_ONCE(pud_large(*pud))) { + goto out; + } + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + if (!alloc) + goto out; + pte = vmem_pte_alloc(); + if (!pte) + goto out; + pmd_populate(&init_mm, pmd, pte); + } else if (WARN_ON_ONCE(pmd_large(*pmd))) { + goto out; + } + ptep = pte_offset_kernel(pmd, addr); +out: + return ptep; +} + +int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc) +{ + pte_t *ptep, pte; + + if (!IS_ALIGNED(addr, PAGE_SIZE)) + return -EINVAL; + ptep = vmem_get_alloc_pte(addr, alloc); + if (!ptep) + return -ENOMEM; + __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); + pte = mk_pte_phys(phys, prot); + set_pte(ptep, pte); + return 0; +} + +int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot) +{ + int rc; + + mutex_lock(&vmem_mutex); + rc = __vmem_map_4k_page(addr, phys, prot, true); + mutex_unlock(&vmem_mutex); + return rc; +} + +void vmem_unmap_4k_page(unsigned long addr) +{ + pte_t *ptep; + + mutex_lock(&vmem_mutex); + ptep = virt_to_kpte(addr); + __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); + pte_clear(&init_mm, addr, ptep); + mutex_unlock(&vmem_mutex); +} + +void __init vmem_map_init(void) +{ + __set_memory_rox(_stext, _etext); + __set_memory_ro(_etext, __end_rodata); + __set_memory_rox(_sinittext, _einittext); + __set_memory_rox(__stext_amode31, __etext_amode31); + /* + * If the BEAR-enhancement facility is not installed the first + * prefix page is used to return to the previous context with + * an LPSWE instruction and therefore must be executable. + */ + if (!static_key_enabled(&cpu_has_bear)) + set_memory_x(0, 1); + if (debug_pagealloc_enabled()) { + /* + * Use RELOC_HIDE() as long as __va(0) translates to NULL, + * since performing pointer arithmetic on a NULL pointer + * has undefined behavior and generates compiler warnings. + */ + __set_memory_4k(__va(0), RELOC_HIDE(__va(0), ident_map_size)); + } + if (MACHINE_HAS_NX) + ctl_set_bit(0, 20); + pr_info("Write protected kernel read-only data: %luk\n", + (unsigned long)(__end_rodata - _stext) >> 10); +} diff --git a/arch/s390/net/Makefile b/arch/s390/net/Makefile new file mode 100644 index 0000000000..8cab6deb04 --- /dev/null +++ b/arch/s390/net/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Arch-specific network modules +# +obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o +obj-$(CONFIG_HAVE_PNETID) += pnet.o diff --git a/arch/s390/net/bpf_jit.h b/arch/s390/net/bpf_jit.h new file mode 100644 index 0000000000..7822ea92e5 --- /dev/null +++ b/arch/s390/net/bpf_jit.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF Jit compiler defines + * + * Copyright IBM Corp. 2012,2015 + * + * Author(s): Martin Schwidefsky + * Michael Holzheu + */ + +#ifndef __ARCH_S390_NET_BPF_JIT_H +#define __ARCH_S390_NET_BPF_JIT_H + +#ifndef __ASSEMBLY__ + +#include +#include + +#endif /* __ASSEMBLY__ */ + +/* + * Stackframe layout (packed stack): + * + * ^ high + * +---------------+ | + * | old backchain | | + * +---------------+ | + * | r15 - r6 | | + * +---------------+ | + * | 4 byte align | | + * | tail_call_cnt | | + * BFP -> +===============+ | + * | | | + * | BPF stack | | + * | | | + * R15+160 -> +---------------+ | + * | new backchain | | + * R15+152 -> +---------------+ | + * | + 152 byte SA | | + * R15 -> +---------------+ + low + * + * We get 160 bytes stack space from calling function, but only use + * 12 * 8 byte for old backchain, r15..r6, and tail_call_cnt. + * + * The stack size used by the BPF program ("BPF stack" above) is passed + * via "aux->stack_depth". + */ +#define STK_SPACE_ADD (160) +#define STK_160_UNUSED (160 - 12 * 8) +#define STK_OFF (STK_SPACE_ADD - STK_160_UNUSED) + +#define STK_OFF_R6 (160 - 11 * 8) /* Offset of r6 on stack */ +#define STK_OFF_TCCNT (160 - 12 * 8) /* Offset of tail_call_cnt on stack */ + +#endif /* __ARCH_S390_NET_BPF_JIT_H */ diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c new file mode 100644 index 0000000000..e507692e51 --- /dev/null +++ b/arch/s390/net/bpf_jit_comp.c @@ -0,0 +1,2537 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * BPF Jit compiler for s390. + * + * Minimum build requirements: + * + * - HAVE_MARCH_Z196_FEATURES: laal, laalg + * - HAVE_MARCH_Z10_FEATURES: msfi, cgrj, clgrj + * - HAVE_MARCH_Z9_109_FEATURES: alfi, llilf, clfi, oilf, nilf + * - 64BIT + * + * Copyright IBM Corp. 2012,2015 + * + * Author(s): Martin Schwidefsky + * Michael Holzheu + */ + +#define KMSG_COMPONENT "bpf_jit" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "bpf_jit.h" + +struct bpf_jit { + u32 seen; /* Flags to remember seen eBPF instructions */ + u32 seen_reg[16]; /* Array to remember which registers are used */ + u32 *addrs; /* Array with relative instruction addresses */ + u8 *prg_buf; /* Start of program */ + int size; /* Size of program and literal pool */ + int size_prg; /* Size of program */ + int prg; /* Current position in program */ + int lit32_start; /* Start of 32-bit literal pool */ + int lit32; /* Current position in 32-bit literal pool */ + int lit64_start; /* Start of 64-bit literal pool */ + int lit64; /* Current position in 64-bit literal pool */ + int base_ip; /* Base address for literal pool */ + int exit_ip; /* Address of exit */ + int r1_thunk_ip; /* Address of expoline thunk for 'br %r1' */ + int r14_thunk_ip; /* Address of expoline thunk for 'br %r14' */ + int tail_call_start; /* Tail call start offset */ + int excnt; /* Number of exception table entries */ + int prologue_plt_ret; /* Return address for prologue hotpatch PLT */ + int prologue_plt; /* Start of prologue hotpatch PLT */ +}; + +#define SEEN_MEM BIT(0) /* use mem[] for temporary storage */ +#define SEEN_LITERAL BIT(1) /* code uses literals */ +#define SEEN_FUNC BIT(2) /* calls C functions */ +#define SEEN_STACK (SEEN_FUNC | SEEN_MEM) + +/* + * s390 registers + */ +#define REG_W0 (MAX_BPF_JIT_REG + 0) /* Work register 1 (even) */ +#define REG_W1 (MAX_BPF_JIT_REG + 1) /* Work register 2 (odd) */ +#define REG_L (MAX_BPF_JIT_REG + 2) /* Literal pool register */ +#define REG_15 (MAX_BPF_JIT_REG + 3) /* Register 15 */ +#define REG_0 REG_W0 /* Register 0 */ +#define REG_1 REG_W1 /* Register 1 */ +#define REG_2 BPF_REG_1 /* Register 2 */ +#define REG_3 BPF_REG_2 /* Register 3 */ +#define REG_4 BPF_REG_3 /* Register 4 */ +#define REG_7 BPF_REG_6 /* Register 7 */ +#define REG_8 BPF_REG_7 /* Register 8 */ +#define REG_14 BPF_REG_0 /* Register 14 */ + +/* + * Mapping of BPF registers to s390 registers + */ +static const int reg2hex[] = { + /* Return code */ + [BPF_REG_0] = 14, + /* Function parameters */ + [BPF_REG_1] = 2, + [BPF_REG_2] = 3, + [BPF_REG_3] = 4, + [BPF_REG_4] = 5, + [BPF_REG_5] = 6, + /* Call saved registers */ + [BPF_REG_6] = 7, + [BPF_REG_7] = 8, + [BPF_REG_8] = 9, + [BPF_REG_9] = 10, + /* BPF stack pointer */ + [BPF_REG_FP] = 13, + /* Register for blinding */ + [BPF_REG_AX] = 12, + /* Work registers for s390x backend */ + [REG_W0] = 0, + [REG_W1] = 1, + [REG_L] = 11, + [REG_15] = 15, +}; + +static inline u32 reg(u32 dst_reg, u32 src_reg) +{ + return reg2hex[dst_reg] << 4 | reg2hex[src_reg]; +} + +static inline u32 reg_high(u32 reg) +{ + return reg2hex[reg] << 4; +} + +static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) +{ + u32 r1 = reg2hex[b1]; + + if (r1 >= 6 && r1 <= 15 && !jit->seen_reg[r1]) + jit->seen_reg[r1] = 1; +} + +#define REG_SET_SEEN(b1) \ +({ \ + reg_set_seen(jit, b1); \ +}) + +#define REG_SEEN(b1) jit->seen_reg[reg2hex[(b1)]] + +/* + * EMIT macros for code generation + */ + +#define _EMIT2(op) \ +({ \ + if (jit->prg_buf) \ + *(u16 *) (jit->prg_buf + jit->prg) = (op); \ + jit->prg += 2; \ +}) + +#define EMIT2(op, b1, b2) \ +({ \ + _EMIT2((op) | reg(b1, b2)); \ + REG_SET_SEEN(b1); \ + REG_SET_SEEN(b2); \ +}) + +#define _EMIT4(op) \ +({ \ + if (jit->prg_buf) \ + *(u32 *) (jit->prg_buf + jit->prg) = (op); \ + jit->prg += 4; \ +}) + +#define EMIT4(op, b1, b2) \ +({ \ + _EMIT4((op) | reg(b1, b2)); \ + REG_SET_SEEN(b1); \ + REG_SET_SEEN(b2); \ +}) + +#define EMIT4_RRF(op, b1, b2, b3) \ +({ \ + _EMIT4((op) | reg_high(b3) << 8 | reg(b1, b2)); \ + REG_SET_SEEN(b1); \ + REG_SET_SEEN(b2); \ + REG_SET_SEEN(b3); \ +}) + +#define _EMIT4_DISP(op, disp) \ +({ \ + unsigned int __disp = (disp) & 0xfff; \ + _EMIT4((op) | __disp); \ +}) + +#define EMIT4_DISP(op, b1, b2, disp) \ +({ \ + _EMIT4_DISP((op) | reg_high(b1) << 16 | \ + reg_high(b2) << 8, (disp)); \ + REG_SET_SEEN(b1); \ + REG_SET_SEEN(b2); \ +}) + +#define EMIT4_IMM(op, b1, imm) \ +({ \ + unsigned int __imm = (imm) & 0xffff; \ + _EMIT4((op) | reg_high(b1) << 16 | __imm); \ + REG_SET_SEEN(b1); \ +}) + +#define EMIT4_PCREL(op, pcrel) \ +({ \ + long __pcrel = ((pcrel) >> 1) & 0xffff; \ + _EMIT4((op) | __pcrel); \ +}) + +#define EMIT4_PCREL_RIC(op, mask, target) \ +({ \ + int __rel = ((target) - jit->prg) / 2; \ + _EMIT4((op) | (mask) << 20 | (__rel & 0xffff)); \ +}) + +#define _EMIT6(op1, op2) \ +({ \ + if (jit->prg_buf) { \ + *(u32 *) (jit->prg_buf + jit->prg) = (op1); \ + *(u16 *) (jit->prg_buf + jit->prg + 4) = (op2); \ + } \ + jit->prg += 6; \ +}) + +#define _EMIT6_DISP(op1, op2, disp) \ +({ \ + unsigned int __disp = (disp) & 0xfff; \ + _EMIT6((op1) | __disp, op2); \ +}) + +#define _EMIT6_DISP_LH(op1, op2, disp) \ +({ \ + u32 _disp = (u32) (disp); \ + unsigned int __disp_h = _disp & 0xff000; \ + unsigned int __disp_l = _disp & 0x00fff; \ + _EMIT6((op1) | __disp_l, (op2) | __disp_h >> 4); \ +}) + +#define EMIT6_DISP_LH(op1, op2, b1, b2, b3, disp) \ +({ \ + _EMIT6_DISP_LH((op1) | reg(b1, b2) << 16 | \ + reg_high(b3) << 8, op2, disp); \ + REG_SET_SEEN(b1); \ + REG_SET_SEEN(b2); \ + REG_SET_SEEN(b3); \ +}) + +#define EMIT6_PCREL_RIEB(op1, op2, b1, b2, mask, target) \ +({ \ + unsigned int rel = (int)((target) - jit->prg) / 2; \ + _EMIT6((op1) | reg(b1, b2) << 16 | (rel & 0xffff), \ + (op2) | (mask) << 12); \ + REG_SET_SEEN(b1); \ + REG_SET_SEEN(b2); \ +}) + +#define EMIT6_PCREL_RIEC(op1, op2, b1, imm, mask, target) \ +({ \ + unsigned int rel = (int)((target) - jit->prg) / 2; \ + _EMIT6((op1) | (reg_high(b1) | (mask)) << 16 | \ + (rel & 0xffff), (op2) | ((imm) & 0xff) << 8); \ + REG_SET_SEEN(b1); \ + BUILD_BUG_ON(((unsigned long) (imm)) > 0xff); \ +}) + +#define EMIT6_PCREL(op1, op2, b1, b2, i, off, mask) \ +({ \ + int rel = (addrs[(i) + (off) + 1] - jit->prg) / 2; \ + _EMIT6((op1) | reg(b1, b2) << 16 | (rel & 0xffff), (op2) | (mask));\ + REG_SET_SEEN(b1); \ + REG_SET_SEEN(b2); \ +}) + +#define EMIT6_PCREL_RILB(op, b, target) \ +({ \ + unsigned int rel = (int)((target) - jit->prg) / 2; \ + _EMIT6((op) | reg_high(b) << 16 | rel >> 16, rel & 0xffff);\ + REG_SET_SEEN(b); \ +}) + +#define EMIT6_PCREL_RIL(op, target) \ +({ \ + unsigned int rel = (int)((target) - jit->prg) / 2; \ + _EMIT6((op) | rel >> 16, rel & 0xffff); \ +}) + +#define EMIT6_PCREL_RILC(op, mask, target) \ +({ \ + EMIT6_PCREL_RIL((op) | (mask) << 20, (target)); \ +}) + +#define _EMIT6_IMM(op, imm) \ +({ \ + unsigned int __imm = (imm); \ + _EMIT6((op) | (__imm >> 16), __imm & 0xffff); \ +}) + +#define EMIT6_IMM(op, b1, imm) \ +({ \ + _EMIT6_IMM((op) | reg_high(b1) << 16, imm); \ + REG_SET_SEEN(b1); \ +}) + +#define _EMIT_CONST_U32(val) \ +({ \ + unsigned int ret; \ + ret = jit->lit32; \ + if (jit->prg_buf) \ + *(u32 *)(jit->prg_buf + jit->lit32) = (u32)(val);\ + jit->lit32 += 4; \ + ret; \ +}) + +#define EMIT_CONST_U32(val) \ +({ \ + jit->seen |= SEEN_LITERAL; \ + _EMIT_CONST_U32(val) - jit->base_ip; \ +}) + +#define _EMIT_CONST_U64(val) \ +({ \ + unsigned int ret; \ + ret = jit->lit64; \ + if (jit->prg_buf) \ + *(u64 *)(jit->prg_buf + jit->lit64) = (u64)(val);\ + jit->lit64 += 8; \ + ret; \ +}) + +#define EMIT_CONST_U64(val) \ +({ \ + jit->seen |= SEEN_LITERAL; \ + _EMIT_CONST_U64(val) - jit->base_ip; \ +}) + +#define EMIT_ZERO(b1) \ +({ \ + if (!fp->aux->verifier_zext) { \ + /* llgfr %dst,%dst (zero extend to 64 bit) */ \ + EMIT4(0xb9160000, b1, b1); \ + REG_SET_SEEN(b1); \ + } \ +}) + +/* + * Return whether this is the first pass. The first pass is special, since we + * don't know any sizes yet, and thus must be conservative. + */ +static bool is_first_pass(struct bpf_jit *jit) +{ + return jit->size == 0; +} + +/* + * Return whether this is the code generation pass. The code generation pass is + * special, since we should change as little as possible. + */ +static bool is_codegen_pass(struct bpf_jit *jit) +{ + return jit->prg_buf; +} + +/* + * Return whether "rel" can be encoded as a short PC-relative offset + */ +static bool is_valid_rel(int rel) +{ + return rel >= -65536 && rel <= 65534; +} + +/* + * Return whether "off" can be reached using a short PC-relative offset + */ +static bool can_use_rel(struct bpf_jit *jit, int off) +{ + return is_valid_rel(off - jit->prg); +} + +/* + * Return whether given displacement can be encoded using + * Long-Displacement Facility + */ +static bool is_valid_ldisp(int disp) +{ + return disp >= -524288 && disp <= 524287; +} + +/* + * Return whether the next 32-bit literal pool entry can be referenced using + * Long-Displacement Facility + */ +static bool can_use_ldisp_for_lit32(struct bpf_jit *jit) +{ + return is_valid_ldisp(jit->lit32 - jit->base_ip); +} + +/* + * Return whether the next 64-bit literal pool entry can be referenced using + * Long-Displacement Facility + */ +static bool can_use_ldisp_for_lit64(struct bpf_jit *jit) +{ + return is_valid_ldisp(jit->lit64 - jit->base_ip); +} + +/* + * Fill whole space with illegal instructions + */ +static void jit_fill_hole(void *area, unsigned int size) +{ + memset(area, 0, size); +} + +/* + * Save registers from "rs" (register start) to "re" (register end) on stack + */ +static void save_regs(struct bpf_jit *jit, u32 rs, u32 re) +{ + u32 off = STK_OFF_R6 + (rs - 6) * 8; + + if (rs == re) + /* stg %rs,off(%r15) */ + _EMIT6(0xe300f000 | rs << 20 | off, 0x0024); + else + /* stmg %rs,%re,off(%r15) */ + _EMIT6_DISP(0xeb00f000 | rs << 20 | re << 16, 0x0024, off); +} + +/* + * Restore registers from "rs" (register start) to "re" (register end) on stack + */ +static void restore_regs(struct bpf_jit *jit, u32 rs, u32 re, u32 stack_depth) +{ + u32 off = STK_OFF_R6 + (rs - 6) * 8; + + if (jit->seen & SEEN_STACK) + off += STK_OFF + stack_depth; + + if (rs == re) + /* lg %rs,off(%r15) */ + _EMIT6(0xe300f000 | rs << 20 | off, 0x0004); + else + /* lmg %rs,%re,off(%r15) */ + _EMIT6_DISP(0xeb00f000 | rs << 20 | re << 16, 0x0004, off); +} + +/* + * Return first seen register (from start) + */ +static int get_start(struct bpf_jit *jit, int start) +{ + int i; + + for (i = start; i <= 15; i++) { + if (jit->seen_reg[i]) + return i; + } + return 0; +} + +/* + * Return last seen register (from start) (gap >= 2) + */ +static int get_end(struct bpf_jit *jit, int start) +{ + int i; + + for (i = start; i < 15; i++) { + if (!jit->seen_reg[i] && !jit->seen_reg[i + 1]) + return i - 1; + } + return jit->seen_reg[15] ? 15 : 14; +} + +#define REGS_SAVE 1 +#define REGS_RESTORE 0 +/* + * Save and restore clobbered registers (6-15) on stack. + * We save/restore registers in chunks with gap >= 2 registers. + */ +static void save_restore_regs(struct bpf_jit *jit, int op, u32 stack_depth) +{ + const int last = 15, save_restore_size = 6; + int re = 6, rs; + + if (is_first_pass(jit)) { + /* + * We don't know yet which registers are used. Reserve space + * conservatively. + */ + jit->prg += (last - re + 1) * save_restore_size; + return; + } + + do { + rs = get_start(jit, re); + if (!rs) + break; + re = get_end(jit, rs + 1); + if (op == REGS_SAVE) + save_regs(jit, rs, re); + else + restore_regs(jit, rs, re, stack_depth); + re++; + } while (re <= last); +} + +static void bpf_skip(struct bpf_jit *jit, int size) +{ + if (size >= 6 && !is_valid_rel(size)) { + /* brcl 0xf,size */ + EMIT6_PCREL_RIL(0xc0f4000000, size); + size -= 6; + } else if (size >= 4 && is_valid_rel(size)) { + /* brc 0xf,size */ + EMIT4_PCREL(0xa7f40000, size); + size -= 4; + } + while (size >= 2) { + /* bcr 0,%0 */ + _EMIT2(0x0700); + size -= 2; + } +} + +/* + * PLT for hotpatchable calls. The calling convention is the same as for the + * ftrace hotpatch trampolines: %r0 is return address, %r1 is clobbered. + */ +extern const char bpf_plt[]; +extern const char bpf_plt_ret[]; +extern const char bpf_plt_target[]; +extern const char bpf_plt_end[]; +#define BPF_PLT_SIZE 32 +asm( + ".pushsection .rodata\n" + " .balign 8\n" + "bpf_plt:\n" + " lgrl %r0,bpf_plt_ret\n" + " lgrl %r1,bpf_plt_target\n" + " br %r1\n" + " .balign 8\n" + "bpf_plt_ret: .quad 0\n" + "bpf_plt_target: .quad 0\n" + "bpf_plt_end:\n" + " .popsection\n" +); + +static void bpf_jit_plt(void *plt, void *ret, void *target) +{ + memcpy(plt, bpf_plt, BPF_PLT_SIZE); + *(void **)((char *)plt + (bpf_plt_ret - bpf_plt)) = ret; + *(void **)((char *)plt + (bpf_plt_target - bpf_plt)) = target ?: ret; +} + +/* + * Emit function prologue + * + * Save registers and create stack frame if necessary. + * See stack frame layout description in "bpf_jit.h"! + */ +static void bpf_jit_prologue(struct bpf_jit *jit, struct bpf_prog *fp, + u32 stack_depth) +{ + /* No-op for hotpatching */ + /* brcl 0,prologue_plt */ + EMIT6_PCREL_RILC(0xc0040000, 0, jit->prologue_plt); + jit->prologue_plt_ret = jit->prg; + + if (fp->aux->func_idx == 0) { + /* Initialize the tail call counter in the main program. */ + /* xc STK_OFF_TCCNT(4,%r15),STK_OFF_TCCNT(%r15) */ + _EMIT6(0xd703f000 | STK_OFF_TCCNT, 0xf000 | STK_OFF_TCCNT); + } else { + /* + * Skip the tail call counter initialization in subprograms. + * Insert nops in order to have tail_call_start at a + * predictable offset. + */ + bpf_skip(jit, 6); + } + /* Tail calls have to skip above initialization */ + jit->tail_call_start = jit->prg; + /* Save registers */ + save_restore_regs(jit, REGS_SAVE, stack_depth); + /* Setup literal pool */ + if (is_first_pass(jit) || (jit->seen & SEEN_LITERAL)) { + if (!is_first_pass(jit) && + is_valid_ldisp(jit->size - (jit->prg + 2))) { + /* basr %l,0 */ + EMIT2(0x0d00, REG_L, REG_0); + jit->base_ip = jit->prg; + } else { + /* larl %l,lit32_start */ + EMIT6_PCREL_RILB(0xc0000000, REG_L, jit->lit32_start); + jit->base_ip = jit->lit32_start; + } + } + /* Setup stack and backchain */ + if (is_first_pass(jit) || (jit->seen & SEEN_STACK)) { + if (is_first_pass(jit) || (jit->seen & SEEN_FUNC)) + /* lgr %w1,%r15 (backchain) */ + EMIT4(0xb9040000, REG_W1, REG_15); + /* la %bfp,STK_160_UNUSED(%r15) (BPF frame pointer) */ + EMIT4_DISP(0x41000000, BPF_REG_FP, REG_15, STK_160_UNUSED); + /* aghi %r15,-STK_OFF */ + EMIT4_IMM(0xa70b0000, REG_15, -(STK_OFF + stack_depth)); + if (is_first_pass(jit) || (jit->seen & SEEN_FUNC)) + /* stg %w1,152(%r15) (backchain) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0, + REG_15, 152); + } +} + +/* + * Emit an expoline for a jump that follows + */ +static void emit_expoline(struct bpf_jit *jit) +{ + /* exrl %r0,.+10 */ + EMIT6_PCREL_RIL(0xc6000000, jit->prg + 10); + /* j . */ + EMIT4_PCREL(0xa7f40000, 0); +} + +/* + * Emit __s390_indirect_jump_r1 thunk if necessary + */ +static void emit_r1_thunk(struct bpf_jit *jit) +{ + if (nospec_uses_trampoline()) { + jit->r1_thunk_ip = jit->prg; + emit_expoline(jit); + /* br %r1 */ + _EMIT2(0x07f1); + } +} + +/* + * Call r1 either directly or via __s390_indirect_jump_r1 thunk + */ +static void call_r1(struct bpf_jit *jit) +{ + if (nospec_uses_trampoline()) + /* brasl %r14,__s390_indirect_jump_r1 */ + EMIT6_PCREL_RILB(0xc0050000, REG_14, jit->r1_thunk_ip); + else + /* basr %r14,%r1 */ + EMIT2(0x0d00, REG_14, REG_1); +} + +/* + * Function epilogue + */ +static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth) +{ + jit->exit_ip = jit->prg; + /* Load exit code: lgr %r2,%b0 */ + EMIT4(0xb9040000, REG_2, BPF_REG_0); + /* Restore registers */ + save_restore_regs(jit, REGS_RESTORE, stack_depth); + if (nospec_uses_trampoline()) { + jit->r14_thunk_ip = jit->prg; + /* Generate __s390_indirect_jump_r14 thunk */ + emit_expoline(jit); + } + /* br %r14 */ + _EMIT2(0x07fe); + + if (is_first_pass(jit) || (jit->seen & SEEN_FUNC)) + emit_r1_thunk(jit); + + jit->prg = ALIGN(jit->prg, 8); + jit->prologue_plt = jit->prg; + if (jit->prg_buf) + bpf_jit_plt(jit->prg_buf + jit->prg, + jit->prg_buf + jit->prologue_plt_ret, NULL); + jit->prg += BPF_PLT_SIZE; +} + +static int get_probe_mem_regno(const u8 *insn) +{ + /* + * insn must point to llgc, llgh, llgf or lg, which have destination + * register at the same position. + */ + if (insn[0] != 0xe3) /* common llgc, llgh, llgf and lg prefix */ + return -1; + if (insn[5] != 0x90 && /* llgc */ + insn[5] != 0x91 && /* llgh */ + insn[5] != 0x16 && /* llgf */ + insn[5] != 0x04) /* lg */ + return -1; + return insn[1] >> 4; +} + +bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs) +{ + regs->psw.addr = extable_fixup(x); + regs->gprs[x->data] = 0; + return true; +} + +static int bpf_jit_probe_mem(struct bpf_jit *jit, struct bpf_prog *fp, + int probe_prg, int nop_prg) +{ + struct exception_table_entry *ex; + int reg, prg; + s64 delta; + u8 *insn; + int i; + + if (!fp->aux->extable) + /* Do nothing during early JIT passes. */ + return 0; + insn = jit->prg_buf + probe_prg; + reg = get_probe_mem_regno(insn); + if (WARN_ON_ONCE(reg < 0)) + /* JIT bug - unexpected probe instruction. */ + return -1; + if (WARN_ON_ONCE(probe_prg + insn_length(*insn) != nop_prg)) + /* JIT bug - gap between probe and nop instructions. */ + return -1; + for (i = 0; i < 2; i++) { + if (WARN_ON_ONCE(jit->excnt >= fp->aux->num_exentries)) + /* Verifier bug - not enough entries. */ + return -1; + ex = &fp->aux->extable[jit->excnt]; + /* Add extable entries for probe and nop instructions. */ + prg = i == 0 ? probe_prg : nop_prg; + delta = jit->prg_buf + prg - (u8 *)&ex->insn; + if (WARN_ON_ONCE(delta < INT_MIN || delta > INT_MAX)) + /* JIT bug - code and extable must be close. */ + return -1; + ex->insn = delta; + /* + * Always land on the nop. Note that extable infrastructure + * ignores fixup field, it is handled by ex_handler_bpf(). + */ + delta = jit->prg_buf + nop_prg - (u8 *)&ex->fixup; + if (WARN_ON_ONCE(delta < INT_MIN || delta > INT_MAX)) + /* JIT bug - landing pad and extable must be close. */ + return -1; + ex->fixup = delta; + ex->type = EX_TYPE_BPF; + ex->data = reg; + jit->excnt++; + } + return 0; +} + +/* + * Sign-extend the register if necessary + */ +static int sign_extend(struct bpf_jit *jit, int r, u8 size, u8 flags) +{ + if (!(flags & BTF_FMODEL_SIGNED_ARG)) + return 0; + + switch (size) { + case 1: + /* lgbr %r,%r */ + EMIT4(0xb9060000, r, r); + return 0; + case 2: + /* lghr %r,%r */ + EMIT4(0xb9070000, r, r); + return 0; + case 4: + /* lgfr %r,%r */ + EMIT4(0xb9140000, r, r); + return 0; + case 8: + return 0; + default: + return -1; + } +} + +/* + * Compile one eBPF instruction into s390x code + * + * NOTE: Use noinline because for gcov (-fprofile-arcs) gcc allocates a lot of + * stack space for the large switch statement. + */ +static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, + int i, bool extra_pass, u32 stack_depth) +{ + struct bpf_insn *insn = &fp->insnsi[i]; + u32 dst_reg = insn->dst_reg; + u32 src_reg = insn->src_reg; + int last, insn_count = 1; + u32 *addrs = jit->addrs; + s32 imm = insn->imm; + s16 off = insn->off; + int probe_prg = -1; + unsigned int mask; + int nop_prg; + int err; + + if (BPF_CLASS(insn->code) == BPF_LDX && + BPF_MODE(insn->code) == BPF_PROBE_MEM) + probe_prg = jit->prg; + + switch (insn->code) { + /* + * BPF_MOV + */ + case BPF_ALU | BPF_MOV | BPF_X: /* dst = (u32) src */ + /* llgfr %dst,%src */ + EMIT4(0xb9160000, dst_reg, src_reg); + if (insn_is_zext(&insn[1])) + insn_count = 2; + break; + case BPF_ALU64 | BPF_MOV | BPF_X: /* dst = src */ + /* lgr %dst,%src */ + EMIT4(0xb9040000, dst_reg, src_reg); + break; + case BPF_ALU | BPF_MOV | BPF_K: /* dst = (u32) imm */ + /* llilf %dst,imm */ + EMIT6_IMM(0xc00f0000, dst_reg, imm); + if (insn_is_zext(&insn[1])) + insn_count = 2; + break; + case BPF_ALU64 | BPF_MOV | BPF_K: /* dst = imm */ + /* lgfi %dst,imm */ + EMIT6_IMM(0xc0010000, dst_reg, imm); + break; + /* + * BPF_LD 64 + */ + case BPF_LD | BPF_IMM | BPF_DW: /* dst = (u64) imm */ + { + /* 16 byte instruction that uses two 'struct bpf_insn' */ + u64 imm64; + + imm64 = (u64)(u32) insn[0].imm | ((u64)(u32) insn[1].imm) << 32; + /* lgrl %dst,imm */ + EMIT6_PCREL_RILB(0xc4080000, dst_reg, _EMIT_CONST_U64(imm64)); + insn_count = 2; + break; + } + /* + * BPF_ADD + */ + case BPF_ALU | BPF_ADD | BPF_X: /* dst = (u32) dst + (u32) src */ + /* ar %dst,%src */ + EMIT2(0x1a00, dst_reg, src_reg); + EMIT_ZERO(dst_reg); + break; + case BPF_ALU64 | BPF_ADD | BPF_X: /* dst = dst + src */ + /* agr %dst,%src */ + EMIT4(0xb9080000, dst_reg, src_reg); + break; + case BPF_ALU | BPF_ADD | BPF_K: /* dst = (u32) dst + (u32) imm */ + if (imm != 0) { + /* alfi %dst,imm */ + EMIT6_IMM(0xc20b0000, dst_reg, imm); + } + EMIT_ZERO(dst_reg); + break; + case BPF_ALU64 | BPF_ADD | BPF_K: /* dst = dst + imm */ + if (!imm) + break; + /* agfi %dst,imm */ + EMIT6_IMM(0xc2080000, dst_reg, imm); + break; + /* + * BPF_SUB + */ + case BPF_ALU | BPF_SUB | BPF_X: /* dst = (u32) dst - (u32) src */ + /* sr %dst,%src */ + EMIT2(0x1b00, dst_reg, src_reg); + EMIT_ZERO(dst_reg); + break; + case BPF_ALU64 | BPF_SUB | BPF_X: /* dst = dst - src */ + /* sgr %dst,%src */ + EMIT4(0xb9090000, dst_reg, src_reg); + break; + case BPF_ALU | BPF_SUB | BPF_K: /* dst = (u32) dst - (u32) imm */ + if (imm != 0) { + /* alfi %dst,-imm */ + EMIT6_IMM(0xc20b0000, dst_reg, -imm); + } + EMIT_ZERO(dst_reg); + break; + case BPF_ALU64 | BPF_SUB | BPF_K: /* dst = dst - imm */ + if (!imm) + break; + if (imm == -0x80000000) { + /* algfi %dst,0x80000000 */ + EMIT6_IMM(0xc20a0000, dst_reg, 0x80000000); + } else { + /* agfi %dst,-imm */ + EMIT6_IMM(0xc2080000, dst_reg, -imm); + } + break; + /* + * BPF_MUL + */ + case BPF_ALU | BPF_MUL | BPF_X: /* dst = (u32) dst * (u32) src */ + /* msr %dst,%src */ + EMIT4(0xb2520000, dst_reg, src_reg); + EMIT_ZERO(dst_reg); + break; + case BPF_ALU64 | BPF_MUL | BPF_X: /* dst = dst * src */ + /* msgr %dst,%src */ + EMIT4(0xb90c0000, dst_reg, src_reg); + break; + case BPF_ALU | BPF_MUL | BPF_K: /* dst = (u32) dst * (u32) imm */ + if (imm != 1) { + /* msfi %r5,imm */ + EMIT6_IMM(0xc2010000, dst_reg, imm); + } + EMIT_ZERO(dst_reg); + break; + case BPF_ALU64 | BPF_MUL | BPF_K: /* dst = dst * imm */ + if (imm == 1) + break; + /* msgfi %dst,imm */ + EMIT6_IMM(0xc2000000, dst_reg, imm); + break; + /* + * BPF_DIV / BPF_MOD + */ + case BPF_ALU | BPF_DIV | BPF_X: /* dst = (u32) dst / (u32) src */ + case BPF_ALU | BPF_MOD | BPF_X: /* dst = (u32) dst % (u32) src */ + { + int rc_reg = BPF_OP(insn->code) == BPF_DIV ? REG_W1 : REG_W0; + + /* lhi %w0,0 */ + EMIT4_IMM(0xa7080000, REG_W0, 0); + /* lr %w1,%dst */ + EMIT2(0x1800, REG_W1, dst_reg); + /* dlr %w0,%src */ + EMIT4(0xb9970000, REG_W0, src_reg); + /* llgfr %dst,%rc */ + EMIT4(0xb9160000, dst_reg, rc_reg); + if (insn_is_zext(&insn[1])) + insn_count = 2; + break; + } + case BPF_ALU64 | BPF_DIV | BPF_X: /* dst = dst / src */ + case BPF_ALU64 | BPF_MOD | BPF_X: /* dst = dst % src */ + { + int rc_reg = BPF_OP(insn->code) == BPF_DIV ? REG_W1 : REG_W0; + + /* lghi %w0,0 */ + EMIT4_IMM(0xa7090000, REG_W0, 0); + /* lgr %w1,%dst */ + EMIT4(0xb9040000, REG_W1, dst_reg); + /* dlgr %w0,%dst */ + EMIT4(0xb9870000, REG_W0, src_reg); + /* lgr %dst,%rc */ + EMIT4(0xb9040000, dst_reg, rc_reg); + break; + } + case BPF_ALU | BPF_DIV | BPF_K: /* dst = (u32) dst / (u32) imm */ + case BPF_ALU | BPF_MOD | BPF_K: /* dst = (u32) dst % (u32) imm */ + { + int rc_reg = BPF_OP(insn->code) == BPF_DIV ? REG_W1 : REG_W0; + + if (imm == 1) { + if (BPF_OP(insn->code) == BPF_MOD) + /* lhgi %dst,0 */ + EMIT4_IMM(0xa7090000, dst_reg, 0); + else + EMIT_ZERO(dst_reg); + break; + } + /* lhi %w0,0 */ + EMIT4_IMM(0xa7080000, REG_W0, 0); + /* lr %w1,%dst */ + EMIT2(0x1800, REG_W1, dst_reg); + if (!is_first_pass(jit) && can_use_ldisp_for_lit32(jit)) { + /* dl %w0,(%l) */ + EMIT6_DISP_LH(0xe3000000, 0x0097, REG_W0, REG_0, REG_L, + EMIT_CONST_U32(imm)); + } else { + /* lgfrl %dst,imm */ + EMIT6_PCREL_RILB(0xc40c0000, dst_reg, + _EMIT_CONST_U32(imm)); + jit->seen |= SEEN_LITERAL; + /* dlr %w0,%dst */ + EMIT4(0xb9970000, REG_W0, dst_reg); + } + /* llgfr %dst,%rc */ + EMIT4(0xb9160000, dst_reg, rc_reg); + if (insn_is_zext(&insn[1])) + insn_count = 2; + break; + } + case BPF_ALU64 | BPF_DIV | BPF_K: /* dst = dst / imm */ + case BPF_ALU64 | BPF_MOD | BPF_K: /* dst = dst % imm */ + { + int rc_reg = BPF_OP(insn->code) == BPF_DIV ? REG_W1 : REG_W0; + + if (imm == 1) { + if (BPF_OP(insn->code) == BPF_MOD) + /* lhgi %dst,0 */ + EMIT4_IMM(0xa7090000, dst_reg, 0); + break; + } + /* lghi %w0,0 */ + EMIT4_IMM(0xa7090000, REG_W0, 0); + /* lgr %w1,%dst */ + EMIT4(0xb9040000, REG_W1, dst_reg); + if (!is_first_pass(jit) && can_use_ldisp_for_lit64(jit)) { + /* dlg %w0,(%l) */ + EMIT6_DISP_LH(0xe3000000, 0x0087, REG_W0, REG_0, REG_L, + EMIT_CONST_U64(imm)); + } else { + /* lgrl %dst,imm */ + EMIT6_PCREL_RILB(0xc4080000, dst_reg, + _EMIT_CONST_U64(imm)); + jit->seen |= SEEN_LITERAL; + /* dlgr %w0,%dst */ + EMIT4(0xb9870000, REG_W0, dst_reg); + } + /* lgr %dst,%rc */ + EMIT4(0xb9040000, dst_reg, rc_reg); + break; + } + /* + * BPF_AND + */ + case BPF_ALU | BPF_AND | BPF_X: /* dst = (u32) dst & (u32) src */ + /* nr %dst,%src */ + EMIT2(0x1400, dst_reg, src_reg); + EMIT_ZERO(dst_reg); + break; + case BPF_ALU64 | BPF_AND | BPF_X: /* dst = dst & src */ + /* ngr %dst,%src */ + EMIT4(0xb9800000, dst_reg, src_reg); + break; + case BPF_ALU | BPF_AND | BPF_K: /* dst = (u32) dst & (u32) imm */ + /* nilf %dst,imm */ + EMIT6_IMM(0xc00b0000, dst_reg, imm); + EMIT_ZERO(dst_reg); + break; + case BPF_ALU64 | BPF_AND | BPF_K: /* dst = dst & imm */ + if (!is_first_pass(jit) && can_use_ldisp_for_lit64(jit)) { + /* ng %dst,(%l) */ + EMIT6_DISP_LH(0xe3000000, 0x0080, + dst_reg, REG_0, REG_L, + EMIT_CONST_U64(imm)); + } else { + /* lgrl %w0,imm */ + EMIT6_PCREL_RILB(0xc4080000, REG_W0, + _EMIT_CONST_U64(imm)); + jit->seen |= SEEN_LITERAL; + /* ngr %dst,%w0 */ + EMIT4(0xb9800000, dst_reg, REG_W0); + } + break; + /* + * BPF_OR + */ + case BPF_ALU | BPF_OR | BPF_X: /* dst = (u32) dst | (u32) src */ + /* or %dst,%src */ + EMIT2(0x1600, dst_reg, src_reg); + EMIT_ZERO(dst_reg); + break; + case BPF_ALU64 | BPF_OR | BPF_X: /* dst = dst | src */ + /* ogr %dst,%src */ + EMIT4(0xb9810000, dst_reg, src_reg); + break; + case BPF_ALU | BPF_OR | BPF_K: /* dst = (u32) dst | (u32) imm */ + /* oilf %dst,imm */ + EMIT6_IMM(0xc00d0000, dst_reg, imm); + EMIT_ZERO(dst_reg); + break; + case BPF_ALU64 | BPF_OR | BPF_K: /* dst = dst | imm */ + if (!is_first_pass(jit) && can_use_ldisp_for_lit64(jit)) { + /* og %dst,(%l) */ + EMIT6_DISP_LH(0xe3000000, 0x0081, + dst_reg, REG_0, REG_L, + EMIT_CONST_U64(imm)); + } else { + /* lgrl %w0,imm */ + EMIT6_PCREL_RILB(0xc4080000, REG_W0, + _EMIT_CONST_U64(imm)); + jit->seen |= SEEN_LITERAL; + /* ogr %dst,%w0 */ + EMIT4(0xb9810000, dst_reg, REG_W0); + } + break; + /* + * BPF_XOR + */ + case BPF_ALU | BPF_XOR | BPF_X: /* dst = (u32) dst ^ (u32) src */ + /* xr %dst,%src */ + EMIT2(0x1700, dst_reg, src_reg); + EMIT_ZERO(dst_reg); + break; + case BPF_ALU64 | BPF_XOR | BPF_X: /* dst = dst ^ src */ + /* xgr %dst,%src */ + EMIT4(0xb9820000, dst_reg, src_reg); + break; + case BPF_ALU | BPF_XOR | BPF_K: /* dst = (u32) dst ^ (u32) imm */ + if (imm != 0) { + /* xilf %dst,imm */ + EMIT6_IMM(0xc0070000, dst_reg, imm); + } + EMIT_ZERO(dst_reg); + break; + case BPF_ALU64 | BPF_XOR | BPF_K: /* dst = dst ^ imm */ + if (!is_first_pass(jit) && can_use_ldisp_for_lit64(jit)) { + /* xg %dst,(%l) */ + EMIT6_DISP_LH(0xe3000000, 0x0082, + dst_reg, REG_0, REG_L, + EMIT_CONST_U64(imm)); + } else { + /* lgrl %w0,imm */ + EMIT6_PCREL_RILB(0xc4080000, REG_W0, + _EMIT_CONST_U64(imm)); + jit->seen |= SEEN_LITERAL; + /* xgr %dst,%w0 */ + EMIT4(0xb9820000, dst_reg, REG_W0); + } + break; + /* + * BPF_LSH + */ + case BPF_ALU | BPF_LSH | BPF_X: /* dst = (u32) dst << (u32) src */ + /* sll %dst,0(%src) */ + EMIT4_DISP(0x89000000, dst_reg, src_reg, 0); + EMIT_ZERO(dst_reg); + break; + case BPF_ALU64 | BPF_LSH | BPF_X: /* dst = dst << src */ + /* sllg %dst,%dst,0(%src) */ + EMIT6_DISP_LH(0xeb000000, 0x000d, dst_reg, dst_reg, src_reg, 0); + break; + case BPF_ALU | BPF_LSH | BPF_K: /* dst = (u32) dst << (u32) imm */ + if (imm != 0) { + /* sll %dst,imm(%r0) */ + EMIT4_DISP(0x89000000, dst_reg, REG_0, imm); + } + EMIT_ZERO(dst_reg); + break; + case BPF_ALU64 | BPF_LSH | BPF_K: /* dst = dst << imm */ + if (imm == 0) + break; + /* sllg %dst,%dst,imm(%r0) */ + EMIT6_DISP_LH(0xeb000000, 0x000d, dst_reg, dst_reg, REG_0, imm); + break; + /* + * BPF_RSH + */ + case BPF_ALU | BPF_RSH | BPF_X: /* dst = (u32) dst >> (u32) src */ + /* srl %dst,0(%src) */ + EMIT4_DISP(0x88000000, dst_reg, src_reg, 0); + EMIT_ZERO(dst_reg); + break; + case BPF_ALU64 | BPF_RSH | BPF_X: /* dst = dst >> src */ + /* srlg %dst,%dst,0(%src) */ + EMIT6_DISP_LH(0xeb000000, 0x000c, dst_reg, dst_reg, src_reg, 0); + break; + case BPF_ALU | BPF_RSH | BPF_K: /* dst = (u32) dst >> (u32) imm */ + if (imm != 0) { + /* srl %dst,imm(%r0) */ + EMIT4_DISP(0x88000000, dst_reg, REG_0, imm); + } + EMIT_ZERO(dst_reg); + break; + case BPF_ALU64 | BPF_RSH | BPF_K: /* dst = dst >> imm */ + if (imm == 0) + break; + /* srlg %dst,%dst,imm(%r0) */ + EMIT6_DISP_LH(0xeb000000, 0x000c, dst_reg, dst_reg, REG_0, imm); + break; + /* + * BPF_ARSH + */ + case BPF_ALU | BPF_ARSH | BPF_X: /* ((s32) dst) >>= src */ + /* sra %dst,%dst,0(%src) */ + EMIT4_DISP(0x8a000000, dst_reg, src_reg, 0); + EMIT_ZERO(dst_reg); + break; + case BPF_ALU64 | BPF_ARSH | BPF_X: /* ((s64) dst) >>= src */ + /* srag %dst,%dst,0(%src) */ + EMIT6_DISP_LH(0xeb000000, 0x000a, dst_reg, dst_reg, src_reg, 0); + break; + case BPF_ALU | BPF_ARSH | BPF_K: /* ((s32) dst >> imm */ + if (imm != 0) { + /* sra %dst,imm(%r0) */ + EMIT4_DISP(0x8a000000, dst_reg, REG_0, imm); + } + EMIT_ZERO(dst_reg); + break; + case BPF_ALU64 | BPF_ARSH | BPF_K: /* ((s64) dst) >>= imm */ + if (imm == 0) + break; + /* srag %dst,%dst,imm(%r0) */ + EMIT6_DISP_LH(0xeb000000, 0x000a, dst_reg, dst_reg, REG_0, imm); + break; + /* + * BPF_NEG + */ + case BPF_ALU | BPF_NEG: /* dst = (u32) -dst */ + /* lcr %dst,%dst */ + EMIT2(0x1300, dst_reg, dst_reg); + EMIT_ZERO(dst_reg); + break; + case BPF_ALU64 | BPF_NEG: /* dst = -dst */ + /* lcgr %dst,%dst */ + EMIT4(0xb9030000, dst_reg, dst_reg); + break; + /* + * BPF_FROM_BE/LE + */ + case BPF_ALU | BPF_END | BPF_FROM_BE: + /* s390 is big endian, therefore only clear high order bytes */ + switch (imm) { + case 16: /* dst = (u16) cpu_to_be16(dst) */ + /* llghr %dst,%dst */ + EMIT4(0xb9850000, dst_reg, dst_reg); + if (insn_is_zext(&insn[1])) + insn_count = 2; + break; + case 32: /* dst = (u32) cpu_to_be32(dst) */ + if (!fp->aux->verifier_zext) + /* llgfr %dst,%dst */ + EMIT4(0xb9160000, dst_reg, dst_reg); + break; + case 64: /* dst = (u64) cpu_to_be64(dst) */ + break; + } + break; + case BPF_ALU | BPF_END | BPF_FROM_LE: + switch (imm) { + case 16: /* dst = (u16) cpu_to_le16(dst) */ + /* lrvr %dst,%dst */ + EMIT4(0xb91f0000, dst_reg, dst_reg); + /* srl %dst,16(%r0) */ + EMIT4_DISP(0x88000000, dst_reg, REG_0, 16); + /* llghr %dst,%dst */ + EMIT4(0xb9850000, dst_reg, dst_reg); + if (insn_is_zext(&insn[1])) + insn_count = 2; + break; + case 32: /* dst = (u32) cpu_to_le32(dst) */ + /* lrvr %dst,%dst */ + EMIT4(0xb91f0000, dst_reg, dst_reg); + if (!fp->aux->verifier_zext) + /* llgfr %dst,%dst */ + EMIT4(0xb9160000, dst_reg, dst_reg); + break; + case 64: /* dst = (u64) cpu_to_le64(dst) */ + /* lrvgr %dst,%dst */ + EMIT4(0xb90f0000, dst_reg, dst_reg); + break; + } + break; + /* + * BPF_NOSPEC (speculation barrier) + */ + case BPF_ST | BPF_NOSPEC: + break; + /* + * BPF_ST(X) + */ + case BPF_STX | BPF_MEM | BPF_B: /* *(u8 *)(dst + off) = src_reg */ + /* stcy %src,off(%dst) */ + EMIT6_DISP_LH(0xe3000000, 0x0072, src_reg, dst_reg, REG_0, off); + jit->seen |= SEEN_MEM; + break; + case BPF_STX | BPF_MEM | BPF_H: /* (u16 *)(dst + off) = src */ + /* sthy %src,off(%dst) */ + EMIT6_DISP_LH(0xe3000000, 0x0070, src_reg, dst_reg, REG_0, off); + jit->seen |= SEEN_MEM; + break; + case BPF_STX | BPF_MEM | BPF_W: /* *(u32 *)(dst + off) = src */ + /* sty %src,off(%dst) */ + EMIT6_DISP_LH(0xe3000000, 0x0050, src_reg, dst_reg, REG_0, off); + jit->seen |= SEEN_MEM; + break; + case BPF_STX | BPF_MEM | BPF_DW: /* (u64 *)(dst + off) = src */ + /* stg %src,off(%dst) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, src_reg, dst_reg, REG_0, off); + jit->seen |= SEEN_MEM; + break; + case BPF_ST | BPF_MEM | BPF_B: /* *(u8 *)(dst + off) = imm */ + /* lhi %w0,imm */ + EMIT4_IMM(0xa7080000, REG_W0, (u8) imm); + /* stcy %w0,off(dst) */ + EMIT6_DISP_LH(0xe3000000, 0x0072, REG_W0, dst_reg, REG_0, off); + jit->seen |= SEEN_MEM; + break; + case BPF_ST | BPF_MEM | BPF_H: /* (u16 *)(dst + off) = imm */ + /* lhi %w0,imm */ + EMIT4_IMM(0xa7080000, REG_W0, (u16) imm); + /* sthy %w0,off(dst) */ + EMIT6_DISP_LH(0xe3000000, 0x0070, REG_W0, dst_reg, REG_0, off); + jit->seen |= SEEN_MEM; + break; + case BPF_ST | BPF_MEM | BPF_W: /* *(u32 *)(dst + off) = imm */ + /* llilf %w0,imm */ + EMIT6_IMM(0xc00f0000, REG_W0, (u32) imm); + /* sty %w0,off(%dst) */ + EMIT6_DISP_LH(0xe3000000, 0x0050, REG_W0, dst_reg, REG_0, off); + jit->seen |= SEEN_MEM; + break; + case BPF_ST | BPF_MEM | BPF_DW: /* *(u64 *)(dst + off) = imm */ + /* lgfi %w0,imm */ + EMIT6_IMM(0xc0010000, REG_W0, imm); + /* stg %w0,off(%dst) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W0, dst_reg, REG_0, off); + jit->seen |= SEEN_MEM; + break; + /* + * BPF_ATOMIC + */ + case BPF_STX | BPF_ATOMIC | BPF_DW: + case BPF_STX | BPF_ATOMIC | BPF_W: + { + bool is32 = BPF_SIZE(insn->code) == BPF_W; + + switch (insn->imm) { +/* {op32|op64} {%w0|%src},%src,off(%dst) */ +#define EMIT_ATOMIC(op32, op64) do { \ + EMIT6_DISP_LH(0xeb000000, is32 ? (op32) : (op64), \ + (insn->imm & BPF_FETCH) ? src_reg : REG_W0, \ + src_reg, dst_reg, off); \ + if (is32 && (insn->imm & BPF_FETCH)) \ + EMIT_ZERO(src_reg); \ +} while (0) + case BPF_ADD: + case BPF_ADD | BPF_FETCH: + /* {laal|laalg} */ + EMIT_ATOMIC(0x00fa, 0x00ea); + break; + case BPF_AND: + case BPF_AND | BPF_FETCH: + /* {lan|lang} */ + EMIT_ATOMIC(0x00f4, 0x00e4); + break; + case BPF_OR: + case BPF_OR | BPF_FETCH: + /* {lao|laog} */ + EMIT_ATOMIC(0x00f6, 0x00e6); + break; + case BPF_XOR: + case BPF_XOR | BPF_FETCH: + /* {lax|laxg} */ + EMIT_ATOMIC(0x00f7, 0x00e7); + break; +#undef EMIT_ATOMIC + case BPF_XCHG: + /* {ly|lg} %w0,off(%dst) */ + EMIT6_DISP_LH(0xe3000000, + is32 ? 0x0058 : 0x0004, REG_W0, REG_0, + dst_reg, off); + /* 0: {csy|csg} %w0,%src,off(%dst) */ + EMIT6_DISP_LH(0xeb000000, is32 ? 0x0014 : 0x0030, + REG_W0, src_reg, dst_reg, off); + /* brc 4,0b */ + EMIT4_PCREL_RIC(0xa7040000, 4, jit->prg - 6); + /* {llgfr|lgr} %src,%w0 */ + EMIT4(is32 ? 0xb9160000 : 0xb9040000, src_reg, REG_W0); + if (is32 && insn_is_zext(&insn[1])) + insn_count = 2; + break; + case BPF_CMPXCHG: + /* 0: {csy|csg} %b0,%src,off(%dst) */ + EMIT6_DISP_LH(0xeb000000, is32 ? 0x0014 : 0x0030, + BPF_REG_0, src_reg, dst_reg, off); + break; + default: + pr_err("Unknown atomic operation %02x\n", insn->imm); + return -1; + } + + jit->seen |= SEEN_MEM; + break; + } + /* + * BPF_LDX + */ + case BPF_LDX | BPF_MEM | BPF_B: /* dst = *(u8 *)(ul) (src + off) */ + case BPF_LDX | BPF_PROBE_MEM | BPF_B: + /* llgc %dst,0(off,%src) */ + EMIT6_DISP_LH(0xe3000000, 0x0090, dst_reg, src_reg, REG_0, off); + jit->seen |= SEEN_MEM; + if (insn_is_zext(&insn[1])) + insn_count = 2; + break; + case BPF_LDX | BPF_MEM | BPF_H: /* dst = *(u16 *)(ul) (src + off) */ + case BPF_LDX | BPF_PROBE_MEM | BPF_H: + /* llgh %dst,0(off,%src) */ + EMIT6_DISP_LH(0xe3000000, 0x0091, dst_reg, src_reg, REG_0, off); + jit->seen |= SEEN_MEM; + if (insn_is_zext(&insn[1])) + insn_count = 2; + break; + case BPF_LDX | BPF_MEM | BPF_W: /* dst = *(u32 *)(ul) (src + off) */ + case BPF_LDX | BPF_PROBE_MEM | BPF_W: + /* llgf %dst,off(%src) */ + jit->seen |= SEEN_MEM; + EMIT6_DISP_LH(0xe3000000, 0x0016, dst_reg, src_reg, REG_0, off); + if (insn_is_zext(&insn[1])) + insn_count = 2; + break; + case BPF_LDX | BPF_MEM | BPF_DW: /* dst = *(u64 *)(ul) (src + off) */ + case BPF_LDX | BPF_PROBE_MEM | BPF_DW: + /* lg %dst,0(off,%src) */ + jit->seen |= SEEN_MEM; + EMIT6_DISP_LH(0xe3000000, 0x0004, dst_reg, src_reg, REG_0, off); + break; + /* + * BPF_JMP / CALL + */ + case BPF_JMP | BPF_CALL: + { + const struct btf_func_model *m; + bool func_addr_fixed; + int j, ret; + u64 func; + + ret = bpf_jit_get_func_addr(fp, insn, extra_pass, + &func, &func_addr_fixed); + if (ret < 0) + return -1; + + REG_SET_SEEN(BPF_REG_5); + jit->seen |= SEEN_FUNC; + /* + * Copy the tail call counter to where the callee expects it. + * + * Note 1: The callee can increment the tail call counter, but + * we do not load it back, since the x86 JIT does not do this + * either. + * + * Note 2: We assume that the verifier does not let us call the + * main program, which clears the tail call counter on entry. + */ + /* mvc STK_OFF_TCCNT(4,%r15),N(%r15) */ + _EMIT6(0xd203f000 | STK_OFF_TCCNT, + 0xf000 | (STK_OFF_TCCNT + STK_OFF + stack_depth)); + + /* Sign-extend the kfunc arguments. */ + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { + m = bpf_jit_find_kfunc_model(fp, insn); + if (!m) + return -1; + + for (j = 0; j < m->nr_args; j++) { + if (sign_extend(jit, BPF_REG_1 + j, + m->arg_size[j], + m->arg_flags[j])) + return -1; + } + } + + /* lgrl %w1,func */ + EMIT6_PCREL_RILB(0xc4080000, REG_W1, _EMIT_CONST_U64(func)); + /* %r1() */ + call_r1(jit); + /* lgr %b0,%r2: load return value into %b0 */ + EMIT4(0xb9040000, BPF_REG_0, REG_2); + break; + } + case BPF_JMP | BPF_TAIL_CALL: { + int patch_1_clrj, patch_2_clij, patch_3_brc; + + /* + * Implicit input: + * B1: pointer to ctx + * B2: pointer to bpf_array + * B3: index in bpf_array + * + * if (index >= array->map.max_entries) + * goto out; + */ + + /* llgf %w1,map.max_entries(%b2) */ + EMIT6_DISP_LH(0xe3000000, 0x0016, REG_W1, REG_0, BPF_REG_2, + offsetof(struct bpf_array, map.max_entries)); + /* if ((u32)%b3 >= (u32)%w1) goto out; */ + /* clrj %b3,%w1,0xa,out */ + patch_1_clrj = jit->prg; + EMIT6_PCREL_RIEB(0xec000000, 0x0077, BPF_REG_3, REG_W1, 0xa, + jit->prg); + + /* + * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT) + * goto out; + */ + + if (jit->seen & SEEN_STACK) + off = STK_OFF_TCCNT + STK_OFF + stack_depth; + else + off = STK_OFF_TCCNT; + /* lhi %w0,1 */ + EMIT4_IMM(0xa7080000, REG_W0, 1); + /* laal %w1,%w0,off(%r15) */ + EMIT6_DISP_LH(0xeb000000, 0x00fa, REG_W1, REG_W0, REG_15, off); + /* clij %w1,MAX_TAIL_CALL_CNT-1,0x2,out */ + patch_2_clij = jit->prg; + EMIT6_PCREL_RIEC(0xec000000, 0x007f, REG_W1, MAX_TAIL_CALL_CNT - 1, + 2, jit->prg); + + /* + * prog = array->ptrs[index]; + * if (prog == NULL) + * goto out; + */ + + /* llgfr %r1,%b3: %r1 = (u32) index */ + EMIT4(0xb9160000, REG_1, BPF_REG_3); + /* sllg %r1,%r1,3: %r1 *= 8 */ + EMIT6_DISP_LH(0xeb000000, 0x000d, REG_1, REG_1, REG_0, 3); + /* ltg %r1,prog(%b2,%r1) */ + EMIT6_DISP_LH(0xe3000000, 0x0002, REG_1, BPF_REG_2, + REG_1, offsetof(struct bpf_array, ptrs)); + /* brc 0x8,out */ + patch_3_brc = jit->prg; + EMIT4_PCREL_RIC(0xa7040000, 8, jit->prg); + + /* + * Restore registers before calling function + */ + save_restore_regs(jit, REGS_RESTORE, stack_depth); + + /* + * goto *(prog->bpf_func + tail_call_start); + */ + + /* lg %r1,bpf_func(%r1) */ + EMIT6_DISP_LH(0xe3000000, 0x0004, REG_1, REG_1, REG_0, + offsetof(struct bpf_prog, bpf_func)); + if (nospec_uses_trampoline()) { + jit->seen |= SEEN_FUNC; + /* aghi %r1,tail_call_start */ + EMIT4_IMM(0xa70b0000, REG_1, jit->tail_call_start); + /* brcl 0xf,__s390_indirect_jump_r1 */ + EMIT6_PCREL_RILC(0xc0040000, 0xf, jit->r1_thunk_ip); + } else { + /* bc 0xf,tail_call_start(%r1) */ + _EMIT4(0x47f01000 + jit->tail_call_start); + } + /* out: */ + if (jit->prg_buf) { + *(u16 *)(jit->prg_buf + patch_1_clrj + 2) = + (jit->prg - patch_1_clrj) >> 1; + *(u16 *)(jit->prg_buf + patch_2_clij + 2) = + (jit->prg - patch_2_clij) >> 1; + *(u16 *)(jit->prg_buf + patch_3_brc + 2) = + (jit->prg - patch_3_brc) >> 1; + } + break; + } + case BPF_JMP | BPF_EXIT: /* return b0 */ + last = (i == fp->len - 1) ? 1 : 0; + if (last) + break; + if (!is_first_pass(jit) && can_use_rel(jit, jit->exit_ip)) + /* brc 0xf, */ + EMIT4_PCREL_RIC(0xa7040000, 0xf, jit->exit_ip); + else + /* brcl 0xf, */ + EMIT6_PCREL_RILC(0xc0040000, 0xf, jit->exit_ip); + break; + /* + * Branch relative (number of skipped instructions) to offset on + * condition. + * + * Condition code to mask mapping: + * + * CC | Description | Mask + * ------------------------------ + * 0 | Operands equal | 8 + * 1 | First operand low | 4 + * 2 | First operand high | 2 + * 3 | Unused | 1 + * + * For s390x relative branches: ip = ip + off_bytes + * For BPF relative branches: insn = insn + off_insns + 1 + * + * For example for s390x with offset 0 we jump to the branch + * instruction itself (loop) and for BPF with offset 0 we + * branch to the instruction behind the branch. + */ + case BPF_JMP | BPF_JA: /* if (true) */ + mask = 0xf000; /* j */ + goto branch_oc; + case BPF_JMP | BPF_JSGT | BPF_K: /* ((s64) dst > (s64) imm) */ + case BPF_JMP32 | BPF_JSGT | BPF_K: /* ((s32) dst > (s32) imm) */ + mask = 0x2000; /* jh */ + goto branch_ks; + case BPF_JMP | BPF_JSLT | BPF_K: /* ((s64) dst < (s64) imm) */ + case BPF_JMP32 | BPF_JSLT | BPF_K: /* ((s32) dst < (s32) imm) */ + mask = 0x4000; /* jl */ + goto branch_ks; + case BPF_JMP | BPF_JSGE | BPF_K: /* ((s64) dst >= (s64) imm) */ + case BPF_JMP32 | BPF_JSGE | BPF_K: /* ((s32) dst >= (s32) imm) */ + mask = 0xa000; /* jhe */ + goto branch_ks; + case BPF_JMP | BPF_JSLE | BPF_K: /* ((s64) dst <= (s64) imm) */ + case BPF_JMP32 | BPF_JSLE | BPF_K: /* ((s32) dst <= (s32) imm) */ + mask = 0xc000; /* jle */ + goto branch_ks; + case BPF_JMP | BPF_JGT | BPF_K: /* (dst_reg > imm) */ + case BPF_JMP32 | BPF_JGT | BPF_K: /* ((u32) dst_reg > (u32) imm) */ + mask = 0x2000; /* jh */ + goto branch_ku; + case BPF_JMP | BPF_JLT | BPF_K: /* (dst_reg < imm) */ + case BPF_JMP32 | BPF_JLT | BPF_K: /* ((u32) dst_reg < (u32) imm) */ + mask = 0x4000; /* jl */ + goto branch_ku; + case BPF_JMP | BPF_JGE | BPF_K: /* (dst_reg >= imm) */ + case BPF_JMP32 | BPF_JGE | BPF_K: /* ((u32) dst_reg >= (u32) imm) */ + mask = 0xa000; /* jhe */ + goto branch_ku; + case BPF_JMP | BPF_JLE | BPF_K: /* (dst_reg <= imm) */ + case BPF_JMP32 | BPF_JLE | BPF_K: /* ((u32) dst_reg <= (u32) imm) */ + mask = 0xc000; /* jle */ + goto branch_ku; + case BPF_JMP | BPF_JNE | BPF_K: /* (dst_reg != imm) */ + case BPF_JMP32 | BPF_JNE | BPF_K: /* ((u32) dst_reg != (u32) imm) */ + mask = 0x7000; /* jne */ + goto branch_ku; + case BPF_JMP | BPF_JEQ | BPF_K: /* (dst_reg == imm) */ + case BPF_JMP32 | BPF_JEQ | BPF_K: /* ((u32) dst_reg == (u32) imm) */ + mask = 0x8000; /* je */ + goto branch_ku; + case BPF_JMP | BPF_JSET | BPF_K: /* (dst_reg & imm) */ + case BPF_JMP32 | BPF_JSET | BPF_K: /* ((u32) dst_reg & (u32) imm) */ + mask = 0x7000; /* jnz */ + if (BPF_CLASS(insn->code) == BPF_JMP32) { + /* llilf %w1,imm (load zero extend imm) */ + EMIT6_IMM(0xc00f0000, REG_W1, imm); + /* nr %w1,%dst */ + EMIT2(0x1400, REG_W1, dst_reg); + } else { + /* lgfi %w1,imm (load sign extend imm) */ + EMIT6_IMM(0xc0010000, REG_W1, imm); + /* ngr %w1,%dst */ + EMIT4(0xb9800000, REG_W1, dst_reg); + } + goto branch_oc; + + case BPF_JMP | BPF_JSGT | BPF_X: /* ((s64) dst > (s64) src) */ + case BPF_JMP32 | BPF_JSGT | BPF_X: /* ((s32) dst > (s32) src) */ + mask = 0x2000; /* jh */ + goto branch_xs; + case BPF_JMP | BPF_JSLT | BPF_X: /* ((s64) dst < (s64) src) */ + case BPF_JMP32 | BPF_JSLT | BPF_X: /* ((s32) dst < (s32) src) */ + mask = 0x4000; /* jl */ + goto branch_xs; + case BPF_JMP | BPF_JSGE | BPF_X: /* ((s64) dst >= (s64) src) */ + case BPF_JMP32 | BPF_JSGE | BPF_X: /* ((s32) dst >= (s32) src) */ + mask = 0xa000; /* jhe */ + goto branch_xs; + case BPF_JMP | BPF_JSLE | BPF_X: /* ((s64) dst <= (s64) src) */ + case BPF_JMP32 | BPF_JSLE | BPF_X: /* ((s32) dst <= (s32) src) */ + mask = 0xc000; /* jle */ + goto branch_xs; + case BPF_JMP | BPF_JGT | BPF_X: /* (dst > src) */ + case BPF_JMP32 | BPF_JGT | BPF_X: /* ((u32) dst > (u32) src) */ + mask = 0x2000; /* jh */ + goto branch_xu; + case BPF_JMP | BPF_JLT | BPF_X: /* (dst < src) */ + case BPF_JMP32 | BPF_JLT | BPF_X: /* ((u32) dst < (u32) src) */ + mask = 0x4000; /* jl */ + goto branch_xu; + case BPF_JMP | BPF_JGE | BPF_X: /* (dst >= src) */ + case BPF_JMP32 | BPF_JGE | BPF_X: /* ((u32) dst >= (u32) src) */ + mask = 0xa000; /* jhe */ + goto branch_xu; + case BPF_JMP | BPF_JLE | BPF_X: /* (dst <= src) */ + case BPF_JMP32 | BPF_JLE | BPF_X: /* ((u32) dst <= (u32) src) */ + mask = 0xc000; /* jle */ + goto branch_xu; + case BPF_JMP | BPF_JNE | BPF_X: /* (dst != src) */ + case BPF_JMP32 | BPF_JNE | BPF_X: /* ((u32) dst != (u32) src) */ + mask = 0x7000; /* jne */ + goto branch_xu; + case BPF_JMP | BPF_JEQ | BPF_X: /* (dst == src) */ + case BPF_JMP32 | BPF_JEQ | BPF_X: /* ((u32) dst == (u32) src) */ + mask = 0x8000; /* je */ + goto branch_xu; + case BPF_JMP | BPF_JSET | BPF_X: /* (dst & src) */ + case BPF_JMP32 | BPF_JSET | BPF_X: /* ((u32) dst & (u32) src) */ + { + bool is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; + + mask = 0x7000; /* jnz */ + /* nrk or ngrk %w1,%dst,%src */ + EMIT4_RRF((is_jmp32 ? 0xb9f40000 : 0xb9e40000), + REG_W1, dst_reg, src_reg); + goto branch_oc; +branch_ks: + is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; + /* cfi or cgfi %dst,imm */ + EMIT6_IMM(is_jmp32 ? 0xc20d0000 : 0xc20c0000, + dst_reg, imm); + if (!is_first_pass(jit) && + can_use_rel(jit, addrs[i + off + 1])) { + /* brc mask,off */ + EMIT4_PCREL_RIC(0xa7040000, + mask >> 12, addrs[i + off + 1]); + } else { + /* brcl mask,off */ + EMIT6_PCREL_RILC(0xc0040000, + mask >> 12, addrs[i + off + 1]); + } + break; +branch_ku: + /* lgfi %w1,imm (load sign extend imm) */ + src_reg = REG_1; + EMIT6_IMM(0xc0010000, src_reg, imm); + goto branch_xu; +branch_xs: + is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; + if (!is_first_pass(jit) && + can_use_rel(jit, addrs[i + off + 1])) { + /* crj or cgrj %dst,%src,mask,off */ + EMIT6_PCREL(0xec000000, (is_jmp32 ? 0x0076 : 0x0064), + dst_reg, src_reg, i, off, mask); + } else { + /* cr or cgr %dst,%src */ + if (is_jmp32) + EMIT2(0x1900, dst_reg, src_reg); + else + EMIT4(0xb9200000, dst_reg, src_reg); + /* brcl mask,off */ + EMIT6_PCREL_RILC(0xc0040000, + mask >> 12, addrs[i + off + 1]); + } + break; +branch_xu: + is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; + if (!is_first_pass(jit) && + can_use_rel(jit, addrs[i + off + 1])) { + /* clrj or clgrj %dst,%src,mask,off */ + EMIT6_PCREL(0xec000000, (is_jmp32 ? 0x0077 : 0x0065), + dst_reg, src_reg, i, off, mask); + } else { + /* clr or clgr %dst,%src */ + if (is_jmp32) + EMIT2(0x1500, dst_reg, src_reg); + else + EMIT4(0xb9210000, dst_reg, src_reg); + /* brcl mask,off */ + EMIT6_PCREL_RILC(0xc0040000, + mask >> 12, addrs[i + off + 1]); + } + break; +branch_oc: + if (!is_first_pass(jit) && + can_use_rel(jit, addrs[i + off + 1])) { + /* brc mask,off */ + EMIT4_PCREL_RIC(0xa7040000, + mask >> 12, addrs[i + off + 1]); + } else { + /* brcl mask,off */ + EMIT6_PCREL_RILC(0xc0040000, + mask >> 12, addrs[i + off + 1]); + } + break; + } + default: /* too complex, give up */ + pr_err("Unknown opcode %02x\n", insn->code); + return -1; + } + + if (probe_prg != -1) { + /* + * Handlers of certain exceptions leave psw.addr pointing to + * the instruction directly after the failing one. Therefore, + * create two exception table entries and also add a nop in + * case two probing instructions come directly after each + * other. + */ + nop_prg = jit->prg; + /* bcr 0,%0 */ + _EMIT2(0x0700); + err = bpf_jit_probe_mem(jit, fp, probe_prg, nop_prg); + if (err < 0) + return err; + } + + return insn_count; +} + +/* + * Return whether new i-th instruction address does not violate any invariant + */ +static bool bpf_is_new_addr_sane(struct bpf_jit *jit, int i) +{ + /* On the first pass anything goes */ + if (is_first_pass(jit)) + return true; + + /* The codegen pass must not change anything */ + if (is_codegen_pass(jit)) + return jit->addrs[i] == jit->prg; + + /* Passes in between must not increase code size */ + return jit->addrs[i] >= jit->prg; +} + +/* + * Update the address of i-th instruction + */ +static int bpf_set_addr(struct bpf_jit *jit, int i) +{ + int delta; + + if (is_codegen_pass(jit)) { + delta = jit->prg - jit->addrs[i]; + if (delta < 0) + bpf_skip(jit, -delta); + } + if (WARN_ON_ONCE(!bpf_is_new_addr_sane(jit, i))) + return -1; + jit->addrs[i] = jit->prg; + return 0; +} + +/* + * Compile eBPF program into s390x code + */ +static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp, + bool extra_pass, u32 stack_depth) +{ + int i, insn_count, lit32_size, lit64_size; + + jit->lit32 = jit->lit32_start; + jit->lit64 = jit->lit64_start; + jit->prg = 0; + jit->excnt = 0; + + bpf_jit_prologue(jit, fp, stack_depth); + if (bpf_set_addr(jit, 0) < 0) + return -1; + for (i = 0; i < fp->len; i += insn_count) { + insn_count = bpf_jit_insn(jit, fp, i, extra_pass, stack_depth); + if (insn_count < 0) + return -1; + /* Next instruction address */ + if (bpf_set_addr(jit, i + insn_count) < 0) + return -1; + } + bpf_jit_epilogue(jit, stack_depth); + + lit32_size = jit->lit32 - jit->lit32_start; + lit64_size = jit->lit64 - jit->lit64_start; + jit->lit32_start = jit->prg; + if (lit32_size) + jit->lit32_start = ALIGN(jit->lit32_start, 4); + jit->lit64_start = jit->lit32_start + lit32_size; + if (lit64_size) + jit->lit64_start = ALIGN(jit->lit64_start, 8); + jit->size = jit->lit64_start + lit64_size; + jit->size_prg = jit->prg; + + if (WARN_ON_ONCE(fp->aux->extable && + jit->excnt != fp->aux->num_exentries)) + /* Verifier bug - too many entries. */ + return -1; + + return 0; +} + +bool bpf_jit_needs_zext(void) +{ + return true; +} + +struct s390_jit_data { + struct bpf_binary_header *header; + struct bpf_jit ctx; + int pass; +}; + +static struct bpf_binary_header *bpf_jit_alloc(struct bpf_jit *jit, + struct bpf_prog *fp) +{ + struct bpf_binary_header *header; + u32 extable_size; + u32 code_size; + + /* We need two entries per insn. */ + fp->aux->num_exentries *= 2; + + code_size = roundup(jit->size, + __alignof__(struct exception_table_entry)); + extable_size = fp->aux->num_exentries * + sizeof(struct exception_table_entry); + header = bpf_jit_binary_alloc(code_size + extable_size, &jit->prg_buf, + 8, jit_fill_hole); + if (!header) + return NULL; + fp->aux->extable = (struct exception_table_entry *) + (jit->prg_buf + code_size); + return header; +} + +/* + * Compile eBPF program "fp" + */ +struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) +{ + u32 stack_depth = round_up(fp->aux->stack_depth, 8); + struct bpf_prog *tmp, *orig_fp = fp; + struct bpf_binary_header *header; + struct s390_jit_data *jit_data; + bool tmp_blinded = false; + bool extra_pass = false; + struct bpf_jit jit; + int pass; + + if (WARN_ON_ONCE(bpf_plt_end - bpf_plt != BPF_PLT_SIZE)) + return orig_fp; + + if (!fp->jit_requested) + return orig_fp; + + tmp = bpf_jit_blind_constants(fp); + /* + * If blinding was requested and we failed during blinding, + * we must fall back to the interpreter. + */ + if (IS_ERR(tmp)) + return orig_fp; + if (tmp != fp) { + tmp_blinded = true; + fp = tmp; + } + + jit_data = fp->aux->jit_data; + if (!jit_data) { + jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL); + if (!jit_data) { + fp = orig_fp; + goto out; + } + fp->aux->jit_data = jit_data; + } + if (jit_data->ctx.addrs) { + jit = jit_data->ctx; + header = jit_data->header; + extra_pass = true; + pass = jit_data->pass + 1; + goto skip_init_ctx; + } + + memset(&jit, 0, sizeof(jit)); + jit.addrs = kvcalloc(fp->len + 1, sizeof(*jit.addrs), GFP_KERNEL); + if (jit.addrs == NULL) { + fp = orig_fp; + goto free_addrs; + } + /* + * Three initial passes: + * - 1/2: Determine clobbered registers + * - 3: Calculate program size and addrs array + */ + for (pass = 1; pass <= 3; pass++) { + if (bpf_jit_prog(&jit, fp, extra_pass, stack_depth)) { + fp = orig_fp; + goto free_addrs; + } + } + /* + * Final pass: Allocate and generate program + */ + header = bpf_jit_alloc(&jit, fp); + if (!header) { + fp = orig_fp; + goto free_addrs; + } +skip_init_ctx: + if (bpf_jit_prog(&jit, fp, extra_pass, stack_depth)) { + bpf_jit_binary_free(header); + fp = orig_fp; + goto free_addrs; + } + if (bpf_jit_enable > 1) { + bpf_jit_dump(fp->len, jit.size, pass, jit.prg_buf); + print_fn_code(jit.prg_buf, jit.size_prg); + } + if (!fp->is_func || extra_pass) { + bpf_jit_binary_lock_ro(header); + } else { + jit_data->header = header; + jit_data->ctx = jit; + jit_data->pass = pass; + } + fp->bpf_func = (void *) jit.prg_buf; + fp->jited = 1; + fp->jited_len = jit.size; + + if (!fp->is_func || extra_pass) { + bpf_prog_fill_jited_linfo(fp, jit.addrs + 1); +free_addrs: + kvfree(jit.addrs); + kfree(jit_data); + fp->aux->jit_data = NULL; + } +out: + if (tmp_blinded) + bpf_jit_prog_release_other(fp, fp == orig_fp ? + tmp : orig_fp); + return fp; +} + +bool bpf_jit_supports_kfunc_call(void) +{ + return true; +} + +bool bpf_jit_supports_far_kfunc_call(void) +{ + return true; +} + +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, + void *old_addr, void *new_addr) +{ + struct { + u16 opc; + s32 disp; + } __packed insn; + char expected_plt[BPF_PLT_SIZE]; + char current_plt[BPF_PLT_SIZE]; + char new_plt[BPF_PLT_SIZE]; + char *plt; + char *ret; + int err; + + /* Verify the branch to be patched. */ + err = copy_from_kernel_nofault(&insn, ip, sizeof(insn)); + if (err < 0) + return err; + if (insn.opc != (0xc004 | (old_addr ? 0xf0 : 0))) + return -EINVAL; + + if (t == BPF_MOD_JUMP && + insn.disp == ((char *)new_addr - (char *)ip) >> 1) { + /* + * The branch already points to the destination, + * there is no PLT. + */ + } else { + /* Verify the PLT. */ + plt = (char *)ip + (insn.disp << 1); + err = copy_from_kernel_nofault(current_plt, plt, BPF_PLT_SIZE); + if (err < 0) + return err; + ret = (char *)ip + 6; + bpf_jit_plt(expected_plt, ret, old_addr); + if (memcmp(current_plt, expected_plt, BPF_PLT_SIZE)) + return -EINVAL; + /* Adjust the call address. */ + bpf_jit_plt(new_plt, ret, new_addr); + s390_kernel_write(plt + (bpf_plt_target - bpf_plt), + new_plt + (bpf_plt_target - bpf_plt), + sizeof(void *)); + } + + /* Adjust the mask of the branch. */ + insn.opc = 0xc004 | (new_addr ? 0xf0 : 0); + s390_kernel_write((char *)ip + 1, (char *)&insn.opc + 1, 1); + + /* Make the new code visible to the other CPUs. */ + text_poke_sync_lock(); + + return 0; +} + +struct bpf_tramp_jit { + struct bpf_jit common; + int orig_stack_args_off;/* Offset of arguments placed on stack by the + * func_addr's original caller + */ + int stack_size; /* Trampoline stack size */ + int backchain_off; /* Offset of backchain */ + int stack_args_off; /* Offset of stack arguments for calling + * func_addr, has to be at the top + */ + int reg_args_off; /* Offset of register arguments for calling + * func_addr + */ + int ip_off; /* For bpf_get_func_ip(), has to be at + * (ctx - 16) + */ + int arg_cnt_off; /* For bpf_get_func_arg_cnt(), has to be at + * (ctx - 8) + */ + int bpf_args_off; /* Offset of BPF_PROG context, which consists + * of BPF arguments followed by return value + */ + int retval_off; /* Offset of return value (see above) */ + int r7_r8_off; /* Offset of saved %r7 and %r8, which are used + * for __bpf_prog_enter() return value and + * func_addr respectively + */ + int run_ctx_off; /* Offset of struct bpf_tramp_run_ctx */ + int tccnt_off; /* Offset of saved tailcall counter */ + int r14_off; /* Offset of saved %r14, has to be at the + * bottom */ + int do_fexit; /* do_fexit: label */ +}; + +static void load_imm64(struct bpf_jit *jit, int dst_reg, u64 val) +{ + /* llihf %dst_reg,val_hi */ + EMIT6_IMM(0xc00e0000, dst_reg, (val >> 32)); + /* oilf %rdst_reg,val_lo */ + EMIT6_IMM(0xc00d0000, dst_reg, val); +} + +static int invoke_bpf_prog(struct bpf_tramp_jit *tjit, + const struct btf_func_model *m, + struct bpf_tramp_link *tlink, bool save_ret) +{ + struct bpf_jit *jit = &tjit->common; + int cookie_off = tjit->run_ctx_off + + offsetof(struct bpf_tramp_run_ctx, bpf_cookie); + struct bpf_prog *p = tlink->link.prog; + int patch; + + /* + * run_ctx.cookie = tlink->cookie; + */ + + /* %r0 = tlink->cookie */ + load_imm64(jit, REG_W0, tlink->cookie); + /* stg %r0,cookie_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W0, REG_0, REG_15, cookie_off); + + /* + * if ((start = __bpf_prog_enter(p, &run_ctx)) == 0) + * goto skip; + */ + + /* %r1 = __bpf_prog_enter */ + load_imm64(jit, REG_1, (u64)bpf_trampoline_enter(p)); + /* %r2 = p */ + load_imm64(jit, REG_2, (u64)p); + /* la %r3,run_ctx_off(%r15) */ + EMIT4_DISP(0x41000000, REG_3, REG_15, tjit->run_ctx_off); + /* %r1() */ + call_r1(jit); + /* ltgr %r7,%r2 */ + EMIT4(0xb9020000, REG_7, REG_2); + /* brcl 8,skip */ + patch = jit->prg; + EMIT6_PCREL_RILC(0xc0040000, 8, 0); + + /* + * retval = bpf_func(args, p->insnsi); + */ + + /* %r1 = p->bpf_func */ + load_imm64(jit, REG_1, (u64)p->bpf_func); + /* la %r2,bpf_args_off(%r15) */ + EMIT4_DISP(0x41000000, REG_2, REG_15, tjit->bpf_args_off); + /* %r3 = p->insnsi */ + if (!p->jited) + load_imm64(jit, REG_3, (u64)p->insnsi); + /* %r1() */ + call_r1(jit); + /* stg %r2,retval_off(%r15) */ + if (save_ret) { + if (sign_extend(jit, REG_2, m->ret_size, m->ret_flags)) + return -1; + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_2, REG_0, REG_15, + tjit->retval_off); + } + + /* skip: */ + if (jit->prg_buf) + *(u32 *)&jit->prg_buf[patch + 2] = (jit->prg - patch) >> 1; + + /* + * __bpf_prog_exit(p, start, &run_ctx); + */ + + /* %r1 = __bpf_prog_exit */ + load_imm64(jit, REG_1, (u64)bpf_trampoline_exit(p)); + /* %r2 = p */ + load_imm64(jit, REG_2, (u64)p); + /* lgr %r3,%r7 */ + EMIT4(0xb9040000, REG_3, REG_7); + /* la %r4,run_ctx_off(%r15) */ + EMIT4_DISP(0x41000000, REG_4, REG_15, tjit->run_ctx_off); + /* %r1() */ + call_r1(jit); + + return 0; +} + +static int alloc_stack(struct bpf_tramp_jit *tjit, size_t size) +{ + int stack_offset = tjit->stack_size; + + tjit->stack_size += size; + return stack_offset; +} + +/* ABI uses %r2 - %r6 for parameter passing. */ +#define MAX_NR_REG_ARGS 5 + +/* The "L" field of the "mvc" instruction is 8 bits. */ +#define MAX_MVC_SIZE 256 +#define MAX_NR_STACK_ARGS (MAX_MVC_SIZE / sizeof(u64)) + +/* -mfentry generates a 6-byte nop on s390x. */ +#define S390X_PATCH_SIZE 6 + +static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, + struct bpf_tramp_jit *tjit, + const struct btf_func_model *m, + u32 flags, + struct bpf_tramp_links *tlinks, + void *func_addr) +{ + struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; + struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; + struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; + int nr_bpf_args, nr_reg_args, nr_stack_args; + struct bpf_jit *jit = &tjit->common; + int arg, bpf_arg_off; + int i, j; + + /* Support as many stack arguments as "mvc" instruction can handle. */ + nr_reg_args = min_t(int, m->nr_args, MAX_NR_REG_ARGS); + nr_stack_args = m->nr_args - nr_reg_args; + if (nr_stack_args > MAX_NR_STACK_ARGS) + return -ENOTSUPP; + + /* Return to %r14, since func_addr and %r0 are not available. */ + if (!func_addr && !(flags & BPF_TRAMP_F_ORIG_STACK)) + flags |= BPF_TRAMP_F_SKIP_FRAME; + + /* + * Compute how many arguments we need to pass to BPF programs. + * BPF ABI mirrors that of x86_64: arguments that are 16 bytes or + * smaller are packed into 1 or 2 registers; larger arguments are + * passed via pointers. + * In s390x ABI, arguments that are 8 bytes or smaller are packed into + * a register; larger arguments are passed via pointers. + * We need to deal with this difference. + */ + nr_bpf_args = 0; + for (i = 0; i < m->nr_args; i++) { + if (m->arg_size[i] <= 8) + nr_bpf_args += 1; + else if (m->arg_size[i] <= 16) + nr_bpf_args += 2; + else + return -ENOTSUPP; + } + + /* + * Calculate the stack layout. + */ + + /* + * Allocate STACK_FRAME_OVERHEAD bytes for the callees. As the s390x + * ABI requires, put our backchain at the end of the allocated memory. + */ + tjit->stack_size = STACK_FRAME_OVERHEAD; + tjit->backchain_off = tjit->stack_size - sizeof(u64); + tjit->stack_args_off = alloc_stack(tjit, nr_stack_args * sizeof(u64)); + tjit->reg_args_off = alloc_stack(tjit, nr_reg_args * sizeof(u64)); + tjit->ip_off = alloc_stack(tjit, sizeof(u64)); + tjit->arg_cnt_off = alloc_stack(tjit, sizeof(u64)); + tjit->bpf_args_off = alloc_stack(tjit, nr_bpf_args * sizeof(u64)); + tjit->retval_off = alloc_stack(tjit, sizeof(u64)); + tjit->r7_r8_off = alloc_stack(tjit, 2 * sizeof(u64)); + tjit->run_ctx_off = alloc_stack(tjit, + sizeof(struct bpf_tramp_run_ctx)); + tjit->tccnt_off = alloc_stack(tjit, sizeof(u64)); + tjit->r14_off = alloc_stack(tjit, sizeof(u64) * 2); + /* + * In accordance with the s390x ABI, the caller has allocated + * STACK_FRAME_OVERHEAD bytes for us. 8 of them contain the caller's + * backchain, and the rest we can use. + */ + tjit->stack_size -= STACK_FRAME_OVERHEAD - sizeof(u64); + tjit->orig_stack_args_off = tjit->stack_size + STACK_FRAME_OVERHEAD; + + /* lgr %r1,%r15 */ + EMIT4(0xb9040000, REG_1, REG_15); + /* aghi %r15,-stack_size */ + EMIT4_IMM(0xa70b0000, REG_15, -tjit->stack_size); + /* stg %r1,backchain_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_1, REG_0, REG_15, + tjit->backchain_off); + /* mvc tccnt_off(4,%r15),stack_size+STK_OFF_TCCNT(%r15) */ + _EMIT6(0xd203f000 | tjit->tccnt_off, + 0xf000 | (tjit->stack_size + STK_OFF_TCCNT)); + /* stmg %r2,%rN,fwd_reg_args_off(%r15) */ + if (nr_reg_args) + EMIT6_DISP_LH(0xeb000000, 0x0024, REG_2, + REG_2 + (nr_reg_args - 1), REG_15, + tjit->reg_args_off); + for (i = 0, j = 0; i < m->nr_args; i++) { + if (i < MAX_NR_REG_ARGS) + arg = REG_2 + i; + else + arg = tjit->orig_stack_args_off + + (i - MAX_NR_REG_ARGS) * sizeof(u64); + bpf_arg_off = tjit->bpf_args_off + j * sizeof(u64); + if (m->arg_size[i] <= 8) { + if (i < MAX_NR_REG_ARGS) + /* stg %arg,bpf_arg_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, arg, + REG_0, REG_15, bpf_arg_off); + else + /* mvc bpf_arg_off(8,%r15),arg(%r15) */ + _EMIT6(0xd207f000 | bpf_arg_off, + 0xf000 | arg); + j += 1; + } else { + if (i < MAX_NR_REG_ARGS) { + /* mvc bpf_arg_off(16,%r15),0(%arg) */ + _EMIT6(0xd20ff000 | bpf_arg_off, + reg2hex[arg] << 12); + } else { + /* lg %r1,arg(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0004, REG_1, REG_0, + REG_15, arg); + /* mvc bpf_arg_off(16,%r15),0(%r1) */ + _EMIT6(0xd20ff000 | bpf_arg_off, 0x1000); + } + j += 2; + } + } + /* stmg %r7,%r8,r7_r8_off(%r15) */ + EMIT6_DISP_LH(0xeb000000, 0x0024, REG_7, REG_8, REG_15, + tjit->r7_r8_off); + /* stg %r14,r14_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_14, REG_0, REG_15, tjit->r14_off); + + if (flags & BPF_TRAMP_F_ORIG_STACK) { + /* + * The ftrace trampoline puts the return address (which is the + * address of the original function + S390X_PATCH_SIZE) into + * %r0; see ftrace_shared_hotpatch_trampoline_br and + * ftrace_init_nop() for details. + */ + + /* lgr %r8,%r0 */ + EMIT4(0xb9040000, REG_8, REG_0); + } else { + /* %r8 = func_addr + S390X_PATCH_SIZE */ + load_imm64(jit, REG_8, (u64)func_addr + S390X_PATCH_SIZE); + } + + /* + * ip = func_addr; + * arg_cnt = m->nr_args; + */ + + if (flags & BPF_TRAMP_F_IP_ARG) { + /* %r0 = func_addr */ + load_imm64(jit, REG_0, (u64)func_addr); + /* stg %r0,ip_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_0, REG_0, REG_15, + tjit->ip_off); + } + /* lghi %r0,nr_bpf_args */ + EMIT4_IMM(0xa7090000, REG_0, nr_bpf_args); + /* stg %r0,arg_cnt_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_0, REG_0, REG_15, + tjit->arg_cnt_off); + + if (flags & BPF_TRAMP_F_CALL_ORIG) { + /* + * __bpf_tramp_enter(im); + */ + + /* %r1 = __bpf_tramp_enter */ + load_imm64(jit, REG_1, (u64)__bpf_tramp_enter); + /* %r2 = im */ + load_imm64(jit, REG_2, (u64)im); + /* %r1() */ + call_r1(jit); + } + + for (i = 0; i < fentry->nr_links; i++) + if (invoke_bpf_prog(tjit, m, fentry->links[i], + flags & BPF_TRAMP_F_RET_FENTRY_RET)) + return -EINVAL; + + if (fmod_ret->nr_links) { + /* + * retval = 0; + */ + + /* xc retval_off(8,%r15),retval_off(%r15) */ + _EMIT6(0xd707f000 | tjit->retval_off, + 0xf000 | tjit->retval_off); + + for (i = 0; i < fmod_ret->nr_links; i++) { + if (invoke_bpf_prog(tjit, m, fmod_ret->links[i], true)) + return -EINVAL; + + /* + * if (retval) + * goto do_fexit; + */ + + /* ltg %r0,retval_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0002, REG_0, REG_0, REG_15, + tjit->retval_off); + /* brcl 7,do_fexit */ + EMIT6_PCREL_RILC(0xc0040000, 7, tjit->do_fexit); + } + } + + if (flags & BPF_TRAMP_F_CALL_ORIG) { + /* + * retval = func_addr(args); + */ + + /* lmg %r2,%rN,reg_args_off(%r15) */ + if (nr_reg_args) + EMIT6_DISP_LH(0xeb000000, 0x0004, REG_2, + REG_2 + (nr_reg_args - 1), REG_15, + tjit->reg_args_off); + /* mvc stack_args_off(N,%r15),orig_stack_args_off(%r15) */ + if (nr_stack_args) + _EMIT6(0xd200f000 | + (nr_stack_args * sizeof(u64) - 1) << 16 | + tjit->stack_args_off, + 0xf000 | tjit->orig_stack_args_off); + /* mvc STK_OFF_TCCNT(4,%r15),tccnt_off(%r15) */ + _EMIT6(0xd203f000 | STK_OFF_TCCNT, 0xf000 | tjit->tccnt_off); + /* lgr %r1,%r8 */ + EMIT4(0xb9040000, REG_1, REG_8); + /* %r1() */ + call_r1(jit); + /* stg %r2,retval_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_2, REG_0, REG_15, + tjit->retval_off); + + im->ip_after_call = jit->prg_buf + jit->prg; + + /* + * The following nop will be patched by bpf_tramp_image_put(). + */ + + /* brcl 0,im->ip_epilogue */ + EMIT6_PCREL_RILC(0xc0040000, 0, (u64)im->ip_epilogue); + } + + /* do_fexit: */ + tjit->do_fexit = jit->prg; + for (i = 0; i < fexit->nr_links; i++) + if (invoke_bpf_prog(tjit, m, fexit->links[i], false)) + return -EINVAL; + + if (flags & BPF_TRAMP_F_CALL_ORIG) { + im->ip_epilogue = jit->prg_buf + jit->prg; + + /* + * __bpf_tramp_exit(im); + */ + + /* %r1 = __bpf_tramp_exit */ + load_imm64(jit, REG_1, (u64)__bpf_tramp_exit); + /* %r2 = im */ + load_imm64(jit, REG_2, (u64)im); + /* %r1() */ + call_r1(jit); + } + + /* lmg %r2,%rN,reg_args_off(%r15) */ + if ((flags & BPF_TRAMP_F_RESTORE_REGS) && nr_reg_args) + EMIT6_DISP_LH(0xeb000000, 0x0004, REG_2, + REG_2 + (nr_reg_args - 1), REG_15, + tjit->reg_args_off); + /* lgr %r1,%r8 */ + if (!(flags & BPF_TRAMP_F_SKIP_FRAME)) + EMIT4(0xb9040000, REG_1, REG_8); + /* lmg %r7,%r8,r7_r8_off(%r15) */ + EMIT6_DISP_LH(0xeb000000, 0x0004, REG_7, REG_8, REG_15, + tjit->r7_r8_off); + /* lg %r14,r14_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0004, REG_14, REG_0, REG_15, tjit->r14_off); + /* lg %r2,retval_off(%r15) */ + if (flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET)) + EMIT6_DISP_LH(0xe3000000, 0x0004, REG_2, REG_0, REG_15, + tjit->retval_off); + /* mvc stack_size+STK_OFF_TCCNT(4,%r15),tccnt_off(%r15) */ + _EMIT6(0xd203f000 | (tjit->stack_size + STK_OFF_TCCNT), + 0xf000 | tjit->tccnt_off); + /* aghi %r15,stack_size */ + EMIT4_IMM(0xa70b0000, REG_15, tjit->stack_size); + /* Emit an expoline for the following indirect jump. */ + if (nospec_uses_trampoline()) + emit_expoline(jit); + if (flags & BPF_TRAMP_F_SKIP_FRAME) + /* br %r14 */ + _EMIT2(0x07fe); + else + /* br %r1 */ + _EMIT2(0x07f1); + + emit_r1_thunk(jit); + + return 0; +} + +int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, + void *image_end, const struct btf_func_model *m, + u32 flags, struct bpf_tramp_links *tlinks, + void *func_addr) +{ + struct bpf_tramp_jit tjit; + int ret; + int i; + + for (i = 0; i < 2; i++) { + if (i == 0) { + /* Compute offsets, check whether the code fits. */ + memset(&tjit, 0, sizeof(tjit)); + } else { + /* Generate the code. */ + tjit.common.prg = 0; + tjit.common.prg_buf = image; + } + ret = __arch_prepare_bpf_trampoline(im, &tjit, m, flags, + tlinks, func_addr); + if (ret < 0) + return ret; + if (tjit.common.prg > (char *)image_end - (char *)image) + /* + * Use the same error code as for exceeding + * BPF_MAX_TRAMP_LINKS. + */ + return -E2BIG; + } + + return tjit.common.prg; +} + +bool bpf_jit_supports_subprog_tailcalls(void) +{ + return true; +} diff --git a/arch/s390/net/pnet.c b/arch/s390/net/pnet.c new file mode 100644 index 0000000000..79211bec0f --- /dev/null +++ b/arch/s390/net/pnet.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * IBM System z PNET ID Support + * + * Copyright IBM Corp. 2018 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define PNETIDS_LEN 64 /* Total utility string length in bytes + * to cover up to 4 PNETIDs of 16 bytes + * for up to 4 device ports + */ +#define MAX_PNETID_LEN 16 /* Max.length of a single port PNETID */ +#define MAX_PNETID_PORTS (PNETIDS_LEN / MAX_PNETID_LEN) + /* Max. # of ports with a PNETID */ + +/* + * Get the PNETIDs from a device. + * s390 hardware supports the definition of a so-called Physical Network + * Identifier (short PNETID) per network device port. These PNETIDs can be + * used to identify network devices that are attached to the same physical + * network (broadcast domain). + * + * The device can be + * - a ccwgroup device with all bundled subchannels having the same PNETID + * - a PCI attached network device + * + * Returns: + * 0: PNETIDs extracted from device. + * -ENOMEM: No memory to extract utility string. + * -EOPNOTSUPP: Device type without utility string support + */ +static int pnet_ids_by_device(struct device *dev, u8 *pnetids) +{ + memset(pnetids, 0, PNETIDS_LEN); + if (dev_is_ccwgroup(dev)) { + struct ccwgroup_device *gdev = to_ccwgroupdev(dev); + u8 *util_str; + + util_str = ccw_device_get_util_str(gdev->cdev[0], 0); + if (!util_str) + return -ENOMEM; + memcpy(pnetids, util_str, PNETIDS_LEN); + EBCASC(pnetids, PNETIDS_LEN); + kfree(util_str); + return 0; + } + if (dev_is_pci(dev)) { + struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); + + memcpy(pnetids, zdev->util_str, sizeof(zdev->util_str)); + EBCASC(pnetids, sizeof(zdev->util_str)); + return 0; + } + return -EOPNOTSUPP; +} + +/* + * Extract the pnetid for a device port. + * + * Return 0 if a pnetid is found and -ENOENT otherwise. + */ +int pnet_id_by_dev_port(struct device *dev, unsigned short port, u8 *pnetid) +{ + u8 pnetids[MAX_PNETID_PORTS][MAX_PNETID_LEN]; + static const u8 zero[MAX_PNETID_LEN] = { 0 }; + int rc = 0; + + if (!dev || port >= MAX_PNETID_PORTS) + return -ENOENT; + + if (!pnet_ids_by_device(dev, (u8 *)pnetids) && + memcmp(pnetids[port], zero, MAX_PNETID_LEN)) + memcpy(pnetid, pnetids[port], MAX_PNETID_LEN); + else + rc = -ENOENT; + + return rc; +} +EXPORT_SYMBOL_GPL(pnet_id_by_dev_port); + +MODULE_DESCRIPTION("pnetid determination from utility strings"); +MODULE_LICENSE("GPL"); diff --git a/arch/s390/pci/Makefile b/arch/s390/pci/Makefile new file mode 100644 index 0000000000..5ae31ca9dd --- /dev/null +++ b/arch/s390/pci/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the s390 PCI subsystem. +# + +obj-$(CONFIG_PCI) += pci.o pci_irq.o pci_dma.o pci_clp.o pci_sysfs.o \ + pci_event.o pci_debug.o pci_insn.o pci_mmio.o \ + pci_bus.o pci_kvm_hook.o +obj-$(CONFIG_PCI_IOV) += pci_iov.o diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c new file mode 100644 index 0000000000..d34d5813d0 --- /dev/null +++ b/arch/s390/pci/pci.c @@ -0,0 +1,1135 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2012 + * + * Author(s): + * Jan Glauber + * + * The System z PCI code is a rewrite from a prototype by + * the following people (Kudoz!): + * Alexander Schmidt + * Christoph Raisch + * Hannes Hering + * Hoang-Nam Nguyen + * Jan-Bernd Themann + * Stefan Roscher + * Thomas Klein + */ + +#define KMSG_COMPONENT "zpci" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "pci_bus.h" +#include "pci_iov.h" + +/* list of all detected zpci devices */ +static LIST_HEAD(zpci_list); +static DEFINE_SPINLOCK(zpci_list_lock); + +static DECLARE_BITMAP(zpci_domain, ZPCI_DOMAIN_BITMAP_SIZE); +static DEFINE_SPINLOCK(zpci_domain_lock); + +#define ZPCI_IOMAP_ENTRIES \ + min(((unsigned long) ZPCI_NR_DEVICES * PCI_STD_NUM_BARS / 2), \ + ZPCI_IOMAP_MAX_ENTRIES) + +unsigned int s390_pci_no_rid; + +static DEFINE_SPINLOCK(zpci_iomap_lock); +static unsigned long *zpci_iomap_bitmap; +struct zpci_iomap_entry *zpci_iomap_start; +EXPORT_SYMBOL_GPL(zpci_iomap_start); + +DEFINE_STATIC_KEY_FALSE(have_mio); + +static struct kmem_cache *zdev_fmb_cache; + +/* AEN structures that must be preserved over KVM module re-insertion */ +union zpci_sic_iib *zpci_aipb; +EXPORT_SYMBOL_GPL(zpci_aipb); +struct airq_iv *zpci_aif_sbv; +EXPORT_SYMBOL_GPL(zpci_aif_sbv); + +struct zpci_dev *get_zdev_by_fid(u32 fid) +{ + struct zpci_dev *tmp, *zdev = NULL; + + spin_lock(&zpci_list_lock); + list_for_each_entry(tmp, &zpci_list, entry) { + if (tmp->fid == fid) { + zdev = tmp; + zpci_zdev_get(zdev); + break; + } + } + spin_unlock(&zpci_list_lock); + return zdev; +} + +void zpci_remove_reserved_devices(void) +{ + struct zpci_dev *tmp, *zdev; + enum zpci_state state; + LIST_HEAD(remove); + + spin_lock(&zpci_list_lock); + list_for_each_entry_safe(zdev, tmp, &zpci_list, entry) { + if (zdev->state == ZPCI_FN_STATE_STANDBY && + !clp_get_state(zdev->fid, &state) && + state == ZPCI_FN_STATE_RESERVED) + list_move_tail(&zdev->entry, &remove); + } + spin_unlock(&zpci_list_lock); + + list_for_each_entry_safe(zdev, tmp, &remove, entry) + zpci_device_reserved(zdev); +} + +int pci_domain_nr(struct pci_bus *bus) +{ + return ((struct zpci_bus *) bus->sysdata)->domain_nr; +} +EXPORT_SYMBOL_GPL(pci_domain_nr); + +int pci_proc_domain(struct pci_bus *bus) +{ + return pci_domain_nr(bus); +} +EXPORT_SYMBOL_GPL(pci_proc_domain); + +/* Modify PCI: Register I/O address translation parameters */ +int zpci_register_ioat(struct zpci_dev *zdev, u8 dmaas, + u64 base, u64 limit, u64 iota, u8 *status) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, dmaas, ZPCI_MOD_FC_REG_IOAT); + struct zpci_fib fib = {0}; + u8 cc; + + WARN_ON_ONCE(iota & 0x3fff); + fib.pba = base; + fib.pal = limit; + fib.iota = iota | ZPCI_IOTA_RTTO_FLAG; + fib.gd = zdev->gisa; + cc = zpci_mod_fc(req, &fib, status); + if (cc) + zpci_dbg(3, "reg ioat fid:%x, cc:%d, status:%d\n", zdev->fid, cc, *status); + return cc; +} +EXPORT_SYMBOL_GPL(zpci_register_ioat); + +/* Modify PCI: Unregister I/O address translation parameters */ +int zpci_unregister_ioat(struct zpci_dev *zdev, u8 dmaas) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, dmaas, ZPCI_MOD_FC_DEREG_IOAT); + struct zpci_fib fib = {0}; + u8 cc, status; + + fib.gd = zdev->gisa; + + cc = zpci_mod_fc(req, &fib, &status); + if (cc) + zpci_dbg(3, "unreg ioat fid:%x, cc:%d, status:%d\n", zdev->fid, cc, status); + return cc; +} + +/* Modify PCI: Set PCI function measurement parameters */ +int zpci_fmb_enable_device(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_SET_MEASURE); + struct zpci_fib fib = {0}; + u8 cc, status; + + if (zdev->fmb || sizeof(*zdev->fmb) < zdev->fmb_length) + return -EINVAL; + + zdev->fmb = kmem_cache_zalloc(zdev_fmb_cache, GFP_KERNEL); + if (!zdev->fmb) + return -ENOMEM; + WARN_ON((u64) zdev->fmb & 0xf); + + /* reset software counters */ + atomic64_set(&zdev->allocated_pages, 0); + atomic64_set(&zdev->mapped_pages, 0); + atomic64_set(&zdev->unmapped_pages, 0); + + fib.fmb_addr = virt_to_phys(zdev->fmb); + fib.gd = zdev->gisa; + cc = zpci_mod_fc(req, &fib, &status); + if (cc) { + kmem_cache_free(zdev_fmb_cache, zdev->fmb); + zdev->fmb = NULL; + } + return cc ? -EIO : 0; +} + +/* Modify PCI: Disable PCI function measurement */ +int zpci_fmb_disable_device(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_SET_MEASURE); + struct zpci_fib fib = {0}; + u8 cc, status; + + if (!zdev->fmb) + return -EINVAL; + + fib.gd = zdev->gisa; + + /* Function measurement is disabled if fmb address is zero */ + cc = zpci_mod_fc(req, &fib, &status); + if (cc == 3) /* Function already gone. */ + cc = 0; + + if (!cc) { + kmem_cache_free(zdev_fmb_cache, zdev->fmb); + zdev->fmb = NULL; + } + return cc ? -EIO : 0; +} + +static int zpci_cfg_load(struct zpci_dev *zdev, int offset, u32 *val, u8 len) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, ZPCI_PCIAS_CFGSPC, len); + u64 data; + int rc; + + rc = __zpci_load(&data, req, offset); + if (!rc) { + data = le64_to_cpu((__force __le64) data); + data >>= (8 - len) * 8; + *val = (u32) data; + } else + *val = 0xffffffff; + return rc; +} + +static int zpci_cfg_store(struct zpci_dev *zdev, int offset, u32 val, u8 len) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, ZPCI_PCIAS_CFGSPC, len); + u64 data = val; + int rc; + + data <<= (8 - len) * 8; + data = (__force u64) cpu_to_le64(data); + rc = __zpci_store(data, req, offset); + return rc; +} + +resource_size_t pcibios_align_resource(void *data, const struct resource *res, + resource_size_t size, + resource_size_t align) +{ + return 0; +} + +/* combine single writes by using store-block insn */ +void __iowrite64_copy(void __iomem *to, const void *from, size_t count) +{ + zpci_memcpy_toio(to, from, count); +} + +void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, + unsigned long prot) +{ + /* + * When PCI MIO instructions are unavailable the "physical" address + * encodes a hint for accessing the PCI memory space it represents. + * Just pass it unchanged such that ioread/iowrite can decode it. + */ + if (!static_branch_unlikely(&have_mio)) + return (void __iomem *)phys_addr; + + return generic_ioremap_prot(phys_addr, size, __pgprot(prot)); +} +EXPORT_SYMBOL(ioremap_prot); + +void iounmap(volatile void __iomem *addr) +{ + if (static_branch_likely(&have_mio)) + generic_iounmap(addr); +} +EXPORT_SYMBOL(iounmap); + +/* Create a virtual mapping cookie for a PCI BAR */ +static void __iomem *pci_iomap_range_fh(struct pci_dev *pdev, int bar, + unsigned long offset, unsigned long max) +{ + struct zpci_dev *zdev = to_zpci(pdev); + int idx; + + idx = zdev->bars[bar].map_idx; + spin_lock(&zpci_iomap_lock); + /* Detect overrun */ + WARN_ON(!++zpci_iomap_start[idx].count); + zpci_iomap_start[idx].fh = zdev->fh; + zpci_iomap_start[idx].bar = bar; + spin_unlock(&zpci_iomap_lock); + + return (void __iomem *) ZPCI_ADDR(idx) + offset; +} + +static void __iomem *pci_iomap_range_mio(struct pci_dev *pdev, int bar, + unsigned long offset, + unsigned long max) +{ + unsigned long barsize = pci_resource_len(pdev, bar); + struct zpci_dev *zdev = to_zpci(pdev); + void __iomem *iova; + + iova = ioremap((unsigned long) zdev->bars[bar].mio_wt, barsize); + return iova ? iova + offset : iova; +} + +void __iomem *pci_iomap_range(struct pci_dev *pdev, int bar, + unsigned long offset, unsigned long max) +{ + if (bar >= PCI_STD_NUM_BARS || !pci_resource_len(pdev, bar)) + return NULL; + + if (static_branch_likely(&have_mio)) + return pci_iomap_range_mio(pdev, bar, offset, max); + else + return pci_iomap_range_fh(pdev, bar, offset, max); +} +EXPORT_SYMBOL(pci_iomap_range); + +void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen) +{ + return pci_iomap_range(dev, bar, 0, maxlen); +} +EXPORT_SYMBOL(pci_iomap); + +static void __iomem *pci_iomap_wc_range_mio(struct pci_dev *pdev, int bar, + unsigned long offset, unsigned long max) +{ + unsigned long barsize = pci_resource_len(pdev, bar); + struct zpci_dev *zdev = to_zpci(pdev); + void __iomem *iova; + + iova = ioremap((unsigned long) zdev->bars[bar].mio_wb, barsize); + return iova ? iova + offset : iova; +} + +void __iomem *pci_iomap_wc_range(struct pci_dev *pdev, int bar, + unsigned long offset, unsigned long max) +{ + if (bar >= PCI_STD_NUM_BARS || !pci_resource_len(pdev, bar)) + return NULL; + + if (static_branch_likely(&have_mio)) + return pci_iomap_wc_range_mio(pdev, bar, offset, max); + else + return pci_iomap_range_fh(pdev, bar, offset, max); +} +EXPORT_SYMBOL(pci_iomap_wc_range); + +void __iomem *pci_iomap_wc(struct pci_dev *dev, int bar, unsigned long maxlen) +{ + return pci_iomap_wc_range(dev, bar, 0, maxlen); +} +EXPORT_SYMBOL(pci_iomap_wc); + +static void pci_iounmap_fh(struct pci_dev *pdev, void __iomem *addr) +{ + unsigned int idx = ZPCI_IDX(addr); + + spin_lock(&zpci_iomap_lock); + /* Detect underrun */ + WARN_ON(!zpci_iomap_start[idx].count); + if (!--zpci_iomap_start[idx].count) { + zpci_iomap_start[idx].fh = 0; + zpci_iomap_start[idx].bar = 0; + } + spin_unlock(&zpci_iomap_lock); +} + +static void pci_iounmap_mio(struct pci_dev *pdev, void __iomem *addr) +{ + iounmap(addr); +} + +void pci_iounmap(struct pci_dev *pdev, void __iomem *addr) +{ + if (static_branch_likely(&have_mio)) + pci_iounmap_mio(pdev, addr); + else + pci_iounmap_fh(pdev, addr); +} +EXPORT_SYMBOL(pci_iounmap); + +static int pci_read(struct pci_bus *bus, unsigned int devfn, int where, + int size, u32 *val) +{ + struct zpci_dev *zdev = zdev_from_bus(bus, devfn); + + return (zdev) ? zpci_cfg_load(zdev, where, val, size) : -ENODEV; +} + +static int pci_write(struct pci_bus *bus, unsigned int devfn, int where, + int size, u32 val) +{ + struct zpci_dev *zdev = zdev_from_bus(bus, devfn); + + return (zdev) ? zpci_cfg_store(zdev, where, val, size) : -ENODEV; +} + +static struct pci_ops pci_root_ops = { + .read = pci_read, + .write = pci_write, +}; + +static void zpci_map_resources(struct pci_dev *pdev) +{ + struct zpci_dev *zdev = to_zpci(pdev); + resource_size_t len; + int i; + + for (i = 0; i < PCI_STD_NUM_BARS; i++) { + len = pci_resource_len(pdev, i); + if (!len) + continue; + + if (zpci_use_mio(zdev)) + pdev->resource[i].start = + (resource_size_t __force) zdev->bars[i].mio_wt; + else + pdev->resource[i].start = (resource_size_t __force) + pci_iomap_range_fh(pdev, i, 0, 0); + pdev->resource[i].end = pdev->resource[i].start + len - 1; + } + + zpci_iov_map_resources(pdev); +} + +static void zpci_unmap_resources(struct pci_dev *pdev) +{ + struct zpci_dev *zdev = to_zpci(pdev); + resource_size_t len; + int i; + + if (zpci_use_mio(zdev)) + return; + + for (i = 0; i < PCI_STD_NUM_BARS; i++) { + len = pci_resource_len(pdev, i); + if (!len) + continue; + pci_iounmap_fh(pdev, (void __iomem __force *) + pdev->resource[i].start); + } +} + +static int zpci_alloc_iomap(struct zpci_dev *zdev) +{ + unsigned long entry; + + spin_lock(&zpci_iomap_lock); + entry = find_first_zero_bit(zpci_iomap_bitmap, ZPCI_IOMAP_ENTRIES); + if (entry == ZPCI_IOMAP_ENTRIES) { + spin_unlock(&zpci_iomap_lock); + return -ENOSPC; + } + set_bit(entry, zpci_iomap_bitmap); + spin_unlock(&zpci_iomap_lock); + return entry; +} + +static void zpci_free_iomap(struct zpci_dev *zdev, int entry) +{ + spin_lock(&zpci_iomap_lock); + memset(&zpci_iomap_start[entry], 0, sizeof(struct zpci_iomap_entry)); + clear_bit(entry, zpci_iomap_bitmap); + spin_unlock(&zpci_iomap_lock); +} + +static void zpci_do_update_iomap_fh(struct zpci_dev *zdev, u32 fh) +{ + int bar, idx; + + spin_lock(&zpci_iomap_lock); + for (bar = 0; bar < PCI_STD_NUM_BARS; bar++) { + if (!zdev->bars[bar].size) + continue; + idx = zdev->bars[bar].map_idx; + if (!zpci_iomap_start[idx].count) + continue; + WRITE_ONCE(zpci_iomap_start[idx].fh, zdev->fh); + } + spin_unlock(&zpci_iomap_lock); +} + +void zpci_update_fh(struct zpci_dev *zdev, u32 fh) +{ + if (!fh || zdev->fh == fh) + return; + + zdev->fh = fh; + if (zpci_use_mio(zdev)) + return; + if (zdev->has_resources && zdev_enabled(zdev)) + zpci_do_update_iomap_fh(zdev, fh); +} + +static struct resource *__alloc_res(struct zpci_dev *zdev, unsigned long start, + unsigned long size, unsigned long flags) +{ + struct resource *r; + + r = kzalloc(sizeof(*r), GFP_KERNEL); + if (!r) + return NULL; + + r->start = start; + r->end = r->start + size - 1; + r->flags = flags; + r->name = zdev->res_name; + + if (request_resource(&iomem_resource, r)) { + kfree(r); + return NULL; + } + return r; +} + +int zpci_setup_bus_resources(struct zpci_dev *zdev) +{ + unsigned long addr, size, flags; + struct resource *res; + int i, entry; + + snprintf(zdev->res_name, sizeof(zdev->res_name), + "PCI Bus %04x:%02x", zdev->uid, ZPCI_BUS_NR); + + for (i = 0; i < PCI_STD_NUM_BARS; i++) { + if (!zdev->bars[i].size) + continue; + entry = zpci_alloc_iomap(zdev); + if (entry < 0) + return entry; + zdev->bars[i].map_idx = entry; + + /* only MMIO is supported */ + flags = IORESOURCE_MEM; + if (zdev->bars[i].val & 8) + flags |= IORESOURCE_PREFETCH; + if (zdev->bars[i].val & 4) + flags |= IORESOURCE_MEM_64; + + if (zpci_use_mio(zdev)) + addr = (unsigned long) zdev->bars[i].mio_wt; + else + addr = ZPCI_ADDR(entry); + size = 1UL << zdev->bars[i].size; + + res = __alloc_res(zdev, addr, size, flags); + if (!res) { + zpci_free_iomap(zdev, entry); + return -ENOMEM; + } + zdev->bars[i].res = res; + } + zdev->has_resources = 1; + + return 0; +} + +static void zpci_cleanup_bus_resources(struct zpci_dev *zdev) +{ + struct resource *res; + int i; + + pci_lock_rescan_remove(); + for (i = 0; i < PCI_STD_NUM_BARS; i++) { + res = zdev->bars[i].res; + if (!res) + continue; + + release_resource(res); + pci_bus_remove_resource(zdev->zbus->bus, res); + zpci_free_iomap(zdev, zdev->bars[i].map_idx); + zdev->bars[i].res = NULL; + kfree(res); + } + zdev->has_resources = 0; + pci_unlock_rescan_remove(); +} + +int pcibios_device_add(struct pci_dev *pdev) +{ + struct zpci_dev *zdev = to_zpci(pdev); + struct resource *res; + int i; + + /* The pdev has a reference to the zdev via its bus */ + zpci_zdev_get(zdev); + if (pdev->is_physfn) + pdev->no_vf_scan = 1; + + pdev->dev.groups = zpci_attr_groups; + pdev->dev.dma_ops = &s390_pci_dma_ops; + zpci_map_resources(pdev); + + for (i = 0; i < PCI_STD_NUM_BARS; i++) { + res = &pdev->resource[i]; + if (res->parent || !res->flags) + continue; + pci_claim_resource(pdev, i); + } + + return 0; +} + +void pcibios_release_device(struct pci_dev *pdev) +{ + struct zpci_dev *zdev = to_zpci(pdev); + + zpci_unmap_resources(pdev); + zpci_zdev_put(zdev); +} + +int pcibios_enable_device(struct pci_dev *pdev, int mask) +{ + struct zpci_dev *zdev = to_zpci(pdev); + + zpci_debug_init_device(zdev, dev_name(&pdev->dev)); + zpci_fmb_enable_device(zdev); + + return pci_enable_resources(pdev, mask); +} + +void pcibios_disable_device(struct pci_dev *pdev) +{ + struct zpci_dev *zdev = to_zpci(pdev); + + zpci_fmb_disable_device(zdev); + zpci_debug_exit_device(zdev); +} + +static int __zpci_register_domain(int domain) +{ + spin_lock(&zpci_domain_lock); + if (test_bit(domain, zpci_domain)) { + spin_unlock(&zpci_domain_lock); + pr_err("Domain %04x is already assigned\n", domain); + return -EEXIST; + } + set_bit(domain, zpci_domain); + spin_unlock(&zpci_domain_lock); + return domain; +} + +static int __zpci_alloc_domain(void) +{ + int domain; + + spin_lock(&zpci_domain_lock); + /* + * We can always auto allocate domains below ZPCI_NR_DEVICES. + * There is either a free domain or we have reached the maximum in + * which case we would have bailed earlier. + */ + domain = find_first_zero_bit(zpci_domain, ZPCI_NR_DEVICES); + set_bit(domain, zpci_domain); + spin_unlock(&zpci_domain_lock); + return domain; +} + +int zpci_alloc_domain(int domain) +{ + if (zpci_unique_uid) { + if (domain) + return __zpci_register_domain(domain); + pr_warn("UID checking was active but no UID is provided: switching to automatic domain allocation\n"); + update_uid_checking(false); + } + return __zpci_alloc_domain(); +} + +void zpci_free_domain(int domain) +{ + spin_lock(&zpci_domain_lock); + clear_bit(domain, zpci_domain); + spin_unlock(&zpci_domain_lock); +} + + +int zpci_enable_device(struct zpci_dev *zdev) +{ + u32 fh = zdev->fh; + int rc = 0; + + if (clp_enable_fh(zdev, &fh, ZPCI_NR_DMA_SPACES)) + rc = -EIO; + else + zpci_update_fh(zdev, fh); + return rc; +} +EXPORT_SYMBOL_GPL(zpci_enable_device); + +int zpci_disable_device(struct zpci_dev *zdev) +{ + u32 fh = zdev->fh; + int cc, rc = 0; + + cc = clp_disable_fh(zdev, &fh); + if (!cc) { + zpci_update_fh(zdev, fh); + } else if (cc == CLP_RC_SETPCIFN_ALRDY) { + pr_info("Disabling PCI function %08x had no effect as it was already disabled\n", + zdev->fid); + /* Function is already disabled - update handle */ + rc = clp_refresh_fh(zdev->fid, &fh); + if (!rc) { + zpci_update_fh(zdev, fh); + rc = -EINVAL; + } + } else { + rc = -EIO; + } + return rc; +} +EXPORT_SYMBOL_GPL(zpci_disable_device); + +/** + * zpci_hot_reset_device - perform a reset of the given zPCI function + * @zdev: the slot which should be reset + * + * Performs a low level reset of the zPCI function. The reset is low level in + * the sense that the zPCI function can be reset without detaching it from the + * common PCI subsystem. The reset may be performed while under control of + * either DMA or IOMMU APIs in which case the existing DMA/IOMMU translation + * table is reinstated at the end of the reset. + * + * After the reset the functions internal state is reset to an initial state + * equivalent to its state during boot when first probing a driver. + * Consequently after reset the PCI function requires re-initialization via the + * common PCI code including re-enabling IRQs via pci_alloc_irq_vectors() + * and enabling the function via e.g.pci_enablde_device_flags().The caller + * must guard against concurrent reset attempts. + * + * In most cases this function should not be called directly but through + * pci_reset_function() or pci_reset_bus() which handle the save/restore and + * locking. + * + * Return: 0 on success and an error value otherwise + */ +int zpci_hot_reset_device(struct zpci_dev *zdev) +{ + u8 status; + int rc; + + zpci_dbg(3, "rst fid:%x, fh:%x\n", zdev->fid, zdev->fh); + if (zdev_enabled(zdev)) { + /* Disables device access, DMAs and IRQs (reset state) */ + rc = zpci_disable_device(zdev); + /* + * Due to a z/VM vs LPAR inconsistency in the error state the + * FH may indicate an enabled device but disable says the + * device is already disabled don't treat it as an error here. + */ + if (rc == -EINVAL) + rc = 0; + if (rc) + return rc; + } + + rc = zpci_enable_device(zdev); + if (rc) + return rc; + + if (zdev->dma_table) + rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, + virt_to_phys(zdev->dma_table), &status); + else + rc = zpci_dma_init_device(zdev); + if (rc) { + zpci_disable_device(zdev); + return rc; + } + + return 0; +} + +/** + * zpci_create_device() - Create a new zpci_dev and add it to the zbus + * @fid: Function ID of the device to be created + * @fh: Current Function Handle of the device to be created + * @state: Initial state after creation either Standby or Configured + * + * Creates a new zpci device and adds it to its, possibly newly created, zbus + * as well as zpci_list. + * + * Returns: the zdev on success or an error pointer otherwise + */ +struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state) +{ + struct zpci_dev *zdev; + int rc; + + zpci_dbg(1, "add fid:%x, fh:%x, c:%d\n", fid, fh, state); + zdev = kzalloc(sizeof(*zdev), GFP_KERNEL); + if (!zdev) + return ERR_PTR(-ENOMEM); + + /* FID and Function Handle are the static/dynamic identifiers */ + zdev->fid = fid; + zdev->fh = fh; + + /* Query function properties and update zdev */ + rc = clp_query_pci_fn(zdev); + if (rc) + goto error; + zdev->state = state; + + kref_init(&zdev->kref); + mutex_init(&zdev->lock); + mutex_init(&zdev->kzdev_lock); + + rc = zpci_init_iommu(zdev); + if (rc) + goto error; + + rc = zpci_bus_device_register(zdev, &pci_root_ops); + if (rc) + goto error_destroy_iommu; + + spin_lock(&zpci_list_lock); + list_add_tail(&zdev->entry, &zpci_list); + spin_unlock(&zpci_list_lock); + + return zdev; + +error_destroy_iommu: + zpci_destroy_iommu(zdev); +error: + zpci_dbg(0, "add fid:%x, rc:%d\n", fid, rc); + kfree(zdev); + return ERR_PTR(rc); +} + +bool zpci_is_device_configured(struct zpci_dev *zdev) +{ + enum zpci_state state = zdev->state; + + return state != ZPCI_FN_STATE_RESERVED && + state != ZPCI_FN_STATE_STANDBY; +} + +/** + * zpci_scan_configured_device() - Scan a freshly configured zpci_dev + * @zdev: The zpci_dev to be configured + * @fh: The general function handle supplied by the platform + * + * Given a device in the configuration state Configured, enables, scans and + * adds it to the common code PCI subsystem if possible. If any failure occurs, + * the zpci_dev is left disabled. + * + * Return: 0 on success, or an error code otherwise + */ +int zpci_scan_configured_device(struct zpci_dev *zdev, u32 fh) +{ + zpci_update_fh(zdev, fh); + return zpci_bus_scan_device(zdev); +} + +/** + * zpci_deconfigure_device() - Deconfigure a zpci_dev + * @zdev: The zpci_dev to configure + * + * Deconfigure a zPCI function that is currently configured and possibly known + * to the common code PCI subsystem. + * If any failure occurs the device is left as is. + * + * Return: 0 on success, or an error code otherwise + */ +int zpci_deconfigure_device(struct zpci_dev *zdev) +{ + int rc; + + if (zdev->zbus->bus) + zpci_bus_remove_device(zdev, false); + + if (zdev->dma_table) { + rc = zpci_dma_exit_device(zdev); + if (rc) + return rc; + } + if (zdev_enabled(zdev)) { + rc = zpci_disable_device(zdev); + if (rc) + return rc; + } + + rc = sclp_pci_deconfigure(zdev->fid); + zpci_dbg(3, "deconf fid:%x, rc:%d\n", zdev->fid, rc); + if (rc) + return rc; + zdev->state = ZPCI_FN_STATE_STANDBY; + + return 0; +} + +/** + * zpci_device_reserved() - Mark device as resverved + * @zdev: the zpci_dev that was reserved + * + * Handle the case that a given zPCI function was reserved by another system. + * After a call to this function the zpci_dev can not be found via + * get_zdev_by_fid() anymore but may still be accessible via existing + * references though it will not be functional anymore. + */ +void zpci_device_reserved(struct zpci_dev *zdev) +{ + if (zdev->has_hp_slot) + zpci_exit_slot(zdev); + /* + * Remove device from zpci_list as it is going away. This also + * makes sure we ignore subsequent zPCI events for this device. + */ + spin_lock(&zpci_list_lock); + list_del(&zdev->entry); + spin_unlock(&zpci_list_lock); + zdev->state = ZPCI_FN_STATE_RESERVED; + zpci_dbg(3, "rsv fid:%x\n", zdev->fid); + zpci_zdev_put(zdev); +} + +void zpci_release_device(struct kref *kref) +{ + struct zpci_dev *zdev = container_of(kref, struct zpci_dev, kref); + int ret; + + if (zdev->zbus->bus) + zpci_bus_remove_device(zdev, false); + + if (zdev->dma_table) + zpci_dma_exit_device(zdev); + if (zdev_enabled(zdev)) + zpci_disable_device(zdev); + + switch (zdev->state) { + case ZPCI_FN_STATE_CONFIGURED: + ret = sclp_pci_deconfigure(zdev->fid); + zpci_dbg(3, "deconf fid:%x, rc:%d\n", zdev->fid, ret); + fallthrough; + case ZPCI_FN_STATE_STANDBY: + if (zdev->has_hp_slot) + zpci_exit_slot(zdev); + spin_lock(&zpci_list_lock); + list_del(&zdev->entry); + spin_unlock(&zpci_list_lock); + zpci_dbg(3, "rsv fid:%x\n", zdev->fid); + fallthrough; + case ZPCI_FN_STATE_RESERVED: + if (zdev->has_resources) + zpci_cleanup_bus_resources(zdev); + zpci_bus_device_unregister(zdev); + zpci_destroy_iommu(zdev); + fallthrough; + default: + break; + } + zpci_dbg(3, "rem fid:%x\n", zdev->fid); + kfree_rcu(zdev, rcu); +} + +int zpci_report_error(struct pci_dev *pdev, + struct zpci_report_error_header *report) +{ + struct zpci_dev *zdev = to_zpci(pdev); + + return sclp_pci_report(report, zdev->fh, zdev->fid); +} +EXPORT_SYMBOL(zpci_report_error); + +/** + * zpci_clear_error_state() - Clears the zPCI error state of the device + * @zdev: The zdev for which the zPCI error state should be reset + * + * Clear the zPCI error state of the device. If clearing the zPCI error state + * fails the device is left in the error state. In this case it may make sense + * to call zpci_io_perm_failure() on the associated pdev if it exists. + * + * Returns: 0 on success, -EIO otherwise + */ +int zpci_clear_error_state(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_RESET_ERROR); + struct zpci_fib fib = {0}; + u8 status; + int cc; + + cc = zpci_mod_fc(req, &fib, &status); + if (cc) { + zpci_dbg(3, "ces fid:%x, cc:%d, status:%x\n", zdev->fid, cc, status); + return -EIO; + } + + return 0; +} + +/** + * zpci_reset_load_store_blocked() - Re-enables L/S from error state + * @zdev: The zdev for which to unblock load/store access + * + * Re-enables load/store access for a PCI function in the error state while + * keeping DMA blocked. In this state drivers can poke MMIO space to determine + * if error recovery is possible while catching any rogue DMA access from the + * device. + * + * Returns: 0 on success, -EIO otherwise + */ +int zpci_reset_load_store_blocked(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_RESET_BLOCK); + struct zpci_fib fib = {0}; + u8 status; + int cc; + + cc = zpci_mod_fc(req, &fib, &status); + if (cc) { + zpci_dbg(3, "rls fid:%x, cc:%d, status:%x\n", zdev->fid, cc, status); + return -EIO; + } + + return 0; +} + +static int zpci_mem_init(void) +{ + BUILD_BUG_ON(!is_power_of_2(__alignof__(struct zpci_fmb)) || + __alignof__(struct zpci_fmb) < sizeof(struct zpci_fmb)); + + zdev_fmb_cache = kmem_cache_create("PCI_FMB_cache", sizeof(struct zpci_fmb), + __alignof__(struct zpci_fmb), 0, NULL); + if (!zdev_fmb_cache) + goto error_fmb; + + zpci_iomap_start = kcalloc(ZPCI_IOMAP_ENTRIES, + sizeof(*zpci_iomap_start), GFP_KERNEL); + if (!zpci_iomap_start) + goto error_iomap; + + zpci_iomap_bitmap = kcalloc(BITS_TO_LONGS(ZPCI_IOMAP_ENTRIES), + sizeof(*zpci_iomap_bitmap), GFP_KERNEL); + if (!zpci_iomap_bitmap) + goto error_iomap_bitmap; + + if (static_branch_likely(&have_mio)) + clp_setup_writeback_mio(); + + return 0; +error_iomap_bitmap: + kfree(zpci_iomap_start); +error_iomap: + kmem_cache_destroy(zdev_fmb_cache); +error_fmb: + return -ENOMEM; +} + +static void zpci_mem_exit(void) +{ + kfree(zpci_iomap_bitmap); + kfree(zpci_iomap_start); + kmem_cache_destroy(zdev_fmb_cache); +} + +static unsigned int s390_pci_probe __initdata = 1; +unsigned int s390_pci_force_floating __initdata; +static unsigned int s390_pci_initialized; + +char * __init pcibios_setup(char *str) +{ + if (!strcmp(str, "off")) { + s390_pci_probe = 0; + return NULL; + } + if (!strcmp(str, "nomio")) { + S390_lowcore.machine_flags &= ~MACHINE_FLAG_PCI_MIO; + return NULL; + } + if (!strcmp(str, "force_floating")) { + s390_pci_force_floating = 1; + return NULL; + } + if (!strcmp(str, "norid")) { + s390_pci_no_rid = 1; + return NULL; + } + return str; +} + +bool zpci_is_enabled(void) +{ + return s390_pci_initialized; +} + +static int __init pci_base_init(void) +{ + int rc; + + if (!s390_pci_probe) + return 0; + + if (!test_facility(69) || !test_facility(71)) { + pr_info("PCI is not supported because CPU facilities 69 or 71 are not available\n"); + return 0; + } + + if (MACHINE_HAS_PCI_MIO) { + static_branch_enable(&have_mio); + ctl_set_bit(2, 5); + } + + rc = zpci_debug_init(); + if (rc) + goto out; + + rc = zpci_mem_init(); + if (rc) + goto out_mem; + + rc = zpci_irq_init(); + if (rc) + goto out_irq; + + rc = zpci_dma_init(); + if (rc) + goto out_dma; + + rc = clp_scan_pci_devices(); + if (rc) + goto out_find; + zpci_bus_scan_busses(); + + s390_pci_initialized = 1; + return 0; + +out_find: + zpci_dma_exit(); +out_dma: + zpci_irq_exit(); +out_irq: + zpci_mem_exit(); +out_mem: + zpci_debug_exit(); +out: + return rc; +} +subsys_initcall_sync(pci_base_init); diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c new file mode 100644 index 0000000000..32245b970a --- /dev/null +++ b/arch/s390/pci/pci_bus.c @@ -0,0 +1,379 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2020 + * + * Author(s): + * Pierre Morel + * + */ + +#define KMSG_COMPONENT "zpci" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "pci_bus.h" +#include "pci_iov.h" + +static LIST_HEAD(zbus_list); +static DEFINE_MUTEX(zbus_list_lock); +static int zpci_nb_devices; + +/* zpci_bus_prepare_device - Prepare a zPCI function for scanning + * @zdev: the zPCI function to be prepared + * + * The PCI resources for the function are set up and added to its zbus and the + * function is enabled. The function must be added to a zbus which must have + * a PCI bus created. If an error occurs the zPCI function is not enabled. + * + * Return: 0 on success, an error code otherwise + */ +static int zpci_bus_prepare_device(struct zpci_dev *zdev) +{ + int rc, i; + + if (!zdev_enabled(zdev)) { + rc = zpci_enable_device(zdev); + if (rc) + return rc; + rc = zpci_dma_init_device(zdev); + if (rc) { + zpci_disable_device(zdev); + return rc; + } + } + + if (!zdev->has_resources) { + zpci_setup_bus_resources(zdev); + for (i = 0; i < PCI_STD_NUM_BARS; i++) { + if (zdev->bars[i].res) + pci_bus_add_resource(zdev->zbus->bus, zdev->bars[i].res, 0); + } + } + + return 0; +} + +/* zpci_bus_scan_device - Scan a single device adding it to the PCI core + * @zdev: the zdev to be scanned + * + * Scans the PCI function making it available to the common PCI code. + * + * Return: 0 on success, an error value otherwise + */ +int zpci_bus_scan_device(struct zpci_dev *zdev) +{ + struct pci_dev *pdev; + int rc; + + rc = zpci_bus_prepare_device(zdev); + if (rc) + return rc; + + pdev = pci_scan_single_device(zdev->zbus->bus, zdev->devfn); + if (!pdev) + return -ENODEV; + + pci_lock_rescan_remove(); + pci_bus_add_device(pdev); + pci_unlock_rescan_remove(); + + return 0; +} + +/* zpci_bus_remove_device - Removes the given zdev from the PCI core + * @zdev: the zdev to be removed from the PCI core + * @set_error: if true the device's error state is set to permanent failure + * + * Sets a zPCI device to a configured but offline state; the zPCI + * device is still accessible through its hotplug slot and the zPCI + * API but is removed from the common code PCI bus, making it + * no longer available to drivers. + */ +void zpci_bus_remove_device(struct zpci_dev *zdev, bool set_error) +{ + struct zpci_bus *zbus = zdev->zbus; + struct pci_dev *pdev; + + if (!zdev->zbus->bus) + return; + + pdev = pci_get_slot(zbus->bus, zdev->devfn); + if (pdev) { + if (set_error) + pdev->error_state = pci_channel_io_perm_failure; + if (pdev->is_virtfn) { + zpci_iov_remove_virtfn(pdev, zdev->vfn); + /* balance pci_get_slot */ + pci_dev_put(pdev); + return; + } + pci_stop_and_remove_bus_device_locked(pdev); + /* balance pci_get_slot */ + pci_dev_put(pdev); + } +} + +/* zpci_bus_scan_bus - Scan all configured zPCI functions on the bus + * @zbus: the zbus to be scanned + * + * Enables and scans all PCI functions on the bus making them available to the + * common PCI code. If a PCI function fails to be initialized an error will be + * returned but attempts will still be made for all other functions on the bus. + * + * Return: 0 on success, an error value otherwise + */ +int zpci_bus_scan_bus(struct zpci_bus *zbus) +{ + struct zpci_dev *zdev; + int devfn, rc, ret = 0; + + for (devfn = 0; devfn < ZPCI_FUNCTIONS_PER_BUS; devfn++) { + zdev = zbus->function[devfn]; + if (zdev && zdev->state == ZPCI_FN_STATE_CONFIGURED) { + rc = zpci_bus_prepare_device(zdev); + if (rc) + ret = -EIO; + } + } + + pci_lock_rescan_remove(); + pci_scan_child_bus(zbus->bus); + pci_bus_add_devices(zbus->bus); + pci_unlock_rescan_remove(); + + return ret; +} + +/* zpci_bus_scan_busses - Scan all registered busses + * + * Scan all available zbusses + * + */ +void zpci_bus_scan_busses(void) +{ + struct zpci_bus *zbus = NULL; + + mutex_lock(&zbus_list_lock); + list_for_each_entry(zbus, &zbus_list, bus_next) { + zpci_bus_scan_bus(zbus); + cond_resched(); + } + mutex_unlock(&zbus_list_lock); +} + +/* zpci_bus_create_pci_bus - Create the PCI bus associated with this zbus + * @zbus: the zbus holding the zdevices + * @fr: PCI root function that will determine the bus's domain, and bus speeed + * @ops: the pci operations + * + * The PCI function @fr determines the domain (its UID), multifunction property + * and maximum bus speed of the entire bus. + * + * Return: 0 on success, an error code otherwise + */ +static int zpci_bus_create_pci_bus(struct zpci_bus *zbus, struct zpci_dev *fr, struct pci_ops *ops) +{ + struct pci_bus *bus; + int domain; + + domain = zpci_alloc_domain((u16)fr->uid); + if (domain < 0) + return domain; + + zbus->domain_nr = domain; + zbus->multifunction = fr->rid_available; + zbus->max_bus_speed = fr->max_bus_speed; + + /* + * Note that the zbus->resources are taken over and zbus->resources + * is empty after a successful call + */ + bus = pci_create_root_bus(NULL, ZPCI_BUS_NR, ops, zbus, &zbus->resources); + if (!bus) { + zpci_free_domain(zbus->domain_nr); + return -EFAULT; + } + + zbus->bus = bus; + + return 0; +} + +static void zpci_bus_release(struct kref *kref) +{ + struct zpci_bus *zbus = container_of(kref, struct zpci_bus, kref); + + if (zbus->bus) { + pci_lock_rescan_remove(); + pci_stop_root_bus(zbus->bus); + + zpci_free_domain(zbus->domain_nr); + pci_free_resource_list(&zbus->resources); + + pci_remove_root_bus(zbus->bus); + pci_unlock_rescan_remove(); + } + + mutex_lock(&zbus_list_lock); + list_del(&zbus->bus_next); + mutex_unlock(&zbus_list_lock); + kfree(zbus); +} + +static void zpci_bus_put(struct zpci_bus *zbus) +{ + kref_put(&zbus->kref, zpci_bus_release); +} + +static struct zpci_bus *zpci_bus_get(int pchid) +{ + struct zpci_bus *zbus; + + mutex_lock(&zbus_list_lock); + list_for_each_entry(zbus, &zbus_list, bus_next) { + if (pchid == zbus->pchid) { + kref_get(&zbus->kref); + goto out_unlock; + } + } + zbus = NULL; +out_unlock: + mutex_unlock(&zbus_list_lock); + return zbus; +} + +static struct zpci_bus *zpci_bus_alloc(int pchid) +{ + struct zpci_bus *zbus; + + zbus = kzalloc(sizeof(*zbus), GFP_KERNEL); + if (!zbus) + return NULL; + + zbus->pchid = pchid; + INIT_LIST_HEAD(&zbus->bus_next); + mutex_lock(&zbus_list_lock); + list_add_tail(&zbus->bus_next, &zbus_list); + mutex_unlock(&zbus_list_lock); + + kref_init(&zbus->kref); + INIT_LIST_HEAD(&zbus->resources); + + zbus->bus_resource.start = 0; + zbus->bus_resource.end = ZPCI_BUS_NR; + zbus->bus_resource.flags = IORESOURCE_BUS; + pci_add_resource(&zbus->resources, &zbus->bus_resource); + + return zbus; +} + +void pcibios_bus_add_device(struct pci_dev *pdev) +{ + struct zpci_dev *zdev = to_zpci(pdev); + + /* + * With pdev->no_vf_scan the common PCI probing code does not + * perform PF/VF linking. + */ + if (zdev->vfn) { + zpci_iov_setup_virtfn(zdev->zbus, pdev, zdev->vfn); + pdev->no_command_memory = 1; + } +} + +static int zpci_bus_add_device(struct zpci_bus *zbus, struct zpci_dev *zdev) +{ + int rc = -EINVAL; + + if (zbus->function[zdev->devfn]) { + pr_err("devfn %04x is already assigned\n", zdev->devfn); + return rc; + } + + zdev->zbus = zbus; + zbus->function[zdev->devfn] = zdev; + zpci_nb_devices++; + + if (zbus->multifunction && !zdev->rid_available) { + WARN_ONCE(1, "rid_available not set for multifunction\n"); + goto error; + } + rc = zpci_init_slot(zdev); + if (rc) + goto error; + zdev->has_hp_slot = 1; + + return 0; + +error: + zbus->function[zdev->devfn] = NULL; + zdev->zbus = NULL; + zpci_nb_devices--; + return rc; +} + +int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops) +{ + struct zpci_bus *zbus = NULL; + int rc = -EBADF; + + if (zpci_nb_devices == ZPCI_NR_DEVICES) { + pr_warn("Adding PCI function %08x failed because the configured limit of %d is reached\n", + zdev->fid, ZPCI_NR_DEVICES); + return -ENOSPC; + } + + if (zdev->devfn >= ZPCI_FUNCTIONS_PER_BUS) + return -EINVAL; + + if (!s390_pci_no_rid && zdev->rid_available) + zbus = zpci_bus_get(zdev->pchid); + + if (!zbus) { + zbus = zpci_bus_alloc(zdev->pchid); + if (!zbus) + return -ENOMEM; + } + + if (!zbus->bus) { + /* The UID of the first PCI function registered with a zpci_bus + * is used as the domain number for that bus. Currently there + * is exactly one zpci_bus per domain. + */ + rc = zpci_bus_create_pci_bus(zbus, zdev, ops); + if (rc) + goto error; + } + + rc = zpci_bus_add_device(zbus, zdev); + if (rc) + goto error; + + return 0; + +error: + pr_err("Adding PCI function %08x failed\n", zdev->fid); + zpci_bus_put(zbus); + return rc; +} + +void zpci_bus_device_unregister(struct zpci_dev *zdev) +{ + struct zpci_bus *zbus = zdev->zbus; + + zpci_nb_devices--; + zbus->function[zdev->devfn] = NULL; + zpci_bus_put(zbus); +} diff --git a/arch/s390/pci/pci_bus.h b/arch/s390/pci/pci_bus.h new file mode 100644 index 0000000000..af9f0ac79a --- /dev/null +++ b/arch/s390/pci/pci_bus.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2020 + * + * Author(s): + * Pierre Morel + * + */ + +int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops); +void zpci_bus_device_unregister(struct zpci_dev *zdev); + +int zpci_bus_scan_bus(struct zpci_bus *zbus); +void zpci_bus_scan_busses(void); + +int zpci_bus_scan_device(struct zpci_dev *zdev); +void zpci_bus_remove_device(struct zpci_dev *zdev, bool set_error); + +void zpci_release_device(struct kref *kref); +static inline void zpci_zdev_put(struct zpci_dev *zdev) +{ + if (zdev) + kref_put(&zdev->kref, zpci_release_device); +} + +static inline void zpci_zdev_get(struct zpci_dev *zdev) +{ + kref_get(&zdev->kref); +} + +int zpci_alloc_domain(int domain); +void zpci_free_domain(int domain); +int zpci_setup_bus_resources(struct zpci_dev *zdev); + +static inline struct zpci_dev *zdev_from_bus(struct pci_bus *bus, + unsigned int devfn) +{ + struct zpci_bus *zbus = bus->sysdata; + + return (devfn >= ZPCI_FUNCTIONS_PER_BUS) ? NULL : zbus->function[devfn]; +} + diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c new file mode 100644 index 0000000000..ee90a91ed8 --- /dev/null +++ b/arch/s390/pci/pci_clp.c @@ -0,0 +1,669 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2012 + * + * Author(s): + * Jan Glauber + */ + +#define KMSG_COMPONENT "zpci" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pci_bus.h" + +bool zpci_unique_uid; + +void update_uid_checking(bool new) +{ + if (zpci_unique_uid != new) + zpci_dbg(3, "uid checking:%d\n", new); + + zpci_unique_uid = new; +} + +static inline void zpci_err_clp(unsigned int rsp, int rc) +{ + struct { + unsigned int rsp; + int rc; + } __packed data = {rsp, rc}; + + zpci_err_hex(&data, sizeof(data)); +} + +/* + * Call Logical Processor with c=1, lps=0 and command 1 + * to get the bit mask of installed logical processors + */ +static inline int clp_get_ilp(unsigned long *ilp) +{ + unsigned long mask; + int cc = 3; + + asm volatile ( + " .insn rrf,0xb9a00000,%[mask],%[cmd],8,0\n" + "0: ipm %[cc]\n" + " srl %[cc],28\n" + "1:\n" + EX_TABLE(0b, 1b) + : [cc] "+d" (cc), [mask] "=d" (mask) : [cmd] "a" (1) + : "cc"); + *ilp = mask; + return cc; +} + +/* + * Call Logical Processor with c=0, the give constant lps and an lpcb request. + */ +static __always_inline int clp_req(void *data, unsigned int lps) +{ + struct { u8 _[CLP_BLK_SIZE]; } *req = data; + u64 ignored; + int cc = 3; + + asm volatile ( + " .insn rrf,0xb9a00000,%[ign],%[req],0,%[lps]\n" + "0: ipm %[cc]\n" + " srl %[cc],28\n" + "1:\n" + EX_TABLE(0b, 1b) + : [cc] "+d" (cc), [ign] "=d" (ignored), "+m" (*req) + : [req] "a" (req), [lps] "i" (lps) + : "cc"); + return cc; +} + +static void *clp_alloc_block(gfp_t gfp_mask) +{ + return (void *) __get_free_pages(gfp_mask, get_order(CLP_BLK_SIZE)); +} + +static void clp_free_block(void *ptr) +{ + free_pages((unsigned long) ptr, get_order(CLP_BLK_SIZE)); +} + +static void clp_store_query_pci_fngrp(struct zpci_dev *zdev, + struct clp_rsp_query_pci_grp *response) +{ + zdev->tlb_refresh = response->refresh; + zdev->dma_mask = response->dasm; + zdev->msi_addr = response->msia; + zdev->max_msi = response->noi; + zdev->fmb_update = response->mui; + zdev->version = response->version; + zdev->maxstbl = response->maxstbl; + zdev->dtsm = response->dtsm; + + switch (response->version) { + case 1: + zdev->max_bus_speed = PCIE_SPEED_5_0GT; + break; + default: + zdev->max_bus_speed = PCI_SPEED_UNKNOWN; + break; + } +} + +static int clp_query_pci_fngrp(struct zpci_dev *zdev, u8 pfgid) +{ + struct clp_req_rsp_query_pci_grp *rrb; + int rc; + + rrb = clp_alloc_block(GFP_KERNEL); + if (!rrb) + return -ENOMEM; + + memset(rrb, 0, sizeof(*rrb)); + rrb->request.hdr.len = sizeof(rrb->request); + rrb->request.hdr.cmd = CLP_QUERY_PCI_FNGRP; + rrb->response.hdr.len = sizeof(rrb->response); + rrb->request.pfgid = pfgid; + + rc = clp_req(rrb, CLP_LPS_PCI); + if (!rc && rrb->response.hdr.rsp == CLP_RC_OK) + clp_store_query_pci_fngrp(zdev, &rrb->response); + else { + zpci_err("Q PCI FGRP:\n"); + zpci_err_clp(rrb->response.hdr.rsp, rc); + rc = -EIO; + } + clp_free_block(rrb); + return rc; +} + +static int clp_store_query_pci_fn(struct zpci_dev *zdev, + struct clp_rsp_query_pci *response) +{ + int i; + + for (i = 0; i < PCI_STD_NUM_BARS; i++) { + zdev->bars[i].val = le32_to_cpu(response->bar[i]); + zdev->bars[i].size = response->bar_size[i]; + } + zdev->start_dma = response->sdma; + zdev->end_dma = response->edma; + zdev->pchid = response->pchid; + zdev->pfgid = response->pfgid; + zdev->pft = response->pft; + zdev->vfn = response->vfn; + zdev->port = response->port; + zdev->uid = response->uid; + zdev->fmb_length = sizeof(u32) * response->fmb_len; + zdev->rid_available = response->rid_avail; + zdev->is_physfn = response->is_physfn; + if (!s390_pci_no_rid && zdev->rid_available) + zdev->devfn = response->rid & ZPCI_RID_MASK_DEVFN; + + memcpy(zdev->pfip, response->pfip, sizeof(zdev->pfip)); + if (response->util_str_avail) { + memcpy(zdev->util_str, response->util_str, + sizeof(zdev->util_str)); + zdev->util_str_avail = 1; + } + zdev->mio_capable = response->mio_addr_avail; + for (i = 0; i < PCI_STD_NUM_BARS; i++) { + if (!(response->mio.valid & (1 << (PCI_STD_NUM_BARS - i - 1)))) + continue; + + zdev->bars[i].mio_wb = (void __iomem *) response->mio.addr[i].wb; + zdev->bars[i].mio_wt = (void __iomem *) response->mio.addr[i].wt; + } + return 0; +} + +int clp_query_pci_fn(struct zpci_dev *zdev) +{ + struct clp_req_rsp_query_pci *rrb; + int rc; + + rrb = clp_alloc_block(GFP_KERNEL); + if (!rrb) + return -ENOMEM; + + memset(rrb, 0, sizeof(*rrb)); + rrb->request.hdr.len = sizeof(rrb->request); + rrb->request.hdr.cmd = CLP_QUERY_PCI_FN; + rrb->response.hdr.len = sizeof(rrb->response); + rrb->request.fh = zdev->fh; + + rc = clp_req(rrb, CLP_LPS_PCI); + if (!rc && rrb->response.hdr.rsp == CLP_RC_OK) { + rc = clp_store_query_pci_fn(zdev, &rrb->response); + if (rc) + goto out; + rc = clp_query_pci_fngrp(zdev, rrb->response.pfgid); + } else { + zpci_err("Q PCI FN:\n"); + zpci_err_clp(rrb->response.hdr.rsp, rc); + rc = -EIO; + } +out: + clp_free_block(rrb); + return rc; +} + +/** + * clp_set_pci_fn() - Execute a command on a PCI function + * @zdev: Function that will be affected + * @fh: Out parameter for updated function handle + * @nr_dma_as: DMA address space number + * @command: The command code to execute + * + * Returns: 0 on success, < 0 for Linux errors (e.g. -ENOMEM), and + * > 0 for non-success platform responses + */ +static int clp_set_pci_fn(struct zpci_dev *zdev, u32 *fh, u8 nr_dma_as, u8 command) +{ + struct clp_req_rsp_set_pci *rrb; + int rc, retries = 100; + u32 gisa = 0; + + *fh = 0; + rrb = clp_alloc_block(GFP_KERNEL); + if (!rrb) + return -ENOMEM; + + if (command != CLP_SET_DISABLE_PCI_FN) + gisa = zdev->gisa; + + do { + memset(rrb, 0, sizeof(*rrb)); + rrb->request.hdr.len = sizeof(rrb->request); + rrb->request.hdr.cmd = CLP_SET_PCI_FN; + rrb->response.hdr.len = sizeof(rrb->response); + rrb->request.fh = zdev->fh; + rrb->request.oc = command; + rrb->request.ndas = nr_dma_as; + rrb->request.gisa = gisa; + + rc = clp_req(rrb, CLP_LPS_PCI); + if (rrb->response.hdr.rsp == CLP_RC_SETPCIFN_BUSY) { + retries--; + if (retries < 0) + break; + msleep(20); + } + } while (rrb->response.hdr.rsp == CLP_RC_SETPCIFN_BUSY); + + if (!rc && rrb->response.hdr.rsp == CLP_RC_OK) { + *fh = rrb->response.fh; + } else { + zpci_err("Set PCI FN:\n"); + zpci_err_clp(rrb->response.hdr.rsp, rc); + if (!rc) + rc = rrb->response.hdr.rsp; + } + clp_free_block(rrb); + return rc; +} + +int clp_setup_writeback_mio(void) +{ + struct clp_req_rsp_slpc_pci *rrb; + u8 wb_bit_pos; + int rc; + + rrb = clp_alloc_block(GFP_KERNEL); + if (!rrb) + return -ENOMEM; + + memset(rrb, 0, sizeof(*rrb)); + rrb->request.hdr.len = sizeof(rrb->request); + rrb->request.hdr.cmd = CLP_SLPC; + rrb->response.hdr.len = sizeof(rrb->response); + + rc = clp_req(rrb, CLP_LPS_PCI); + if (!rc && rrb->response.hdr.rsp == CLP_RC_OK) { + if (rrb->response.vwb) { + wb_bit_pos = rrb->response.mio_wb; + set_bit_inv(wb_bit_pos, &mio_wb_bit_mask); + zpci_dbg(3, "wb bit: %d\n", wb_bit_pos); + } else { + zpci_dbg(3, "wb bit: n.a.\n"); + } + + } else { + zpci_err("SLPC PCI:\n"); + zpci_err_clp(rrb->response.hdr.rsp, rc); + rc = -EIO; + } + clp_free_block(rrb); + return rc; +} + +int clp_enable_fh(struct zpci_dev *zdev, u32 *fh, u8 nr_dma_as) +{ + int rc; + + rc = clp_set_pci_fn(zdev, fh, nr_dma_as, CLP_SET_ENABLE_PCI_FN); + zpci_dbg(3, "ena fid:%x, fh:%x, rc:%d\n", zdev->fid, *fh, rc); + if (!rc && zpci_use_mio(zdev)) { + rc = clp_set_pci_fn(zdev, fh, nr_dma_as, CLP_SET_ENABLE_MIO); + zpci_dbg(3, "ena mio fid:%x, fh:%x, rc:%d\n", + zdev->fid, *fh, rc); + if (rc) + clp_disable_fh(zdev, fh); + } + return rc; +} + +int clp_disable_fh(struct zpci_dev *zdev, u32 *fh) +{ + int rc; + + if (!zdev_enabled(zdev)) + return 0; + + rc = clp_set_pci_fn(zdev, fh, 0, CLP_SET_DISABLE_PCI_FN); + zpci_dbg(3, "dis fid:%x, fh:%x, rc:%d\n", zdev->fid, *fh, rc); + return rc; +} + +static int clp_list_pci_req(struct clp_req_rsp_list_pci *rrb, + u64 *resume_token, int *nentries) +{ + int rc; + + memset(rrb, 0, sizeof(*rrb)); + rrb->request.hdr.len = sizeof(rrb->request); + rrb->request.hdr.cmd = CLP_LIST_PCI; + /* store as many entries as possible */ + rrb->response.hdr.len = CLP_BLK_SIZE - LIST_PCI_HDR_LEN; + rrb->request.resume_token = *resume_token; + + /* Get PCI function handle list */ + rc = clp_req(rrb, CLP_LPS_PCI); + if (rc || rrb->response.hdr.rsp != CLP_RC_OK) { + zpci_err("List PCI FN:\n"); + zpci_err_clp(rrb->response.hdr.rsp, rc); + return -EIO; + } + + update_uid_checking(rrb->response.uid_checking); + WARN_ON_ONCE(rrb->response.entry_size != + sizeof(struct clp_fh_list_entry)); + + *nentries = (rrb->response.hdr.len - LIST_PCI_HDR_LEN) / + rrb->response.entry_size; + *resume_token = rrb->response.resume_token; + + return rc; +} + +static int clp_list_pci(struct clp_req_rsp_list_pci *rrb, void *data, + void (*cb)(struct clp_fh_list_entry *, void *)) +{ + u64 resume_token = 0; + int nentries, i, rc; + + do { + rc = clp_list_pci_req(rrb, &resume_token, &nentries); + if (rc) + return rc; + for (i = 0; i < nentries; i++) + cb(&rrb->response.fh_list[i], data); + } while (resume_token); + + return rc; +} + +static int clp_find_pci(struct clp_req_rsp_list_pci *rrb, u32 fid, + struct clp_fh_list_entry *entry) +{ + struct clp_fh_list_entry *fh_list; + u64 resume_token = 0; + int nentries, i, rc; + + do { + rc = clp_list_pci_req(rrb, &resume_token, &nentries); + if (rc) + return rc; + fh_list = rrb->response.fh_list; + for (i = 0; i < nentries; i++) { + if (fh_list[i].fid == fid) { + *entry = fh_list[i]; + return 0; + } + } + } while (resume_token); + + return -ENODEV; +} + +static void __clp_add(struct clp_fh_list_entry *entry, void *data) +{ + struct zpci_dev *zdev; + + if (!entry->vendor_id) + return; + + zdev = get_zdev_by_fid(entry->fid); + if (zdev) { + zpci_zdev_put(zdev); + return; + } + zpci_create_device(entry->fid, entry->fh, entry->config_state); +} + +int clp_scan_pci_devices(void) +{ + struct clp_req_rsp_list_pci *rrb; + int rc; + + rrb = clp_alloc_block(GFP_KERNEL); + if (!rrb) + return -ENOMEM; + + rc = clp_list_pci(rrb, NULL, __clp_add); + + clp_free_block(rrb); + return rc; +} + +/* + * Get the current function handle of the function matching @fid + */ +int clp_refresh_fh(u32 fid, u32 *fh) +{ + struct clp_req_rsp_list_pci *rrb; + struct clp_fh_list_entry entry; + int rc; + + rrb = clp_alloc_block(GFP_NOWAIT); + if (!rrb) + return -ENOMEM; + + rc = clp_find_pci(rrb, fid, &entry); + if (!rc) + *fh = entry.fh; + + clp_free_block(rrb); + return rc; +} + +int clp_get_state(u32 fid, enum zpci_state *state) +{ + struct clp_req_rsp_list_pci *rrb; + struct clp_fh_list_entry entry; + int rc; + + rrb = clp_alloc_block(GFP_ATOMIC); + if (!rrb) + return -ENOMEM; + + rc = clp_find_pci(rrb, fid, &entry); + if (!rc) { + *state = entry.config_state; + } else if (rc == -ENODEV) { + *state = ZPCI_FN_STATE_RESERVED; + rc = 0; + } + + clp_free_block(rrb); + return rc; +} + +static int clp_base_slpc(struct clp_req *req, struct clp_req_rsp_slpc *lpcb) +{ + unsigned long limit = PAGE_SIZE - sizeof(lpcb->request); + + if (lpcb->request.hdr.len != sizeof(lpcb->request) || + lpcb->response.hdr.len > limit) + return -EINVAL; + return clp_req(lpcb, CLP_LPS_BASE) ? -EOPNOTSUPP : 0; +} + +static int clp_base_command(struct clp_req *req, struct clp_req_hdr *lpcb) +{ + switch (lpcb->cmd) { + case 0x0001: /* store logical-processor characteristics */ + return clp_base_slpc(req, (void *) lpcb); + default: + return -EINVAL; + } +} + +static int clp_pci_slpc(struct clp_req *req, struct clp_req_rsp_slpc_pci *lpcb) +{ + unsigned long limit = PAGE_SIZE - sizeof(lpcb->request); + + if (lpcb->request.hdr.len != sizeof(lpcb->request) || + lpcb->response.hdr.len > limit) + return -EINVAL; + return clp_req(lpcb, CLP_LPS_PCI) ? -EOPNOTSUPP : 0; +} + +static int clp_pci_list(struct clp_req *req, struct clp_req_rsp_list_pci *lpcb) +{ + unsigned long limit = PAGE_SIZE - sizeof(lpcb->request); + + if (lpcb->request.hdr.len != sizeof(lpcb->request) || + lpcb->response.hdr.len > limit) + return -EINVAL; + if (lpcb->request.reserved2 != 0) + return -EINVAL; + return clp_req(lpcb, CLP_LPS_PCI) ? -EOPNOTSUPP : 0; +} + +static int clp_pci_query(struct clp_req *req, + struct clp_req_rsp_query_pci *lpcb) +{ + unsigned long limit = PAGE_SIZE - sizeof(lpcb->request); + + if (lpcb->request.hdr.len != sizeof(lpcb->request) || + lpcb->response.hdr.len > limit) + return -EINVAL; + if (lpcb->request.reserved2 != 0 || lpcb->request.reserved3 != 0) + return -EINVAL; + return clp_req(lpcb, CLP_LPS_PCI) ? -EOPNOTSUPP : 0; +} + +static int clp_pci_query_grp(struct clp_req *req, + struct clp_req_rsp_query_pci_grp *lpcb) +{ + unsigned long limit = PAGE_SIZE - sizeof(lpcb->request); + + if (lpcb->request.hdr.len != sizeof(lpcb->request) || + lpcb->response.hdr.len > limit) + return -EINVAL; + if (lpcb->request.reserved2 != 0 || lpcb->request.reserved3 != 0 || + lpcb->request.reserved4 != 0) + return -EINVAL; + return clp_req(lpcb, CLP_LPS_PCI) ? -EOPNOTSUPP : 0; +} + +static int clp_pci_command(struct clp_req *req, struct clp_req_hdr *lpcb) +{ + switch (lpcb->cmd) { + case 0x0001: /* store logical-processor characteristics */ + return clp_pci_slpc(req, (void *) lpcb); + case 0x0002: /* list PCI functions */ + return clp_pci_list(req, (void *) lpcb); + case 0x0003: /* query PCI function */ + return clp_pci_query(req, (void *) lpcb); + case 0x0004: /* query PCI function group */ + return clp_pci_query_grp(req, (void *) lpcb); + default: + return -EINVAL; + } +} + +static int clp_normal_command(struct clp_req *req) +{ + struct clp_req_hdr *lpcb; + void __user *uptr; + int rc; + + rc = -EINVAL; + if (req->lps != 0 && req->lps != 2) + goto out; + + rc = -ENOMEM; + lpcb = clp_alloc_block(GFP_KERNEL); + if (!lpcb) + goto out; + + rc = -EFAULT; + uptr = (void __force __user *)(unsigned long) req->data_p; + if (copy_from_user(lpcb, uptr, PAGE_SIZE) != 0) + goto out_free; + + rc = -EINVAL; + if (lpcb->fmt != 0 || lpcb->reserved1 != 0 || lpcb->reserved2 != 0) + goto out_free; + + switch (req->lps) { + case 0: + rc = clp_base_command(req, lpcb); + break; + case 2: + rc = clp_pci_command(req, lpcb); + break; + } + if (rc) + goto out_free; + + rc = -EFAULT; + if (copy_to_user(uptr, lpcb, PAGE_SIZE) != 0) + goto out_free; + + rc = 0; + +out_free: + clp_free_block(lpcb); +out: + return rc; +} + +static int clp_immediate_command(struct clp_req *req) +{ + void __user *uptr; + unsigned long ilp; + int exists; + + if (req->cmd > 1 || clp_get_ilp(&ilp) != 0) + return -EINVAL; + + uptr = (void __force __user *)(unsigned long) req->data_p; + if (req->cmd == 0) { + /* Command code 0: test for a specific processor */ + exists = test_bit_inv(req->lps, &ilp); + return put_user(exists, (int __user *) uptr); + } + /* Command code 1: return bit mask of installed processors */ + return put_user(ilp, (unsigned long __user *) uptr); +} + +static long clp_misc_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct clp_req req; + void __user *argp; + + if (cmd != CLP_SYNC) + return -EINVAL; + + argp = is_compat_task() ? compat_ptr(arg) : (void __user *) arg; + if (copy_from_user(&req, argp, sizeof(req))) + return -EFAULT; + if (req.r != 0) + return -EINVAL; + return req.c ? clp_immediate_command(&req) : clp_normal_command(&req); +} + +static int clp_misc_release(struct inode *inode, struct file *filp) +{ + return 0; +} + +static const struct file_operations clp_misc_fops = { + .owner = THIS_MODULE, + .open = nonseekable_open, + .release = clp_misc_release, + .unlocked_ioctl = clp_misc_ioctl, + .compat_ioctl = clp_misc_ioctl, + .llseek = no_llseek, +}; + +static struct miscdevice clp_misc_device = { + .minor = MISC_DYNAMIC_MINOR, + .name = "clp", + .fops = &clp_misc_fops, +}; + +builtin_misc_device(clp_misc_device); diff --git a/arch/s390/pci/pci_debug.c b/arch/s390/pci/pci_debug.c new file mode 100644 index 0000000000..ca6bd98eec --- /dev/null +++ b/arch/s390/pci/pci_debug.c @@ -0,0 +1,210 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2012,2015 + * + * Author(s): + * Jan Glauber + */ + +#define KMSG_COMPONENT "zpci" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include + +#include + +static struct dentry *debugfs_root; +debug_info_t *pci_debug_msg_id; +EXPORT_SYMBOL_GPL(pci_debug_msg_id); +debug_info_t *pci_debug_err_id; +EXPORT_SYMBOL_GPL(pci_debug_err_id); + +static char *pci_common_names[] = { + "Load operations", + "Store operations", + "Store block operations", + "Refresh operations", +}; + +static char *pci_fmt0_names[] = { + "DMA read bytes", + "DMA write bytes", +}; + +static char *pci_fmt1_names[] = { + "Received bytes", + "Received packets", + "Transmitted bytes", + "Transmitted packets", +}; + +static char *pci_fmt2_names[] = { + "Consumed work units", + "Maximum work units", +}; + +static char *pci_fmt3_names[] = { + "Transmitted bytes", +}; + +static char *pci_sw_names[] = { + "Allocated pages", + "Mapped pages", + "Unmapped pages", +}; + +static void pci_fmb_show(struct seq_file *m, char *name[], int length, + u64 *data) +{ + int i; + + for (i = 0; i < length; i++, data++) + seq_printf(m, "%26s:\t%llu\n", name[i], *data); +} + +static void pci_sw_counter_show(struct seq_file *m) +{ + struct zpci_dev *zdev = m->private; + atomic64_t *counter = &zdev->allocated_pages; + int i; + + for (i = 0; i < ARRAY_SIZE(pci_sw_names); i++, counter++) + seq_printf(m, "%26s:\t%llu\n", pci_sw_names[i], + atomic64_read(counter)); +} + +static int pci_perf_show(struct seq_file *m, void *v) +{ + struct zpci_dev *zdev = m->private; + + if (!zdev) + return 0; + + mutex_lock(&zdev->lock); + if (!zdev->fmb) { + mutex_unlock(&zdev->lock); + seq_puts(m, "FMB statistics disabled\n"); + return 0; + } + + /* header */ + seq_printf(m, "Update interval: %u ms\n", zdev->fmb_update); + seq_printf(m, "Samples: %u\n", zdev->fmb->samples); + seq_printf(m, "Last update TOD: %Lx\n", zdev->fmb->last_update); + + pci_fmb_show(m, pci_common_names, ARRAY_SIZE(pci_common_names), + &zdev->fmb->ld_ops); + + switch (zdev->fmb->format) { + case 0: + if (!(zdev->fmb->fmt_ind & ZPCI_FMB_DMA_COUNTER_VALID)) + break; + pci_fmb_show(m, pci_fmt0_names, ARRAY_SIZE(pci_fmt0_names), + &zdev->fmb->fmt0.dma_rbytes); + break; + case 1: + pci_fmb_show(m, pci_fmt1_names, ARRAY_SIZE(pci_fmt1_names), + &zdev->fmb->fmt1.rx_bytes); + break; + case 2: + pci_fmb_show(m, pci_fmt2_names, ARRAY_SIZE(pci_fmt2_names), + &zdev->fmb->fmt2.consumed_work_units); + break; + case 3: + pci_fmb_show(m, pci_fmt3_names, ARRAY_SIZE(pci_fmt3_names), + &zdev->fmb->fmt3.tx_bytes); + break; + default: + seq_puts(m, "Unknown format\n"); + } + + pci_sw_counter_show(m); + mutex_unlock(&zdev->lock); + return 0; +} + +static ssize_t pci_perf_seq_write(struct file *file, const char __user *ubuf, + size_t count, loff_t *off) +{ + struct zpci_dev *zdev = ((struct seq_file *) file->private_data)->private; + unsigned long val; + int rc; + + if (!zdev) + return 0; + + rc = kstrtoul_from_user(ubuf, count, 10, &val); + if (rc) + return rc; + + mutex_lock(&zdev->lock); + switch (val) { + case 0: + rc = zpci_fmb_disable_device(zdev); + break; + case 1: + rc = zpci_fmb_enable_device(zdev); + break; + } + mutex_unlock(&zdev->lock); + return rc ? rc : count; +} + +static int pci_perf_seq_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, pci_perf_show, + file_inode(filp)->i_private); +} + +static const struct file_operations debugfs_pci_perf_fops = { + .open = pci_perf_seq_open, + .read = seq_read, + .write = pci_perf_seq_write, + .llseek = seq_lseek, + .release = single_release, +}; + +void zpci_debug_init_device(struct zpci_dev *zdev, const char *name) +{ + zdev->debugfs_dev = debugfs_create_dir(name, debugfs_root); + + debugfs_create_file("statistics", S_IFREG | S_IRUGO | S_IWUSR, + zdev->debugfs_dev, zdev, &debugfs_pci_perf_fops); +} + +void zpci_debug_exit_device(struct zpci_dev *zdev) +{ + debugfs_remove_recursive(zdev->debugfs_dev); +} + +int __init zpci_debug_init(void) +{ + /* event trace buffer */ + pci_debug_msg_id = debug_register("pci_msg", 8, 1, 8 * sizeof(long)); + if (!pci_debug_msg_id) + return -EINVAL; + debug_register_view(pci_debug_msg_id, &debug_sprintf_view); + debug_set_level(pci_debug_msg_id, 3); + + /* error log */ + pci_debug_err_id = debug_register("pci_error", 2, 1, 16); + if (!pci_debug_err_id) + return -EINVAL; + debug_register_view(pci_debug_err_id, &debug_hex_ascii_view); + debug_set_level(pci_debug_err_id, 3); + + debugfs_root = debugfs_create_dir("pci", NULL); + return 0; +} + +void zpci_debug_exit(void) +{ + debug_unregister(pci_debug_msg_id); + debug_unregister(pci_debug_err_id); + debugfs_remove(debugfs_root); +} diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c new file mode 100644 index 0000000000..99209085c7 --- /dev/null +++ b/arch/s390/pci/pci_dma.c @@ -0,0 +1,746 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2012 + * + * Author(s): + * Jan Glauber + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static struct kmem_cache *dma_region_table_cache; +static struct kmem_cache *dma_page_table_cache; +static int s390_iommu_strict; +static u64 s390_iommu_aperture; +static u32 s390_iommu_aperture_factor = 1; + +static int zpci_refresh_global(struct zpci_dev *zdev) +{ + return zpci_refresh_trans((u64) zdev->fh << 32, zdev->start_dma, + zdev->iommu_pages * PAGE_SIZE); +} + +unsigned long *dma_alloc_cpu_table(gfp_t gfp) +{ + unsigned long *table, *entry; + + table = kmem_cache_alloc(dma_region_table_cache, gfp); + if (!table) + return NULL; + + for (entry = table; entry < table + ZPCI_TABLE_ENTRIES; entry++) + *entry = ZPCI_TABLE_INVALID; + return table; +} + +static void dma_free_cpu_table(void *table) +{ + kmem_cache_free(dma_region_table_cache, table); +} + +static unsigned long *dma_alloc_page_table(gfp_t gfp) +{ + unsigned long *table, *entry; + + table = kmem_cache_alloc(dma_page_table_cache, gfp); + if (!table) + return NULL; + + for (entry = table; entry < table + ZPCI_PT_ENTRIES; entry++) + *entry = ZPCI_PTE_INVALID; + return table; +} + +static void dma_free_page_table(void *table) +{ + kmem_cache_free(dma_page_table_cache, table); +} + +static unsigned long *dma_get_seg_table_origin(unsigned long *rtep, gfp_t gfp) +{ + unsigned long old_rte, rte; + unsigned long *sto; + + rte = READ_ONCE(*rtep); + if (reg_entry_isvalid(rte)) { + sto = get_rt_sto(rte); + } else { + sto = dma_alloc_cpu_table(gfp); + if (!sto) + return NULL; + + set_rt_sto(&rte, virt_to_phys(sto)); + validate_rt_entry(&rte); + entry_clr_protected(&rte); + + old_rte = cmpxchg(rtep, ZPCI_TABLE_INVALID, rte); + if (old_rte != ZPCI_TABLE_INVALID) { + /* Somone else was faster, use theirs */ + dma_free_cpu_table(sto); + sto = get_rt_sto(old_rte); + } + } + return sto; +} + +static unsigned long *dma_get_page_table_origin(unsigned long *step, gfp_t gfp) +{ + unsigned long old_ste, ste; + unsigned long *pto; + + ste = READ_ONCE(*step); + if (reg_entry_isvalid(ste)) { + pto = get_st_pto(ste); + } else { + pto = dma_alloc_page_table(gfp); + if (!pto) + return NULL; + set_st_pto(&ste, virt_to_phys(pto)); + validate_st_entry(&ste); + entry_clr_protected(&ste); + + old_ste = cmpxchg(step, ZPCI_TABLE_INVALID, ste); + if (old_ste != ZPCI_TABLE_INVALID) { + /* Somone else was faster, use theirs */ + dma_free_page_table(pto); + pto = get_st_pto(old_ste); + } + } + return pto; +} + +unsigned long *dma_walk_cpu_trans(unsigned long *rto, dma_addr_t dma_addr, + gfp_t gfp) +{ + unsigned long *sto, *pto; + unsigned int rtx, sx, px; + + rtx = calc_rtx(dma_addr); + sto = dma_get_seg_table_origin(&rto[rtx], gfp); + if (!sto) + return NULL; + + sx = calc_sx(dma_addr); + pto = dma_get_page_table_origin(&sto[sx], gfp); + if (!pto) + return NULL; + + px = calc_px(dma_addr); + return &pto[px]; +} + +void dma_update_cpu_trans(unsigned long *ptep, phys_addr_t page_addr, int flags) +{ + unsigned long pte; + + pte = READ_ONCE(*ptep); + if (flags & ZPCI_PTE_INVALID) { + invalidate_pt_entry(&pte); + } else { + set_pt_pfaa(&pte, page_addr); + validate_pt_entry(&pte); + } + + if (flags & ZPCI_TABLE_PROTECTED) + entry_set_protected(&pte); + else + entry_clr_protected(&pte); + + xchg(ptep, pte); +} + +static int __dma_update_trans(struct zpci_dev *zdev, phys_addr_t pa, + dma_addr_t dma_addr, size_t size, int flags) +{ + unsigned int nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT; + phys_addr_t page_addr = (pa & PAGE_MASK); + unsigned long *entry; + int i, rc = 0; + + if (!nr_pages) + return -EINVAL; + + if (!zdev->dma_table) + return -EINVAL; + + for (i = 0; i < nr_pages; i++) { + entry = dma_walk_cpu_trans(zdev->dma_table, dma_addr, + GFP_ATOMIC); + if (!entry) { + rc = -ENOMEM; + goto undo_cpu_trans; + } + dma_update_cpu_trans(entry, page_addr, flags); + page_addr += PAGE_SIZE; + dma_addr += PAGE_SIZE; + } + +undo_cpu_trans: + if (rc && ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID)) { + flags = ZPCI_PTE_INVALID; + while (i-- > 0) { + page_addr -= PAGE_SIZE; + dma_addr -= PAGE_SIZE; + entry = dma_walk_cpu_trans(zdev->dma_table, dma_addr, + GFP_ATOMIC); + if (!entry) + break; + dma_update_cpu_trans(entry, page_addr, flags); + } + } + return rc; +} + +static int __dma_purge_tlb(struct zpci_dev *zdev, dma_addr_t dma_addr, + size_t size, int flags) +{ + unsigned long irqflags; + int ret; + + /* + * With zdev->tlb_refresh == 0, rpcit is not required to establish new + * translations when previously invalid translation-table entries are + * validated. With lazy unmap, rpcit is skipped for previously valid + * entries, but a global rpcit is then required before any address can + * be re-used, i.e. after each iommu bitmap wrap-around. + */ + if ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID) { + if (!zdev->tlb_refresh) + return 0; + } else { + if (!s390_iommu_strict) + return 0; + } + + ret = zpci_refresh_trans((u64) zdev->fh << 32, dma_addr, + PAGE_ALIGN(size)); + if (ret == -ENOMEM && !s390_iommu_strict) { + /* enable the hypervisor to free some resources */ + if (zpci_refresh_global(zdev)) + goto out; + + spin_lock_irqsave(&zdev->iommu_bitmap_lock, irqflags); + bitmap_andnot(zdev->iommu_bitmap, zdev->iommu_bitmap, + zdev->lazy_bitmap, zdev->iommu_pages); + bitmap_zero(zdev->lazy_bitmap, zdev->iommu_pages); + spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, irqflags); + ret = 0; + } +out: + return ret; +} + +static int dma_update_trans(struct zpci_dev *zdev, phys_addr_t pa, + dma_addr_t dma_addr, size_t size, int flags) +{ + int rc; + + rc = __dma_update_trans(zdev, pa, dma_addr, size, flags); + if (rc) + return rc; + + rc = __dma_purge_tlb(zdev, dma_addr, size, flags); + if (rc && ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID)) + __dma_update_trans(zdev, pa, dma_addr, size, ZPCI_PTE_INVALID); + + return rc; +} + +void dma_free_seg_table(unsigned long entry) +{ + unsigned long *sto = get_rt_sto(entry); + int sx; + + for (sx = 0; sx < ZPCI_TABLE_ENTRIES; sx++) + if (reg_entry_isvalid(sto[sx])) + dma_free_page_table(get_st_pto(sto[sx])); + + dma_free_cpu_table(sto); +} + +void dma_cleanup_tables(unsigned long *table) +{ + int rtx; + + if (!table) + return; + + for (rtx = 0; rtx < ZPCI_TABLE_ENTRIES; rtx++) + if (reg_entry_isvalid(table[rtx])) + dma_free_seg_table(table[rtx]); + + dma_free_cpu_table(table); +} + +static unsigned long __dma_alloc_iommu(struct device *dev, + unsigned long start, int size) +{ + struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); + + return iommu_area_alloc(zdev->iommu_bitmap, zdev->iommu_pages, + start, size, zdev->start_dma >> PAGE_SHIFT, + dma_get_seg_boundary_nr_pages(dev, PAGE_SHIFT), + 0); +} + +static dma_addr_t dma_alloc_address(struct device *dev, int size) +{ + struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); + unsigned long offset, flags; + + spin_lock_irqsave(&zdev->iommu_bitmap_lock, flags); + offset = __dma_alloc_iommu(dev, zdev->next_bit, size); + if (offset == -1) { + if (!s390_iommu_strict) { + /* global flush before DMA addresses are reused */ + if (zpci_refresh_global(zdev)) + goto out_error; + + bitmap_andnot(zdev->iommu_bitmap, zdev->iommu_bitmap, + zdev->lazy_bitmap, zdev->iommu_pages); + bitmap_zero(zdev->lazy_bitmap, zdev->iommu_pages); + } + /* wrap-around */ + offset = __dma_alloc_iommu(dev, 0, size); + if (offset == -1) + goto out_error; + } + zdev->next_bit = offset + size; + spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, flags); + + return zdev->start_dma + offset * PAGE_SIZE; + +out_error: + spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, flags); + return DMA_MAPPING_ERROR; +} + +static void dma_free_address(struct device *dev, dma_addr_t dma_addr, int size) +{ + struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); + unsigned long flags, offset; + + offset = (dma_addr - zdev->start_dma) >> PAGE_SHIFT; + + spin_lock_irqsave(&zdev->iommu_bitmap_lock, flags); + if (!zdev->iommu_bitmap) + goto out; + + if (s390_iommu_strict) + bitmap_clear(zdev->iommu_bitmap, offset, size); + else + bitmap_set(zdev->lazy_bitmap, offset, size); + +out: + spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, flags); +} + +static inline void zpci_err_dma(unsigned long rc, unsigned long addr) +{ + struct { + unsigned long rc; + unsigned long addr; + } __packed data = {rc, addr}; + + zpci_err_hex(&data, sizeof(data)); +} + +static dma_addr_t s390_dma_map_pages(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction direction, + unsigned long attrs) +{ + struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); + unsigned long pa = page_to_phys(page) + offset; + int flags = ZPCI_PTE_VALID; + unsigned long nr_pages; + dma_addr_t dma_addr; + int ret; + + /* This rounds up number of pages based on size and offset */ + nr_pages = iommu_num_pages(pa, size, PAGE_SIZE); + dma_addr = dma_alloc_address(dev, nr_pages); + if (dma_addr == DMA_MAPPING_ERROR) { + ret = -ENOSPC; + goto out_err; + } + + /* Use rounded up size */ + size = nr_pages * PAGE_SIZE; + + if (direction == DMA_NONE || direction == DMA_TO_DEVICE) + flags |= ZPCI_TABLE_PROTECTED; + + ret = dma_update_trans(zdev, pa, dma_addr, size, flags); + if (ret) + goto out_free; + + atomic64_add(nr_pages, &zdev->mapped_pages); + return dma_addr + (offset & ~PAGE_MASK); + +out_free: + dma_free_address(dev, dma_addr, nr_pages); +out_err: + zpci_err("map error:\n"); + zpci_err_dma(ret, pa); + return DMA_MAPPING_ERROR; +} + +static void s390_dma_unmap_pages(struct device *dev, dma_addr_t dma_addr, + size_t size, enum dma_data_direction direction, + unsigned long attrs) +{ + struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); + int npages, ret; + + npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); + dma_addr = dma_addr & PAGE_MASK; + ret = dma_update_trans(zdev, 0, dma_addr, npages * PAGE_SIZE, + ZPCI_PTE_INVALID); + if (ret) { + zpci_err("unmap error:\n"); + zpci_err_dma(ret, dma_addr); + return; + } + + atomic64_add(npages, &zdev->unmapped_pages); + dma_free_address(dev, dma_addr, npages); +} + +static void *s390_dma_alloc(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flag, + unsigned long attrs) +{ + struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); + struct page *page; + phys_addr_t pa; + dma_addr_t map; + + size = PAGE_ALIGN(size); + page = alloc_pages(flag | __GFP_ZERO, get_order(size)); + if (!page) + return NULL; + + pa = page_to_phys(page); + map = s390_dma_map_pages(dev, page, 0, size, DMA_BIDIRECTIONAL, 0); + if (dma_mapping_error(dev, map)) { + __free_pages(page, get_order(size)); + return NULL; + } + + atomic64_add(size / PAGE_SIZE, &zdev->allocated_pages); + if (dma_handle) + *dma_handle = map; + return phys_to_virt(pa); +} + +static void s390_dma_free(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_handle, + unsigned long attrs) +{ + struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); + + size = PAGE_ALIGN(size); + atomic64_sub(size / PAGE_SIZE, &zdev->allocated_pages); + s390_dma_unmap_pages(dev, dma_handle, size, DMA_BIDIRECTIONAL, 0); + free_pages((unsigned long)vaddr, get_order(size)); +} + +/* Map a segment into a contiguous dma address area */ +static int __s390_dma_map_sg(struct device *dev, struct scatterlist *sg, + size_t size, dma_addr_t *handle, + enum dma_data_direction dir) +{ + unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT; + struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); + dma_addr_t dma_addr_base, dma_addr; + int flags = ZPCI_PTE_VALID; + struct scatterlist *s; + phys_addr_t pa = 0; + int ret; + + dma_addr_base = dma_alloc_address(dev, nr_pages); + if (dma_addr_base == DMA_MAPPING_ERROR) + return -ENOMEM; + + dma_addr = dma_addr_base; + if (dir == DMA_NONE || dir == DMA_TO_DEVICE) + flags |= ZPCI_TABLE_PROTECTED; + + for (s = sg; dma_addr < dma_addr_base + size; s = sg_next(s)) { + pa = page_to_phys(sg_page(s)); + ret = __dma_update_trans(zdev, pa, dma_addr, + s->offset + s->length, flags); + if (ret) + goto unmap; + + dma_addr += s->offset + s->length; + } + ret = __dma_purge_tlb(zdev, dma_addr_base, size, flags); + if (ret) + goto unmap; + + *handle = dma_addr_base; + atomic64_add(nr_pages, &zdev->mapped_pages); + + return ret; + +unmap: + dma_update_trans(zdev, 0, dma_addr_base, dma_addr - dma_addr_base, + ZPCI_PTE_INVALID); + dma_free_address(dev, dma_addr_base, nr_pages); + zpci_err("map error:\n"); + zpci_err_dma(ret, pa); + return ret; +} + +static int s390_dma_map_sg(struct device *dev, struct scatterlist *sg, + int nr_elements, enum dma_data_direction dir, + unsigned long attrs) +{ + struct scatterlist *s = sg, *start = sg, *dma = sg; + unsigned int max = dma_get_max_seg_size(dev); + unsigned int size = s->offset + s->length; + unsigned int offset = s->offset; + int count = 0, i, ret; + + for (i = 1; i < nr_elements; i++) { + s = sg_next(s); + + s->dma_length = 0; + + if (s->offset || (size & ~PAGE_MASK) || + size + s->length > max) { + ret = __s390_dma_map_sg(dev, start, size, + &dma->dma_address, dir); + if (ret) + goto unmap; + + dma->dma_address += offset; + dma->dma_length = size - offset; + + size = offset = s->offset; + start = s; + dma = sg_next(dma); + count++; + } + size += s->length; + } + ret = __s390_dma_map_sg(dev, start, size, &dma->dma_address, dir); + if (ret) + goto unmap; + + dma->dma_address += offset; + dma->dma_length = size - offset; + + return count + 1; +unmap: + for_each_sg(sg, s, count, i) + s390_dma_unmap_pages(dev, sg_dma_address(s), sg_dma_len(s), + dir, attrs); + + return ret; +} + +static void s390_dma_unmap_sg(struct device *dev, struct scatterlist *sg, + int nr_elements, enum dma_data_direction dir, + unsigned long attrs) +{ + struct scatterlist *s; + int i; + + for_each_sg(sg, s, nr_elements, i) { + if (s->dma_length) + s390_dma_unmap_pages(dev, s->dma_address, s->dma_length, + dir, attrs); + s->dma_address = 0; + s->dma_length = 0; + } +} + +static unsigned long *bitmap_vzalloc(size_t bits, gfp_t flags) +{ + size_t n = BITS_TO_LONGS(bits); + size_t bytes; + + if (unlikely(check_mul_overflow(n, sizeof(unsigned long), &bytes))) + return NULL; + + return vzalloc(bytes); +} + +int zpci_dma_init_device(struct zpci_dev *zdev) +{ + u8 status; + int rc; + + /* + * At this point, if the device is part of an IOMMU domain, this would + * be a strong hint towards a bug in the IOMMU API (common) code and/or + * simultaneous access via IOMMU and DMA API. So let's issue a warning. + */ + WARN_ON(zdev->s390_domain); + + spin_lock_init(&zdev->iommu_bitmap_lock); + + zdev->dma_table = dma_alloc_cpu_table(GFP_KERNEL); + if (!zdev->dma_table) { + rc = -ENOMEM; + goto out; + } + + /* + * Restrict the iommu bitmap size to the minimum of the following: + * - s390_iommu_aperture which defaults to high_memory + * - 3-level pagetable address limit minus start_dma offset + * - DMA address range allowed by the hardware (clp query pci fn) + * + * Also set zdev->end_dma to the actual end address of the usable + * range, instead of the theoretical maximum as reported by hardware. + * + * This limits the number of concurrently usable DMA mappings since + * for each DMA mapped memory address we need a DMA address including + * extra DMA addresses for multiple mappings of the same memory address. + */ + zdev->start_dma = PAGE_ALIGN(zdev->start_dma); + zdev->iommu_size = min3(s390_iommu_aperture, + ZPCI_TABLE_SIZE_RT - zdev->start_dma, + zdev->end_dma - zdev->start_dma + 1); + zdev->end_dma = zdev->start_dma + zdev->iommu_size - 1; + zdev->iommu_pages = zdev->iommu_size >> PAGE_SHIFT; + zdev->iommu_bitmap = bitmap_vzalloc(zdev->iommu_pages, GFP_KERNEL); + if (!zdev->iommu_bitmap) { + rc = -ENOMEM; + goto free_dma_table; + } + if (!s390_iommu_strict) { + zdev->lazy_bitmap = bitmap_vzalloc(zdev->iommu_pages, GFP_KERNEL); + if (!zdev->lazy_bitmap) { + rc = -ENOMEM; + goto free_bitmap; + } + + } + if (zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, + virt_to_phys(zdev->dma_table), &status)) { + rc = -EIO; + goto free_bitmap; + } + + return 0; +free_bitmap: + vfree(zdev->iommu_bitmap); + zdev->iommu_bitmap = NULL; + vfree(zdev->lazy_bitmap); + zdev->lazy_bitmap = NULL; +free_dma_table: + dma_free_cpu_table(zdev->dma_table); + zdev->dma_table = NULL; +out: + return rc; +} + +int zpci_dma_exit_device(struct zpci_dev *zdev) +{ + int cc = 0; + + /* + * At this point, if the device is part of an IOMMU domain, this would + * be a strong hint towards a bug in the IOMMU API (common) code and/or + * simultaneous access via IOMMU and DMA API. So let's issue a warning. + */ + WARN_ON(zdev->s390_domain); + if (zdev_enabled(zdev)) + cc = zpci_unregister_ioat(zdev, 0); + /* + * cc == 3 indicates the function is gone already. This can happen + * if the function was deconfigured/disabled suddenly and we have not + * received a new handle yet. + */ + if (cc && cc != 3) + return -EIO; + + dma_cleanup_tables(zdev->dma_table); + zdev->dma_table = NULL; + vfree(zdev->iommu_bitmap); + zdev->iommu_bitmap = NULL; + vfree(zdev->lazy_bitmap); + zdev->lazy_bitmap = NULL; + zdev->next_bit = 0; + return 0; +} + +static int __init dma_alloc_cpu_table_caches(void) +{ + dma_region_table_cache = kmem_cache_create("PCI_DMA_region_tables", + ZPCI_TABLE_SIZE, ZPCI_TABLE_ALIGN, + 0, NULL); + if (!dma_region_table_cache) + return -ENOMEM; + + dma_page_table_cache = kmem_cache_create("PCI_DMA_page_tables", + ZPCI_PT_SIZE, ZPCI_PT_ALIGN, + 0, NULL); + if (!dma_page_table_cache) { + kmem_cache_destroy(dma_region_table_cache); + return -ENOMEM; + } + return 0; +} + +int __init zpci_dma_init(void) +{ + s390_iommu_aperture = (u64)virt_to_phys(high_memory); + if (!s390_iommu_aperture_factor) + s390_iommu_aperture = ULONG_MAX; + else + s390_iommu_aperture *= s390_iommu_aperture_factor; + + return dma_alloc_cpu_table_caches(); +} + +void zpci_dma_exit(void) +{ + kmem_cache_destroy(dma_page_table_cache); + kmem_cache_destroy(dma_region_table_cache); +} + +const struct dma_map_ops s390_pci_dma_ops = { + .alloc = s390_dma_alloc, + .free = s390_dma_free, + .map_sg = s390_dma_map_sg, + .unmap_sg = s390_dma_unmap_sg, + .map_page = s390_dma_map_pages, + .unmap_page = s390_dma_unmap_pages, + .mmap = dma_common_mmap, + .get_sgtable = dma_common_get_sgtable, + .alloc_pages = dma_common_alloc_pages, + .free_pages = dma_common_free_pages, + /* dma_supported is unconditionally true without a callback */ +}; +EXPORT_SYMBOL_GPL(s390_pci_dma_ops); + +static int __init s390_iommu_setup(char *str) +{ + if (!strcmp(str, "strict")) + s390_iommu_strict = 1; + return 1; +} + +__setup("s390_iommu=", s390_iommu_setup); + +static int __init s390_iommu_aperture_setup(char *str) +{ + if (kstrtou32(str, 10, &s390_iommu_aperture_factor)) + s390_iommu_aperture_factor = 1; + return 1; +} + +__setup("s390_iommu_aperture=", s390_iommu_aperture_setup); diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c new file mode 100644 index 0000000000..b9324ca2eb --- /dev/null +++ b/arch/s390/pci/pci_event.c @@ -0,0 +1,389 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2012 + * + * Author(s): + * Jan Glauber + */ + +#define KMSG_COMPONENT "zpci" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include + +#include "pci_bus.h" + +/* Content Code Description for PCI Function Error */ +struct zpci_ccdf_err { + u32 reserved1; + u32 fh; /* function handle */ + u32 fid; /* function id */ + u32 ett : 4; /* expected table type */ + u32 mvn : 12; /* MSI vector number */ + u32 dmaas : 8; /* DMA address space */ + u32 : 6; + u32 q : 1; /* event qualifier */ + u32 rw : 1; /* read/write */ + u64 faddr; /* failing address */ + u32 reserved3; + u16 reserved4; + u16 pec; /* PCI event code */ +} __packed; + +/* Content Code Description for PCI Function Availability */ +struct zpci_ccdf_avail { + u32 reserved1; + u32 fh; /* function handle */ + u32 fid; /* function id */ + u32 reserved2; + u32 reserved3; + u32 reserved4; + u32 reserved5; + u16 reserved6; + u16 pec; /* PCI event code */ +} __packed; + +static inline bool ers_result_indicates_abort(pci_ers_result_t ers_res) +{ + switch (ers_res) { + case PCI_ERS_RESULT_CAN_RECOVER: + case PCI_ERS_RESULT_RECOVERED: + case PCI_ERS_RESULT_NEED_RESET: + return false; + default: + return true; + } +} + +static bool is_passed_through(struct zpci_dev *zdev) +{ + return zdev->s390_domain; +} + +static bool is_driver_supported(struct pci_driver *driver) +{ + if (!driver || !driver->err_handler) + return false; + if (!driver->err_handler->error_detected) + return false; + if (!driver->err_handler->slot_reset) + return false; + if (!driver->err_handler->resume) + return false; + return true; +} + +static pci_ers_result_t zpci_event_notify_error_detected(struct pci_dev *pdev, + struct pci_driver *driver) +{ + pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT; + + ers_res = driver->err_handler->error_detected(pdev, pdev->error_state); + if (ers_result_indicates_abort(ers_res)) + pr_info("%s: Automatic recovery failed after initial reporting\n", pci_name(pdev)); + else if (ers_res == PCI_ERS_RESULT_NEED_RESET) + pr_debug("%s: Driver needs reset to recover\n", pci_name(pdev)); + + return ers_res; +} + +static pci_ers_result_t zpci_event_do_error_state_clear(struct pci_dev *pdev, + struct pci_driver *driver) +{ + pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT; + struct zpci_dev *zdev = to_zpci(pdev); + int rc; + + pr_info("%s: Unblocking device access for examination\n", pci_name(pdev)); + rc = zpci_reset_load_store_blocked(zdev); + if (rc) { + pr_err("%s: Unblocking device access failed\n", pci_name(pdev)); + /* Let's try a full reset instead */ + return PCI_ERS_RESULT_NEED_RESET; + } + + if (driver->err_handler->mmio_enabled) { + ers_res = driver->err_handler->mmio_enabled(pdev); + if (ers_result_indicates_abort(ers_res)) { + pr_info("%s: Automatic recovery failed after MMIO re-enable\n", + pci_name(pdev)); + return ers_res; + } else if (ers_res == PCI_ERS_RESULT_NEED_RESET) { + pr_debug("%s: Driver needs reset to recover\n", pci_name(pdev)); + return ers_res; + } + } + + pr_debug("%s: Unblocking DMA\n", pci_name(pdev)); + rc = zpci_clear_error_state(zdev); + if (!rc) { + pdev->error_state = pci_channel_io_normal; + } else { + pr_err("%s: Unblocking DMA failed\n", pci_name(pdev)); + /* Let's try a full reset instead */ + return PCI_ERS_RESULT_NEED_RESET; + } + + return ers_res; +} + +static pci_ers_result_t zpci_event_do_reset(struct pci_dev *pdev, + struct pci_driver *driver) +{ + pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT; + + pr_info("%s: Initiating reset\n", pci_name(pdev)); + if (zpci_hot_reset_device(to_zpci(pdev))) { + pr_err("%s: The reset request failed\n", pci_name(pdev)); + return ers_res; + } + pdev->error_state = pci_channel_io_normal; + ers_res = driver->err_handler->slot_reset(pdev); + if (ers_result_indicates_abort(ers_res)) { + pr_info("%s: Automatic recovery failed after slot reset\n", pci_name(pdev)); + return ers_res; + } + + return ers_res; +} + +/* zpci_event_attempt_error_recovery - Try to recover the given PCI function + * @pdev: PCI function to recover currently in the error state + * + * We follow the scheme outlined in Documentation/PCI/pci-error-recovery.rst. + * With the simplification that recovery always happens per function + * and the platform determines which functions are affected for + * multi-function devices. + */ +static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev) +{ + pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT; + struct pci_driver *driver; + + /* + * Ensure that the PCI function is not removed concurrently, no driver + * is unbound or probed and that userspace can't access its + * configuration space while we perform recovery. + */ + pci_dev_lock(pdev); + if (pdev->error_state == pci_channel_io_perm_failure) { + ers_res = PCI_ERS_RESULT_DISCONNECT; + goto out_unlock; + } + pdev->error_state = pci_channel_io_frozen; + + if (is_passed_through(to_zpci(pdev))) { + pr_info("%s: Cannot be recovered in the host because it is a pass-through device\n", + pci_name(pdev)); + goto out_unlock; + } + + driver = to_pci_driver(pdev->dev.driver); + if (!is_driver_supported(driver)) { + if (!driver) + pr_info("%s: Cannot be recovered because no driver is bound to the device\n", + pci_name(pdev)); + else + pr_info("%s: The %s driver bound to the device does not support error recovery\n", + pci_name(pdev), + driver->name); + goto out_unlock; + } + + ers_res = zpci_event_notify_error_detected(pdev, driver); + if (ers_result_indicates_abort(ers_res)) + goto out_unlock; + + if (ers_res == PCI_ERS_RESULT_CAN_RECOVER) { + ers_res = zpci_event_do_error_state_clear(pdev, driver); + if (ers_result_indicates_abort(ers_res)) + goto out_unlock; + } + + if (ers_res == PCI_ERS_RESULT_NEED_RESET) + ers_res = zpci_event_do_reset(pdev, driver); + + if (ers_res != PCI_ERS_RESULT_RECOVERED) { + pr_err("%s: Automatic recovery failed; operator intervention is required\n", + pci_name(pdev)); + goto out_unlock; + } + + pr_info("%s: The device is ready to resume operations\n", pci_name(pdev)); + if (driver->err_handler->resume) + driver->err_handler->resume(pdev); +out_unlock: + pci_dev_unlock(pdev); + + return ers_res; +} + +/* zpci_event_io_failure - Report PCI channel failure state to driver + * @pdev: PCI function for which to report + * @es: PCI channel failure state to report + */ +static void zpci_event_io_failure(struct pci_dev *pdev, pci_channel_state_t es) +{ + struct pci_driver *driver; + + pci_dev_lock(pdev); + pdev->error_state = es; + /** + * While vfio-pci's error_detected callback notifies user-space QEMU + * reacts to this by freezing the guest. In an s390 environment PCI + * errors are rarely fatal so this is overkill. Instead in the future + * we will inject the error event and let the guest recover the device + * itself. + */ + if (is_passed_through(to_zpci(pdev))) + goto out; + driver = to_pci_driver(pdev->dev.driver); + if (driver && driver->err_handler && driver->err_handler->error_detected) + driver->err_handler->error_detected(pdev, pdev->error_state); +out: + pci_dev_unlock(pdev); +} + +static void __zpci_event_error(struct zpci_ccdf_err *ccdf) +{ + struct zpci_dev *zdev = get_zdev_by_fid(ccdf->fid); + struct pci_dev *pdev = NULL; + pci_ers_result_t ers_res; + + zpci_dbg(3, "err fid:%x, fh:%x, pec:%x\n", + ccdf->fid, ccdf->fh, ccdf->pec); + zpci_err("error CCDF:\n"); + zpci_err_hex(ccdf, sizeof(*ccdf)); + + if (zdev) { + zpci_update_fh(zdev, ccdf->fh); + if (zdev->zbus->bus) + pdev = pci_get_slot(zdev->zbus->bus, zdev->devfn); + } + + pr_err("%s: Event 0x%x reports an error for PCI function 0x%x\n", + pdev ? pci_name(pdev) : "n/a", ccdf->pec, ccdf->fid); + + if (!pdev) + goto no_pdev; + + switch (ccdf->pec) { + case 0x003a: /* Service Action or Error Recovery Successful */ + ers_res = zpci_event_attempt_error_recovery(pdev); + if (ers_res != PCI_ERS_RESULT_RECOVERED) + zpci_event_io_failure(pdev, pci_channel_io_perm_failure); + break; + default: + /* + * Mark as frozen not permanently failed because the device + * could be subsequently recovered by the platform. + */ + zpci_event_io_failure(pdev, pci_channel_io_frozen); + break; + } + pci_dev_put(pdev); +no_pdev: + zpci_zdev_put(zdev); +} + +void zpci_event_error(void *data) +{ + if (zpci_is_enabled()) + __zpci_event_error(data); +} + +static void zpci_event_hard_deconfigured(struct zpci_dev *zdev, u32 fh) +{ + zpci_update_fh(zdev, fh); + /* Give the driver a hint that the function is + * already unusable. + */ + zpci_bus_remove_device(zdev, true); + /* Even though the device is already gone we still + * need to free zPCI resources as part of the disable. + */ + if (zdev->dma_table) + zpci_dma_exit_device(zdev); + if (zdev_enabled(zdev)) + zpci_disable_device(zdev); + zdev->state = ZPCI_FN_STATE_STANDBY; +} + +static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) +{ + struct zpci_dev *zdev = get_zdev_by_fid(ccdf->fid); + bool existing_zdev = !!zdev; + enum zpci_state state; + + zpci_dbg(3, "avl fid:%x, fh:%x, pec:%x\n", + ccdf->fid, ccdf->fh, ccdf->pec); + switch (ccdf->pec) { + case 0x0301: /* Reserved|Standby -> Configured */ + if (!zdev) { + zdev = zpci_create_device(ccdf->fid, ccdf->fh, ZPCI_FN_STATE_CONFIGURED); + if (IS_ERR(zdev)) + break; + } else { + /* the configuration request may be stale */ + if (zdev->state != ZPCI_FN_STATE_STANDBY) + break; + zdev->state = ZPCI_FN_STATE_CONFIGURED; + } + zpci_scan_configured_device(zdev, ccdf->fh); + break; + case 0x0302: /* Reserved -> Standby */ + if (!zdev) + zpci_create_device(ccdf->fid, ccdf->fh, ZPCI_FN_STATE_STANDBY); + else + zpci_update_fh(zdev, ccdf->fh); + break; + case 0x0303: /* Deconfiguration requested */ + if (zdev) { + /* The event may have been queued before we confirgured + * the device. + */ + if (zdev->state != ZPCI_FN_STATE_CONFIGURED) + break; + zpci_update_fh(zdev, ccdf->fh); + zpci_deconfigure_device(zdev); + } + break; + case 0x0304: /* Configured -> Standby|Reserved */ + if (zdev) { + /* The event may have been queued before we confirgured + * the device.: + */ + if (zdev->state == ZPCI_FN_STATE_CONFIGURED) + zpci_event_hard_deconfigured(zdev, ccdf->fh); + /* The 0x0304 event may immediately reserve the device */ + if (!clp_get_state(zdev->fid, &state) && + state == ZPCI_FN_STATE_RESERVED) { + zpci_device_reserved(zdev); + } + } + break; + case 0x0306: /* 0x308 or 0x302 for multiple devices */ + zpci_remove_reserved_devices(); + clp_scan_pci_devices(); + break; + case 0x0308: /* Standby -> Reserved */ + if (!zdev) + break; + zpci_device_reserved(zdev); + break; + default: + break; + } + if (existing_zdev) + zpci_zdev_put(zdev); +} + +void zpci_event_availability(void *data) +{ + if (zpci_is_enabled()) + __zpci_event_availability(data); +} diff --git a/arch/s390/pci/pci_insn.c b/arch/s390/pci/pci_insn.c new file mode 100644 index 0000000000..56480be482 --- /dev/null +++ b/arch/s390/pci/pci_insn.c @@ -0,0 +1,443 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * s390 specific pci instructions + * + * Copyright IBM Corp. 2013 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ZPCI_INSN_BUSY_DELAY 1 /* 1 microsecond */ + +struct zpci_err_insn_data { + u8 insn; + u8 cc; + u8 status; + union { + struct { + u64 req; + u64 offset; + }; + struct { + u64 addr; + u64 len; + }; + }; +} __packed; + +static inline void zpci_err_insn_req(int lvl, u8 insn, u8 cc, u8 status, + u64 req, u64 offset) +{ + struct zpci_err_insn_data data = { + .insn = insn, .cc = cc, .status = status, + .req = req, .offset = offset}; + + zpci_err_hex_level(lvl, &data, sizeof(data)); +} + +static inline void zpci_err_insn_addr(int lvl, u8 insn, u8 cc, u8 status, + u64 addr, u64 len) +{ + struct zpci_err_insn_data data = { + .insn = insn, .cc = cc, .status = status, + .addr = addr, .len = len}; + + zpci_err_hex_level(lvl, &data, sizeof(data)); +} + +/* Modify PCI Function Controls */ +static inline u8 __mpcifc(u64 req, struct zpci_fib *fib, u8 *status) +{ + u8 cc; + + asm volatile ( + " .insn rxy,0xe300000000d0,%[req],%[fib]\n" + " ipm %[cc]\n" + " srl %[cc],28\n" + : [cc] "=d" (cc), [req] "+d" (req), [fib] "+Q" (*fib) + : : "cc"); + *status = req >> 24 & 0xff; + return cc; +} + +u8 zpci_mod_fc(u64 req, struct zpci_fib *fib, u8 *status) +{ + bool retried = false; + u8 cc; + + do { + cc = __mpcifc(req, fib, status); + if (cc == 2) { + msleep(ZPCI_INSN_BUSY_DELAY); + if (!retried) { + zpci_err_insn_req(1, 'M', cc, *status, req, 0); + retried = true; + } + } + } while (cc == 2); + + if (cc) + zpci_err_insn_req(0, 'M', cc, *status, req, 0); + else if (retried) + zpci_err_insn_req(1, 'M', cc, *status, req, 0); + + return cc; +} +EXPORT_SYMBOL_GPL(zpci_mod_fc); + +/* Refresh PCI Translations */ +static inline u8 __rpcit(u64 fn, u64 addr, u64 range, u8 *status) +{ + union register_pair addr_range = {.even = addr, .odd = range}; + u8 cc; + + asm volatile ( + " .insn rre,0xb9d30000,%[fn],%[addr_range]\n" + " ipm %[cc]\n" + " srl %[cc],28\n" + : [cc] "=d" (cc), [fn] "+d" (fn) + : [addr_range] "d" (addr_range.pair) + : "cc"); + *status = fn >> 24 & 0xff; + return cc; +} + +int zpci_refresh_trans(u64 fn, u64 addr, u64 range) +{ + bool retried = false; + u8 cc, status; + + do { + cc = __rpcit(fn, addr, range, &status); + if (cc == 2) { + udelay(ZPCI_INSN_BUSY_DELAY); + if (!retried) { + zpci_err_insn_addr(1, 'R', cc, status, addr, range); + retried = true; + } + } + } while (cc == 2); + + if (cc) + zpci_err_insn_addr(0, 'R', cc, status, addr, range); + else if (retried) + zpci_err_insn_addr(1, 'R', cc, status, addr, range); + + if (cc == 1 && (status == 4 || status == 16)) + return -ENOMEM; + + return (cc) ? -EIO : 0; +} + +/* Set Interruption Controls */ +int zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib) +{ + if (!test_facility(72)) + return -EIO; + + asm volatile( + ".insn rsy,0xeb00000000d1,%[ctl],%[isc],%[iib]\n" + : : [ctl] "d" (ctl), [isc] "d" (isc << 27), [iib] "Q" (*iib)); + + return 0; +} +EXPORT_SYMBOL_GPL(zpci_set_irq_ctrl); + +/* PCI Load */ +static inline int ____pcilg(u64 *data, u64 req, u64 offset, u8 *status) +{ + union register_pair req_off = {.even = req, .odd = offset}; + int cc = -ENXIO; + u64 __data; + + asm volatile ( + " .insn rre,0xb9d20000,%[data],%[req_off]\n" + "0: ipm %[cc]\n" + " srl %[cc],28\n" + "1:\n" + EX_TABLE(0b, 1b) + : [cc] "+d" (cc), [data] "=d" (__data), + [req_off] "+&d" (req_off.pair) :: "cc"); + *status = req_off.even >> 24 & 0xff; + *data = __data; + return cc; +} + +static inline int __pcilg(u64 *data, u64 req, u64 offset, u8 *status) +{ + u64 __data; + int cc; + + cc = ____pcilg(&__data, req, offset, status); + if (!cc) + *data = __data; + + return cc; +} + +int __zpci_load(u64 *data, u64 req, u64 offset) +{ + bool retried = false; + u8 status; + int cc; + + do { + cc = __pcilg(data, req, offset, &status); + if (cc == 2) { + udelay(ZPCI_INSN_BUSY_DELAY); + if (!retried) { + zpci_err_insn_req(1, 'l', cc, status, req, offset); + retried = true; + } + } + } while (cc == 2); + + if (cc) + zpci_err_insn_req(0, 'l', cc, status, req, offset); + else if (retried) + zpci_err_insn_req(1, 'l', cc, status, req, offset); + + return (cc > 0) ? -EIO : cc; +} +EXPORT_SYMBOL_GPL(__zpci_load); + +static inline int zpci_load_fh(u64 *data, const volatile void __iomem *addr, + unsigned long len) +{ + struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(addr)]; + u64 req = ZPCI_CREATE_REQ(READ_ONCE(entry->fh), entry->bar, len); + + return __zpci_load(data, req, ZPCI_OFFSET(addr)); +} + +static inline int __pcilg_mio(u64 *data, u64 ioaddr, u64 len, u8 *status) +{ + union register_pair ioaddr_len = {.even = ioaddr, .odd = len}; + int cc = -ENXIO; + u64 __data; + + asm volatile ( + " .insn rre,0xb9d60000,%[data],%[ioaddr_len]\n" + "0: ipm %[cc]\n" + " srl %[cc],28\n" + "1:\n" + EX_TABLE(0b, 1b) + : [cc] "+d" (cc), [data] "=d" (__data), + [ioaddr_len] "+&d" (ioaddr_len.pair) :: "cc"); + *status = ioaddr_len.odd >> 24 & 0xff; + *data = __data; + return cc; +} + +int zpci_load(u64 *data, const volatile void __iomem *addr, unsigned long len) +{ + u8 status; + int cc; + + if (!static_branch_unlikely(&have_mio)) + return zpci_load_fh(data, addr, len); + + cc = __pcilg_mio(data, (__force u64) addr, len, &status); + if (cc) + zpci_err_insn_addr(0, 'L', cc, status, (__force u64) addr, len); + + return (cc > 0) ? -EIO : cc; +} +EXPORT_SYMBOL_GPL(zpci_load); + +/* PCI Store */ +static inline int __pcistg(u64 data, u64 req, u64 offset, u8 *status) +{ + union register_pair req_off = {.even = req, .odd = offset}; + int cc = -ENXIO; + + asm volatile ( + " .insn rre,0xb9d00000,%[data],%[req_off]\n" + "0: ipm %[cc]\n" + " srl %[cc],28\n" + "1:\n" + EX_TABLE(0b, 1b) + : [cc] "+d" (cc), [req_off] "+&d" (req_off.pair) + : [data] "d" (data) + : "cc"); + *status = req_off.even >> 24 & 0xff; + return cc; +} + +int __zpci_store(u64 data, u64 req, u64 offset) +{ + bool retried = false; + u8 status; + int cc; + + do { + cc = __pcistg(data, req, offset, &status); + if (cc == 2) { + udelay(ZPCI_INSN_BUSY_DELAY); + if (!retried) { + zpci_err_insn_req(1, 's', cc, status, req, offset); + retried = true; + } + } + } while (cc == 2); + + if (cc) + zpci_err_insn_req(0, 's', cc, status, req, offset); + else if (retried) + zpci_err_insn_req(1, 's', cc, status, req, offset); + + return (cc > 0) ? -EIO : cc; +} +EXPORT_SYMBOL_GPL(__zpci_store); + +static inline int zpci_store_fh(const volatile void __iomem *addr, u64 data, + unsigned long len) +{ + struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(addr)]; + u64 req = ZPCI_CREATE_REQ(READ_ONCE(entry->fh), entry->bar, len); + + return __zpci_store(data, req, ZPCI_OFFSET(addr)); +} + +static inline int __pcistg_mio(u64 data, u64 ioaddr, u64 len, u8 *status) +{ + union register_pair ioaddr_len = {.even = ioaddr, .odd = len}; + int cc = -ENXIO; + + asm volatile ( + " .insn rre,0xb9d40000,%[data],%[ioaddr_len]\n" + "0: ipm %[cc]\n" + " srl %[cc],28\n" + "1:\n" + EX_TABLE(0b, 1b) + : [cc] "+d" (cc), [ioaddr_len] "+&d" (ioaddr_len.pair) + : [data] "d" (data) + : "cc", "memory"); + *status = ioaddr_len.odd >> 24 & 0xff; + return cc; +} + +int zpci_store(const volatile void __iomem *addr, u64 data, unsigned long len) +{ + u8 status; + int cc; + + if (!static_branch_unlikely(&have_mio)) + return zpci_store_fh(addr, data, len); + + cc = __pcistg_mio(data, (__force u64) addr, len, &status); + if (cc) + zpci_err_insn_addr(0, 'S', cc, status, (__force u64) addr, len); + + return (cc > 0) ? -EIO : cc; +} +EXPORT_SYMBOL_GPL(zpci_store); + +/* PCI Store Block */ +static inline int __pcistb(const u64 *data, u64 req, u64 offset, u8 *status) +{ + int cc = -ENXIO; + + asm volatile ( + " .insn rsy,0xeb00000000d0,%[req],%[offset],%[data]\n" + "0: ipm %[cc]\n" + " srl %[cc],28\n" + "1:\n" + EX_TABLE(0b, 1b) + : [cc] "+d" (cc), [req] "+d" (req) + : [offset] "d" (offset), [data] "Q" (*data) + : "cc"); + *status = req >> 24 & 0xff; + return cc; +} + +int __zpci_store_block(const u64 *data, u64 req, u64 offset) +{ + bool retried = false; + u8 status; + int cc; + + do { + cc = __pcistb(data, req, offset, &status); + if (cc == 2) { + udelay(ZPCI_INSN_BUSY_DELAY); + if (!retried) { + zpci_err_insn_req(0, 'b', cc, status, req, offset); + retried = true; + } + } + } while (cc == 2); + + if (cc) + zpci_err_insn_req(0, 'b', cc, status, req, offset); + else if (retried) + zpci_err_insn_req(1, 'b', cc, status, req, offset); + + return (cc > 0) ? -EIO : cc; +} +EXPORT_SYMBOL_GPL(__zpci_store_block); + +static inline int zpci_write_block_fh(volatile void __iomem *dst, + const void *src, unsigned long len) +{ + struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(dst)]; + u64 req = ZPCI_CREATE_REQ(entry->fh, entry->bar, len); + u64 offset = ZPCI_OFFSET(dst); + + return __zpci_store_block(src, req, offset); +} + +static inline int __pcistb_mio(const u64 *data, u64 ioaddr, u64 len, u8 *status) +{ + int cc = -ENXIO; + + asm volatile ( + " .insn rsy,0xeb00000000d4,%[len],%[ioaddr],%[data]\n" + "0: ipm %[cc]\n" + " srl %[cc],28\n" + "1:\n" + EX_TABLE(0b, 1b) + : [cc] "+d" (cc), [len] "+d" (len) + : [ioaddr] "d" (ioaddr), [data] "Q" (*data) + : "cc"); + *status = len >> 24 & 0xff; + return cc; +} + +int zpci_write_block(volatile void __iomem *dst, + const void *src, unsigned long len) +{ + u8 status; + int cc; + + if (!static_branch_unlikely(&have_mio)) + return zpci_write_block_fh(dst, src, len); + + cc = __pcistb_mio(src, (__force u64) dst, len, &status); + if (cc) + zpci_err_insn_addr(0, 'B', cc, status, (__force u64) dst, len); + + return (cc > 0) ? -EIO : cc; +} +EXPORT_SYMBOL_GPL(zpci_write_block); + +static inline void __pciwb_mio(void) +{ + asm volatile (".insn rre,0xb9d50000,0,0\n"); +} + +void zpci_barrier(void) +{ + if (static_branch_likely(&have_mio)) + __pciwb_mio(); +} +EXPORT_SYMBOL_GPL(zpci_barrier); diff --git a/arch/s390/pci/pci_iov.c b/arch/s390/pci/pci_iov.c new file mode 100644 index 0000000000..ead062bf2b --- /dev/null +++ b/arch/s390/pci/pci_iov.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2020 + * + * Author(s): + * Niklas Schnelle + * + */ + +#define KMSG_COMPONENT "zpci" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include + +#include "pci_iov.h" + +static struct resource iov_res = { + .name = "PCI IOV res", + .start = 0, + .end = -1, + .flags = IORESOURCE_MEM, +}; + +void zpci_iov_map_resources(struct pci_dev *pdev) +{ + resource_size_t len; + int i; + + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + int bar = i + PCI_IOV_RESOURCES; + + len = pci_resource_len(pdev, bar); + if (!len) + continue; + pdev->resource[bar].parent = &iov_res; + } +} + +void zpci_iov_remove_virtfn(struct pci_dev *pdev, int vfn) +{ + pci_lock_rescan_remove(); + /* Linux' vfid's start at 0 vfn at 1 */ + pci_iov_remove_virtfn(pdev->physfn, vfn - 1); + pci_unlock_rescan_remove(); +} + +static int zpci_iov_link_virtfn(struct pci_dev *pdev, struct pci_dev *virtfn, int vfid) +{ + int rc; + + rc = pci_iov_sysfs_link(pdev, virtfn, vfid); + if (rc) + return rc; + + virtfn->is_virtfn = 1; + virtfn->multifunction = 0; + virtfn->physfn = pci_dev_get(pdev); + + return 0; +} + +int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn) +{ + int i, cand_devfn; + struct zpci_dev *zdev; + struct pci_dev *pdev; + int vfid = vfn - 1; /* Linux' vfid's start at 0 vfn at 1*/ + int rc = 0; + + if (!zbus->multifunction) + return 0; + + /* If the parent PF for the given VF is also configured in the + * instance, it must be on the same zbus. + * We can then identify the parent PF by checking what + * devfn the VF would have if it belonged to that PF using the PF's + * stride and offset. Only if this candidate devfn matches the + * actual devfn will we link both functions. + */ + for (i = 0; i < ZPCI_FUNCTIONS_PER_BUS; i++) { + zdev = zbus->function[i]; + if (zdev && zdev->is_physfn) { + pdev = pci_get_slot(zbus->bus, zdev->devfn); + if (!pdev) + continue; + cand_devfn = pci_iov_virtfn_devfn(pdev, vfid); + if (cand_devfn == virtfn->devfn) { + rc = zpci_iov_link_virtfn(pdev, virtfn, vfid); + /* balance pci_get_slot() */ + pci_dev_put(pdev); + break; + } + /* balance pci_get_slot() */ + pci_dev_put(pdev); + } + } + return rc; +} diff --git a/arch/s390/pci/pci_iov.h b/arch/s390/pci/pci_iov.h new file mode 100644 index 0000000000..b2c828003b --- /dev/null +++ b/arch/s390/pci/pci_iov.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2020 + * + * Author(s): + * Niklas Schnelle + * + */ + +#ifndef __S390_PCI_IOV_H +#define __S390_PCI_IOV_H + +#ifdef CONFIG_PCI_IOV +void zpci_iov_remove_virtfn(struct pci_dev *pdev, int vfn); + +void zpci_iov_map_resources(struct pci_dev *pdev); + +int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn); + +#else /* CONFIG_PCI_IOV */ +static inline void zpci_iov_remove_virtfn(struct pci_dev *pdev, int vfn) {} + +static inline void zpci_iov_map_resources(struct pci_dev *pdev) {} + +static inline int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn) +{ + return 0; +} +#endif /* CONFIG_PCI_IOV */ +#endif /* __S390_PCI_IOV_h */ diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c new file mode 100644 index 0000000000..ff8f24854c --- /dev/null +++ b/arch/s390/pci/pci_irq.c @@ -0,0 +1,530 @@ +// SPDX-License-Identifier: GPL-2.0 +#define KMSG_COMPONENT "zpci" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static enum {FLOATING, DIRECTED} irq_delivery; + +/* + * summary bit vector + * FLOATING - summary bit per function + * DIRECTED - summary bit per cpu (only used in fallback path) + */ +static struct airq_iv *zpci_sbv; + +/* + * interrupt bit vectors + * FLOATING - interrupt bit vector per function + * DIRECTED - interrupt bit vector per cpu + */ +static struct airq_iv **zpci_ibv; + +/* Modify PCI: Register floating adapter interruptions */ +static int zpci_set_airq(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_REG_INT); + struct zpci_fib fib = {0}; + u8 status; + + fib.fmt0.isc = PCI_ISC; + fib.fmt0.sum = 1; /* enable summary notifications */ + fib.fmt0.noi = airq_iv_end(zdev->aibv); + fib.fmt0.aibv = virt_to_phys(zdev->aibv->vector); + fib.fmt0.aibvo = 0; /* each zdev has its own interrupt vector */ + fib.fmt0.aisb = virt_to_phys(zpci_sbv->vector) + (zdev->aisb / 64) * 8; + fib.fmt0.aisbo = zdev->aisb & 63; + fib.gd = zdev->gisa; + + return zpci_mod_fc(req, &fib, &status) ? -EIO : 0; +} + +/* Modify PCI: Unregister floating adapter interruptions */ +static int zpci_clear_airq(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_DEREG_INT); + struct zpci_fib fib = {0}; + u8 cc, status; + + fib.gd = zdev->gisa; + + cc = zpci_mod_fc(req, &fib, &status); + if (cc == 3 || (cc == 1 && status == 24)) + /* Function already gone or IRQs already deregistered. */ + cc = 0; + + return cc ? -EIO : 0; +} + +/* Modify PCI: Register CPU directed interruptions */ +static int zpci_set_directed_irq(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_REG_INT_D); + struct zpci_fib fib = {0}; + u8 status; + + fib.fmt = 1; + fib.fmt1.noi = zdev->msi_nr_irqs; + fib.fmt1.dibvo = zdev->msi_first_bit; + fib.gd = zdev->gisa; + + return zpci_mod_fc(req, &fib, &status) ? -EIO : 0; +} + +/* Modify PCI: Unregister CPU directed interruptions */ +static int zpci_clear_directed_irq(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_DEREG_INT_D); + struct zpci_fib fib = {0}; + u8 cc, status; + + fib.fmt = 1; + fib.gd = zdev->gisa; + cc = zpci_mod_fc(req, &fib, &status); + if (cc == 3 || (cc == 1 && status == 24)) + /* Function already gone or IRQs already deregistered. */ + cc = 0; + + return cc ? -EIO : 0; +} + +/* Register adapter interruptions */ +static int zpci_set_irq(struct zpci_dev *zdev) +{ + int rc; + + if (irq_delivery == DIRECTED) + rc = zpci_set_directed_irq(zdev); + else + rc = zpci_set_airq(zdev); + + if (!rc) + zdev->irqs_registered = 1; + + return rc; +} + +/* Clear adapter interruptions */ +static int zpci_clear_irq(struct zpci_dev *zdev) +{ + int rc; + + if (irq_delivery == DIRECTED) + rc = zpci_clear_directed_irq(zdev); + else + rc = zpci_clear_airq(zdev); + + if (!rc) + zdev->irqs_registered = 0; + + return rc; +} + +static int zpci_set_irq_affinity(struct irq_data *data, const struct cpumask *dest, + bool force) +{ + struct msi_desc *entry = irq_data_get_msi_desc(data); + struct msi_msg msg = entry->msg; + int cpu_addr = smp_cpu_get_cpu_address(cpumask_first(dest)); + + msg.address_lo &= 0xff0000ff; + msg.address_lo |= (cpu_addr << 8); + pci_write_msi_msg(data->irq, &msg); + + return IRQ_SET_MASK_OK; +} + +static struct irq_chip zpci_irq_chip = { + .name = "PCI-MSI", + .irq_unmask = pci_msi_unmask_irq, + .irq_mask = pci_msi_mask_irq, +}; + +static void zpci_handle_cpu_local_irq(bool rescan) +{ + struct airq_iv *dibv = zpci_ibv[smp_processor_id()]; + union zpci_sic_iib iib = {{0}}; + unsigned long bit; + int irqs_on = 0; + + for (bit = 0;;) { + /* Scan the directed IRQ bit vector */ + bit = airq_iv_scan(dibv, bit, airq_iv_end(dibv)); + if (bit == -1UL) { + if (!rescan || irqs_on++) + /* End of second scan with interrupts on. */ + break; + /* First scan complete, re-enable interrupts. */ + if (zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC, &iib)) + break; + bit = 0; + continue; + } + inc_irq_stat(IRQIO_MSI); + generic_handle_irq(airq_iv_get_data(dibv, bit)); + } +} + +struct cpu_irq_data { + call_single_data_t csd; + atomic_t scheduled; +}; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct cpu_irq_data, irq_data); + +static void zpci_handle_remote_irq(void *data) +{ + atomic_t *scheduled = data; + + do { + zpci_handle_cpu_local_irq(false); + } while (atomic_dec_return(scheduled)); +} + +static void zpci_handle_fallback_irq(void) +{ + struct cpu_irq_data *cpu_data; + union zpci_sic_iib iib = {{0}}; + unsigned long cpu; + int irqs_on = 0; + + for (cpu = 0;;) { + cpu = airq_iv_scan(zpci_sbv, cpu, airq_iv_end(zpci_sbv)); + if (cpu == -1UL) { + if (irqs_on++) + /* End of second scan with interrupts on. */ + break; + /* First scan complete, re-enable interrupts. */ + if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC, &iib)) + break; + cpu = 0; + continue; + } + cpu_data = &per_cpu(irq_data, cpu); + if (atomic_inc_return(&cpu_data->scheduled) > 1) + continue; + + INIT_CSD(&cpu_data->csd, zpci_handle_remote_irq, &cpu_data->scheduled); + smp_call_function_single_async(cpu, &cpu_data->csd); + } +} + +static void zpci_directed_irq_handler(struct airq_struct *airq, + struct tpi_info *tpi_info) +{ + bool floating = !tpi_info->directed_irq; + + if (floating) { + inc_irq_stat(IRQIO_PCF); + zpci_handle_fallback_irq(); + } else { + inc_irq_stat(IRQIO_PCD); + zpci_handle_cpu_local_irq(true); + } +} + +static void zpci_floating_irq_handler(struct airq_struct *airq, + struct tpi_info *tpi_info) +{ + union zpci_sic_iib iib = {{0}}; + unsigned long si, ai; + struct airq_iv *aibv; + int irqs_on = 0; + + inc_irq_stat(IRQIO_PCF); + for (si = 0;;) { + /* Scan adapter summary indicator bit vector */ + si = airq_iv_scan(zpci_sbv, si, airq_iv_end(zpci_sbv)); + if (si == -1UL) { + if (irqs_on++) + /* End of second scan with interrupts on. */ + break; + /* First scan complete, re-enable interrupts. */ + if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC, &iib)) + break; + si = 0; + continue; + } + + /* Scan the adapter interrupt vector for this device. */ + aibv = zpci_ibv[si]; + for (ai = 0;;) { + ai = airq_iv_scan(aibv, ai, airq_iv_end(aibv)); + if (ai == -1UL) + break; + inc_irq_stat(IRQIO_MSI); + airq_iv_lock(aibv, ai); + generic_handle_irq(airq_iv_get_data(aibv, ai)); + airq_iv_unlock(aibv, ai); + } + } +} + +int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) +{ + struct zpci_dev *zdev = to_zpci(pdev); + unsigned int hwirq, msi_vecs, cpu; + unsigned long bit; + struct msi_desc *msi; + struct msi_msg msg; + int cpu_addr; + int rc, irq; + + zdev->aisb = -1UL; + zdev->msi_first_bit = -1U; + if (type == PCI_CAP_ID_MSI && nvec > 1) + return 1; + msi_vecs = min_t(unsigned int, nvec, zdev->max_msi); + + if (irq_delivery == DIRECTED) { + /* Allocate cpu vector bits */ + bit = airq_iv_alloc(zpci_ibv[0], msi_vecs); + if (bit == -1UL) + return -EIO; + } else { + /* Allocate adapter summary indicator bit */ + bit = airq_iv_alloc_bit(zpci_sbv); + if (bit == -1UL) + return -EIO; + zdev->aisb = bit; + + /* Create adapter interrupt vector */ + zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK, NULL); + if (!zdev->aibv) + return -ENOMEM; + + /* Wire up shortcut pointer */ + zpci_ibv[bit] = zdev->aibv; + /* Each function has its own interrupt vector */ + bit = 0; + } + + /* Request MSI interrupts */ + hwirq = bit; + msi_for_each_desc(msi, &pdev->dev, MSI_DESC_NOTASSOCIATED) { + rc = -EIO; + if (hwirq - bit >= msi_vecs) + break; + irq = __irq_alloc_descs(-1, 0, 1, 0, THIS_MODULE, + (irq_delivery == DIRECTED) ? + msi->affinity : NULL); + if (irq < 0) + return -ENOMEM; + rc = irq_set_msi_desc(irq, msi); + if (rc) + return rc; + irq_set_chip_and_handler(irq, &zpci_irq_chip, + handle_percpu_irq); + msg.data = hwirq - bit; + if (irq_delivery == DIRECTED) { + if (msi->affinity) + cpu = cpumask_first(&msi->affinity->mask); + else + cpu = 0; + cpu_addr = smp_cpu_get_cpu_address(cpu); + + msg.address_lo = zdev->msi_addr & 0xff0000ff; + msg.address_lo |= (cpu_addr << 8); + + for_each_possible_cpu(cpu) { + airq_iv_set_data(zpci_ibv[cpu], hwirq, irq); + } + } else { + msg.address_lo = zdev->msi_addr & 0xffffffff; + airq_iv_set_data(zdev->aibv, hwirq, irq); + } + msg.address_hi = zdev->msi_addr >> 32; + pci_write_msi_msg(irq, &msg); + hwirq++; + } + + zdev->msi_first_bit = bit; + zdev->msi_nr_irqs = msi_vecs; + + rc = zpci_set_irq(zdev); + if (rc) + return rc; + + return (msi_vecs == nvec) ? 0 : msi_vecs; +} + +void arch_teardown_msi_irqs(struct pci_dev *pdev) +{ + struct zpci_dev *zdev = to_zpci(pdev); + struct msi_desc *msi; + int rc; + + /* Disable interrupts */ + rc = zpci_clear_irq(zdev); + if (rc) + return; + + /* Release MSI interrupts */ + msi_for_each_desc(msi, &pdev->dev, MSI_DESC_ASSOCIATED) { + irq_set_msi_desc(msi->irq, NULL); + irq_free_desc(msi->irq); + msi->msg.address_lo = 0; + msi->msg.address_hi = 0; + msi->msg.data = 0; + msi->irq = 0; + } + + if (zdev->aisb != -1UL) { + zpci_ibv[zdev->aisb] = NULL; + airq_iv_free_bit(zpci_sbv, zdev->aisb); + zdev->aisb = -1UL; + } + if (zdev->aibv) { + airq_iv_release(zdev->aibv); + zdev->aibv = NULL; + } + + if ((irq_delivery == DIRECTED) && zdev->msi_first_bit != -1U) + airq_iv_free(zpci_ibv[0], zdev->msi_first_bit, zdev->msi_nr_irqs); +} + +bool arch_restore_msi_irqs(struct pci_dev *pdev) +{ + struct zpci_dev *zdev = to_zpci(pdev); + + if (!zdev->irqs_registered) + zpci_set_irq(zdev); + return true; +} + +static struct airq_struct zpci_airq = { + .handler = zpci_floating_irq_handler, + .isc = PCI_ISC, +}; + +static void __init cpu_enable_directed_irq(void *unused) +{ + union zpci_sic_iib iib = {{0}}; + union zpci_sic_iib ziib = {{0}}; + + iib.cdiib.dibv_addr = (u64) zpci_ibv[smp_processor_id()]->vector; + + zpci_set_irq_ctrl(SIC_IRQ_MODE_SET_CPU, 0, &iib); + zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC, &ziib); +} + +static int __init zpci_directed_irq_init(void) +{ + union zpci_sic_iib iib = {{0}}; + unsigned int cpu; + + zpci_sbv = airq_iv_create(num_possible_cpus(), 0, NULL); + if (!zpci_sbv) + return -ENOMEM; + + iib.diib.isc = PCI_ISC; + iib.diib.nr_cpus = num_possible_cpus(); + iib.diib.disb_addr = virt_to_phys(zpci_sbv->vector); + zpci_set_irq_ctrl(SIC_IRQ_MODE_DIRECT, 0, &iib); + + zpci_ibv = kcalloc(num_possible_cpus(), sizeof(*zpci_ibv), + GFP_KERNEL); + if (!zpci_ibv) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + /* + * Per CPU IRQ vectors look the same but bit-allocation + * is only done on the first vector. + */ + zpci_ibv[cpu] = airq_iv_create(cache_line_size() * BITS_PER_BYTE, + AIRQ_IV_DATA | + AIRQ_IV_CACHELINE | + (!cpu ? AIRQ_IV_ALLOC : 0), NULL); + if (!zpci_ibv[cpu]) + return -ENOMEM; + } + on_each_cpu(cpu_enable_directed_irq, NULL, 1); + + zpci_irq_chip.irq_set_affinity = zpci_set_irq_affinity; + + return 0; +} + +static int __init zpci_floating_irq_init(void) +{ + zpci_ibv = kcalloc(ZPCI_NR_DEVICES, sizeof(*zpci_ibv), GFP_KERNEL); + if (!zpci_ibv) + return -ENOMEM; + + zpci_sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC, NULL); + if (!zpci_sbv) + goto out_free; + + return 0; + +out_free: + kfree(zpci_ibv); + return -ENOMEM; +} + +int __init zpci_irq_init(void) +{ + union zpci_sic_iib iib = {{0}}; + int rc; + + irq_delivery = sclp.has_dirq ? DIRECTED : FLOATING; + if (s390_pci_force_floating) + irq_delivery = FLOATING; + + if (irq_delivery == DIRECTED) + zpci_airq.handler = zpci_directed_irq_handler; + + rc = register_adapter_interrupt(&zpci_airq); + if (rc) + goto out; + /* Set summary to 1 to be called every time for the ISC. */ + *zpci_airq.lsi_ptr = 1; + + switch (irq_delivery) { + case FLOATING: + rc = zpci_floating_irq_init(); + break; + case DIRECTED: + rc = zpci_directed_irq_init(); + break; + } + + if (rc) + goto out_airq; + + /* + * Enable floating IRQs (with suppression after one IRQ). When using + * directed IRQs this enables the fallback path. + */ + zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC, &iib); + + return 0; +out_airq: + unregister_adapter_interrupt(&zpci_airq); +out: + return rc; +} + +void __init zpci_irq_exit(void) +{ + unsigned int cpu; + + if (irq_delivery == DIRECTED) { + for_each_possible_cpu(cpu) { + airq_iv_release(zpci_ibv[cpu]); + } + } + kfree(zpci_ibv); + if (zpci_sbv) + airq_iv_release(zpci_sbv); + unregister_adapter_interrupt(&zpci_airq); +} diff --git a/arch/s390/pci/pci_kvm_hook.c b/arch/s390/pci/pci_kvm_hook.c new file mode 100644 index 0000000000..ff34baf50a --- /dev/null +++ b/arch/s390/pci/pci_kvm_hook.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VFIO ZPCI devices support + * + * Copyright (C) IBM Corp. 2022. All rights reserved. + * Author(s): Pierre Morel + */ +#include + +struct zpci_kvm_hook zpci_kvm_hook; +EXPORT_SYMBOL_GPL(zpci_kvm_hook); diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c new file mode 100644 index 0000000000..a90499c087 --- /dev/null +++ b/arch/s390/pci/pci_mmio.c @@ -0,0 +1,335 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Access to PCI I/O memory from user space programs. + * + * Copyright IBM Corp. 2014 + * Author(s): Alexey Ishchuk + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static inline void zpci_err_mmio(u8 cc, u8 status, u64 offset) +{ + struct { + u64 offset; + u8 cc; + u8 status; + } data = {offset, cc, status}; + + zpci_err_hex(&data, sizeof(data)); +} + +static inline int __pcistb_mio_inuser( + void __iomem *ioaddr, const void __user *src, + u64 len, u8 *status) +{ + int cc = -ENXIO; + + asm volatile ( + " sacf 256\n" + "0: .insn rsy,0xeb00000000d4,%[len],%[ioaddr],%[src]\n" + "1: ipm %[cc]\n" + " srl %[cc],28\n" + "2: sacf 768\n" + EX_TABLE(0b, 2b) EX_TABLE(1b, 2b) + : [cc] "+d" (cc), [len] "+d" (len) + : [ioaddr] "a" (ioaddr), [src] "Q" (*((u8 __force *)src)) + : "cc", "memory"); + *status = len >> 24 & 0xff; + return cc; +} + +static inline int __pcistg_mio_inuser( + void __iomem *ioaddr, const void __user *src, + u64 ulen, u8 *status) +{ + union register_pair ioaddr_len = {.even = (u64 __force)ioaddr, .odd = ulen}; + int cc = -ENXIO; + u64 val = 0; + u64 cnt = ulen; + u8 tmp; + + /* + * copy 0 < @len <= 8 bytes from @src into the right most bytes of + * a register, then store it to PCI at @ioaddr while in secondary + * address space. pcistg then uses the user mappings. + */ + asm volatile ( + " sacf 256\n" + "0: llgc %[tmp],0(%[src])\n" + "4: sllg %[val],%[val],8\n" + " aghi %[src],1\n" + " ogr %[val],%[tmp]\n" + " brctg %[cnt],0b\n" + "1: .insn rre,0xb9d40000,%[val],%[ioaddr_len]\n" + "2: ipm %[cc]\n" + " srl %[cc],28\n" + "3: sacf 768\n" + EX_TABLE(0b, 3b) EX_TABLE(4b, 3b) EX_TABLE(1b, 3b) EX_TABLE(2b, 3b) + : + [src] "+a" (src), [cnt] "+d" (cnt), + [val] "+d" (val), [tmp] "=d" (tmp), + [cc] "+d" (cc), [ioaddr_len] "+&d" (ioaddr_len.pair) + :: "cc", "memory"); + *status = ioaddr_len.odd >> 24 & 0xff; + + /* did we read everything from user memory? */ + if (!cc && cnt != 0) + cc = -EFAULT; + + return cc; +} + +static inline int __memcpy_toio_inuser(void __iomem *dst, + const void __user *src, size_t n) +{ + int size, rc = 0; + u8 status = 0; + + if (!src) + return -EINVAL; + + while (n > 0) { + size = zpci_get_max_io_size((u64 __force) dst, + (u64 __force) src, n, + ZPCI_MAX_WRITE_SIZE); + if (size > 8) /* main path */ + rc = __pcistb_mio_inuser(dst, src, size, &status); + else + rc = __pcistg_mio_inuser(dst, src, size, &status); + if (rc) + break; + src += size; + dst += size; + n -= size; + } + if (rc) + zpci_err_mmio(rc, status, (__force u64) dst); + return rc; +} + +SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr, + const void __user *, user_buffer, size_t, length) +{ + u8 local_buf[64]; + void __iomem *io_addr; + void *buf; + struct vm_area_struct *vma; + pte_t *ptep; + spinlock_t *ptl; + long ret; + + if (!zpci_is_enabled()) + return -ENODEV; + + if (length <= 0 || PAGE_SIZE - (mmio_addr & ~PAGE_MASK) < length) + return -EINVAL; + + /* + * We only support write access to MIO capable devices if we are on + * a MIO enabled system. Otherwise we would have to check for every + * address if it is a special ZPCI_ADDR and would have to do + * a pfn lookup which we don't need for MIO capable devices. Currently + * ISM devices are the only devices without MIO support and there is no + * known need for accessing these from userspace. + */ + if (static_branch_likely(&have_mio)) { + ret = __memcpy_toio_inuser((void __iomem *) mmio_addr, + user_buffer, + length); + return ret; + } + + if (length > 64) { + buf = kmalloc(length, GFP_KERNEL); + if (!buf) + return -ENOMEM; + } else + buf = local_buf; + + ret = -EFAULT; + if (copy_from_user(buf, user_buffer, length)) + goto out_free; + + mmap_read_lock(current->mm); + ret = -EINVAL; + vma = vma_lookup(current->mm, mmio_addr); + if (!vma) + goto out_unlock_mmap; + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + goto out_unlock_mmap; + ret = -EACCES; + if (!(vma->vm_flags & VM_WRITE)) + goto out_unlock_mmap; + + ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl); + if (ret) + goto out_unlock_mmap; + + io_addr = (void __iomem *)((pte_pfn(*ptep) << PAGE_SHIFT) | + (mmio_addr & ~PAGE_MASK)); + + if ((unsigned long) io_addr < ZPCI_IOMAP_ADDR_BASE) + goto out_unlock_pt; + + ret = zpci_memcpy_toio(io_addr, buf, length); +out_unlock_pt: + pte_unmap_unlock(ptep, ptl); +out_unlock_mmap: + mmap_read_unlock(current->mm); +out_free: + if (buf != local_buf) + kfree(buf); + return ret; +} + +static inline int __pcilg_mio_inuser( + void __user *dst, const void __iomem *ioaddr, + u64 ulen, u8 *status) +{ + union register_pair ioaddr_len = {.even = (u64 __force)ioaddr, .odd = ulen}; + u64 cnt = ulen; + int shift = ulen * 8; + int cc = -ENXIO; + u64 val, tmp; + + /* + * read 0 < @len <= 8 bytes from the PCI memory mapped at @ioaddr (in + * user space) into a register using pcilg then store these bytes at + * user address @dst + */ + asm volatile ( + " sacf 256\n" + "0: .insn rre,0xb9d60000,%[val],%[ioaddr_len]\n" + "1: ipm %[cc]\n" + " srl %[cc],28\n" + " ltr %[cc],%[cc]\n" + " jne 4f\n" + "2: ahi %[shift],-8\n" + " srlg %[tmp],%[val],0(%[shift])\n" + "3: stc %[tmp],0(%[dst])\n" + "5: aghi %[dst],1\n" + " brctg %[cnt],2b\n" + "4: sacf 768\n" + EX_TABLE(0b, 4b) EX_TABLE(1b, 4b) EX_TABLE(3b, 4b) EX_TABLE(5b, 4b) + : + [ioaddr_len] "+&d" (ioaddr_len.pair), + [cc] "+d" (cc), [val] "=d" (val), + [dst] "+a" (dst), [cnt] "+d" (cnt), [tmp] "=d" (tmp), + [shift] "+d" (shift) + :: "cc", "memory"); + + /* did we write everything to the user space buffer? */ + if (!cc && cnt != 0) + cc = -EFAULT; + + *status = ioaddr_len.odd >> 24 & 0xff; + return cc; +} + +static inline int __memcpy_fromio_inuser(void __user *dst, + const void __iomem *src, + unsigned long n) +{ + int size, rc = 0; + u8 status; + + while (n > 0) { + size = zpci_get_max_io_size((u64 __force) src, + (u64 __force) dst, n, + ZPCI_MAX_READ_SIZE); + rc = __pcilg_mio_inuser(dst, src, size, &status); + if (rc) + break; + src += size; + dst += size; + n -= size; + } + if (rc) + zpci_err_mmio(rc, status, (__force u64) dst); + return rc; +} + +SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr, + void __user *, user_buffer, size_t, length) +{ + u8 local_buf[64]; + void __iomem *io_addr; + void *buf; + struct vm_area_struct *vma; + pte_t *ptep; + spinlock_t *ptl; + long ret; + + if (!zpci_is_enabled()) + return -ENODEV; + + if (length <= 0 || PAGE_SIZE - (mmio_addr & ~PAGE_MASK) < length) + return -EINVAL; + + /* + * We only support read access to MIO capable devices if we are on + * a MIO enabled system. Otherwise we would have to check for every + * address if it is a special ZPCI_ADDR and would have to do + * a pfn lookup which we don't need for MIO capable devices. Currently + * ISM devices are the only devices without MIO support and there is no + * known need for accessing these from userspace. + */ + if (static_branch_likely(&have_mio)) { + ret = __memcpy_fromio_inuser( + user_buffer, (const void __iomem *)mmio_addr, + length); + return ret; + } + + if (length > 64) { + buf = kmalloc(length, GFP_KERNEL); + if (!buf) + return -ENOMEM; + } else { + buf = local_buf; + } + + mmap_read_lock(current->mm); + ret = -EINVAL; + vma = vma_lookup(current->mm, mmio_addr); + if (!vma) + goto out_unlock_mmap; + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + goto out_unlock_mmap; + ret = -EACCES; + if (!(vma->vm_flags & VM_WRITE)) + goto out_unlock_mmap; + + ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl); + if (ret) + goto out_unlock_mmap; + + io_addr = (void __iomem *)((pte_pfn(*ptep) << PAGE_SHIFT) | + (mmio_addr & ~PAGE_MASK)); + + if ((unsigned long) io_addr < ZPCI_IOMAP_ADDR_BASE) { + ret = -EFAULT; + goto out_unlock_pt; + } + ret = zpci_memcpy_fromio(buf, io_addr, length); + +out_unlock_pt: + pte_unmap_unlock(ptep, ptl); +out_unlock_mmap: + mmap_read_unlock(current->mm); + + if (!ret && copy_to_user(user_buffer, buf, length)) + ret = -EFAULT; + + if (buf != local_buf) + kfree(buf); + return ret; +} diff --git a/arch/s390/pci/pci_sysfs.c b/arch/s390/pci/pci_sysfs.c new file mode 100644 index 0000000000..cae280e5c0 --- /dev/null +++ b/arch/s390/pci/pci_sysfs.c @@ -0,0 +1,239 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2012 + * + * Author(s): + * Jan Glauber + */ + +#define KMSG_COMPONENT "zpci" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include + +#include "../../../drivers/pci/pci.h" + +#include + +#define zpci_attr(name, fmt, member) \ +static ssize_t name##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); \ + \ + return sprintf(buf, fmt, zdev->member); \ +} \ +static DEVICE_ATTR_RO(name) + +zpci_attr(function_id, "0x%08x\n", fid); +zpci_attr(function_handle, "0x%08x\n", fh); +zpci_attr(pchid, "0x%04x\n", pchid); +zpci_attr(pfgid, "0x%02x\n", pfgid); +zpci_attr(vfn, "0x%04x\n", vfn); +zpci_attr(pft, "0x%02x\n", pft); +zpci_attr(port, "%d\n", port); +zpci_attr(uid, "0x%x\n", uid); +zpci_attr(segment0, "0x%02x\n", pfip[0]); +zpci_attr(segment1, "0x%02x\n", pfip[1]); +zpci_attr(segment2, "0x%02x\n", pfip[2]); +zpci_attr(segment3, "0x%02x\n", pfip[3]); + +static ssize_t mio_enabled_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); + + return sprintf(buf, zpci_use_mio(zdev) ? "1\n" : "0\n"); +} +static DEVICE_ATTR_RO(mio_enabled); + +static ssize_t recover_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct kernfs_node *kn; + struct pci_dev *pdev = to_pci_dev(dev); + struct zpci_dev *zdev = to_zpci(pdev); + int ret = 0; + + /* Can't use device_remove_self() here as that would lead us to lock + * the pci_rescan_remove_lock while holding the device' kernfs lock. + * This would create a possible deadlock with disable_slot() which is + * not directly protected by the device' kernfs lock but takes it + * during the device removal which happens under + * pci_rescan_remove_lock. + * + * This is analogous to sdev_store_delete() in + * drivers/scsi/scsi_sysfs.c + */ + kn = sysfs_break_active_protection(&dev->kobj, &attr->attr); + WARN_ON_ONCE(!kn); + /* device_remove_file() serializes concurrent calls ignoring all but + * the first + */ + device_remove_file(dev, attr); + + /* A concurrent call to recover_store() may slip between + * sysfs_break_active_protection() and the sysfs file removal. + * Once it unblocks from pci_lock_rescan_remove() the original pdev + * will already be removed. + */ + pci_lock_rescan_remove(); + if (pci_dev_is_added(pdev)) { + pci_stop_and_remove_bus_device(pdev); + if (zdev->dma_table) { + ret = zpci_dma_exit_device(zdev); + if (ret) + goto out; + } + + if (zdev_enabled(zdev)) { + ret = zpci_disable_device(zdev); + /* + * Due to a z/VM vs LPAR inconsistency in the error + * state the FH may indicate an enabled device but + * disable says the device is already disabled don't + * treat it as an error here. + */ + if (ret == -EINVAL) + ret = 0; + if (ret) + goto out; + } + + ret = zpci_enable_device(zdev); + if (ret) + goto out; + ret = zpci_dma_init_device(zdev); + if (ret) { + zpci_disable_device(zdev); + goto out; + } + pci_rescan_bus(zdev->zbus->bus); + } +out: + pci_unlock_rescan_remove(); + if (kn) + sysfs_unbreak_active_protection(kn); + return ret ? ret : count; +} +static DEVICE_ATTR_WO(recover); + +static ssize_t util_string_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, char *buf, + loff_t off, size_t count) +{ + struct device *dev = kobj_to_dev(kobj); + struct pci_dev *pdev = to_pci_dev(dev); + struct zpci_dev *zdev = to_zpci(pdev); + + return memory_read_from_buffer(buf, count, &off, zdev->util_str, + sizeof(zdev->util_str)); +} +static BIN_ATTR_RO(util_string, CLP_UTIL_STR_LEN); + +static ssize_t report_error_write(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, char *buf, + loff_t off, size_t count) +{ + struct zpci_report_error_header *report = (void *) buf; + struct device *dev = kobj_to_dev(kobj); + struct pci_dev *pdev = to_pci_dev(dev); + struct zpci_dev *zdev = to_zpci(pdev); + int ret; + + if (off || (count < sizeof(*report))) + return -EINVAL; + + ret = sclp_pci_report(report, zdev->fh, zdev->fid); + + return ret ? ret : count; +} +static BIN_ATTR(report_error, S_IWUSR, NULL, report_error_write, PAGE_SIZE); + +static ssize_t uid_is_unique_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", zpci_unique_uid ? 1 : 0); +} +static DEVICE_ATTR_RO(uid_is_unique); + +#ifndef CONFIG_DMI +/* analogous to smbios index */ +static ssize_t index_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); + u32 index = ~0; + + if (zpci_unique_uid) + index = zdev->uid; + + return sysfs_emit(buf, "%u\n", index); +} +static DEVICE_ATTR_RO(index); + +static umode_t zpci_index_is_visible(struct kobject *kobj, + struct attribute *attr, int n) +{ + return zpci_unique_uid ? attr->mode : 0; +} + +static struct attribute *zpci_ident_attrs[] = { + &dev_attr_index.attr, + NULL, +}; + +static struct attribute_group zpci_ident_attr_group = { + .attrs = zpci_ident_attrs, + .is_visible = zpci_index_is_visible, +}; +#endif + +static struct bin_attribute *zpci_bin_attrs[] = { + &bin_attr_util_string, + &bin_attr_report_error, + NULL, +}; + +static struct attribute *zpci_dev_attrs[] = { + &dev_attr_function_id.attr, + &dev_attr_function_handle.attr, + &dev_attr_pchid.attr, + &dev_attr_pfgid.attr, + &dev_attr_pft.attr, + &dev_attr_port.attr, + &dev_attr_vfn.attr, + &dev_attr_uid.attr, + &dev_attr_recover.attr, + &dev_attr_mio_enabled.attr, + &dev_attr_uid_is_unique.attr, + NULL, +}; + +static struct attribute_group zpci_attr_group = { + .attrs = zpci_dev_attrs, + .bin_attrs = zpci_bin_attrs, +}; + +static struct attribute *pfip_attrs[] = { + &dev_attr_segment0.attr, + &dev_attr_segment1.attr, + &dev_attr_segment2.attr, + &dev_attr_segment3.attr, + NULL, +}; +static struct attribute_group pfip_attr_group = { + .name = "pfip", + .attrs = pfip_attrs, +}; + +const struct attribute_group *zpci_attr_groups[] = { + &zpci_attr_group, + &pfip_attr_group, +#ifndef CONFIG_DMI + &zpci_ident_attr_group, +#endif + NULL, +}; diff --git a/arch/s390/purgatory/.gitignore b/arch/s390/purgatory/.gitignore new file mode 100644 index 0000000000..97ca527794 --- /dev/null +++ b/arch/s390/purgatory/.gitignore @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only +purgatory +purgatory.chk +purgatory.lds +purgatory.ro diff --git a/arch/s390/purgatory/Makefile b/arch/s390/purgatory/Makefile new file mode 100644 index 0000000000..4e930f5668 --- /dev/null +++ b/arch/s390/purgatory/Makefile @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: GPL-2.0 + +OBJECT_FILES_NON_STANDARD := y + +purgatory-y := head.o purgatory.o string.o sha256.o mem.o + +targets += $(purgatory-y) purgatory.lds purgatory purgatory.chk purgatory.ro +PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y)) + +$(obj)/sha256.o: $(srctree)/lib/crypto/sha256.c FORCE + $(call if_changed_rule,cc_o_c) + +CFLAGS_sha256.o := -D__DISABLE_EXPORTS -D__NO_FORTIFY + +$(obj)/mem.o: $(srctree)/arch/s390/lib/mem.S FORCE + $(call if_changed_rule,as_o_S) + +KCOV_INSTRUMENT := n +GCOV_PROFILE := n +UBSAN_SANITIZE := n +KASAN_SANITIZE := n +KCSAN_SANITIZE := n + +KBUILD_CFLAGS := -fno-strict-aliasing -Wall -Wstrict-prototypes +KBUILD_CFLAGS += -Wno-pointer-sign -Wno-sign-compare +KBUILD_CFLAGS += -fno-zero-initialized-in-bss -fno-builtin -ffreestanding +KBUILD_CFLAGS += -Os -m64 -msoft-float -fno-common +KBUILD_CFLAGS += -fno-stack-protector +KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING +KBUILD_CFLAGS += $(CLANG_FLAGS) +KBUILD_CFLAGS += $(call cc-option,-fno-PIE) +KBUILD_AFLAGS := $(filter-out -DCC_USING_EXPOLINE,$(KBUILD_AFLAGS)) + +# Since we link purgatory with -r unresolved symbols are not checked, so we +# also link a purgatory.chk binary without -r to check for unresolved symbols. +PURGATORY_LDFLAGS := -nostdlib -z nodefaultlib +LDFLAGS_purgatory := -r $(PURGATORY_LDFLAGS) -T +LDFLAGS_purgatory.chk := -e purgatory_start $(PURGATORY_LDFLAGS) +$(obj)/purgatory: $(obj)/purgatory.lds $(PURGATORY_OBJS) FORCE + $(call if_changed,ld) + +$(obj)/purgatory.chk: $(obj)/purgatory FORCE + $(call if_changed,ld) + +OBJCOPYFLAGS_purgatory.ro := -O elf64-s390 +OBJCOPYFLAGS_purgatory.ro += --remove-section='*debug*' +OBJCOPYFLAGS_purgatory.ro += --remove-section='.comment' +OBJCOPYFLAGS_purgatory.ro += --remove-section='.note.*' +$(obj)/purgatory.ro: $(obj)/purgatory $(obj)/purgatory.chk FORCE + $(call if_changed,objcopy) + +$(obj)/kexec-purgatory.o: $(obj)/purgatory.ro + +obj-y += kexec-purgatory.o diff --git a/arch/s390/purgatory/head.S b/arch/s390/purgatory/head.S new file mode 100644 index 0000000000..0f93f2e72e --- /dev/null +++ b/arch/s390/purgatory/head.S @@ -0,0 +1,265 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Purgatory setup code + * + * Copyright IBM Corp. 2018 + * + * Author(s): Philipp Rudo + */ + +#include +#include +#include +#include +#include + +/* The purgatory is the code running between two kernels. It's main purpose + * is to verify that the next kernel was not corrupted after load and to + * start it. + * + * If the next kernel is a crash kernel there are some peculiarities to + * consider: + * + * First the purgatory is called twice. Once only to verify the + * sha digest. So if the crash kernel got corrupted the old kernel can try + * to trigger a stand-alone dumper. And once to actually load the crash kernel. + * + * Second the purgatory also has to swap the crash memory region with its + * destination at address 0. As the purgatory is part of crash memory this + * requires some finesse. The tactic here is that the purgatory first copies + * itself to the end of the destination and then swaps the rest of the + * memory running from there. + */ + +#define bufsz purgatory_end-stack + +.macro MEMCPY dst,src,len + lgr %r0,\dst + lgr %r1,\len + lgr %r2,\src + lgr %r3,\len + +20: mvcle %r0,%r2,0 + jo 20b +.endm + +.macro MEMSWAP dst,src,buf,len +10: larl %r0,purgatory_end + larl %r1,stack + slgr %r0,%r1 + cgr \len,%r0 + jh 11f + lgr %r4,\len + j 12f +11: lgr %r4,%r0 + +12: MEMCPY \buf,\dst,%r4 + MEMCPY \dst,\src,%r4 + MEMCPY \src,\buf,%r4 + + agr \dst,%r4 + agr \src,%r4 + sgr \len,%r4 + + cghi \len,0 + jh 10b +.endm + +.macro START_NEXT_KERNEL base subcode + lg %r4,kernel_entry-\base(%r13) + lg %r5,load_psw_mask-\base(%r13) + ogr %r4,%r5 + stg %r4,0(%r0) + + xgr %r0,%r0 + lghi %r1,\subcode + diag %r0,%r1,0x308 +.endm + + .text + .balign PAGE_SIZE +SYM_CODE_START(purgatory_start) + /* The purgatory might be called after a diag308 so better set + * architecture and addressing mode. + */ + lhi %r1,1 + sigp %r1,%r0,SIGP_SET_ARCHITECTURE + sam64 + + larl %r5,gprregs + stmg %r6,%r15,0(%r5) + + basr %r13,0 +.base_crash: + + /* Setup stack */ + larl %r15,purgatory_end-STACK_FRAME_OVERHEAD + + /* If the next kernel is KEXEC_TYPE_CRASH the purgatory is called + * directly with a flag passed in %r2 whether the purgatory shall do + * checksum verification only (%r2 = 0 -> verification only). + * + * Check now and preserve over C function call by storing in + * %r10 with + * 1 -> checksum verification only + * 0 -> load new kernel + */ + lghi %r10,0 + lg %r11,kernel_type-.base_crash(%r13) + cghi %r11,1 /* KEXEC_TYPE_CRASH */ + jne .do_checksum_verification + cghi %r2,0 /* checksum verification only */ + jne .do_checksum_verification + lghi %r10,1 + +.do_checksum_verification: + brasl %r14,verify_sha256_digest + + cghi %r10,1 /* checksum verification only */ + je .return_old_kernel + cghi %r2,0 /* checksum match */ + jne .disabled_wait + + /* If the next kernel is a crash kernel the purgatory has to swap + * the mem regions first. + */ + cghi %r11,1 /* KEXEC_TYPE_CRASH */ + je .start_crash_kernel + + /* start normal kernel */ + START_NEXT_KERNEL .base_crash 0 + +.return_old_kernel: + lmg %r6,%r15,gprregs-.base_crash(%r13) + br %r14 + +.disabled_wait: + lpswe disabled_wait_psw-.base_crash(%r13) + +.start_crash_kernel: + /* Location of purgatory_start in crash memory */ + larl %r0,.base_crash + larl %r1,purgatory_start + slgr %r0,%r1 + lgr %r8,%r13 + sgr %r8,%r0 + + /* Destination for this code i.e. end of memory to be swapped. */ + larl %r0,purgatory_end + larl %r1,purgatory_start + slgr %r0,%r1 + lg %r9,crash_size-.base_crash(%r13) + sgr %r9,%r0 + + /* Destination in crash memory, i.e. same as r9 but in crash memory. */ + lg %r10,crash_start-.base_crash(%r13) + agr %r10,%r9 + + /* Buffer location (in crash memory) and size. As the purgatory is + * behind the point of no return it can re-use the stack as buffer. + */ + larl %r11,purgatory_end + larl %r12,stack + slgr %r11,%r12 + + MEMCPY %r12,%r9,%r11 /* dst -> (crash) buf */ + MEMCPY %r9,%r8,%r11 /* self -> dst */ + + /* Jump to new location. */ + lgr %r7,%r9 + larl %r0,.jump_to_dst + larl %r1,purgatory_start + slgr %r0,%r1 + agr %r7,%r0 + br %r7 + +.jump_to_dst: + basr %r13,0 +.base_dst: + + /* clear buffer */ + MEMCPY %r12,%r10,%r11 /* (crash) buf -> (crash) dst */ + + /* Load new buffer location after jump */ + larl %r7,stack + lgr %r0,%r7 + larl %r1,purgatory_start + slgr %r0,%r1 + agr %r10,%r0 + MEMCPY %r10,%r7,%r11 /* (new) buf -> (crash) buf */ + + /* Now the code is set up to run from its designated location. Start + * swapping the rest of crash memory now. + * + * The registers will be used as follow: + * + * %r0-%r4 reserved for macros defined above + * %r5-%r6 tmp registers + * %r7 pointer to current struct sha region + * %r8 index to iterate over all sha regions + * %r9 pointer in crash memory + * %r10 pointer in old kernel + * %r11 total size (still) to be moved + * %r12 pointer to buffer + */ + lgr %r12,%r7 + lgr %r11,%r9 + lghi %r10,0 + lg %r9,crash_start-.base_dst(%r13) + lghi %r8,16 /* KEXEC_SEGMENTS_MAX */ + larl %r7,purgatory_sha_regions + + j .loop_first + + /* Loop over all purgatory_sha_regions. */ +.loop_next: + aghi %r8,-1 + cghi %r8,0 + je .loop_out + + aghi %r7,__KEXEC_SHA_REGION_SIZE + +.loop_first: + lg %r5,__KEXEC_SHA_REGION_START(%r7) + cghi %r5,0 + je .loop_next + + /* Copy [end last sha region, start current sha region) */ + /* Note: kexec_sha_region->start points in crash memory */ + sgr %r5,%r9 + MEMCPY %r9,%r10,%r5 + + agr %r9,%r5 + agr %r10,%r5 + sgr %r11,%r5 + + /* Swap sha region */ + lg %r6,__KEXEC_SHA_REGION_LEN(%r7) + MEMSWAP %r9,%r10,%r12,%r6 + sg %r11,__KEXEC_SHA_REGION_LEN(%r7) + j .loop_next + +.loop_out: + /* Copy rest of crash memory */ + MEMCPY %r9,%r10,%r11 + + /* start crash kernel */ + START_NEXT_KERNEL .base_dst 1 +SYM_CODE_END(purgatory_start) + +SYM_DATA_LOCAL(load_psw_mask, .long 0x00080000,0x80000000) + .balign 8 +SYM_DATA_LOCAL(disabled_wait_psw, .quad 0x0002000180000000,.do_checksum_verification) +SYM_DATA_LOCAL(gprregs, .fill 10,8,0) +SYM_DATA(purgatory_sha256_digest, .skip 32) +SYM_DATA(purgatory_sha_regions, .skip 16*__KEXEC_SHA_REGION_SIZE) +SYM_DATA(kernel_entry, .skip 8) +SYM_DATA(kernel_type, .skip 8) +SYM_DATA(crash_start, .skip 8) +SYM_DATA(crash_size, .skip 8) + .balign PAGE_SIZE +SYM_DATA_START_LOCAL(stack) + /* The buffer to move this code must be as big as the code. */ + .skip stack-purgatory_start + .balign PAGE_SIZE +SYM_DATA_END_LABEL(stack, SYM_L_LOCAL, purgatory_end) diff --git a/arch/s390/purgatory/kexec-purgatory.S b/arch/s390/purgatory/kexec-purgatory.S new file mode 100644 index 0000000000..25f512b1de --- /dev/null +++ b/arch/s390/purgatory/kexec-purgatory.S @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include + + .section .rodata, "a" + + .balign 8 +SYM_DATA_START(kexec_purgatory) + .incbin "arch/s390/purgatory/purgatory.ro" +SYM_DATA_END_LABEL(kexec_purgatory, SYM_L_LOCAL, kexec_purgatory_end) + + .balign 8 +SYM_DATA(kexec_purgatory_size, .quad kexec_purgatory_end-kexec_purgatory) diff --git a/arch/s390/purgatory/purgatory.c b/arch/s390/purgatory/purgatory.c new file mode 100644 index 0000000000..030efda05d --- /dev/null +++ b/arch/s390/purgatory/purgatory.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Purgatory code running between two kernels. + * + * Copyright IBM Corp. 2018 + * + * Author(s): Philipp Rudo + */ + +#include +#include +#include +#include + +int verify_sha256_digest(void) +{ + struct kexec_sha_region *ptr, *end; + u8 digest[SHA256_DIGEST_SIZE]; + struct sha256_state sctx; + + sha256_init(&sctx); + end = purgatory_sha_regions + ARRAY_SIZE(purgatory_sha_regions); + + for (ptr = purgatory_sha_regions; ptr < end; ptr++) + sha256_update(&sctx, (uint8_t *)(ptr->start), ptr->len); + + sha256_final(&sctx, digest); + + if (memcmp(digest, purgatory_sha256_digest, sizeof(digest))) + return 1; + + return 0; +} diff --git a/arch/s390/purgatory/purgatory.lds.S b/arch/s390/purgatory/purgatory.lds.S new file mode 100644 index 0000000000..482eb4fbce --- /dev/null +++ b/arch/s390/purgatory/purgatory.lds.S @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include + +OUTPUT_FORMAT("elf64-s390", "elf64-s390", "elf64-s390") +OUTPUT_ARCH(s390:64-bit) + +ENTRY(purgatory_start) + +SECTIONS +{ + . = 0; + .head.text : { + _head = . ; + HEAD_TEXT + _ehead = . ; + } + .text : { + _text = .; /* Text */ + *(.text) + *(.text.*) + _etext = . ; + } + .rodata : { + _rodata = . ; + *(.rodata) /* read-only data */ + *(.rodata.*) + _erodata = . ; + } + .data : { + _data = . ; + *(.data) + *(.data.*) + _edata = . ; + } + + . = ALIGN(256); + .bss : { + _bss = . ; + *(.bss) + *(.bss.*) + *(COMMON) + . = ALIGN(8); /* For convenience during zeroing */ + _ebss = .; + } + _end = .; + + /* Sections to be discarded */ + /DISCARD/ : { + *(.eh_frame) + *(*__ksymtab*) + *(___kcrctab*) + } +} diff --git a/arch/s390/purgatory/string.c b/arch/s390/purgatory/string.c new file mode 100644 index 0000000000..c98c22a72d --- /dev/null +++ b/arch/s390/purgatory/string.c @@ -0,0 +1,3 @@ +// SPDX-License-Identifier: GPL-2.0 +#define __HAVE_ARCH_MEMCMP /* arch function */ +#include "../lib/string.c" diff --git a/arch/s390/tools/.gitignore b/arch/s390/tools/.gitignore new file mode 100644 index 0000000000..ea62f37b79 --- /dev/null +++ b/arch/s390/tools/.gitignore @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only +gen_facilities +gen_opcode_table diff --git a/arch/s390/tools/Makefile b/arch/s390/tools/Makefile new file mode 100644 index 0000000000..f9dd47ff9a --- /dev/null +++ b/arch/s390/tools/Makefile @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for s390 specific build tools +# + +kapi := arch/$(ARCH)/include/generated/asm +kapi-hdrs-y := $(kapi)/facility-defs.h $(kapi)/dis-defs.h + +PHONY += kapi + +kapi: $(kapi-hdrs-y) + +hostprogs += gen_facilities +hostprogs += gen_opcode_table + +HOSTCFLAGS_gen_facilities.o += $(LINUXINCLUDE) + +filechk_facility-defs.h = $(obj)/gen_facilities + +filechk_dis-defs.h = \ + $(obj)/gen_opcode_table < $(srctree)/arch/$(ARCH)/tools/opcodes.txt + +$(kapi)/facility-defs.h: $(obj)/gen_facilities FORCE + $(call filechk,facility-defs.h) + +$(kapi)/dis-defs.h: $(obj)/gen_opcode_table FORCE + $(call filechk,dis-defs.h) diff --git a/arch/s390/tools/gcc-thunk-extern.sh b/arch/s390/tools/gcc-thunk-extern.sh new file mode 100755 index 0000000000..20bcbf6dd7 --- /dev/null +++ b/arch/s390/tools/gcc-thunk-extern.sh @@ -0,0 +1,24 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# Borrowed from gcc: gcc/testsuite/gcc.target/s390/nobp-section-type-conflict.c +# Checks that we don't get error: section type conflict with ‘put_page’. + +cat << "END" | $@ -x c - -fno-PIE -march=z10 -mindirect-branch=thunk-extern -mfunction-return=thunk-extern -mindirect-branch-table -O2 -c -o /dev/null +int a; +int b (void); +void c (int); + +static void +put_page (void) +{ + if (b ()) + c (a); +} + +__attribute__ ((__section__ (".init.text"), __cold__)) void +d (void) +{ + put_page (); + put_page (); +} +END diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c new file mode 100644 index 0000000000..cb0aff5c01 --- /dev/null +++ b/arch/s390/tools/gen_facilities.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Simple program to generate defines out of facility lists that use the bit + * numbering scheme from the Princples of Operations: most significant bit + * has bit number 0. + * + * Copyright IBM Corp. 2015, 2018 + * + */ + +#include +#include +#include +#include + +struct facility_def { + char *name; + int *bits; +}; + +static struct facility_def facility_defs[] = { + { + /* + * FACILITIES_ALS contains the list of facilities that are + * required to run a kernel that is compiled e.g. with + * -march=. + */ + .name = "FACILITIES_ALS", + .bits = (int[]){ + 0, /* N3 instructions */ + 1, /* z/Arch mode installed */ + 18, /* long displacement facility */ + 21, /* extended-immediate facility */ + 25, /* store clock fast */ + 27, /* mvcos */ + 32, /* compare and swap and store */ + 33, /* compare and swap and store 2 */ + 34, /* general instructions extension */ + 35, /* execute extensions */ +#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES + 45, /* fast-BCR, etc. */ +#endif +#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES + 49, /* misc-instruction-extensions */ + 52, /* interlocked facility 2 */ +#endif +#ifdef CONFIG_HAVE_MARCH_Z13_FEATURES + 53, /* load-and-zero-rightmost-byte, etc. */ +#endif +#ifdef CONFIG_HAVE_MARCH_Z14_FEATURES + 58, /* miscellaneous-instruction-extension 2 */ +#endif +#ifdef CONFIG_HAVE_MARCH_Z15_FEATURES + 61, /* miscellaneous-instruction-extension 3 */ +#endif + -1 /* END */ + } + }, + { + /* + * FACILITIES_KVM contains the list of facilities that are part + * of the default facility mask and list that are passed to the + * initial CPU model. If no CPU model is used, this, together + * with the non-hypervisor managed bits, is the maximum list of + * guest facilities supported by KVM. + */ + .name = "FACILITIES_KVM", + .bits = (int[]){ + 0, /* N3 instructions */ + 1, /* z/Arch mode installed */ + 2, /* z/Arch mode active */ + 3, /* DAT-enhancement */ + 4, /* idte segment table */ + 5, /* idte region table */ + 6, /* ASN-and-LX reuse */ + 7, /* stfle */ + 8, /* enhanced-DAT 1 */ + 9, /* sense-running-status */ + 10, /* conditional sske */ + 13, /* ipte-range */ + 14, /* nonquiescing key-setting */ + 73, /* transactional execution */ + 75, /* access-exception-fetch/store indication */ + 76, /* msa extension 3 */ + 77, /* msa extension 4 */ + 78, /* enhanced-DAT 2 */ + 130, /* instruction-execution-protection */ + 131, /* enhanced-SOP 2 and side-effect */ + 139, /* multiple epoch facility */ + 146, /* msa extension 8 */ + 150, /* enhanced sort */ + 151, /* deflate conversion */ + 155, /* msa extension 9 */ + -1 /* END */ + } + }, + { + /* + * FACILITIES_KVM_CPUMODEL contains the list of facilities + * that can be enabled by CPU model code if the host supports + * it. These facilities are not passed to the guest without + * CPU model support. + */ + + .name = "FACILITIES_KVM_CPUMODEL", + .bits = (int[]){ + 12, /* AP Query Configuration Information */ + 15, /* AP Facilities Test */ + 156, /* etoken facility */ + 165, /* nnpa facility */ + 193, /* bear enhancement facility */ + 194, /* rdp enhancement facility */ + 196, /* processor activity instrumentation facility */ + 197, /* processor activity instrumentation extension 1 */ + -1 /* END */ + } + }, +}; + +static void print_facility_list(struct facility_def *def) +{ + unsigned int high, bit, dword, i; + unsigned long long *array; + + array = calloc(1, 8); + if (!array) + exit(EXIT_FAILURE); + high = 0; + for (i = 0; def->bits[i] != -1; i++) { + bit = 63 - (def->bits[i] & 63); + dword = def->bits[i] / 64; + if (dword > high) { + array = realloc(array, (dword + 1) * 8); + if (!array) + exit(EXIT_FAILURE); + memset(array + high + 1, 0, (dword - high) * 8); + high = dword; + } + array[dword] |= 1ULL << bit; + } + printf("#define %s ", def->name); + for (i = 0; i <= high; i++) + printf("_AC(0x%016llx,UL)%c", array[i], i < high ? ',' : '\n'); + free(array); +} + +static void print_facility_lists(void) +{ + unsigned int i; + + for (i = 0; i < sizeof(facility_defs) / sizeof(facility_defs[0]); i++) + print_facility_list(&facility_defs[i]); +} + +int main(int argc, char **argv) +{ + printf("#ifndef __ASM_S390_FACILITY_DEFS__\n"); + printf("#define __ASM_S390_FACILITY_DEFS__\n"); + printf("/*\n"); + printf(" * DO NOT MODIFY.\n"); + printf(" *\n"); + printf(" * This file was generated by %s\n", __FILE__); + printf(" */\n\n"); + printf("#include \n\n"); + print_facility_lists(); + printf("\n#endif\n"); + return 0; +} diff --git a/arch/s390/tools/gen_opcode_table.c b/arch/s390/tools/gen_opcode_table.c new file mode 100644 index 0000000000..a1bc02b29c --- /dev/null +++ b/arch/s390/tools/gen_opcode_table.c @@ -0,0 +1,337 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Generate opcode table initializers for the in-kernel disassembler. + * + * Copyright IBM Corp. 2017 + * + */ + +#include +#include +#include +#include + +#define STRING_SIZE_MAX 20 + +struct insn_type { + unsigned char byte; + unsigned char mask; + char **format; +}; + +struct insn { + struct insn_type *type; + char opcode[STRING_SIZE_MAX]; + char name[STRING_SIZE_MAX]; + char upper[STRING_SIZE_MAX]; + char format[STRING_SIZE_MAX]; + unsigned int name_len; +}; + +struct insn_group { + struct insn_type *type; + int offset; + int count; + char opcode[2]; +}; + +struct insn_format { + char *format; + int type; +}; + +struct gen_opcode { + struct insn *insn; + int nr; + struct insn_group *group; + int nr_groups; +}; + +/* + * Table of instruction format types. Each opcode is defined with at + * least one byte (two nibbles), three nibbles, or two bytes (four + * nibbles). + * The byte member of each instruction format type entry defines + * within which byte of an instruction the third (and fourth) nibble + * of an opcode can be found. The mask member is the and-mask that + * needs to be applied on this byte in order to get the third (and + * fourth) nibble of the opcode. + * The format array defines all instruction formats (as defined in the + * Principles of Operation) which have the same position of the opcode + * nibbles. + * A special case are instruction formats with 1-byte opcodes. In this + * case the byte member always is zero, so that the mask is applied on + * the (only) byte that contains the opcode. + */ +static struct insn_type insn_type_table[] = { + { + .byte = 0, + .mask = 0xff, + .format = (char *[]) { + "MII", + "RR", + "RS", + "RSI", + "RX", + "SI", + "SMI", + "SS", + NULL, + }, + }, + { + .byte = 1, + .mask = 0x0f, + .format = (char *[]) { + "RI", + "RIL", + "SSF", + NULL, + }, + }, + { + .byte = 1, + .mask = 0xff, + .format = (char *[]) { + "E", + "IE", + "RRE", + "RRF", + "RRR", + "S", + "SIL", + "SSE", + NULL, + }, + }, + { + .byte = 5, + .mask = 0xff, + .format = (char *[]) { + "RIE", + "RIS", + "RRS", + "RSE", + "RSL", + "RSY", + "RXE", + "RXF", + "RXY", + "SIY", + "VRI", + "VRR", + "VRS", + "VRV", + "VRX", + "VSI", + NULL, + }, + }, +}; + +static struct insn_type *insn_format_to_type(char *format) +{ + char tmp[STRING_SIZE_MAX]; + char *base_format, **ptr; + int i; + + strcpy(tmp, format); + base_format = tmp; + base_format = strsep(&base_format, "_"); + for (i = 0; i < sizeof(insn_type_table) / sizeof(insn_type_table[0]); i++) { + ptr = insn_type_table[i].format; + while (*ptr) { + if (!strcmp(base_format, *ptr)) + return &insn_type_table[i]; + ptr++; + } + } + exit(EXIT_FAILURE); +} + +static void read_instructions(struct gen_opcode *desc) +{ + struct insn insn; + int rc, i; + + while (1) { + rc = scanf("%s %s %s", insn.opcode, insn.name, insn.format); + if (rc == EOF) + break; + if (rc != 3) + exit(EXIT_FAILURE); + insn.type = insn_format_to_type(insn.format); + insn.name_len = strlen(insn.name); + for (i = 0; i <= insn.name_len; i++) + insn.upper[i] = toupper((unsigned char)insn.name[i]); + desc->nr++; + desc->insn = realloc(desc->insn, desc->nr * sizeof(*desc->insn)); + if (!desc->insn) + exit(EXIT_FAILURE); + desc->insn[desc->nr - 1] = insn; + } +} + +static int cmpformat(const void *a, const void *b) +{ + return strcmp(((struct insn *)a)->format, ((struct insn *)b)->format); +} + +static void print_formats(struct gen_opcode *desc) +{ + char *format; + int i, count; + + qsort(desc->insn, desc->nr, sizeof(*desc->insn), cmpformat); + format = ""; + count = 0; + printf("enum {\n"); + for (i = 0; i < desc->nr; i++) { + if (!strcmp(format, desc->insn[i].format)) + continue; + count++; + format = desc->insn[i].format; + printf("\tINSTR_%s,\n", format); + } + printf("}; /* %d */\n\n", count); +} + +static int cmp_long_insn(const void *a, const void *b) +{ + return strcmp(((struct insn *)a)->name, ((struct insn *)b)->name); +} + +static void print_long_insn(struct gen_opcode *desc) +{ + struct insn *insn; + int i, count; + + qsort(desc->insn, desc->nr, sizeof(*desc->insn), cmp_long_insn); + count = 0; + printf("enum {\n"); + for (i = 0; i < desc->nr; i++) { + insn = &desc->insn[i]; + if (insn->name_len < 6) + continue; + printf("\tLONG_INSN_%s,\n", insn->upper); + count++; + } + printf("}; /* %d */\n\n", count); + + printf("#define LONG_INSN_INITIALIZER { \\\n"); + for (i = 0; i < desc->nr; i++) { + insn = &desc->insn[i]; + if (insn->name_len < 6) + continue; + printf("\t[LONG_INSN_%s] = \"%s\", \\\n", insn->upper, insn->name); + } + printf("}\n\n"); +} + +static void print_opcode(struct insn *insn, int nr) +{ + char *opcode; + + opcode = insn->opcode; + if (insn->type->byte != 0) + opcode += 2; + printf("\t[%4d] = { .opfrag = 0x%s, .format = INSTR_%s, ", nr, opcode, insn->format); + if (insn->name_len < 6) + printf(".name = \"%s\" ", insn->name); + else + printf(".offset = LONG_INSN_%s ", insn->upper); + printf("}, \\\n"); +} + +static void add_to_group(struct gen_opcode *desc, struct insn *insn, int offset) +{ + struct insn_group *group; + + group = desc->group ? &desc->group[desc->nr_groups - 1] : NULL; + if (group && (!strncmp(group->opcode, insn->opcode, 2) || group->type->byte == 0)) { + group->count++; + return; + } + desc->nr_groups++; + desc->group = realloc(desc->group, desc->nr_groups * sizeof(*desc->group)); + if (!desc->group) + exit(EXIT_FAILURE); + group = &desc->group[desc->nr_groups - 1]; + memcpy(group->opcode, insn->opcode, 2); + group->type = insn->type; + group->offset = offset; + group->count = 1; +} + +static int cmpopcode(const void *a, const void *b) +{ + return strcmp(((struct insn *)a)->opcode, ((struct insn *)b)->opcode); +} + +static void print_opcode_table(struct gen_opcode *desc) +{ + char opcode[2] = ""; + struct insn *insn; + int i, offset; + + qsort(desc->insn, desc->nr, sizeof(*desc->insn), cmpopcode); + printf("#define OPCODE_TABLE_INITIALIZER { \\\n"); + offset = 0; + for (i = 0; i < desc->nr; i++) { + insn = &desc->insn[i]; + if (insn->type->byte == 0) + continue; + add_to_group(desc, insn, offset); + if (strncmp(opcode, insn->opcode, 2)) { + memcpy(opcode, insn->opcode, 2); + printf("\t/* %.2s */ \\\n", opcode); + } + print_opcode(insn, offset); + offset++; + } + printf("\t/* 1-byte opcode instructions */ \\\n"); + for (i = 0; i < desc->nr; i++) { + insn = &desc->insn[i]; + if (insn->type->byte != 0) + continue; + add_to_group(desc, insn, offset); + print_opcode(insn, offset); + offset++; + } + printf("}\n\n"); +} + +static void print_opcode_table_offsets(struct gen_opcode *desc) +{ + struct insn_group *group; + int i; + + printf("#define OPCODE_OFFSET_INITIALIZER { \\\n"); + for (i = 0; i < desc->nr_groups; i++) { + group = &desc->group[i]; + printf("\t{ .opcode = 0x%.2s, .mask = 0x%02x, .byte = %d, .offset = %d, .count = %d }, \\\n", + group->opcode, group->type->mask, group->type->byte, group->offset, group->count); + } + printf("}\n\n"); +} + +int main(int argc, char **argv) +{ + struct gen_opcode _desc = { 0 }; + struct gen_opcode *desc = &_desc; + + read_instructions(desc); + printf("#ifndef __S390_GENERATED_DIS_DEFS_H__\n"); + printf("#define __S390_GENERATED_DIS_DEFS_H__\n"); + printf("/*\n"); + printf(" * DO NOT MODIFY.\n"); + printf(" *\n"); + printf(" * This file was generated by %s\n", __FILE__); + printf(" */\n\n"); + print_formats(desc); + print_long_insn(desc); + print_opcode_table(desc); + print_opcode_table_offsets(desc); + printf("#endif\n"); + exit(EXIT_SUCCESS); +} diff --git a/arch/s390/tools/opcodes.txt b/arch/s390/tools/opcodes.txt new file mode 100644 index 0000000000..5f008e7948 --- /dev/null +++ b/arch/s390/tools/opcodes.txt @@ -0,0 +1,1250 @@ +0000 illegal E +0002 brkpt E +0101 pr E +0102 upt E +0104 ptff E +0107 sckpf E +010a pfpo E +010b tam E +010c sam24 E +010d sam31 E +010e sam64 E +01ff trap2 E +04 spm RR_R0 +05 balr RR_RR +06 bctr RR_RR +07 bcr RR_UR +0a svc RR_U0 +0b bsm RR_RR +0c bassm RR_RR +0d basr RR_RR +0e mvcl RR_RR +0f clcl RR_RR +10 lpr RR_RR +11 lnr RR_RR +12 ltr RR_RR +13 lcr RR_RR +14 nr RR_RR +15 clr RR_RR +16 or RR_RR +17 xr RR_RR +18 lr RR_RR +19 cr RR_RR +1a ar RR_RR +1b sr RR_RR +1c mr RR_RR +1d dr RR_RR +1e alr RR_RR +1f slr RR_RR +20 lpdr RR_FF +21 lndr RR_FF +22 ltdr RR_FF +23 lcdr RR_FF +24 hdr RR_FF +25 ldxr RR_FF +26 mxr RR_FF +27 mxdr RR_FF +28 ldr RR_FF +29 cdr RR_FF +2a adr RR_FF +2b sdr RR_FF +2c mdr RR_FF +2d ddr RR_FF +2e awr RR_FF +2f swr RR_FF +30 lper RR_FF +31 lner RR_FF +32 lter RR_FF +33 lcer RR_FF +34 her RR_FF +35 ledr RR_FF +36 axr RR_FF +37 sxr RR_FF +38 ler RR_FF +39 cer RR_FF +3a aer RR_FF +3b ser RR_FF +3c mder RR_FF +3d der RR_FF +3e aur RR_FF +3f sur RR_FF +40 sth RX_RRRD +41 la RX_RRRD +42 stc RX_RRRD +43 ic RX_RRRD +44 ex RX_RRRD +45 bal RX_RRRD +46 bct RX_RRRD +47 bc RX_URRD +48 lh RX_RRRD +49 ch RX_RRRD +4a ah RX_RRRD +4b sh RX_RRRD +4c mh RX_RRRD +4d bas RX_RRRD +4e cvd RX_RRRD +4f cvb RX_RRRD +50 st RX_RRRD +51 lae RX_RRRD +54 n RX_RRRD +55 cl RX_RRRD +56 o RX_RRRD +57 x RX_RRRD +58 l RX_RRRD +59 c RX_RRRD +5a a RX_RRRD +5b s RX_RRRD +5c m RX_RRRD +5d d RX_RRRD +5e al RX_RRRD +5f sl RX_RRRD +60 std RX_FRRD +67 mxd RX_FRRD +68 ld RX_FRRD +69 cd RX_FRRD +6a ad RX_FRRD +6b sd RX_FRRD +6c md RX_FRRD +6d dd RX_FRRD +6e aw RX_FRRD +6f sw RX_FRRD +70 ste RX_FRRD +71 ms RX_RRRD +78 le RX_FRRD +79 ce RX_FRRD +7a ae RX_FRRD +7b se RX_FRRD +7c mde RX_FRRD +7d de RX_FRRD +7e au RX_FRRD +7f su RX_FRRD +80 ssm SI_RD +82 lpsw SI_RD +83 diag RS_RRRD +84 brxh RSI_RRP +85 brxle RSI_RRP +86 bxh RS_RRRD +87 bxle RS_RRRD +88 srl RS_R0RD +89 sll RS_R0RD +8a sra RS_R0RD +8b sla RS_R0RD +8c srdl RS_R0RD +8d sldl RS_R0RD +8e srda RS_R0RD +8f slda RS_R0RD +90 stm RS_RRRD +91 tm SI_URD +92 mvi SI_URD +93 ts SI_RD +94 ni SI_URD +95 cli SI_URD +96 oi SI_URD +97 xi SI_URD +98 lm RS_RRRD +99 trace RS_RRRD +9a lam RS_AARD +9b stam RS_AARD +a50 iihh RI_RU +a51 iihl RI_RU +a52 iilh RI_RU +a53 iill RI_RU +a54 nihh RI_RU +a55 nihl RI_RU +a56 nilh RI_RU +a57 nill RI_RU +a58 oihh RI_RU +a59 oihl RI_RU +a5a oilh RI_RU +a5b oill RI_RU +a5c llihh RI_RU +a5d llihl RI_RU +a5e llilh RI_RU +a5f llill RI_RU +a70 tmlh RI_RU +a71 tmll RI_RU +a72 tmhh RI_RU +a73 tmhl RI_RU +a74 brc RI_UP +a75 bras RI_RP +a76 brct RI_RP +a77 brctg RI_RP +a78 lhi RI_RI +a79 lghi RI_RI +a7a ahi RI_RI +a7b aghi RI_RI +a7c mhi RI_RI +a7d mghi RI_RI +a7e chi RI_RI +a7f cghi RI_RI +a8 mvcle RS_RRRD +a9 clcle RS_RRRD +aa0 rinext RI_RI +aa1 rion RI_RI +aa2 tric RI_RI +aa3 rioff RI_RI +aa4 riemit RI_RI +ac stnsm SI_URD +ad stosm SI_URD +ae sigp RS_RRRD +af mc SI_URD +b1 lra RX_RRRD +b200 lbear S_RD +b201 stbear S_RD +b202 stidp S_RD +b204 sck S_RD +b205 stck S_RD +b206 sckc S_RD +b207 stckc S_RD +b208 spt S_RD +b209 stpt S_RD +b20a spka S_RD +b20b ipk S_00 +b20d ptlb S_00 +b210 spx S_RD +b211 stpx S_RD +b212 stap S_RD +b214 sie S_RD +b218 pc S_RD +b219 sac S_RD +b21a cfc S_RD +b220 servc RRE_RR +b221 ipte RRF_RURR +b222 ipm RRE_R0 +b223 ivsk RRE_RR +b224 iac RRE_R0 +b225 ssar RRE_R0 +b226 epar RRE_R0 +b227 esar RRE_R0 +b228 pt RRE_RR +b229 iske RRE_RR +b22a rrbe RRE_RR +b22b sske RRF_U0RR +b22c tb RRE_RR +b22d dxr RRE_FF +b22e pgin RRE_RR +b22f pgout RRE_RR +b230 csch S_00 +b231 hsch S_00 +b232 msch S_RD +b233 ssch S_RD +b234 stsch S_RD +b235 tsch S_RD +b236 tpi S_RD +b237 sal S_00 +b238 rsch S_00 +b239 stcrw S_RD +b23a stcps S_RD +b23b rchp S_00 +b23c schm S_00 +b240 bakr RRE_RR +b241 cksm RRE_RR +b244 sqdr RRE_FF +b245 sqer RRE_FF +b246 stura RRE_RR +b247 msta RRE_R0 +b248 palb RRE_00 +b249 ereg RRE_RR +b24a esta RRE_RR +b24b lura RRE_RR +b24c tar RRE_AR +b24d cpya RRE_AA +b24e sar RRE_AR +b24f ear RRE_RA +b250 csp RRE_RR +b252 msr RRE_RR +b254 mvpg RRE_RR +b255 mvst RRE_RR +b256 sthyi RRE_RR +b257 cuse RRE_RR +b258 bsg RRE_RR +b25a bsa RRE_RR +b25d clst RRE_RR +b25e srst RRE_RR +b25f chsc RRE_R0 +b263 cmpsc RRE_RR +b274 siga S_RD +b276 xsch S_00 +b277 rp S_RD +b278 stcke S_RD +b279 sacf S_RD +b27c stckf S_RD +b27d stsi S_RD +b280 lpp S_RD +b284 lcctl S_RD +b285 lpctl S_RD +b286 qsi S_RD +b287 lsctl S_RD +b28e qctri S_RD +b28f qpaci S_RD +b299 srnm S_RD +b29c stfpc S_RD +b29d lfpc S_RD +b2a5 tre RRE_RR +b2a6 cu21 RRF_U0RR +b2a7 cu12 RRF_U0RR +b2ad nqap RRE_RR +b2ae dqap RRE_RR +b2af pqap RRE_RR +b2b0 stfle S_RD +b2b1 stfl S_RD +b2b2 lpswe S_RD +b2b8 srnmb S_RD +b2b9 srnmt S_RD +b2bd lfas S_RD +b2e0 scctr RRE_RR +b2e1 spctr RRE_RR +b2e4 ecctr RRE_RR +b2e5 epctr RRE_RR +b2e8 ppa RRF_U0RR +b2ec etnd RRE_R0 +b2ed ecpga RRE_RR +b2f0 iucv RRE_RR +b2f8 tend S_00 +b2fa niai IE_UU +b2fc tabort S_RD +b2ff trap4 S_RD +b300 lpebr RRE_FF +b301 lnebr RRE_FF +b302 ltebr RRE_FF +b303 lcebr RRE_FF +b304 ldebr RRE_FF +b305 lxdbr RRE_FF +b306 lxebr RRE_FF +b307 mxdbr RRE_FF +b308 kebr RRE_FF +b309 cebr RRE_FF +b30a aebr RRE_FF +b30b sebr RRE_FF +b30c mdebr RRE_FF +b30d debr RRE_FF +b30e maebr RRF_F0FF +b30f msebr RRF_F0FF +b310 lpdbr RRE_FF +b311 lndbr RRE_FF +b312 ltdbr RRE_FF +b313 lcdbr RRE_FF +b314 sqebr RRE_FF +b315 sqdbr RRE_FF +b316 sqxbr RRE_FF +b317 meebr RRE_FF +b318 kdbr RRE_FF +b319 cdbr RRE_FF +b31a adbr RRE_FF +b31b sdbr RRE_FF +b31c mdbr RRE_FF +b31d ddbr RRE_FF +b31e madbr RRF_F0FF +b31f msdbr RRF_F0FF +b324 lder RRE_FF +b325 lxdr RRE_FF +b326 lxer RRE_FF +b32e maer RRF_F0FF +b32f mser RRF_F0FF +b336 sqxr RRE_FF +b337 meer RRE_FF +b338 maylr RRF_F0FF +b339 mylr RRF_F0FF +b33a mayr RRF_F0FF +b33b myr RRF_F0FF +b33c mayhr RRF_F0FF +b33d myhr RRF_F0FF +b33e madr RRF_F0FF +b33f msdr RRF_F0FF +b340 lpxbr RRE_FF +b341 lnxbr RRE_FF +b342 ltxbr RRE_FF +b343 lcxbr RRE_FF +b344 ledbra RRF_UUFF +b345 ldxbra RRF_UUFF +b346 lexbra RRF_UUFF +b347 fixbra RRF_UUFF +b348 kxbr RRE_FF +b349 cxbr RRE_FF +b34a axbr RRE_FF +b34b sxbr RRE_FF +b34c mxbr RRE_FF +b34d dxbr RRE_FF +b350 tbedr RRF_U0FF +b351 tbdr RRF_U0FF +b353 diebr RRF_FUFF +b357 fiebra RRF_UUFF +b358 thder RRE_FF +b359 thdr RRE_FF +b35b didbr RRF_FUFF +b35f fidbra RRF_UUFF +b360 lpxr RRE_FF +b361 lnxr RRE_FF +b362 ltxr RRE_FF +b363 lcxr RRE_FF +b365 lxr RRE_FF +b366 lexr RRE_FF +b367 fixr RRE_FF +b369 cxr RRE_FF +b370 lpdfr RRE_FF +b371 lndfr RRE_FF +b372 cpsdr RRF_F0FF2 +b373 lcdfr RRE_FF +b374 lzer RRE_F0 +b375 lzdr RRE_F0 +b376 lzxr RRE_F0 +b377 fier RRE_FF +b37f fidr RRE_FF +b384 sfpc RRE_RR +b385 sfasr RRE_R0 +b38c efpc RRE_RR +b390 celfbr RRF_UUFR +b391 cdlfbr RRF_UUFR +b392 cxlfbr RRF_UUFR +b394 cefbra RRF_UUFR +b395 cdfbra RRF_UUFR +b396 cxfbra RRF_UUFR +b398 cfebra RRF_UURF +b399 cfdbra RRF_UURF +b39a cfxbra RRF_UURF +b39c clfebr RRF_UURF +b39d clfdbr RRF_UURF +b39e clfxbr RRF_UURF +b3a0 celgbr RRF_UUFR +b3a1 cdlgbr RRF_UUFR +b3a2 cxlgbr RRF_UUFR +b3a4 cegbra RRF_UUFR +b3a5 cdgbra RRF_UUFR +b3a6 cxgbra RRF_UUFR +b3a8 cgebra RRF_UURF +b3a9 cgdbra RRF_UURF +b3aa cgxbra RRF_UURF +b3ac clgebr RRF_UURF +b3ad clgdbr RRF_UURF +b3ae clgxbr RRF_UURF +b3b4 cefr RRE_FR +b3b5 cdfr RRE_FR +b3b6 cxfr RRE_FR +b3b8 cfer RRF_U0RF +b3b9 cfdr RRF_U0RF +b3ba cfxr RRF_U0RF +b3c1 ldgr RRE_FR +b3c4 cegr RRE_FR +b3c5 cdgr RRE_FR +b3c6 cxgr RRE_FR +b3c8 cger RRF_U0RF +b3c9 cgdr RRF_U0RF +b3ca cgxr RRF_U0RF +b3cd lgdr RRE_RF +b3d0 mdtra RRF_FUFF2 +b3d1 ddtra RRF_FUFF2 +b3d2 adtra RRF_FUFF2 +b3d3 sdtra RRF_FUFF2 +b3d4 ldetr RRF_0UFF +b3d5 ledtr RRF_UUFF +b3d6 ltdtr RRE_FF +b3d7 fidtr RRF_UUFF +b3d8 mxtra RRF_FUFF2 +b3d9 dxtra RRF_FUFF2 +b3da axtra RRF_FUFF2 +b3db sxtra RRF_FUFF2 +b3dc lxdtr RRF_0UFF +b3dd ldxtr RRF_UUFF +b3de ltxtr RRE_FF +b3df fixtr RRF_UUFF +b3e0 kdtr RRE_FF +b3e1 cgdtra RRF_UURF +b3e2 cudtr RRE_RF +b3e3 csdtr RRF_0URF +b3e4 cdtr RRE_FF +b3e5 eedtr RRE_RF +b3e7 esdtr RRE_RF +b3e8 kxtr RRE_FF +b3e9 cgxtra RRF_UURF +b3ea cuxtr RRE_RF +b3eb csxtr RRF_0URF +b3ec cxtr RRE_FF +b3ed eextr RRE_RF +b3ef esxtr RRE_RF +b3f1 cdgtra RRF_UUFR +b3f2 cdutr RRE_FR +b3f3 cdstr RRE_FR +b3f4 cedtr RRE_FF +b3f5 qadtr RRF_FUFF +b3f6 iedtr RRF_F0FR +b3f7 rrdtr RRF_FFRU +b3f9 cxgtra RRF_UUFR +b3fa cxutr RRE_FR +b3fb cxstr RRE_FR +b3fc cextr RRE_FF +b3fd qaxtr RRF_FUFF +b3fe iextr RRF_F0FR +b3ff rrxtr RRF_FFRU +b6 stctl RS_CCRD +b7 lctl RS_CCRD +b900 lpgr RRE_RR +b901 lngr RRE_RR +b902 ltgr RRE_RR +b903 lcgr RRE_RR +b904 lgr RRE_RR +b905 lurag RRE_RR +b906 lgbr RRE_RR +b907 lghr RRE_RR +b908 agr RRE_RR +b909 sgr RRE_RR +b90a algr RRE_RR +b90b slgr RRE_RR +b90c msgr RRE_RR +b90d dsgr RRE_RR +b90e eregg RRE_RR +b90f lrvgr RRE_RR +b910 lpgfr RRE_RR +b911 lngfr RRE_RR +b912 ltgfr RRE_RR +b913 lcgfr RRE_RR +b914 lgfr RRE_RR +b916 llgfr RRE_RR +b917 llgtr RRE_RR +b918 agfr RRE_RR +b919 sgfr RRE_RR +b91a algfr RRE_RR +b91b slgfr RRE_RR +b91c msgfr RRE_RR +b91d dsgfr RRE_RR +b91e kmac RRE_RR +b91f lrvr RRE_RR +b920 cgr RRE_RR +b921 clgr RRE_RR +b925 sturg RRE_RR +b926 lbr RRE_RR +b927 lhr RRE_RR +b928 pckmo RRE_00 +b929 kma RRF_R0RR +b92a kmf RRE_RR +b92b kmo RRE_RR +b92c pcc RRE_00 +b92d kmctr RRF_R0RR +b92e km RRE_RR +b92f kmc RRE_RR +b930 cgfr RRE_RR +b931 clgfr RRE_RR +b938 sortl RRE_RR +b939 dfltcc RRF_R0RR2 +b93a kdsa RRE_RR +b93b nnpa RRE_00 +b93c ppno RRE_RR +b93e kimd RRE_RR +b93f klmd RRE_RR +b941 cfdtr RRF_UURF +b942 clgdtr RRF_UURF +b943 clfdtr RRF_UURF +b946 bctgr RRE_RR +b949 cfxtr RRF_UURF +b94a clgxtr RRF_UURF +b94b clfxtr RRF_UURF +b951 cdftr RRF_UUFR +b952 cdlgtr RRF_UUFR +b953 cdlftr RRF_UUFR +b959 cxftr RRF_UUFR +b95a cxlgtr RRF_UUFR +b95b cxlftr RRF_UUFR +b960 cgrt RRF_U0RR +b961 clgrt RRF_U0RR +b964 nngrk RRF_R0RR2 +b965 ocgrk RRF_R0RR2 +b966 nogrk RRF_R0RR2 +b967 nxgrk RRF_R0RR2 +b972 crt RRF_U0RR +b973 clrt RRF_U0RR +b974 nnrk RRF_R0RR2 +b975 ocrk RRF_R0RR2 +b976 nork RRF_R0RR2 +b977 nxrk RRF_R0RR2 +b980 ngr RRE_RR +b981 ogr RRE_RR +b982 xgr RRE_RR +b983 flogr RRE_RR +b984 llgcr RRE_RR +b985 llghr RRE_RR +b986 mlgr RRE_RR +b987 dlgr RRE_RR +b988 alcgr RRE_RR +b989 slbgr RRE_RR +b98a cspg RRE_RR +b98b rdp RRF_RURR2 +b98d epsw RRE_RR +b98e idte RRF_RURR2 +b98f crdte RRF_RURR2 +b990 trtt RRF_U0RR +b991 trto RRF_U0RR +b992 trot RRF_U0RR +b993 troo RRF_U0RR +b994 llcr RRE_RR +b995 llhr RRE_RR +b996 mlr RRE_RR +b997 dlr RRE_RR +b998 alcr RRE_RR +b999 slbr RRE_RR +b99a epair RRE_R0 +b99b esair RRE_R0 +b99c eqbs RRF_U0RR +b99d esea RRE_R0 +b99e pti RRE_RR +b99f ssair RRE_R0 +b9a0 clp RRF_U0RR +b9a1 tpei RRE_RR +b9a2 ptf RRE_R0 +b9a4 uvc RRF_URR +b9aa lptea RRF_RURR2 +b9ab essa RRF_U0RR +b9ac irbm RRE_RR +b9ae rrbm RRE_RR +b9af pfmf RRE_RR +b9b0 cu14 RRF_U0RR +b9b1 cu24 RRF_U0RR +b9b2 cu41 RRE_RR +b9b3 cu42 RRE_RR +b9bd trtre RRF_U0RR +b9be srstu RRE_RR +b9bf trte RRF_U0RR +b9c0 selfhr RRF_RURR +b9c8 ahhhr RRF_R0RR2 +b9c9 shhhr RRF_R0RR2 +b9ca alhhhr RRF_R0RR2 +b9cb slhhhr RRF_R0RR2 +b9cd chhr RRE_RR +b9cf clhhr RRE_RR +b9d0 pcistg RRE_RR +b9d2 pcilg RRE_RR +b9d3 rpcit RRE_RR +b9d4 pcistgi RRE_RR +b9d5 pciwb RRE_00 +b9d6 pcilgi RRE_RR +b9d8 ahhlr RRF_R0RR2 +b9d9 shhlr RRF_R0RR2 +b9da alhhlr RRF_R0RR2 +b9db slhhlr RRF_R0RR2 +b9dd chlr RRE_RR +b9df clhlr RRE_RR +b9e0 locfhr RRF_U0RR +b9e1 popcnt RRF_U0RR +b9e2 locgr RRF_U0RR +b9e3 selgr RRF_RURR +b9e4 ngrk RRF_R0RR2 +b9e5 ncgrk RRF_R0RR2 +b9e6 ogrk RRF_R0RR2 +b9e7 xgrk RRF_R0RR2 +b9e8 agrk RRF_R0RR2 +b9e9 sgrk RRF_R0RR2 +b9ea algrk RRF_R0RR2 +b9eb slgrk RRF_R0RR2 +b9ec mgrk RRF_R0RR2 +b9ed msgrkc RRF_R0RR2 +b9f0 selr RRF_RURR +b9f2 locr RRF_U0RR +b9f4 nrk RRF_R0RR2 +b9f5 ncrk RRF_R0RR2 +b9f6 ork RRF_R0RR2 +b9f7 xrk RRF_R0RR2 +b9f8 ark RRF_R0RR2 +b9f9 srk RRF_R0RR2 +b9fa alrk RRF_R0RR2 +b9fb slrk RRF_R0RR2 +b9fd msrkc RRF_R0RR2 +ba cs RS_RRRD +bb cds RS_RRRD +bd clm RS_RURD +be stcm RS_RURD +bf icm RS_RURD +c00 larl RIL_RP +c01 lgfi RIL_RI +c04 brcl RIL_UP +c05 brasl RIL_RP +c06 xihf RIL_RU +c07 xilf RIL_RU +c08 iihf RIL_RU +c09 iilf RIL_RU +c0a nihf RIL_RU +c0b nilf RIL_RU +c0c oihf RIL_RU +c0d oilf RIL_RU +c0e llihf RIL_RU +c0f llilf RIL_RU +c20 msgfi RIL_RI +c21 msfi RIL_RI +c24 slgfi RIL_RU +c25 slfi RIL_RU +c28 agfi RIL_RI +c29 afi RIL_RI +c2a algfi RIL_RU +c2b alfi RIL_RU +c2c cgfi RIL_RI +c2d cfi RIL_RI +c2e clgfi RIL_RU +c2f clfi RIL_RU +c42 llhrl RIL_RP +c44 lghrl RIL_RP +c45 lhrl RIL_RP +c46 llghrl RIL_RP +c47 sthrl RIL_RP +c48 lgrl RIL_RP +c4b stgrl RIL_RP +c4c lgfrl RIL_RP +c4d lrl RIL_RP +c4e llgfrl RIL_RP +c4f strl RIL_RP +c5 bprp MII_UPP +c60 exrl RIL_RP +c62 pfdrl RIL_UP +c64 cghrl RIL_RP +c65 chrl RIL_RP +c66 clghrl RIL_RP +c67 clhrl RIL_RP +c68 cgrl RIL_RP +c6a clgrl RIL_RP +c6c cgfrl RIL_RP +c6d crl RIL_RP +c6e clgfrl RIL_RP +c6f clrl RIL_RP +c7 bpp SMI_U0RDP +c80 mvcos SSF_RRDRD +c81 ectg SSF_RRDRD +c82 csst SSF_RRDRD +c84 lpd SSF_RRDRD2 +c85 lpdg SSF_RRDRD2 +cc6 brcth RIL_RP +cc8 aih RIL_RI +cca alsih RIL_RI +ccb alsihn RIL_RI +ccd cih RIL_RI +ccf clih RIL_RU +d0 trtr SS_L0RDRD +d1 mvn SS_L0RDRD +d2 mvc SS_L0RDRD +d3 mvz SS_L0RDRD +d4 nc SS_L0RDRD +d5 clc SS_L0RDRD +d6 oc SS_L0RDRD +d7 xc SS_L0RDRD +d9 mvck SS_RRRDRD +da mvcp SS_RRRDRD +db mvcs SS_RRRDRD +dc tr SS_L0RDRD +dd trt SS_L0RDRD +de ed SS_L0RDRD +df edmk SS_L0RDRD +e1 pku SS_L2RDRD +e2 unpku SS_L0RDRD +e302 ltg RXY_RRRD +e303 lrag RXY_RRRD +e304 lg RXY_RRRD +e306 cvby RXY_RRRD +e308 ag RXY_RRRD +e309 sg RXY_RRRD +e30a alg RXY_RRRD +e30b slg RXY_RRRD +e30c msg RXY_RRRD +e30d dsg RXY_RRRD +e30e cvbg RXY_RRRD +e30f lrvg RXY_RRRD +e312 lt RXY_RRRD +e313 lray RXY_RRRD +e314 lgf RXY_RRRD +e315 lgh RXY_RRRD +e316 llgf RXY_RRRD +e317 llgt RXY_RRRD +e318 agf RXY_RRRD +e319 sgf RXY_RRRD +e31a algf RXY_RRRD +e31b slgf RXY_RRRD +e31c msgf RXY_RRRD +e31d dsgf RXY_RRRD +e31e lrv RXY_RRRD +e31f lrvh RXY_RRRD +e320 cg RXY_RRRD +e321 clg RXY_RRRD +e324 stg RXY_RRRD +e325 ntstg RXY_RRRD +e326 cvdy RXY_RRRD +e32a lzrg RXY_RRRD +e32e cvdg RXY_RRRD +e32f strvg RXY_RRRD +e330 cgf RXY_RRRD +e331 clgf RXY_RRRD +e332 ltgf RXY_RRRD +e334 cgh RXY_RRRD +e336 pfd RXY_URRD +e338 agh RXY_RRRD +e339 sgh RXY_RRRD +e33a llzrgf RXY_RRRD +e33b lzrf RXY_RRRD +e33c mgh RXY_RRRD +e33e strv RXY_RRRD +e33f strvh RXY_RRRD +e346 bctg RXY_RRRD +e347 bic RXY_URRD +e348 llgfsg RXY_RRRD +e349 stgsc RXY_RRRD +e34c lgg RXY_RRRD +e34d lgsc RXY_RRRD +e350 sty RXY_RRRD +e351 msy RXY_RRRD +e353 msc RXY_RRRD +e354 ny RXY_RRRD +e355 cly RXY_RRRD +e356 oy RXY_RRRD +e357 xy RXY_RRRD +e358 ly RXY_RRRD +e359 cy RXY_RRRD +e35a ay RXY_RRRD +e35b sy RXY_RRRD +e35c mfy RXY_RRRD +e35e aly RXY_RRRD +e35f sly RXY_RRRD +e370 sthy RXY_RRRD +e371 lay RXY_RRRD +e372 stcy RXY_RRRD +e373 icy RXY_RRRD +e375 laey RXY_RRRD +e376 lb RXY_RRRD +e377 lgb RXY_RRRD +e378 lhy RXY_RRRD +e379 chy RXY_RRRD +e37a ahy RXY_RRRD +e37b shy RXY_RRRD +e37c mhy RXY_RRRD +e380 ng RXY_RRRD +e381 og RXY_RRRD +e382 xg RXY_RRRD +e383 msgc RXY_RRRD +e384 mg RXY_RRRD +e385 lgat RXY_RRRD +e386 mlg RXY_RRRD +e387 dlg RXY_RRRD +e388 alcg RXY_RRRD +e389 slbg RXY_RRRD +e38e stpq RXY_RRRD +e38f lpq RXY_RRRD +e390 llgc RXY_RRRD +e391 llgh RXY_RRRD +e394 llc RXY_RRRD +e395 llh RXY_RRRD +e396 ml RXY_RRRD +e397 dl RXY_RRRD +e398 alc RXY_RRRD +e399 slb RXY_RRRD +e39c llgtat RXY_RRRD +e39d llgfat RXY_RRRD +e39f lat RXY_RRRD +e3c0 lbh RXY_RRRD +e3c2 llch RXY_RRRD +e3c3 stch RXY_RRRD +e3c4 lhh RXY_RRRD +e3c6 llhh RXY_RRRD +e3c7 sthh RXY_RRRD +e3c8 lfhat RXY_RRRD +e3ca lfh RXY_RRRD +e3cb stfh RXY_RRRD +e3cd chf RXY_RRRD +e3cf clhf RXY_RRRD +e3d0 mpcifc RXY_RRRD +e3d4 stpcifc RXY_RRRD +e500 lasp SSE_RDRD +e501 tprot SSE_RDRD +e502 strag SSE_RDRD +e50a mvcrl SSE_RDRD +e50e mvcsk SSE_RDRD +e50f mvcdk SSE_RDRD +e544 mvhhi SIL_RDI +e548 mvghi SIL_RDI +e54c mvhi SIL_RDI +e554 chhsi SIL_RDI +e555 clhhsi SIL_RDU +e558 cghsi SIL_RDI +e559 clghsi SIL_RDU +e55c chsi SIL_RDI +e55d clfhsi SIL_RDU +e560 tbegin SIL_RDU +e561 tbeginc SIL_RDU +e601 vlebrh VRX_VRRDU +e602 vlebrg VRX_VRRDU +e603 vlebrf VRX_VRRDU +e604 vllebrz VRX_VRRDU +e605 vlbrrep VRX_VRRDU +e606 vlbr VRX_VRRDU +e607 vler VRX_VRRDU +e609 vstebrh VRX_VRRDU +e60a vstebrg VRX_VRRDU +e60b vstebrf VRX_VRRDU +e60e vstbr VRX_VRRDU +e60f vster VRX_VRRDU +e634 vpkz VSI_URDV +e635 vlrl VSI_URDV +e637 vlrlr VRS_RRDV +e63c vupkz VSI_URDV +e63d vstrl VSI_URDV +e63f vstrlr VRS_RRDV +e649 vlip VRI_V0UU2 +e650 vcvb VRR_RV0UU +e651 vclzdp VRR_VV0U2 +e652 vcvbg VRR_RV0UU +e654 vupkzh VRR_VV0U2 +e655 vcnf VRR_VV0UU2 +e656 vclfnh VRR_VV0UU2 +e658 vcvd VRI_VR0UU +e659 vsrp VRI_VVUUU2 +e65a vcvdg VRI_VR0UU +e65b vpsop VRI_VVUUU2 +e65c vupkzl VRR_VV0U2 +e65d vcfn VRR_VV0UU2 +e65e vclfnl VRR_VV0UU2 +e65f vtp VRR_0V +e670 vpkzr VRI_VVV0UU2 +e671 vap VRI_VVV0UU2 +e672 vsrpr VRI_VVV0UU2 +e673 vsp VRI_VVV0UU2 +e674 vschp VRR_VVV0U0U +e675 vcrnf VRR_VVV0UU +e677 vcp VRR_0VV0U +e678 vmp VRI_VVV0UU2 +e679 vmsp VRI_VVV0UU2 +e67a vdp VRI_VVV0UU2 +e67b vrp VRI_VVV0UU2 +e67c vscshp VRR_VVV +e67d vcsph VRR_VVV0U0 +e67e vsdp VRI_VVV0UU2 +e700 vleb VRX_VRRDU +e701 vleh VRX_VRRDU +e702 vleg VRX_VRRDU +e703 vlef VRX_VRRDU +e704 vllez VRX_VRRDU +e705 vlrep VRX_VRRDU +e706 vl VRX_VRRDU +e707 vlbb VRX_VRRDU +e708 vsteb VRX_VRRDU +e709 vsteh VRX_VRRDU +e70a vsteg VRX_VRRDU +e70b vstef VRX_VRRDU +e70e vst VRX_VRRDU +e712 vgeg VRV_VVXRDU +e713 vgef VRV_VVXRDU +e71a vsceg VRV_VVXRDU +e71b vscef VRV_VVXRDU +e721 vlgv VRS_RVRDU +e722 vlvg VRS_VRRDU +e727 lcbb RXE_RRRDU +e730 vesl VRS_VVRDU +e733 verll VRS_VVRDU +e736 vlm VRS_VVRDU +e737 vll VRS_VRRD +e738 vesrl VRS_VVRDU +e73a vesra VRS_VVRDU +e73e vstm VRS_VVRDU +e73f vstl VRS_VRRD +e740 vleib VRI_V0IU +e741 vleih VRI_V0IU +e742 vleig VRI_V0IU +e743 vleif VRI_V0IU +e744 vgbm VRI_V0U +e745 vrepi VRI_V0IU +e746 vgm VRI_V0UUU +e74a vftci VRI_VVUUU +e74d vrep VRI_VVUU +e750 vpopct VRR_VV0U +e752 vctz VRR_VV0U +e753 vclz VRR_VV0U +e756 vlr VRX_VV +e75c vistr VRR_VV0U0U +e75f vseg VRR_VV0U +e760 vmrl VRR_VVV0U +e761 vmrh VRR_VVV0U +e762 vlvgp VRR_VRR +e764 vsum VRR_VVV0U +e765 vsumg VRR_VVV0U +e766 vcksm VRR_VVV +e767 vsumq VRR_VVV0U +e768 vn VRR_VVV +e769 vnc VRR_VVV +e76a vo VRR_VVV +e76b vno VRR_VVV +e76c vnx VRR_VVV +e76d vx VRR_VVV +e76e vnn VRR_VVV +e76f voc VRR_VVV +e770 veslv VRR_VVV0U +e772 verim VRI_VVV0UU +e773 verllv VRR_VVV0U +e774 vsl VRR_VVV +e775 vslb VRR_VVV +e777 vsldb VRI_VVV0U +e778 vesrlv VRR_VVV0U +e77a vesrav VRR_VVV0U +e77c vsrl VRR_VVV +e77d vsrlb VRR_VVV +e77e vsra VRR_VVV +e77f vsrab VRR_VVV +e780 vfee VRR_VVV0U0U +e781 vfene VRR_VVV0U0U +e782 vfae VRR_VVV0U0U +e784 vpdi VRR_VVV0U +e785 vbperm VRR_VVV +e786 vsld VRI_VVV0U +e787 vsrd VRI_VVV0U +e78a vstrc VRR_VVVUU0V +e78b vstrs VRR_VVVUU0V +e78c vperm VRR_VVV0V +e78d vsel VRR_VVV0V +e78e vfms VRR_VVVU0UV +e78f vfma VRR_VVVU0UV +e794 vpk VRR_VVV0U +e795 vpkls VRR_VVV0U0U +e797 vpks VRR_VVV0U0U +e79e vfnms VRR_VVVU0UV +e79f vfnma VRR_VVVU0UV +e7a1 vmlh VRR_VVV0U +e7a2 vml VRR_VVV0U +e7a3 vmh VRR_VVV0U +e7a4 vmle VRR_VVV0U +e7a5 vmlo VRR_VVV0U +e7a6 vme VRR_VVV0U +e7a7 vmo VRR_VVV0U +e7a9 vmalh VRR_VVVU0V +e7aa vmal VRR_VVVU0V +e7ab vmah VRR_VVVU0V +e7ac vmale VRR_VVVU0V +e7ad vmalo VRR_VVVU0V +e7ae vmae VRR_VVVU0V +e7af vmao VRR_VVVU0V +e7b4 vgfm VRR_VVV0U +e7b8 vmsl VRR_VVVUU0V +e7b9 vaccc VRR_VVVU0V +e7bb vac VRR_VVVU0V +e7bc vgfma VRR_VVVU0V +e7bd vsbcbi VRR_VVVU0V +e7bf vsbi VRR_VVVU0V +e7c0 vclgd VRR_VV0UUU +e7c1 vcdlg VRR_VV0UUU +e7c2 vcgd VRR_VV0UUU +e7c3 vcdg VRR_VV0UUU +e7c4 vlde VRR_VV0UU2 +e7c5 vled VRR_VV0UUU +e7c7 vfi VRR_VV0UUU +e7ca wfk VRR_VV0UU2 +e7cb wfc VRR_VV0UU2 +e7cc vfpso VRR_VV0UUU +e7ce vfsq VRR_VV0UU2 +e7d4 vupll VRR_VV0U +e7d5 vuplh VRR_VV0U +e7d6 vupl VRR_VV0U +e7d7 vuph VRR_VV0U +e7d8 vtm VRR_VV +e7d9 vecl VRR_VV0U +e7db vec VRR_VV0U +e7de vlc VRR_VV0U +e7df vlp VRR_VV0U +e7e2 vfs VRR_VVV0UU +e7e3 vfa VRR_VVV0UU +e7e5 vfd VRR_VVV0UU +e7e7 vfm VRR_VVV0UU +e7e8 vfce VRR_VVV0UUU +e7ea vfche VRR_VVV0UUU +e7eb vfch VRR_VVV0UUU +e7ee vfmin VRR_VVV0UUU +e7ef vfmax VRR_VVV0UUU +e7f0 vavgl VRR_VVV0U +e7f1 vacc VRR_VVV0U +e7f2 vavg VRR_VVV0U +e7f3 va VRR_VVV0U +e7f5 vscbi VRR_VVV0U +e7f7 vs VRR_VVV0U +e7f8 vceq VRR_VVV0U0U +e7f9 vchl VRR_VVV0U0U +e7fb vch VRR_VVV0U0U +e7fc vmnl VRR_VVV0U +e7fd vmxl VRR_VVV0U +e7fe vmn VRR_VVV0U +e7ff vmx VRR_VVV0U +e8 mvcin SS_L0RDRD +e9 pka SS_L2RDRD +ea unpka SS_L0RDRD +eb04 lmg RSY_RRRD +eb0a srag RSY_RRRD +eb0b slag RSY_RRRD +eb0c srlg RSY_RRRD +eb0d sllg RSY_RRRD +eb0f tracg RSY_RRRD +eb14 csy RSY_RRRD +eb17 stcctm RSY_RURD +eb1c rllg RSY_RRRD +eb1d rll RSY_RRRD +eb20 clmh RSY_RURD +eb21 clmy RSY_RURD +eb23 clt RSY_RURD +eb24 stmg RSY_RRRD +eb25 stctg RSY_CCRD +eb26 stmh RSY_RRRD +eb2b clgt RSY_RURD +eb2c stcmh RSY_RURD +eb2d stcmy RSY_RURD +eb2f lctlg RSY_CCRD +eb30 csg RSY_RRRD +eb31 cdsy RSY_RRRD +eb3e cdsg RSY_RRRD +eb44 bxhg RSY_RRRD +eb45 bxleg RSY_RRRD +eb4c ecag RSY_RRRD +eb51 tmy SIY_URD +eb52 mviy SIY_URD +eb54 niy SIY_URD +eb55 cliy SIY_URD +eb56 oiy SIY_URD +eb57 xiy SIY_URD +eb60 lric RSY_RDRU +eb61 stric RSY_RDRU +eb62 mric RSY_RDRU +eb6a asi SIY_IRD +eb6e alsi SIY_IRD +eb71 lpswey SIY_RD +eb7a agsi SIY_IRD +eb7e algsi SIY_IRD +eb80 icmh RSY_RURD +eb81 icmy RSY_RURD +eb8a sqbs RSY_RDRU +eb8e mvclu RSY_RRRD +eb8f clclu RSY_RRRD +eb90 stmy RSY_RRRD +eb96 lmh RSY_RRRD +eb98 lmy RSY_RRRD +eb9a lamy RSY_AARD +eb9b stamy RSY_AARD +ebc0 tp RSL_R0RD +ebd0 pcistb RSY_RRRD +ebd1 sic RSY_RRRD +ebd4 pcistbi RSY_RRRD +ebdc srak RSY_RRRD +ebdd slak RSY_RRRD +ebde srlk RSY_RRRD +ebdf sllk RSY_RRRD +ebe0 locfh RSY_RURD2 +ebe1 stocfh RSY_RURD2 +ebe2 locg RSY_RURD2 +ebe3 stocg RSY_RURD2 +ebe4 lang RSY_RRRD +ebe6 laog RSY_RRRD +ebe7 laxg RSY_RRRD +ebe8 laag RSY_RRRD +ebea laalg RSY_RRRD +ebf2 loc RSY_RURD2 +ebf3 stoc RSY_RURD2 +ebf4 lan RSY_RRRD +ebf6 lao RSY_RRRD +ebf7 lax RSY_RRRD +ebf8 laa RSY_RRRD +ebfa laal RSY_RRRD +ec42 lochi RIE_RUI0 +ec44 brxhg RIE_RRP +ec45 brxlg RIE_RRP +ec46 locghi RIE_RUI0 +ec4e lochhi RIE_RUI0 +ec51 risblg RIE_RRUUU +ec54 rnsbg RIE_RRUUU +ec55 risbg RIE_RRUUU +ec56 rosbg RIE_RRUUU +ec57 rxsbg RIE_RRUUU +ec59 risbgn RIE_RRUUU +ec5d risbhg RIE_RRUUU +ec64 cgrj RIE_RRPU +ec65 clgrj RIE_RRPU +ec70 cgit RIE_R0IU +ec71 clgit RIE_R0UU +ec72 cit RIE_R0IU +ec73 clfit RIE_R0UU +ec76 crj RIE_RRPU +ec77 clrj RIE_RRPU +ec7c cgij RIE_RUPI +ec7d clgij RIE_RUPU +ec7e cij RIE_RUPI +ec7f clij RIE_RUPU +ecd8 ahik RIE_RRI0 +ecd9 aghik RIE_RRI0 +ecda alhsik RIE_RRI0 +ecdb alghsik RIE_RRI0 +ece4 cgrb RRS_RRRDU +ece5 clgrb RRS_RRRDU +ecf6 crb RRS_RRRDU +ecf7 clrb RRS_RRRDU +ecfc cgib RIS_RURDI +ecfd clgib RIS_RURDU +ecfe cib RIS_RURDI +ecff clib RIS_RURDU +ed04 ldeb RXE_FRRD +ed05 lxdb RXE_FRRD +ed06 lxeb RXE_FRRD +ed07 mxdb RXE_FRRD +ed08 keb RXE_FRRD +ed09 ceb RXE_FRRD +ed0a aeb RXE_FRRD +ed0b seb RXE_FRRD +ed0c mdeb RXE_FRRD +ed0d deb RXE_FRRD +ed0e maeb RXF_FRRDF +ed0f mseb RXF_FRRDF +ed10 tceb RXE_FRRD +ed11 tcdb RXE_FRRD +ed12 tcxb RXE_FRRD +ed14 sqeb RXE_FRRD +ed15 sqdb RXE_FRRD +ed17 meeb RXE_FRRD +ed18 kdb RXE_FRRD +ed19 cdb RXE_FRRD +ed1a adb RXE_FRRD +ed1b sdb RXE_FRRD +ed1c mdb RXE_FRRD +ed1d ddb RXE_FRRD +ed1e madb RXF_FRRDF +ed1f msdb RXF_FRRDF +ed24 lde RXE_FRRD +ed25 lxd RXE_FRRD +ed26 lxe RXE_FRRD +ed2e mae RXF_FRRDF +ed2f mse RXF_FRRDF +ed34 sqe RXE_FRRD +ed35 sqd RXE_FRRD +ed37 mee RXE_FRRD +ed38 mayl RXF_FRRDF +ed39 myl RXF_FRRDF +ed3a may RXF_FRRDF +ed3b my RXF_FRRDF +ed3c mayh RXF_FRRDF +ed3d myh RXF_FRRDF +ed3e mad RXF_FRRDF +ed3f msd RXF_FRRDF +ed40 sldt RXF_FRRDF +ed41 srdt RXF_FRRDF +ed48 slxt RXF_FRRDF +ed49 srxt RXF_FRRDF +ed50 tdcet RXE_FRRD +ed51 tdget RXE_FRRD +ed54 tdcdt RXE_FRRD +ed55 tdgdt RXE_FRRD +ed58 tdcxt RXE_FRRD +ed59 tdgxt RXE_FRRD +ed64 ley RXY_FRRD +ed65 ldy RXY_FRRD +ed66 stey RXY_FRRD +ed67 stdy RXY_FRRD +eda8 czdt RSL_LRDFU +eda9 czxt RSL_LRDFU +edaa cdzt RSL_LRDFU +edab cxzt RSL_LRDFU +edac cpdt RSL_LRDFU +edad cpxt RSL_LRDFU +edae cdpt RSL_LRDFU +edaf cxpt RSL_LRDFU +ee plo SS_RRRDRD2 +ef lmd SS_RRRDRD3 +f0 srp SS_LIRDRD +f1 mvo SS_LLRDRD +f2 pack SS_LLRDRD +f3 unpk SS_LLRDRD +f8 zap SS_LLRDRD +f9 cp SS_LLRDRD +fa ap SS_LLRDRD +fb sp SS_LLRDRD +fc mp SS_LLRDRD +fd dp SS_LLRDRD -- cgit v1.2.3